Merge branch 'master' into pipe-video

Conflicts: src/gallium/include/pipe/p_format.h
author: Thomas Balling Sørensen <tball@tball-laptop.(none)> 2010-10-26 12:49:41 +0200
committer: Thomas Balling Sørensen <tball@tball-laptop.(none)> 2010-10-26 12:49:41 +0200
commit: dbf3a15313eed930a3d8fdde12e457259c43651b (patch)
tree: 78bc54fa866ff1e97e61802f52dabe3f4f3380b1 /src/gallium
parent: 1dccc4cfaa423f15ab582d2a0253a84a0ae0b9fa (diff)
parent: 547e7619aac74ae13bdaa7fdf403a4ceb5212467 (diff)
200 files changed, 9064 insertions, 4866 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 05096b12a86..ed96725d782 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -173,6 +173,7 @@ GALLIVM_SOURCES = \
         gallivm/lp_bld_struct.c \
         gallivm/lp_bld_swizzle.c \
         gallivm/lp_bld_tgsi_aos.c \
+        gallivm/lp_bld_tgsi_info.c \
         gallivm/lp_bld_tgsi_soa.c \
         gallivm/lp_bld_type.c \
         draw/draw_llvm.c \
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index a18f7c0b2a3..f22c8b96123 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -225,6 +225,7 @@ if env['llvm']:
     'gallivm/lp_bld_struct.c',
     'gallivm/lp_bld_swizzle.c',
     'gallivm/lp_bld_tgsi_aos.c',
+    'gallivm/lp_bld_tgsi_info.c',
     'gallivm/lp_bld_tgsi_soa.c',
     'gallivm/lp_bld_type.c',
     'draw/draw_llvm.c',
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 032fcbbc70a..39d82f32892 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -335,6 +335,7 @@ draw_set_mapped_constant_buffer(struct draw_context *draw,
    case PIPE_SHADER_VERTEX:
       draw->pt.user.vs_constants[slot] = buffer;
       draw->pt.user.vs_constants_size[slot] = size;
+      draw->pt.user.planes = (float (*) [12][4]) &(draw->plane[0]);
       draw_vs_set_constants(draw, slot, buffer, size);
       break;
    case PIPE_SHADER_GEOMETRY:
@@ -721,9 +722,9 @@ draw_set_mapped_texture(struct draw_context *draw,
                         unsigned sampler_idx,
                         uint32_t width, uint32_t height, uint32_t depth,
                         uint32_t last_level,
-                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        const void *data[DRAW_MAX_TEXTURE_LEVELS])
+                        uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        const void *data[PIPE_MAX_TEXTURE_LEVELS])
 {
 #ifdef HAVE_LLVM
    if(draw->llvm)
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 1f27cbf488a..ff4f753604f 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -49,7 +49,6 @@ struct draw_geometry_shader;
 struct draw_fragment_shader;
 struct tgsi_sampler;
 
-#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_context *draw_create( struct pipe_context *pipe );
 
@@ -120,9 +119,9 @@ draw_set_mapped_texture(struct draw_context *draw,
                         unsigned sampler_idx,
                         uint32_t width, uint32_t height, uint32_t depth,
                         uint32_t last_level,
-                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+                        uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        const void *data[PIPE_MAX_TEXTURE_LEVELS]);
 
 
 /*
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 7fb86d7cb27..140e596f994 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -31,6 +31,9 @@
 #include "draw_vs.h"
 
 #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_swizzle.h"
 #include "gallivm/lp_bld_struct.h"
 #include "gallivm/lp_bld_type.h"
 #include "gallivm/lp_bld_flow.h"
@@ -43,7 +46,6 @@
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_dump.h"
 
-#include "util/u_cpu_detect.h"
 #include "util/u_math.h"
 #include "util/u_pointer.h"
 #include "util/u_string.h"
@@ -72,12 +74,12 @@ init_globals(struct draw_llvm *llvm)
       elem_types[DRAW_JIT_TEXTURE_DEPTH] = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_ROW_STRIDE] =
-         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+         LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_IMG_STRIDE] =
-         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+         LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_DATA] =
          LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
-                       DRAW_MAX_TEXTURE_LEVELS);
+                       PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_MIN_LOD] = LLVMFloatType();
       elem_types[DRAW_JIT_TEXTURE_MAX_LOD] = LLVMFloatType();
       elem_types[DRAW_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType();
@@ -128,12 +130,14 @@ init_globals(struct draw_llvm *llvm)
 
    /* struct draw_jit_context */
    {
-      LLVMTypeRef elem_types[3];
+      LLVMTypeRef elem_types[5];
       LLVMTypeRef context_type;
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
-      elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
-      elem_types[2] = LLVMArrayType(texture_type,
+      elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* gs_constants */
+      elem_types[2] = LLVMPointerType(LLVMArrayType(LLVMArrayType(LLVMFloatType(), 4), 12), 0); /* planes */
+      elem_types[3] = LLVMPointerType(LLVMFloatType(), 0); /* viewport */
+      elem_types[4] = LLVMArrayType(texture_type,
                                     PIPE_MAX_VERTEX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
@@ -142,6 +146,8 @@ init_globals(struct draw_llvm *llvm)
                              llvm->target, context_type, 0);
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, gs_constants,
                              llvm->target, context_type, 1);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, planes,
+                             llvm->target, context_type, 2);
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, textures,
                              llvm->target, context_type,
                              DRAW_JIT_CTX_TEXTURES);
@@ -267,13 +273,7 @@ draw_llvm_create(struct draw_context *draw)
          LLVMAddConstantPropagationPass(llvm->pass);
       }
 
-      if(util_cpu_caps.has_sse4_1) {
-         /* FIXME: There is a bug in this pass, whereby the combination of fptosi
-          * and sitofp (necessary for trunc/floor/ceil/round implementation)
-          * somehow becomes invalid code.
-          */
-         LLVMAddInstructionCombiningPass(llvm->pass);
-      }
+      LLVMAddInstructionCombiningPass(llvm->pass);
       LLVMAddGVNPass(llvm->pass);
    } else {
       /* We need at least this pass to prevent the backends to fail in
@@ -421,7 +421,7 @@ generate_fetch(LLVMBuilderRef builder,
                             "instance_divisor");
    }
 
-   /* limit index to min(inex, vb_max_index) */
+   /* limit index to min(index, vb_max_index) */
    cond = LLVMBuildICmp(builder, LLVMIntULE, index, vb_max_index, "");
    index = LLVMBuildSelect(builder, cond, index, vb_max_index, "");
 
@@ -550,19 +550,28 @@ static void
 store_aos(LLVMBuilderRef builder,
           LLVMValueRef io_ptr,
           LLVMValueRef index,
-          LLVMValueRef value)
+          LLVMValueRef value,
+          LLVMValueRef clipmask)
 {
    LLVMValueRef id_ptr = draw_jit_header_id(builder, io_ptr);
    LLVMValueRef data_ptr = draw_jit_header_data(builder, io_ptr);
    LLVMValueRef indices[3];
+   LLVMValueRef val, shift;
 
    indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
    indices[1] = index;
    indices[2] = LLVMConstInt(LLVMInt32Type(), 0, 0);
 
-   /* undefined vertex */
-   LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(),
-                                        0xffff, 0), id_ptr);
+   /* initialize vertex id:16 = 0xffff, pad:3 = 0, edgeflag:1 = 1 */
+   val = LLVMConstInt(LLVMInt32Type(), 0xffff1, 0); 
+   shift  = LLVMConstInt(LLVMInt32Type(), 12, 0);          
+   val = LLVMBuildShl(builder, val, shift, "");
+   /* add clipmask:12 */   
+   val = LLVMBuildOr(builder, val, clipmask, "");               
+
+   /* store vertex header */
+   LLVMBuildStore(builder, val, id_ptr);
+
 
 #if DEBUG_STORE
    lp_build_printf(builder, "    ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr);
@@ -617,7 +626,8 @@ store_aos_array(LLVMBuilderRef builder,
                 LLVMValueRef io_ptr,
                 LLVMValueRef aos[NUM_CHANNELS],
                 int attrib,
-                int num_outputs)
+                int num_outputs,
+                LLVMValueRef clipmask)
 {
    LLVMValueRef attr_index = LLVMConstInt(LLVMInt32Type(), attrib, 0);
    LLVMValueRef ind0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
@@ -625,7 +635,8 @@ store_aos_array(LLVMBuilderRef builder,
    LLVMValueRef ind2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
    LLVMValueRef ind3 = LLVMConstInt(LLVMInt32Type(), 3, 0);
    LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
-
+   LLVMValueRef clipmask0, clipmask1, clipmask2, clipmask3;
+   
    debug_assert(NUM_CHANNELS == 4);
 
    io0_ptr = LLVMBuildGEP(builder, io_ptr,
@@ -637,21 +648,31 @@ store_aos_array(LLVMBuilderRef builder,
    io3_ptr = LLVMBuildGEP(builder, io_ptr,
                           &ind3, 1, "");
 
+   clipmask0 = LLVMBuildExtractElement(builder, clipmask,
+                                       ind0, "");
+   clipmask1 = LLVMBuildExtractElement(builder, clipmask,
+                                       ind1, "");
+   clipmask2 = LLVMBuildExtractElement(builder, clipmask,
+                                       ind2, "");
+   clipmask3 = LLVMBuildExtractElement(builder, clipmask,
+                                       ind3, "");
+
 #if DEBUG_STORE
-   lp_build_printf(builder, "   io = %p, indexes[%d, %d, %d, %d]\n",
-                   io_ptr, ind0, ind1, ind2, ind3);
+   lp_build_printf(builder, "io = %p, indexes[%d, %d, %d, %d]\n, clipmask0 = %x, clipmask1 = %x, clipmask2 = %x, clipmask3 = %x\n",
+                   io_ptr, ind0, ind1, ind2, ind3, clipmask0, clipmask1, clipmask2, clipmask3);
 #endif
-
-   store_aos(builder, io0_ptr, attr_index, aos[0]);
-   store_aos(builder, io1_ptr, attr_index, aos[1]);
-   store_aos(builder, io2_ptr, attr_index, aos[2]);
-   store_aos(builder, io3_ptr, attr_index, aos[3]);
+   /* store for each of the 4 vertices */
+   store_aos(builder, io0_ptr, attr_index, aos[0], clipmask0);
+   store_aos(builder, io1_ptr, attr_index, aos[1], clipmask1);
+   store_aos(builder, io2_ptr, attr_index, aos[2], clipmask2);
+   store_aos(builder, io3_ptr, attr_index, aos[3], clipmask3);
 }
 
 static void
 convert_to_aos(LLVMBuilderRef builder,
                LLVMValueRef io,
                LLVMValueRef (*outputs)[NUM_CHANNELS],
+               LLVMValueRef clipmask,
                int num_outputs,
                int max_vertices)
 {
@@ -680,13 +701,305 @@ convert_to_aos(LLVMBuilderRef builder,
                       io,
                       aos,
                       attrib,
-                      num_outputs);
+                      num_outputs,
+                      clipmask);
    }
 #if DEBUG_STORE
    lp_build_printf(builder, "   # storing end\n");
 #endif
 }
 
+/*
+ * Stores original vertex positions in clip coordinates
+ * There is probably a more efficient way to do this, 4 floats at once
+ * rather than extracting each element one by one.
+ */
+static void
+store_clip(LLVMBuilderRef builder,
+           LLVMValueRef io_ptr,           
+           LLVMValueRef (*outputs)[NUM_CHANNELS])
+{
+   LLVMValueRef out[4];
+   LLVMValueRef indices[2]; 
+   LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
+   LLVMValueRef clip_ptr0, clip_ptr1, clip_ptr2, clip_ptr3;
+   LLVMValueRef clip0_ptr, clip1_ptr, clip2_ptr, clip3_ptr;    
+   LLVMValueRef out0elem, out1elem, out2elem, out3elem;
+   int i;
+
+   LLVMValueRef ind0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   LLVMValueRef ind1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+   LLVMValueRef ind2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+   LLVMValueRef ind3 = LLVMConstInt(LLVMInt32Type(), 3, 0);
+   
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indices[1] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   
+   out[0] = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 x1 x2 x3*/
+   out[1] = LLVMBuildLoad(builder, outputs[0][1], ""); /*y0 y1 y2 y3*/
+   out[2] = LLVMBuildLoad(builder, outputs[0][2], ""); /*z0 z1 z2 z3*/
+   out[3] = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/  
+
+   io0_ptr = LLVMBuildGEP(builder, io_ptr, &ind0, 1, "");
+   io1_ptr = LLVMBuildGEP(builder, io_ptr, &ind1, 1, "");
+   io2_ptr = LLVMBuildGEP(builder, io_ptr, &ind2, 1, "");
+   io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, "");
+
+   clip_ptr0 = draw_jit_header_clip(builder, io0_ptr);
+   clip_ptr1 = draw_jit_header_clip(builder, io1_ptr);
+   clip_ptr2 = draw_jit_header_clip(builder, io2_ptr);
+   clip_ptr3 = draw_jit_header_clip(builder, io3_ptr);
+
+   for (i = 0; i<4; i++){
+      clip0_ptr = LLVMBuildGEP(builder, clip_ptr0,
+                               indices, 2, ""); //x0
+      clip1_ptr = LLVMBuildGEP(builder, clip_ptr1,
+                               indices, 2, ""); //x1
+      clip2_ptr = LLVMBuildGEP(builder, clip_ptr2,
+                               indices, 2, ""); //x2
+      clip3_ptr = LLVMBuildGEP(builder, clip_ptr3,
+                               indices, 2, ""); //x3
+
+      out0elem = LLVMBuildExtractElement(builder, out[i],
+                                         ind0, ""); //x0
+      out1elem = LLVMBuildExtractElement(builder, out[i],
+                                         ind1, ""); //x1
+      out2elem = LLVMBuildExtractElement(builder, out[i],
+                                         ind2, ""); //x2
+      out3elem = LLVMBuildExtractElement(builder, out[i],
+                                         ind3, ""); //x3
+  
+      LLVMBuildStore(builder, out0elem, clip0_ptr);
+      LLVMBuildStore(builder, out1elem, clip1_ptr);
+      LLVMBuildStore(builder, out2elem, clip2_ptr);
+      LLVMBuildStore(builder, out3elem, clip3_ptr);
+
+      indices[1]= LLVMBuildAdd(builder, indices[1], ind1, "");
+   }
+
+}
+
+/* Equivalent of _mm_set1_ps(a)
+ */
+static LLVMValueRef vec4f_from_scalar(LLVMBuilderRef bld,
+				      LLVMValueRef a,
+				      const char *name)
+{
+   LLVMValueRef res = LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4));
+   int i;
+
+   for(i = 0; i < 4; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildInsertElement(bld, res, a, index, i == 3 ? name : "");
+   }
+
+   return res;
+}
+
+/*
+ * Transforms the outputs for viewport mapping
+ */
+static void
+generate_viewport(struct draw_llvm *llvm,
+                  LLVMBuilderRef builder,
+                  LLVMValueRef (*outputs)[NUM_CHANNELS],
+                  LLVMValueRef context_ptr)
+{
+   int i;
+   struct lp_type f32_type = lp_type_float_vec(32);
+   LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/   
+   LLVMValueRef const1 = lp_build_const_vec(f32_type, 1.0);       /*1.0 1.0 1.0 1.0*/ 
+   LLVMValueRef vp_ptr = draw_jit_context_viewport(builder, context_ptr);
+
+   /* for 1/w convention*/
+   out3 = LLVMBuildFDiv(builder, const1, out3, "");
+   LLVMBuildStore(builder, out3, outputs[0][3]);
+  
+   /* Viewport Mapping */
+   for (i=0; i<3; i++){
+      LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 x2 x3*/
+      LLVMValueRef scale;
+      LLVMValueRef trans;
+      LLVMValueRef scale_i;
+      LLVMValueRef trans_i;
+      LLVMValueRef index;
+      
+      index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      scale_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, "");
+
+      index = LLVMConstInt(LLVMInt32Type(), i+4, 0);
+      trans_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, "");
+
+      scale = vec4f_from_scalar(builder, LLVMBuildLoad(builder, scale_i, ""), "scale");
+      trans = vec4f_from_scalar(builder, LLVMBuildLoad(builder, trans_i, ""), "trans");
+
+      /* divide by w */
+      out = LLVMBuildFMul(builder, out, out3, "");
+      /* mult by scale */
+      out = LLVMBuildFMul(builder, out, scale, "");
+      /* add translation */
+      out = LLVMBuildFAdd(builder, out, trans, "");
+
+      /* store transformed outputs */
+      LLVMBuildStore(builder, out, outputs[0][i]);
+   }
+   
+}
+
+
+/*
+ * Returns clipmask as 4xi32 bitmask for the 4 vertices
+ */
+static LLVMValueRef 
+generate_clipmask(LLVMBuilderRef builder,
+                  LLVMValueRef (*outputs)[NUM_CHANNELS],
+                  boolean clip_xy,
+                  boolean clip_z,
+                  boolean clip_user,
+                  boolean clip_halfz,
+                  unsigned nr,
+                  LLVMValueRef context_ptr)
+{
+   LLVMValueRef mask; /* stores the <4xi32> clipmasks */     
+   LLVMValueRef test, temp; 
+   LLVMValueRef zero, shift;
+   LLVMValueRef pos_x, pos_y, pos_z, pos_w;
+   LLVMValueRef plane1, planes, plane_ptr, sum;
+
+   unsigned i;
+
+   struct lp_type f32_type = lp_type_float_vec(32); 
+
+   mask = lp_build_const_int_vec(lp_type_int_vec(32), 0);
+   temp = lp_build_const_int_vec(lp_type_int_vec(32), 0);
+   zero = lp_build_const_vec(f32_type, 0);                    /* 0.0f 0.0f 0.0f 0.0f */
+   shift = lp_build_const_int_vec(lp_type_int_vec(32), 1);    /* 1 1 1 1 */
+
+   /* Assuming position stored at output[0] */
+   pos_x = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 x1 x2 x3*/
+   pos_y = LLVMBuildLoad(builder, outputs[0][1], ""); /*y0 y1 y2 y3*/
+   pos_z = LLVMBuildLoad(builder, outputs[0][2], ""); /*z0 z1 z2 z3*/
+   pos_w = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/   
+
+   /* Cliptest, for hardwired planes */
+   if (clip_xy){
+      /* plane 1 */
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_x , pos_w);
+      temp = shift;
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = test;
+   
+      /* plane 2 */
+      test = LLVMBuildFAdd(builder, pos_x, pos_w, "");
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test);
+      temp = LLVMBuildShl(builder, temp, shift, "");
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = LLVMBuildOr(builder, mask, test, "");
+   
+      /* plane 3 */
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_y, pos_w);
+      temp = LLVMBuildShl(builder, temp, shift, "");
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = LLVMBuildOr(builder, mask, test, "");
+
+      /* plane 4 */
+      test = LLVMBuildFAdd(builder, pos_y, pos_w, "");
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test);
+      temp = LLVMBuildShl(builder, temp, shift, "");
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = LLVMBuildOr(builder, mask, test, "");
+   }
+
+   if (clip_z){
+      temp = lp_build_const_int_vec(lp_type_int_vec(32), 16);
+      if (clip_halfz){
+         /* plane 5 */
+         test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, pos_z);
+         test = LLVMBuildAnd(builder, test, temp, ""); 
+         mask = LLVMBuildOr(builder, mask, test, "");
+      }  
+      else{
+         /* plane 5 */
+         test = LLVMBuildFAdd(builder, pos_z, pos_w, "");
+         test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test);
+         test = LLVMBuildAnd(builder, test, temp, ""); 
+         mask = LLVMBuildOr(builder, mask, test, "");
+      }
+      /* plane 6 */
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_z, pos_w);
+      temp = LLVMBuildShl(builder, temp, shift, "");
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = LLVMBuildOr(builder, mask, test, "");
+   }   
+
+   if (clip_user){
+      LLVMValueRef planes_ptr = draw_jit_context_planes(builder, context_ptr);
+      LLVMValueRef indices[3];
+      temp = lp_build_const_int_vec(lp_type_int_vec(32), 32);
+
+      /* userclip planes */
+      for (i = 6; i < nr; i++) {
+         indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+         indices[1] = LLVMConstInt(LLVMInt32Type(), i, 0);
+
+         indices[2] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+         plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
+         plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_x");
+         planes = vec4f_from_scalar(builder, plane1, "plane4_x");
+         sum = LLVMBuildFMul(builder, planes, pos_x, "");
+
+         indices[2] = LLVMConstInt(LLVMInt32Type(), 1, 0);
+         plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
+         plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_y"); 
+         planes = vec4f_from_scalar(builder, plane1, "plane4_y");
+         test = LLVMBuildFMul(builder, planes, pos_y, "");
+         sum = LLVMBuildFAdd(builder, sum, test, "");
+         
+         indices[2] = LLVMConstInt(LLVMInt32Type(), 2, 0);
+         plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
+         plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_z"); 
+         planes = vec4f_from_scalar(builder, plane1, "plane4_z");
+         test = LLVMBuildFMul(builder, planes, pos_z, "");
+         sum = LLVMBuildFAdd(builder, sum, test, "");
+
+         indices[2] = LLVMConstInt(LLVMInt32Type(), 3, 0);
+         plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
+         plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_w"); 
+         planes = vec4f_from_scalar(builder, plane1, "plane4_w");
+         test = LLVMBuildFMul(builder, planes, pos_w, "");
+         sum = LLVMBuildFAdd(builder, sum, test, "");
+
+         test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, sum);
+         temp = LLVMBuildShl(builder, temp, shift, "");
+         test = LLVMBuildAnd(builder, test, temp, ""); 
+         mask = LLVMBuildOr(builder, mask, test, "");
+      }
+   }
+   return mask;
+}
+
+/*
+ * Returns boolean if any clipping has occurred
+ * Used zero/non-zero i32 value to represent boolean 
+ */
+static void
+clipmask_bool(LLVMBuilderRef builder, 
+              LLVMValueRef clipmask,
+              LLVMValueRef ret_ptr)
+{
+   LLVMValueRef ret = LLVMBuildLoad(builder, ret_ptr, "");   
+   LLVMValueRef temp;
+   int i;
+
+   for (i=0; i<4; i++){   
+      temp = LLVMBuildExtractElement(builder, clipmask,
+                                     LLVMConstInt(LLVMInt32Type(), i, 0) , "");
+      ret = LLVMBuildOr(builder, ret, temp, "");
+   }
+   
+   LLVMBuildStore(builder, ret, ret_ptr);
+}
+
 static void
 draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 {
@@ -706,7 +1019,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    void *code;
    struct lp_build_sampler_soa *sampler = 0;
-
+   LLVMValueRef ret, ret_ptr;
+   boolean bypass_viewport = variant->key.bypass_viewport;
+   boolean enable_cliptest = variant->key.clip_xy || 
+                             variant->key.clip_z  ||
+                             variant->key.clip_user;
+   
    arg_types[0] = llvm->context_ptr_type;           /* context */
    arg_types[1] = llvm->vertex_header_ptr_type;     /* vertex_header */
    arg_types[2] = llvm->buffer_ptr_type;            /* vbuffers */
@@ -716,7 +1034,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    arg_types[6] = llvm->vb_ptr_type;                /* pipe_vertex_buffer's */
    arg_types[7] = LLVMInt32Type();                  /* instance_id */
 
-   func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
+   func_type = LLVMFunctionType(LLVMInt32Type(), arg_types, Elements(arg_types), 0);
 
    variant->function = LLVMAddFunction(llvm->module, "draw_llvm_shader", func_type);
    LLVMSetFunctionCallConv(variant->function, LLVMCCallConv);
@@ -756,6 +1074,10 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 
    step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
 
+   /* function will return non-zero i32 value if any clipped vertices */     
+   ret_ptr = lp_build_alloca(builder, LLVMInt32Type(), "");   
+   LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), ret_ptr);
+
    /* code generated texture sampling */
    sampler = draw_llvm_sampler_soa_create(
       draw_llvm_variant_key_samplers(&variant->key),
@@ -770,6 +1092,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
       LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
       LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS] = { { 0 } };
       LLVMValueRef io;
+      LLVMValueRef clipmask;   /* holds the clipmask value */
       const LLVMValueRef (*ptr_aos)[NUM_CHANNELS];
 
       io_itr = LLVMBuildSub(builder, lp_loop.counter, start, "");
@@ -806,10 +1129,37 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
                   context_ptr,
                   sampler);
 
-      convert_to_aos(builder, io, outputs,
+      /* store original positions in clip before further manipulation */
+      store_clip(builder, io, outputs);
+
+      /* do cliptest */
+      if (enable_cliptest){
+         /* allocate clipmask, assign it integer type */
+         clipmask = generate_clipmask(builder, outputs,
+                                      variant->key.clip_xy,
+                                      variant->key.clip_z, 
+                                      variant->key.clip_user,
+                                      variant->key.clip_halfz,
+                                      variant->key.nr_planes,
+                                      context_ptr);
+         /* return clipping boolean value for function */
+         clipmask_bool(builder, clipmask, ret_ptr);
+      }
+      else{
+         clipmask = lp_build_const_int_vec(lp_type_int_vec(32), 0);    
+      }
+      
+      /* do viewport mapping */
+      if (!bypass_viewport){
+         generate_viewport(llvm, builder, outputs, context_ptr);
+      }
+
+      /* store clipmask in vertex header and positions in data */
+      convert_to_aos(builder, io, outputs, clipmask,
                      draw->vs.vertex_shader->info.num_outputs,
                      max_vertices);
    }
+
    lp_build_loop_end_cond(builder, end, step, LLVMIntUGE, &lp_loop);
 
    sampler->destroy(sampler);
@@ -819,8 +1169,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0);
 #endif
 
-   LLVMBuildRetVoid(builder);
-
+   ret = LLVMBuildLoad(builder, ret_ptr,"");
+   LLVMBuildRet(builder, ret);
+      
    LLVMDisposeBuilder(builder);
 
    /*
@@ -870,7 +1221,12 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    LLVMValueRef fetch_max;
    void *code;
    struct lp_build_sampler_soa *sampler = 0;
-
+   LLVMValueRef ret, ret_ptr;
+   boolean bypass_viewport = variant->key.bypass_viewport;
+   boolean enable_cliptest = variant->key.clip_xy || 
+                             variant->key.clip_z  ||
+                             variant->key.clip_user;
+   
    arg_types[0] = llvm->context_ptr_type;               /* context */
    arg_types[1] = llvm->vertex_header_ptr_type;         /* vertex_header */
    arg_types[2] = llvm->buffer_ptr_type;                /* vbuffers */
@@ -880,10 +1236,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    arg_types[6] = llvm->vb_ptr_type;                    /* pipe_vertex_buffer's */
    arg_types[7] = LLVMInt32Type();                      /* instance_id */
 
-   func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
+   func_type = LLVMFunctionType(LLVMInt32Type(), arg_types, Elements(arg_types), 0);
 
-   variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts",
-                                            func_type);
+   variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", func_type);
    LLVMSetFunctionCallConv(variant->function_elts, LLVMCCallConv);
    for(i = 0; i < Elements(arg_types); ++i)
       if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
@@ -929,11 +1284,16 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
                             LLVMConstInt(LLVMInt32Type(), 1, 0),
                             "fetch_max");
 
+   /* function returns non-zero i32 value if any clipped vertices */
+   ret_ptr = lp_build_alloca(builder, LLVMInt32Type(), ""); 
+   LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), ret_ptr);
+
    lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), &lp_loop);
    {
       LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
       LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS] = { { 0 } };
       LLVMValueRef io;
+      LLVMValueRef clipmask;   /* holds the clipmask value */
       const LLVMValueRef (*ptr_aos)[NUM_CHANNELS];
 
       io_itr = lp_loop.counter;
@@ -980,10 +1340,40 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
                   context_ptr,
                   sampler);
 
-      convert_to_aos(builder, io, outputs,
+      /* store original positions in clip before further manipulation */
+      store_clip(builder, io, outputs);
+
+      /* do cliptest */
+      if (enable_cliptest){
+         /* allocate clipmask, assign it integer type */
+         clipmask = generate_clipmask(builder, outputs,
+                                      variant->key.clip_xy,
+                                      variant->key.clip_z, 
+                                      variant->key.clip_user,
+                                      variant->key.clip_halfz,
+                                      variant->key.nr_planes,
+                                      context_ptr);
+         /* return clipping boolean value for function */
+         clipmask_bool(builder, clipmask, ret_ptr);
+      }
+      else{
+         clipmask = lp_build_const_int_vec(lp_type_int_vec(32), 0);
+      }
+      
+      /* do viewport mapping */
+      if (!bypass_viewport){
+         generate_viewport(llvm, builder, outputs, context_ptr);
+      }
+
+      /* store clipmask in vertex header, 
+       * original positions in clip 
+       * and transformed positions in data 
+       */   
+      convert_to_aos(builder, io, outputs, clipmask,
                      draw->vs.vertex_shader->info.num_outputs,
                      max_vertices);
    }
+
    lp_build_loop_end_cond(builder, fetch_count, step, LLVMIntUGE, &lp_loop);
 
    sampler->destroy(sampler);
@@ -993,8 +1383,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0);
 #endif
 
-   LLVMBuildRetVoid(builder);
-
+   ret = LLVMBuildLoad(builder, ret_ptr,"");   
+   LLVMBuildRet(builder, ret);
+   
    LLVMDisposeBuilder(builder);
 
    /*
@@ -1038,6 +1429,16 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store)
     */
    key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements;
 
+   /* will have to rig this up properly later */
+   key->clip_xy = llvm->draw->clip_xy;
+   key->clip_z = llvm->draw->clip_z;
+   key->clip_user = llvm->draw->clip_user;
+   key->bypass_viewport = llvm->draw->identity_viewport;
+   key->clip_halfz = !llvm->draw->rasterizer->gl_rasterization_rules;
+   key->need_edgeflags = (llvm->draw->vs.edgeflag_output ? TRUE : FALSE);
+   key->nr_planes = llvm->draw->nr_planes;
+   key->pad = 0;
+
    /* All variants of this shader will have the same value for
     * nr_samplers.  Not yet trying to compact away holes in the
     * sampler array.
@@ -1066,9 +1467,9 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,
                              unsigned sampler_idx,
                              uint32_t width, uint32_t height, uint32_t depth,
                              uint32_t last_level,
-                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             const void *data[DRAW_MAX_TEXTURE_LEVELS])
+                             uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             const void *data[PIPE_MAX_TEXTURE_LEVELS])
 {
    unsigned j;
    struct draw_jit_texture *jit_tex;
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index d0a68ae412d..c3c30c07c64 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -41,7 +41,6 @@
 #include <llvm-c/Target.h>
 #include <llvm-c/ExecutionEngine.h>
 
-#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_llvm;
 struct llvm_vertex_shader;
@@ -52,9 +51,9 @@ struct draw_jit_texture
    uint32_t height;
    uint32_t depth;
    uint32_t last_level;
-   uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
-   uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
-   const void *data[DRAW_MAX_TEXTURE_LEVELS];
+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   const void *data[PIPE_MAX_TEXTURE_LEVELS];
    float min_lod;
    float max_lod;
    float lod_bias;
@@ -97,7 +96,8 @@ struct draw_jit_context
 {
    const float *vs_constants;
    const float *gs_constants;
-
+   float (*planes) [12][4];
+   float *viewport;
 
    struct draw_jit_texture textures[PIPE_MAX_VERTEX_SAMPLERS];
 };
@@ -109,18 +109,22 @@ struct draw_jit_context
 #define draw_jit_context_gs_constants(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 1, "gs_constants")
 
-#define DRAW_JIT_CTX_TEXTURES 2
+#define draw_jit_context_planes(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 2, "planes")
 
-#define draw_jit_context_textures(_builder, _ptr) \
-   lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CTX_TEXTURES, "textures")
+#define draw_jit_context_viewport(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 3, "viewport")
 
+#define DRAW_JIT_CTX_TEXTURES 4
 
+#define draw_jit_context_textures(_builder, _ptr) \
+   lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CTX_TEXTURES, "textures")
 
 #define draw_jit_header_id(_builder, _ptr)              \
    lp_build_struct_get_ptr(_builder, _ptr, 0, "id")
 
 #define draw_jit_header_clip(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 1, "clip")
+   lp_build_struct_get_ptr(_builder, _ptr, 1, "clip")
 
 #define draw_jit_header_data(_builder, _ptr)            \
    lp_build_struct_get_ptr(_builder, _ptr, 2, "data")
@@ -136,7 +140,7 @@ struct draw_jit_context
    lp_build_struct_get(_builder, _ptr, 2, "buffer_offset")
 
 
-typedef void
+typedef int
 (*draw_jit_vert_func)(struct draw_jit_context *context,
                       struct vertex_header *io,
                       const char *vbuffers[PIPE_MAX_ATTRIBS],
@@ -147,7 +151,7 @@ typedef void
                       unsigned instance_id);
 
 
-typedef void
+typedef int
 (*draw_jit_vert_func_elts)(struct draw_jit_context *context,
                            struct vertex_header *io,
                            const char *vbuffers[PIPE_MAX_ATTRIBS],
@@ -159,8 +163,16 @@ typedef void
 
 struct draw_llvm_variant_key
 {
-   unsigned nr_vertex_elements:16;
-   unsigned nr_samplers:16;
+   unsigned nr_vertex_elements:8;
+   unsigned nr_samplers:8;
+   unsigned clip_xy:1;
+   unsigned clip_z:1;
+   unsigned clip_user:1;
+   unsigned clip_halfz:1;
+   unsigned bypass_viewport:1;
+   unsigned need_edgeflags:1;
+   unsigned nr_planes:4;
+   unsigned pad:6;
 
    /* Variable number of vertex elements:
     */
@@ -290,8 +302,8 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,
                              unsigned sampler_idx,
                              uint32_t width, uint32_t height, uint32_t depth,
                              uint32_t last_level,
-                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+                             uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             const void *data[PIPE_MAX_TEXTURE_LEVELS]);
 
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index d417f825a0f..54163d7f9eb 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -169,6 +169,9 @@ struct draw_context
          unsigned vs_constants_size[PIPE_MAX_CONSTANT_BUFFERS];
          const void *gs_constants[PIPE_MAX_CONSTANT_BUFFERS];
          unsigned gs_constants_size[PIPE_MAX_CONSTANT_BUFFERS];
+         
+         /* pointer to planes */
+         float (*planes)[12][4]; 
       } user;
 
       boolean test_fse;         /* enable FSE even though its not correct (eg for softpipe) */
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index f44bf2507c6..4078b2a07d0 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -287,6 +287,84 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
 }
 
 
+/** Helper code for below */
+#define PRIM_RESTART_LOOP(elements) \
+   do { \
+      for (i = start; i < end; i++) { \
+         if (elements[i] == info->restart_index) { \
+            if (cur_count > 0) { \
+               /* draw elts up to prev pos */ \
+               draw_pt_arrays(draw, prim, cur_start, cur_count); \
+            } \
+            /* begin new prim at next elt */ \
+            cur_start = i + 1; \
+            cur_count = 0; \
+         } \
+         else { \
+            cur_count++; \
+         } \
+      } \
+      if (cur_count > 0) { \
+         draw_pt_arrays(draw, prim, cur_start, cur_count); \
+      } \
+   } while (0)
+
+
+/**
+ * For drawing prims with primitive restart enabled.
+ * Scan for restart indexes and draw the runs of elements/vertices between
+ * the restarts.
+ */
+static void
+draw_pt_arrays_restart(struct draw_context *draw,
+                       const struct pipe_draw_info *info)
+{
+   const unsigned prim = info->mode;
+   const unsigned start = info->start;
+   const unsigned count = info->count;
+   const unsigned end = start + count;
+   unsigned i, cur_start, cur_count;
+
+   assert(info->primitive_restart);
+
+   if (draw->pt.user.elts) {
+      /* indexed prims (draw_elements) */
+      cur_start = start;
+      cur_count = 0;
+
+      switch (draw->pt.user.eltSize) {
+      case 1:
+         {
+            const ubyte *elt_ub = (const ubyte *) draw->pt.user.elts;
+            PRIM_RESTART_LOOP(elt_ub);
+         }
+         break;
+      case 2:
+         {
+            const ushort *elt_us = (const ushort *) draw->pt.user.elts;
+            PRIM_RESTART_LOOP(elt_us);
+         }
+         break;
+      case 4:
+         {
+            const uint *elt_ui = (const uint *) draw->pt.user.elts;
+            PRIM_RESTART_LOOP(elt_ui);
+         }
+         break;
+      default:
+         assert(0 && "bad eltSize in draw_arrays()");
+      }
+   }
+   else {
+      /* Non-indexed prims (draw_arrays).
+       * Primitive restart should have been handled in the state tracker.
+       */
+      draw_pt_arrays(draw, prim, start, count);
+   }
+}
+
+
+
 /**
  * Non-instanced drawing.
  * \sa draw_arrays_instanced
@@ -395,6 +473,12 @@ draw_vbo(struct draw_context *draw,
 
    for (instance = 0; instance < info->instance_count; instance++) {
       draw->instance_id = instance + info->start_instance;
-      draw_pt_arrays(draw, info->mode, info->start, info->count);
+
+      if (info->primitive_restart) {
+         draw_pt_arrays_restart(draw, info);
+      }
+      else {
+         draw_pt_arrays(draw, info->mode, info->start, info->count);
+      }
    }
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 77291e304e1..a53a768d029 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -175,6 +175,11 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
       draw->pt.user.vs_constants[0];
    fpme->llvm->jit_context.gs_constants =
       draw->pt.user.gs_constants[0];
+   fpme->llvm->jit_context.planes =
+      (float (*) [12][4]) draw->pt.user.planes[0];
+   fpme->llvm->jit_context.viewport =
+      (float *)draw->viewport.scale;
+    
 }
 
 
@@ -217,6 +222,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
    struct draw_vertex_info gs_vert_info;
    struct draw_vertex_info *vert_info;
    unsigned opt = fpme->opt;
+   unsigned clipped = 0;
 
    llvm_vert_info.count = fetch_info->count;
    llvm_vert_info.vertex_size = fpme->vertex_size;
@@ -230,7 +236,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
    }
 
    if (fetch_info->linear)
-      fpme->current_variant->jit_func( &fpme->llvm->jit_context,
+      clipped = fpme->current_variant->jit_func( &fpme->llvm->jit_context,
                                        llvm_vert_info.verts,
                                        (const char **)draw->pt.user.vbuffer,
                                        fetch_info->start,
@@ -239,7 +245,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
                                        draw->pt.vertex_buffer,
                                        draw->instance_id);
    else
-      fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context,
+      clipped = fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context,
                                             llvm_vert_info.verts,
                                             (const char **)draw->pt.user.vbuffer,
                                             fetch_info->elts,
@@ -266,6 +272,9 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
       FREE(vert_info->verts);
       vert_info = &gs_vert_info;
       prim_info = &gs_prim_info;
+
+      clipped = draw_pt_post_vs_run( fpme->post_vs, vert_info );
+
    }
 
    /* stream output needs to be done before clipping */
@@ -273,11 +282,11 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
 		    vert_info,
                     prim_info );
 
-   if (draw_pt_post_vs_run( fpme->post_vs, vert_info )) {
+   if (clipped) {
       opt |= PT_PIPELINE;
    }
 
-   /* Do we need to run the pipeline?
+   /* Do we need to run the pipeline? Now will come here if clipped
     */
    if (opt & PT_PIPELINE) {
       pipeline( fpme,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index e65c13e64b5..f9a12a41a1b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -983,6 +983,12 @@ enum lp_build_round_sse41_mode
 };
 
 
+/**
+ * Helper for SSE4.1's ROUNDxx instructions.
+ *
+ * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
+ * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
+ */
 static INLINE LLVMValueRef
 lp_build_round_sse41(struct lp_build_context *bld,
                      LLVMValueRef a,
@@ -1053,10 +1059,58 @@ lp_build_round_sse41(struct lp_build_context *bld,
 }
 
 
+static INLINE LLVMValueRef
+lp_build_iround_nearest_sse2(struct lp_build_context *bld,
+                             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMTypeRef ret_type = lp_build_int_vec_type(type);
+   const char *intrinsic;
+   LLVMValueRef res;
+
+   assert(type.floating);
+   /* using the double precision conversions is a bit more complicated */
+   assert(type.width == 32);
+
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse2);
+
+   /* This is relying on MXCSR rounding mode, which should always be nearest. */
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef arg;
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      intrinsic = "llvm.x86.sse.cvtss2si";
+
+      undef = LLVMGetUndef(vec_type);
+
+      arg = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, arg);
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      intrinsic = "llvm.x86.sse2.cvtps2dq";
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, a);
+   }
+
+   return res;
+}
+
+
 /**
- * Return the integer part of a float (vector) value.  The returned value is
- * a float (vector).
- * Ex: trunc(-1.5) = 1.0
+ * Return the integer part of a float (vector) value (== round toward zero).
+ * The returned value is a float (vector).
+ * Ex: trunc(-1.5) = -1.0
  */
 LLVMValueRef
 lp_build_trunc(struct lp_build_context *bld,
@@ -1181,9 +1235,9 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
- * Return the integer part of a float (vector) value.  The returned value is
- * an integer (vector).
- * Ex: itrunc(-1.5) = 1
+ * Return the integer part of a float (vector) value (== round toward zero).
+ * The returned value is an integer (vector).
+ * Ex: itrunc(-1.5) = -1
  */
 LLVMValueRef
 lp_build_itrunc(struct lp_build_context *bld,
@@ -1210,32 +1264,40 @@ lp_build_iround(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
    LLVMValueRef res;
 
    assert(type.floating);
 
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
+   if (util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+      return lp_build_iround_nearest_sse2(bld, a);
+   }
+   else if (util_cpu_caps.has_sse4_1 &&
        (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
       LLVMValueRef half;
 
-      /* get sign bit */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-
-      /* sign * 0.5 */
       half = lp_build_const_vec(type, 0.5);
-      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
-      half = LLVMBuildOr(bld->builder, sign, half, "");
-      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+
+      if (type.sign) {
+         LLVMTypeRef vec_type = bld->vec_type;
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+
+         /* get sign bit */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+
+         /* sign * 0.5 */
+         half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
+         half = LLVMBuildOr(bld->builder, sign, half, "");
+         half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+      }
 
       res = LLVMBuildFAdd(bld->builder, a, half, "");
    }
@@ -1256,7 +1318,7 @@ lp_build_ifloor(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
    LLVMValueRef res;
 
    assert(type.floating);
@@ -1267,27 +1329,31 @@ lp_build_ifloor(struct lp_build_context *bld,
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    }
    else {
-      /* Take the sign bit and add it to 1 constant */
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
-      LLVMValueRef offset;
-
-      /* sign = a < 0 ? ~0 : 0 */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
-
-      /* offset = -0.99999(9)f */
-      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
-      offset = LLVMConstBitCast(offset, int_vec_type);
-
-      /* offset = a < 0 ? offset : 0.0f */
-      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
-
-      res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
+      res = a;
+
+      if (type.sign) {
+         /* Take the sign bit and add it to 1 constant */
+         LLVMTypeRef vec_type = bld->vec_type;
+         unsigned mantissa = lp_mantissa(type);
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+         LLVMValueRef offset;
+
+         /* sign = a < 0 ? ~0 : 0 */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+         sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
+
+         /* offset = -0.99999(9)f */
+         offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
+         offset = LLVMConstBitCast(offset, int_vec_type);
+
+         /* offset = a < 0 ? offset : 0.0f */
+         offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+         offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
+
+         res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res");
+      }
    }
 
    /* round to nearest (toward zero) */
@@ -1307,7 +1373,7 @@ lp_build_iceil(struct lp_build_context *bld,
                LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef int_vec_type = bld->int_vec_type;
    LLVMValueRef res;
 
    assert(type.floating);
@@ -1318,25 +1384,28 @@ lp_build_iceil(struct lp_build_context *bld,
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMTypeRef vec_type = bld->vec_type;
       unsigned mantissa = lp_mantissa(type);
-      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
-      LLVMValueRef sign;
       LLVMValueRef offset;
 
-      /* sign = a < 0 ? 0 : ~0 */
-      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
-      sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
-
       /* offset = 0.99999(9)f */
       offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
-      offset = LLVMConstBitCast(offset, int_vec_type);
 
-      /* offset = a < 0 ? 0.0 : offset */
-      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+      if (type.sign) {
+         LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+         LLVMValueRef sign;
+
+         /* sign = a < 0 ? 0 : ~0 */
+         sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+         sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+         sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
+         sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
+
+         /* offset = a < 0 ? 0.0 : offset */
+         offset = LLVMConstBitCast(offset, int_vec_type);
+         offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+         offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+      }
 
       res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
    }
@@ -1348,6 +1417,46 @@ lp_build_iceil(struct lp_build_context *bld,
 }
 
 
+/**
+ * Combined ifloor() & fract().
+ *
+ * Preferred to calling the functions separately, as it will ensure that the
+ * stratergy (floor() vs ifloor()) that results in less redundant work is used.
+ */
+void
+lp_build_ifloor_fract(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef *out_ipart,
+                      LLVMValueRef *out_fpart)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef ipart;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
+      /*
+       * floor() is easier.
+       */
+
+      ipart = lp_build_floor(bld, a);
+      *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart");
+      *out_ipart = LLVMBuildFPToSI(bld->builder, ipart, bld->int_vec_type, "ipart");
+   }
+   else {
+      /*
+       * ifloor() is easier.
+       */
+
+      *out_ipart = lp_build_ifloor(bld, a);
+      ipart = LLVMBuildSIToFP(bld->builder, *out_ipart, bld->vec_type, "ipart");
+      *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart");
+   }
+}
+
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a)
@@ -2157,6 +2266,71 @@ lp_build_exp2(struct lp_build_context *bld,
 
 
 /**
+ * Extract the exponent of a IEEE-754 floating point value.
+ *
+ * Optionally apply an integer bias.
+ *
+ * Result is an integer value with
+ *
+ *   ifloor(log2(x)) + bias
+ */
+LLVMValueRef
+lp_build_extract_exponent(struct lp_build_context *bld,
+                          LLVMValueRef x,
+                          int bias)
+{
+   const struct lp_type type = bld->type;
+   unsigned mantissa = lp_mantissa(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, "");
+
+   res = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
+   res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(type, 255), "");
+   res = LLVMBuildSub(bld->builder, res, lp_build_const_int_vec(type, 127 - bias), "");
+
+   return res;
+}
+
+
+/**
+ * Extract the mantissa of the a floating.
+ *
+ * Result is a floating point value with
+ *
+ *   x / floor(log2(x))
+ */
+LLVMValueRef
+lp_build_extract_mantissa(struct lp_build_context *bld,
+                          LLVMValueRef x)
+{
+   const struct lp_type type = bld->type;
+   unsigned mantissa = lp_mantissa(type);
+   LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
+   LLVMValueRef res;
+
+   assert(lp_check_value(bld->type, x));
+
+   assert(type.floating);
+
+   x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, "");
+
+   /* res = x / 2**ipart */
+   res = LLVMBuildAnd(bld->builder, x, mantmask, "");
+   res = LLVMBuildOr(bld->builder, res, one, "");
+   res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
+
+   return res;
+}
+
+
+
+/**
  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
  * These coefficients can be generate with
  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
@@ -2275,3 +2449,62 @@ lp_build_log2(struct lp_build_context *bld,
    lp_build_log2_approx(bld, x, NULL, NULL, &res);
    return res;
 }
+
+
+/**
+ * Faster (and less accurate) log2.
+ *
+ *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
+ *
+ * Piece-wise linear approximation, with exact results when x is a
+ * power of two.
+ *
+ * See http://www.flipcode.com/archives/Fast_log_Function.shtml
+ */
+LLVMValueRef
+lp_build_fast_log2(struct lp_build_context *bld,
+                   LLVMValueRef x)
+{
+   LLVMValueRef ipart;
+   LLVMValueRef fpart;
+
+   assert(lp_check_value(bld->type, x));
+
+   assert(bld->type.floating);
+
+   /* ipart = floor(log2(x)) - 1 */
+   ipart = lp_build_extract_exponent(bld, x, -1);
+   ipart = LLVMBuildSIToFP(bld->builder, ipart, bld->vec_type, "");
+
+   /* fpart = x / 2**ipart */
+   fpart = lp_build_extract_mantissa(bld, x);
+
+   /* ipart + fpart */
+   return LLVMBuildFAdd(bld->builder, ipart, fpart, "");
+}
+
+
+/**
+ * Fast implementation of iround(log2(x)).
+ *
+ * Not an approximation -- it should give accurate results all the time.
+ */
+LLVMValueRef
+lp_build_ilog2(struct lp_build_context *bld,
+               LLVMValueRef x)
+{
+   LLVMValueRef sqrt2 = lp_build_const_vec(bld->type, M_SQRT2);
+   LLVMValueRef ipart;
+
+   assert(bld->type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
+   x = LLVMBuildFMul(bld->builder, x, sqrt2, "");
+
+   /* ipart = floor(log2(x) + 0.5)  */
+   ipart = lp_build_extract_exponent(bld, x, 0);
+
+   return ipart;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 31efa9921ce..c78b61decf0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -171,6 +171,12 @@ LLVMValueRef
 lp_build_itrunc(struct lp_build_context *bld,
                 LLVMValueRef a);
 
+void
+lp_build_ifloor_fract(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef *out_ipart,
+                      LLVMValueRef *out_fpart);
+
 LLVMValueRef
 lp_build_sqrt(struct lp_build_context *bld,
               LLVMValueRef a);
@@ -209,9 +215,26 @@ lp_build_exp2(struct lp_build_context *bld,
               LLVMValueRef a);
 
 LLVMValueRef
+lp_build_extract_exponent(struct lp_build_context *bld,
+                          LLVMValueRef x,
+                          int bias);
+
+LLVMValueRef
+lp_build_extract_mantissa(struct lp_build_context *bld,
+                          LLVMValueRef x);
+
+LLVMValueRef
 lp_build_log2(struct lp_build_context *bld,
               LLVMValueRef a);
 
+LLVMValueRef
+lp_build_fast_log2(struct lp_build_context *bld,
+                   LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ilog2(struct lp_build_context *bld,
+               LLVMValueRef x);
+
 void
 lp_build_exp2_approx(struct lp_build_context *bld,
                      LLVMValueRef x,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 8b477313d48..6967dd26225 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -63,6 +63,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -96,58 +97,104 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type);
    LLVMValueRef res;
    unsigned mantissa;
-   unsigned n;
-   unsigned long long ubound;
-   unsigned long long mask;
-   double scale;
-   double bias;
 
    assert(src_type.floating);
+   assert(dst_width <= src_type.width);
+   src_type.sign = FALSE;
 
    mantissa = lp_mantissa(src_type);
 
-   /* We cannot carry more bits than the mantissa */
-   n = MIN2(mantissa, dst_width);
+   if (dst_width <= mantissa) {
+      /*
+       * Apply magic coefficients that will make the desired result to appear
+       * in the lowest significant bits of the mantissa, with correct rounding.
+       *
+       * This only works if the destination width fits in the mantissa.
+       */
 
-   /* This magic coefficients will make the desired result to appear in the
-    * lowest significant bits of the mantissa.
-    */
-   ubound = ((unsigned long long)1 << n);
-   mask = ubound - 1;
-   scale = (double)mask/ubound;
-   bias = (double)((unsigned long long)1 << (mantissa - n));
+      unsigned long long ubound;
+      unsigned long long mask;
+      double scale;
+      double bias;
 
-   res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
-   res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), "");
-   res = LLVMBuildBitCast(builder, res, int_vec_type, "");
+      ubound = (1ULL << dst_width);
+      mask = ubound - 1;
+      scale = (double)mask/ubound;
+      bias = (double)(1ULL << (mantissa - dst_width));
 
-   if(dst_width > n) {
-      int shift = dst_width - n;
-      res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), "");
+      res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+      res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), "");
+      res = LLVMBuildBitCast(builder, res, int_vec_type, "");
+      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
+   }
+   else if (dst_width == (mantissa + 1)) {
+      /*
+       * The destination width matches exactly what can be represented in
+       * floating point (i.e., mantissa + 1 bits). So do a straight
+       * multiplication followed by casting. No further rounding is necessary.
+       */
+
+      double scale;
 
-      /* TODO: Fill in the empty lower bits for additional precision? */
-      /* YES: this fixes progs/trivial/tri-z-eq.c.
-       * Otherwise vertex Z=1.0 values get converted to something like
-       * 0xfffffb00 and the test for equality with 0xffffffff fails.
+      scale = (double)((1ULL << dst_width) - 1);
+
+      res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+      res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
+   }
+   else {
+      /*
+       * The destination exceeds what can be represented in the floating point.
+       * So multiply by the largest power two we get away with, and when
+       * subtract the most significant bit to rescale to normalized values.
+       *
+       * The largest power of two factor we can get away is
+       * (1 << (src_type.width - 1)), because we need to use signed . In theory it
+       * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
+       * INT_MIN should be returned in FPToSI, which is the correct result for
+       * values near 1.0!
+       *
+       * This means we get (src_type.width - 1) correct bits for values near 0.0,
+       * and (mantissa + 1) correct bits for values near 1.0. Equally or more
+       * important, we also get exact results for 0.0 and 1.0.
        */
-#if 0
-      {
-         LLVMValueRef msb;
-         msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), "");
-         msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), "");
-         msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), "");
-         res = LLVMBuildOr(builder, res, msb, "");
-      }
-#elif 0
-      while(shift > 0) {
-         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), "");
-         shift -= n;
-         n *= 2;
+
+      unsigned n = MIN2(src_type.width - 1, dst_width);
+
+      double scale = (double)(1ULL << n);
+      unsigned lshift = dst_width - n;
+      unsigned rshift = n;
+      LLVMValueRef lshifted;
+      LLVMValueRef rshifted;
+
+      res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+      res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
+
+      /*
+       * Align the most significant bit to its final place.
+       *
+       * This will cause 1.0 to overflow to 0, but the later adjustment will
+       * get it right.
+       */
+      if (lshift) {
+         lshifted = LLVMBuildShl(builder, res,
+                                 lp_build_const_int_vec(src_type, lshift), "");
+      } else {
+         lshifted = res;
       }
-#endif
+
+      /*
+       * Align the most significant bit to the right.
+       */
+      rshifted =  LLVMBuildAShr(builder, res,
+                                lp_build_const_int_vec(src_type, rshift), "");
+
+      /*
+       * Subtract the MSB to the LSB, therefore re-scaling from
+       * (1 << dst_width) to ((1 << dst_width) - 1).
+       */
+
+      res = LLVMBuildSub(builder, lshifted, rshifted, "");
    }
-   else
-      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
 
    return res;
 }
@@ -177,6 +224,16 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
    assert(dst_type.floating);
 
+   /* Special-case int8->float, though most cases could be handled
+    * this way:
+    */
+   if (src_width == 8) {
+      scale = 1.0/255.0;
+      res = LLVMBuildSIToFP(builder, src, vec_type, "");
+      res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), "");
+      return res;
+   }
+
    mantissa = lp_mantissa(dst_type);
 
    n = MIN2(mantissa, src_width);
@@ -241,6 +298,87 @@ lp_build_conv(LLVMBuilderRef builder,
    }
    num_tmps = num_srcs;
 
+
+   /* Special case 4x4f --> 1x16ub 
+    */
+   if (src_type.floating == 1 &&
+       src_type.fixed    == 0 &&
+       src_type.sign     == 1 &&
+       src_type.norm     == 0 &&
+       src_type.width    == 32 &&
+       src_type.length   == 4 &&
+
+       dst_type.floating == 0 &&
+       dst_type.fixed    == 0 &&
+       dst_type.sign     == 0 &&
+       dst_type.norm     == 1 &&
+       dst_type.width    == 8 &&
+       dst_type.length   == 16 &&
+
+       util_cpu_caps.has_sse2)
+   {
+      int i;
+
+      for (i = 0; i < num_dsts; i++, src += 4) {
+         struct lp_type int16_type = dst_type;
+         struct lp_type int32_type = dst_type;
+         LLVMValueRef lo, hi;
+         LLVMValueRef src_int0;
+         LLVMValueRef src_int1;
+         LLVMValueRef src_int2;
+         LLVMValueRef src_int3;
+         LLVMTypeRef int16_vec_type;
+         LLVMTypeRef int32_vec_type;
+         LLVMTypeRef src_vec_type;
+         LLVMTypeRef dst_vec_type;
+         LLVMValueRef const_255f;
+         LLVMValueRef a, b, c, d;
+
+         int16_type.width *= 2;
+         int16_type.length /= 2;
+         int16_type.sign = 1;
+
+         int32_type.width *= 4;
+         int32_type.length /= 4;
+         int32_type.sign = 1;
+
+         src_vec_type   = lp_build_vec_type(src_type);
+         dst_vec_type   = lp_build_vec_type(dst_type);
+         int16_vec_type = lp_build_vec_type(int16_type);
+         int32_vec_type = lp_build_vec_type(int32_type);
+
+         const_255f = lp_build_const_vec(src_type, 255.0f);
+
+         a = LLVMBuildFMul(builder, src[0], const_255f, "");
+         b = LLVMBuildFMul(builder, src[1], const_255f, "");
+         c = LLVMBuildFMul(builder, src[2], const_255f, "");
+         d = LLVMBuildFMul(builder, src[3], const_255f, "");
+
+         {
+            struct lp_build_context bld;
+
+            bld.builder = builder;
+            bld.type = src_type;
+            bld.vec_type = src_vec_type;
+            bld.int_elem_type = lp_build_elem_type(int32_type);
+            bld.int_vec_type = int32_vec_type;
+            bld.undef = lp_build_undef(src_type);
+            bld.zero = lp_build_zero(src_type);
+            bld.one = lp_build_one(src_type);
+
+            src_int0 = lp_build_iround(&bld, a);
+            src_int1 = lp_build_iround(&bld, b);
+            src_int2 = lp_build_iround(&bld, c);
+            src_int3 = lp_build_iround(&bld, d);
+         }
+         /* relying on clamping behavior of sse2 intrinsics here */
+         lo = lp_build_pack2(builder, int32_type, int16_type, src_int0, src_int1);
+         hi = lp_build_pack2(builder, int32_type, int16_type, src_int2, src_int3);
+         dst[i] = lp_build_pack2(builder, int16_type, dst_type, lo, hi);
+      }
+      return; 
+   }
+
    /*
     * Clamp if necessary
     */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
index d3a5afff8c2..93e56553d7b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
@@ -57,6 +57,8 @@ lp_disassemble(const void* func)
 #ifdef HAVE_UDIS86
    ud_t ud_obj;
    uint64_t max_jmp_pc;
+   uint inst_no;
+   boolean emit_addrs = TRUE, emit_line_nos = FALSE;
 
    ud_init(&ud_obj);
 
@@ -76,13 +78,18 @@ lp_disassemble(const void* func)
 
    while (ud_disassemble(&ud_obj)) {
 
+      if (emit_addrs) {
 #ifdef PIPE_ARCH_X86
-      debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
+         debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
 #endif
 #ifdef PIPE_ARCH_X86_64
-      debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
+         debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
 #endif
-
+      }
+      else if (emit_line_nos) {
+         debug_printf("%6d:\t", inst_no);
+         inst_no++;
+      }
 #if 0
       debug_printf("%-16s ", ud_insn_hex(&ud_obj));
 #endif
@@ -115,8 +122,10 @@ lp_disassemble(const void* func)
          }
       }
 
-      if ((ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret) ||
-           ud_obj.mnemonic == UD_Iinvalid)
+      if (ud_obj.mnemonic == UD_Iinvalid ||
+          (ud_insn_off(&ud_obj) >= max_jmp_pc &&
+           (ud_obj.mnemonic == UD_Iret ||
+            ud_obj.mnemonic == UD_Ijmp)))
          break;
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 369c1bbf09a..eb11dcd4ef4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -36,11 +36,12 @@
 #include "util/u_string.h"
 
 
-#define GALLIVM_DEBUG_TGSI      0x1
-#define GALLIVM_DEBUG_IR        0x2
-#define GALLIVM_DEBUG_ASM       0x4
-#define GALLIVM_DEBUG_NO_OPT    0x8
-#define GALLIVM_DEBUG_PERF      0x10
+#define GALLIVM_DEBUG_TGSI          (1 << 0)
+#define GALLIVM_DEBUG_IR            (1 << 1)
+#define GALLIVM_DEBUG_ASM           (1 << 2)
+#define GALLIVM_DEBUG_NO_OPT        (1 << 3)
+#define GALLIVM_DEBUG_PERF          (1 << 4)
+#define GALLIVM_DEBUG_NO_BRILINEAR  (1 << 5)
 
 
 #ifdef DEBUG
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index 5bc9c741a88..a2cee199a01 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -38,273 +38,15 @@
 #include "lp_bld_flow.h"
 
 
-#define LP_BUILD_FLOW_MAX_VARIABLES 64
-#define LP_BUILD_FLOW_MAX_DEPTH 32
-
-/**
- * Enumeration of all possible flow constructs.
- */
-enum lp_build_flow_construct_kind {
-   LP_BUILD_FLOW_SCOPE,
-   LP_BUILD_FLOW_SKIP,
-   LP_BUILD_FLOW_IF
-};
-
-
-/**
- * Variable declaration scope.
- */
-struct lp_build_flow_scope
-{
-   /** Number of variables declared in this scope */
-   unsigned num_variables;
-};
-
-
-/**
- * Early exit. Useful to skip to the end of a function or block when
- * the execution mask becomes zero or when there is an error condition.
- */
-struct lp_build_flow_skip
-{
-   /** Block to skip to */
-   LLVMBasicBlockRef block;
-
-   /** Number of variables declared at the beginning */
-   unsigned num_variables;
-
-   LLVMValueRef *phi;  /**< array [num_variables] */
-};
-
-
-/**
- * if/else/endif.
- */
-struct lp_build_flow_if
-{
-   unsigned num_variables;
-
-   LLVMValueRef *phi;  /**< array [num_variables] */
-
-   LLVMValueRef condition;
-   LLVMBasicBlockRef entry_block, true_block, false_block, merge_block;
-};
-
-
-/**
- * Union of all possible flow constructs' data
- */
-union lp_build_flow_construct_data
-{
-   struct lp_build_flow_scope scope;
-   struct lp_build_flow_skip skip;
-   struct lp_build_flow_if ifthen;
-};
-
-
-/**
- * Element of the flow construct stack.
- */
-struct lp_build_flow_construct
-{
-   enum lp_build_flow_construct_kind kind;
-   union lp_build_flow_construct_data data;
-};
-
-
 /**
- * All necessary data to generate LLVM control flow constructs.
+ * Insert a new block, right where builder is pointing to.
  *
- * Besides keeping track of the control flow construct themselves we also
- * need to keep track of variables in order to generate SSA Phi values.
- */
-struct lp_build_flow_context
-{
-   LLVMBuilderRef builder;
-
-   /**
-    * Control flow stack.
-    */
-   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
-   unsigned num_constructs;
-
-   /**
-    * Variable stack
-    */
-   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
-   unsigned num_variables;
-};
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder)
-{
-   struct lp_build_flow_context *flow;
-
-   flow = CALLOC_STRUCT(lp_build_flow_context);
-   if(!flow)
-      return NULL;
-
-   flow->builder = builder;
-
-   return flow;
-}
-
-
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow)
-{
-   assert(flow->num_constructs == 0);
-   assert(flow->num_variables == 0);
-   FREE(flow);
-}
-
-
-/**
- * Begin/push a new flow control construct, such as a loop, skip block
- * or variable scope.
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_push(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
-   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
-      return NULL;
-
-   flow->constructs[flow->num_constructs].kind = kind;
-   return &flow->constructs[flow->num_constructs++].data;
-}
-
-
-/**
- * Return the current/top flow control construct on the stack.
- * \param kind  the expected type of the top-most construct
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_peek(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[flow->num_constructs - 1].data;
-}
-
-
-/**
- * End/pop the current/top flow control construct on the stack.
- * \param kind  the expected type of the top-most construct
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_pop(struct lp_build_flow_context *flow,
-                  enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[--flow->num_constructs].data;
-}
-
-
-/**
- * Begin a variable scope.
+ * This is useful important not only for aesthetic reasons, but also for
+ * performance reasons, as frequently run blocks should be laid out next to
+ * each other and fall-throughs maximized.
  *
+ * See also llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp.
  *
- */
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_push(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   scope->num_variables = 0;
-}
-
-
-/**
- * Declare a variable.
- *
- * A variable is a named entity which can have different LLVMValueRef's at
- * different points of the program. This is relevant for control flow because
- * when there are multiple branches to a same location we need to replace
- * the variable's value with a Phi function as explained in
- * http://en.wikipedia.org/wiki/Static_single_assignment_form .
- *
- * We keep track of variables by keeping around a pointer to where they're
- * current.
- *
- * There are a few cautions to observe:
- *
- * - Variable's value must not be NULL. If there is no initial value then
- *   LLVMGetUndef() should be used.
- *
- * - Variable's value must be kept up-to-date. If the variable is going to be
- *   modified by a function then a pointer should be passed so that its value
- *   is accurate. Failure to do this will cause some of the variables'
- *   transient values to be lost, leading to wrong results.
- *
- * - A program should be written from top to bottom, by always appending
- *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
- *   modifying existing statements will most likely lead to wrong results.
- *
- */
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(*variable);
-   if(!*variable)
-      return;
-
-   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
-   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
-      return;
-
-   flow->variables[flow->num_variables++] = variable;
-   ++scope->num_variables;
-}
-
-
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(flow->num_variables >= scope->num_variables);
-   if(flow->num_variables < scope->num_variables) {
-      flow->num_variables = 0;
-      return;
-   }
-
-   flow->num_variables -= scope->num_variables;
-}
-
-
-/**
  * Note: this function has no dependencies on the flow code and could
  * be used elsewhere.
  */
@@ -334,52 +76,18 @@ lp_build_insert_new_block(LLVMBuilderRef builder, const char *name)
 }
 
 
-static LLVMBasicBlockRef
-lp_build_flow_insert_block(struct lp_build_flow_context *flow)
-{
-   return lp_build_insert_new_block(flow->builder, "");
-}
-
-
 /**
  * Begin a "skip" block.  Inside this block we can test a condition and
  * skip to the end of the block if the condition is false.
  */
 void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
+lp_build_flow_skip_begin(struct lp_build_skip_context *skip,
+                         LLVMBuilderRef builder)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBuilderRef builder;
-   unsigned i;
-
-   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
+   skip->builder = builder;
 
    /* create new basic block */
-   skip->block = lp_build_flow_insert_block(flow);
-
-   skip->num_variables = flow->num_variables;
-   if(!skip->num_variables) {
-      skip->phi = NULL;
-      return;
-   }
-
-   /* Allocate a Phi node for each variable in this skip scope */
-   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
-   if(!skip->phi) {
-      skip->num_variables = 0;
-      return;
-   }
-
-   builder = LLVMCreateBuilder();
-   LLVMPositionBuilderAtEnd(builder, skip->block);
-
-   /* create a Phi node for each variable */
-   for(i = 0; i < skip->num_variables; ++i)
-      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
-
-   LLVMDisposeBuilder(builder);
+   skip->block = lp_build_insert_new_block(skip->builder, "skip");
 }
 
 
@@ -388,83 +96,50 @@ lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
  * skip block if the condition is true.
  */
 void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+lp_build_flow_skip_cond_break(struct lp_build_skip_context *skip,
                               LLVMValueRef cond)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
    LLVMBasicBlockRef new_block;
-   unsigned i;
-
-   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
 
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   new_block = lp_build_flow_insert_block(flow);
-
-   /* for each variable, update the Phi node with a (variable, block) pair */
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-   }
+   new_block = lp_build_insert_new_block(skip->builder, "");
 
    /* if cond is true, goto skip->block, else goto new_block */
-   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
+   LLVMBuildCondBr(skip->builder, cond, skip->block, new_block);
 
-   LLVMPositionBuilderAtEnd(flow->builder, new_block);
+   LLVMPositionBuilderAtEnd(skip->builder, new_block);
 }
 
 
 void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow)
+lp_build_flow_skip_end(struct lp_build_skip_context *skip)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
-   unsigned i;
-
-   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
-
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   /* add (variable, block) tuples to the phi nodes */
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-      *flow->variables[i] = skip->phi[i];
-   }
-
    /* goto block */
-   LLVMBuildBr(flow->builder, skip->block);
-   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
-
-   FREE(skip->phi);
+   LLVMBuildBr(skip->builder, skip->block);
+   LLVMPositionBuilderAtEnd(skip->builder, skip->block);
 }
 
 
 /**
  * Check if the mask predicate is zero.  If so, jump to the end of the block.
  */
-static void
+void
 lp_build_mask_check(struct lp_build_mask_context *mask)
 {
-   LLVMBuilderRef builder = mask->flow->builder;
+   LLVMBuilderRef builder = mask->skip.builder;
+   LLVMValueRef value;
    LLVMValueRef cond;
 
+   value = lp_build_mask_value(mask);
+
    /* cond = (mask == 0) */
    cond = LLVMBuildICmp(builder,
                         LLVMIntEQ,
-                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
+                        LLVMBuildBitCast(builder, value, mask->reg_type, ""),
                         LLVMConstNull(mask->reg_type),
                         "");
 
    /* if cond, goto end of block */
-   lp_build_flow_skip_cond_break(mask->flow, cond);
+   lp_build_flow_skip_cond_break(&mask->skip, cond);
 }
 
 
@@ -477,21 +152,27 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
  */
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
+                    LLVMBuilderRef builder,
                     struct lp_type type,
                     LLVMValueRef value)
 {
    memset(mask, 0, sizeof *mask);
 
-   mask->flow = flow;
    mask->reg_type = LLVMIntType(type.width * type.length);
-   mask->value = value;
+   mask->var = lp_build_alloca(builder,
+                               lp_build_int_vec_type(type),
+                               "execution_mask");
 
-   lp_build_flow_scope_begin(flow);
-   lp_build_flow_scope_declare(flow, &mask->value);
-   lp_build_flow_skip_begin(flow);
+   LLVMBuildStore(builder, value, mask->var);
 
-   lp_build_mask_check(mask);
+   lp_build_flow_skip_begin(&mask->skip, builder);
+}
+
+
+LLVMValueRef
+lp_build_mask_value(struct lp_build_mask_context *mask)
+{
+   return LLVMBuildLoad(mask->skip.builder, mask->var, "");
 }
 
 
@@ -504,9 +185,10 @@ void
 lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value)
 {
-   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
-
-   lp_build_mask_check(mask);
+   value = LLVMBuildAnd(mask->skip.builder,
+                        lp_build_mask_value(mask),
+                        value, "");
+   LLVMBuildStore(mask->skip.builder, value, mask->var);
 }
 
 
@@ -516,9 +198,8 @@ lp_build_mask_update(struct lp_build_mask_context *mask,
 LLVMValueRef
 lp_build_mask_end(struct lp_build_mask_context *mask)
 {
-   lp_build_flow_skip_end(mask->flow);
-   lp_build_flow_scope_end(mask->flow);
-   return mask->value;
+   lp_build_flow_skip_end(&mask->skip);
+   return lp_build_mask_value(mask);
 }
 
 
@@ -528,59 +209,27 @@ lp_build_loop_begin(LLVMBuilderRef builder,
                     LLVMValueRef start,
                     struct lp_build_loop_state *state)
 {
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
+   state->block = lp_build_insert_new_block(builder, "loop_begin");
 
-   state->block = LLVMAppendBasicBlock(function, "loop");
+   state->counter_var = lp_build_alloca(builder, LLVMTypeOf(start), "loop_counter");
+
+   LLVMBuildStore(builder, start, state->counter_var);
 
    LLVMBuildBr(builder, state->block);
 
    LLVMPositionBuilderAtEnd(builder, state->block);
 
-   state->counter = LLVMBuildPhi(builder, LLVMTypeOf(start), "");
-
-   LLVMAddIncoming(state->counter, &start, &block, 1);
-
+   state->counter = LLVMBuildLoad(builder, state->counter_var, "");
 }
 
 
 void
-lp_build_loop_end(LLVMBuilderRef builder,
-                  LLVMValueRef end,
-                  LLVMValueRef step,
-                  struct lp_build_loop_state *state)
-{
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
-   LLVMValueRef next;
-   LLVMValueRef cond;
-   LLVMBasicBlockRef after_block;
-
-   if (!step)
-      step = LLVMConstInt(LLVMTypeOf(end), 1, 0);
-
-   next = LLVMBuildAdd(builder, state->counter, step, "");
-
-   cond = LLVMBuildICmp(builder, LLVMIntNE, next, end, "");
-
-   after_block = LLVMAppendBasicBlock(function, "");
-
-   LLVMBuildCondBr(builder, cond, after_block, state->block);
-
-   LLVMAddIncoming(state->counter, &next, &block, 1);
-
-   LLVMPositionBuilderAtEnd(builder, after_block);
-}
-
-void
 lp_build_loop_end_cond(LLVMBuilderRef builder,
                        LLVMValueRef end,
                        LLVMValueRef step,
-                       int llvm_cond,
+                       LLVMIntPredicate llvm_cond,
                        struct lp_build_loop_state *state)
 {
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
    LLVMValueRef next;
    LLVMValueRef cond;
    LLVMBasicBlockRef after_block;
@@ -590,15 +239,27 @@ lp_build_loop_end_cond(LLVMBuilderRef builder,
 
    next = LLVMBuildAdd(builder, state->counter, step, "");
 
+   LLVMBuildStore(builder, next, state->counter_var);
+
    cond = LLVMBuildICmp(builder, llvm_cond, next, end, "");
 
-   after_block = LLVMAppendBasicBlock(function, "");
+   after_block = lp_build_insert_new_block(builder, "loop_end");
 
    LLVMBuildCondBr(builder, cond, after_block, state->block);
 
-   LLVMAddIncoming(state->counter, &next, &block, 1);
-
    LLVMPositionBuilderAtEnd(builder, after_block);
+
+   state->counter = LLVMBuildLoad(builder, state->counter_var, "");
+}
+
+
+void
+lp_build_loop_end(LLVMBuilderRef builder,
+                  LLVMValueRef end,
+                  LLVMValueRef step,
+                  struct lp_build_loop_state *state)
+{
+   lp_build_loop_end_cond(builder, end, step, LLVMIntNE, state);
 }
 
 
@@ -616,24 +277,16 @@ lp_build_loop_end_cond(LLVMBuilderRef builder,
 
   Is built with:
 
-     LLVMValueRef x = LLVMGetUndef();  // or something else
+     // x needs an alloca variable
+     x = lp_build_alloca(builder, type, "x");
 
-     flow = lp_build_flow_create(builder);
 
-        lp_build_flow_scope_begin(flow);
+     lp_build_if(ctx, builder, cond);
+        LLVMBuildStore(LLVMBuildAdd(1, 2), x);
+     lp_build_else(ctx);
+        LLVMBuildStore(LLVMBuildAdd(2, 3). x);
+     lp_build_endif(ctx);
 
-           // x needs a phi node
-           lp_build_flow_scope_declare(flow, &x);
-
-           lp_build_if(ctx, flow, builder, cond);
-              x = LLVMAdd(1, 2);
-           lp_build_else(ctx);
-              x = LLVMAdd(2, 3);
-           lp_build_endif(ctx);
-
-        lp_build_flow_scope_end(flow);
-
-     lp_build_flow_destroy(flow);
  */
 
 
@@ -642,47 +295,19 @@ lp_build_loop_end_cond(LLVMBuilderRef builder,
  * Begin an if/else/endif construct.
  */
 void
-lp_build_if(struct lp_build_if_state *ctx,
-            struct lp_build_flow_context *flow,
+lp_build_if(struct lp_build_if_state *ifthen,
             LLVMBuilderRef builder,
             LLVMValueRef condition)
 {
    LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   struct lp_build_flow_if *ifthen;
-   unsigned i;
-
-   memset(ctx, 0, sizeof(*ctx));
-   ctx->builder = builder;
-   ctx->flow = flow;
 
-   /* push/create new scope */
-   ifthen = &lp_build_flow_push(flow, LP_BUILD_FLOW_IF)->ifthen;
-   assert(ifthen);
-
-   ifthen->num_variables = flow->num_variables;
+   memset(ifthen, 0, sizeof *ifthen);
+   ifthen->builder = builder;
    ifthen->condition = condition;
    ifthen->entry_block = block;
 
-   /* create a Phi node for each variable in this flow scope */
-   ifthen->phi = MALLOC(ifthen->num_variables * sizeof(*ifthen->phi));
-   if (!ifthen->phi) {
-      ifthen->num_variables = 0;
-      return;
-   }
-
    /* create endif/merge basic block for the phi functions */
    ifthen->merge_block = lp_build_insert_new_block(builder, "endif-block");
-   LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
-
-   /* create a phi node for each variable */
-   for (i = 0; i < flow->num_variables; i++) {
-      ifthen->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
-
-      /* add add the initial value of the var from the entry block */
-      if (!LLVMIsUndef(*flow->variables[i]))
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i],
-                         &ifthen->entry_block, 1);
-   }
 
    /* create/insert true_block before merge_block */
    ifthen->true_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-true-block");
@@ -696,27 +321,16 @@ lp_build_if(struct lp_build_if_state *ctx,
  * Begin else-part of a conditional
  */
 void
-lp_build_else(struct lp_build_if_state *ctx)
+lp_build_else(struct lp_build_if_state *ifthen)
 {
-   struct lp_build_flow_context *flow = ctx->flow;
-   struct lp_build_flow_if *ifthen;
-   unsigned i;
-
-   ifthen = &lp_build_flow_peek(flow, LP_BUILD_FLOW_IF)->ifthen;
-   assert(ifthen);
-
-   /* for each variable, update the Phi node with a (variable, block) pair */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-   for (i = 0; i < flow->num_variables; i++) {
-      assert(*flow->variables[i]);
-      LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
-   }
+   /* Append an unconditional Br(anch) instruction on the true_block */
+   LLVMBuildBr(ifthen->builder, ifthen->merge_block);
 
    /* create/insert false_block before the merge block */
    ifthen->false_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-false-block");
 
    /* successive code goes into the else block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->false_block);
+   LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->false_block);
 }
 
 
@@ -724,75 +338,30 @@ lp_build_else(struct lp_build_if_state *ctx)
  * End a conditional.
  */
 void
-lp_build_endif(struct lp_build_if_state *ctx)
+lp_build_endif(struct lp_build_if_state *ifthen)
 {
-   struct lp_build_flow_context *flow = ctx->flow;
-   struct lp_build_flow_if *ifthen;
-   LLVMBasicBlockRef curBlock = LLVMGetInsertBlock(ctx->builder);
-   unsigned i;
-
-   ifthen = &lp_build_flow_pop(flow, LP_BUILD_FLOW_IF)->ifthen;
-   assert(ifthen);
-
    /* Insert branch to the merge block from current block */
-   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+   LLVMBuildBr(ifthen->builder, ifthen->merge_block);
 
-   if (ifthen->false_block) {
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-      /* for each variable, update the Phi node with a (variable, block) pair */
-      for (i = 0; i < flow->num_variables; i++) {
-         assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &curBlock, 1);
-         /* replace the variable ref with the phi function */
-         *flow->variables[i] = ifthen->phi[i];
-      }
-   }
-   else {
-      /* no else clause */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-      for (i = 0; i < flow->num_variables; i++) {
-         assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
-
-         /* replace the variable ref with the phi function */
-         *flow->variables[i] = ifthen->phi[i];
-      }
-   }
-
-   FREE(ifthen->phi);
-
-   /***
-    *** Now patch in the various branch instructions.
-    ***/
+   /*
+    * Now patch in the various branch instructions.
+    */
 
    /* Insert the conditional branch instruction at the end of entry_block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->entry_block);
+   LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->entry_block);
    if (ifthen->false_block) {
       /* we have an else clause */
-      LLVMBuildCondBr(ctx->builder, ifthen->condition,
+      LLVMBuildCondBr(ifthen->builder, ifthen->condition,
                       ifthen->true_block, ifthen->false_block);
    }
    else {
       /* no else clause */
-      LLVMBuildCondBr(ctx->builder, ifthen->condition,
+      LLVMBuildCondBr(ifthen->builder, ifthen->condition,
                       ifthen->true_block, ifthen->merge_block);
    }
 
-   /* Insert branch from end of true_block to merge_block */
-   if (ifthen->false_block) {
-      /* Append an unconditional Br(anch) instruction on the true_block */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
-      LLVMBuildBr(ctx->builder, ifthen->merge_block);
-   }
-   else {
-      /* No else clause.
-       * Note that we've already inserted the branch at the end of
-       * true_block.  See the very first LLVMBuildBr() call in this function.
-       */
-   }
-
    /* Resume building code at end of the ifthen->merge_block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
+   LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->merge_block);
 }
 
 
@@ -830,6 +399,7 @@ lp_build_alloca(LLVMBuilderRef builder,
    }
 
    res = LLVMBuildAlloca(first_builder, type, name);
+   LLVMBuildStore(builder, LLVMConstNull(type), res);
 
    LLVMDisposeBuilder(first_builder);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index fffb493a93b..e729ee6eaac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -41,52 +41,49 @@
 struct lp_type;
 
 
-struct lp_build_flow_context;
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder);
-
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable);
+/**
+ * Early exit. Useful to skip to the end of a function or block when
+ * the execution mask becomes zero or when there is an error condition.
+ */
+struct lp_build_skip_context
+{
+   LLVMBuilderRef builder;
 
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow);
+   /** Block to skip to */
+   LLVMBasicBlockRef block;
+};
 
 void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
+lp_build_flow_skip_begin(struct lp_build_skip_context *ctx,
+                         LLVMBuilderRef builder);
 
 void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+lp_build_flow_skip_cond_break(struct lp_build_skip_context *ctx,
                               LLVMValueRef cond);
 
 void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow);
+lp_build_flow_skip_end(struct lp_build_skip_context *ctx);
 
 
 struct lp_build_mask_context
 {
-   struct lp_build_flow_context *flow;
+   struct lp_build_skip_context skip;
 
    LLVMTypeRef reg_type;
 
-   LLVMValueRef value;
+   LLVMValueRef var;
 };
 
 
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
+                    LLVMBuilderRef builder,
                     struct lp_type type,
                     LLVMValueRef value);
 
+LLVMValueRef
+lp_build_mask_value(struct lp_build_mask_context *mask);
+
 /**
  * Bitwise AND the mask with the given value, if a previous mask was set.
  */
@@ -94,6 +91,9 @@ void
 lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value);
 
+void
+lp_build_mask_check(struct lp_build_mask_context *mask);
+
 LLVMValueRef
 lp_build_mask_end(struct lp_build_mask_context *mask);
 
@@ -108,6 +108,7 @@ lp_build_mask_end(struct lp_build_mask_context *mask);
 struct lp_build_loop_state
 {
   LLVMBasicBlockRef block;
+  LLVMValueRef counter_var;
   LLVMValueRef counter;
 };
 
@@ -128,22 +129,28 @@ void
 lp_build_loop_end_cond(LLVMBuilderRef builder,
                        LLVMValueRef end,
                        LLVMValueRef step,
-                       int cond, /* LLVM condition */
+                       LLVMIntPredicate cond,
                        struct lp_build_loop_state *state);
 
 
 
 
+/**
+ * if/else/endif.
+ */
 struct lp_build_if_state
 {
    LLVMBuilderRef builder;
-   struct lp_build_flow_context *flow;
+   LLVMValueRef condition;
+   LLVMBasicBlockRef entry_block;
+   LLVMBasicBlockRef true_block;
+   LLVMBasicBlockRef false_block;
+   LLVMBasicBlockRef merge_block;
 };
 
 
 void
 lp_build_if(struct lp_build_if_state *ctx,
-            struct lp_build_flow_context *flow,
             LLVMBuilderRef builder,
             LLVMValueRef condition);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 761f33b578d..5598ca5c489 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -44,6 +44,7 @@ static const struct debug_named_value lp_bld_debug_flags[] = {
    { "asm",    GALLIVM_DEBUG_ASM, NULL },
    { "nopt",   GALLIVM_DEBUG_NO_OPT, NULL },
    { "perf",   GALLIVM_DEBUG_PERF, NULL },
+   { "no_brilinear", GALLIVM_DEBUG_NO_BRILINEAR, NULL },
    DEBUG_NAMED_VALUE_END
 };
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index f26fdac4663..0b4b1ca7d11 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -47,4 +47,10 @@ lp_build_init(void);
 extern void
 lp_func_delete_body(LLVMValueRef func);
 
+
+extern LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                       const char *Name);
+
+
 #endif /* !LP_BLD_INIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index d5c62a3f734..026b60ac36e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -92,9 +92,23 @@ lp_build_compare(LLVMBuilderRef builder,
    if(func == PIPE_FUNC_ALWAYS)
       return ones;
 
-   /* TODO: optimize the constant case */
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * There are no unsigned integer comparison instructions in SSE.
+    */
 
-   /* XXX: It is not clear if we should use the ordered or unordered operators */
+   if (!type.floating && !type.sign &&
+       type.width * type.length == 128 &&
+       util_cpu_caps.has_sse2 &&
+       (func == PIPE_FUNC_LESS ||
+        func == PIPE_FUNC_LEQUAL ||
+        func == PIPE_FUNC_GREATER ||
+        func == PIPE_FUNC_GEQUAL) &&
+       (gallivm_debug & GALLIVM_DEBUG_PERF)) {
+         debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n",
+                      __FUNCTION__, type.length, type.width);
+   }
+#endif
 
 #if HAVE_LLVM < 0x0207
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
@@ -225,6 +239,8 @@ lp_build_compare(LLVMBuilderRef builder,
 #endif
 #endif /* HAVE_LLVM < 0x0207 */
 
+   /* XXX: It is not clear if we should use the ordered or unordered operators */
+
    if(type.floating) {
       LLVMRealPredicate op;
       switch(func) {
@@ -446,10 +462,12 @@ lp_build_select(struct lp_build_context *bld,
       LLVMTypeRef arg_type;
       LLVMValueRef args[3];
 
-      if (type.width == 64) {
+      if (type.floating &&
+          type.width == 64) {
          intrinsic = "llvm.x86.sse41.blendvpd";
          arg_type = LLVMVectorType(LLVMDoubleType(), 2);
-      } else if (type.width == 32) {
+      } else if (type.floating &&
+                 type.width == 32) {
          intrinsic = "llvm.x86.sse41.blendvps";
          arg_type = LLVMVectorType(LLVMFloatType(), 4);
       } else {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 48baf7c425c..f56ddee7fd7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -178,3 +178,13 @@ lp_func_delete_body(LLVMValueRef FF)
    llvm::Function *func = llvm::unwrap<llvm::Function>(FF);
    func->deleteBody();
 }
+
+
+extern "C"
+LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                       const char *Name)
+{
+   return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.c b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
index 153ba5b15b1..f418e96aff4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
@@ -29,6 +29,8 @@
 
 #include "util/u_debug.h"
 #include "util/u_memory.h"
+#include "util/u_string.h"
+#include "lp_bld_const.h"
 #include "lp_bld_printf.h"
 
 
@@ -119,3 +121,22 @@ lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...)
    return LLVMBuildCall(builder, func_printf, params, argcount + 1, "");
 }
 
+
+
+/**
+ * Print a float[4] vector.
+ */
+LLVMValueRef
+lp_build_print_vec4(LLVMBuilderRef builder, const char *msg, LLVMValueRef vec)
+{
+   char format[1000];
+   LLVMValueRef x, y, z, w;
+
+   x = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(0), "");
+   y = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(1), "");
+   z = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(2), "");
+   w = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(3), "");
+
+   util_snprintf(format, sizeof(format), "%s %%f %%f %%f %%f\n", msg);
+   return lp_build_printf(builder, format, x, y, z, w);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.h b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
index 83bd8f1d557..b6222c62ebe 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
@@ -35,5 +35,9 @@
 LLVMValueRef lp_build_const_string_variable(LLVMModuleRef module, const char *str, int len);
 LLVMValueRef lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...);
 
+LLVMValueRef
+lp_build_print_vec4(LLVMBuilderRef builder, const char *msg, LLVMValueRef vec);
+
+
 #endif
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index 7b1088939b9..c18c8b47100 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -81,11 +81,15 @@ LLVMValueRef
 lp_build_scalar_ddx(struct lp_build_context *bld,
                     LLVMValueRef a)
 {
-   LLVMValueRef idx_left  = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
-   LLVMValueRef idx_right = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_RIGHT, 0);
-   LLVMValueRef a_left  = LLVMBuildExtractElement(bld->builder, a, idx_left, "");
-   LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "");
-   return lp_build_sub(bld, a_right, a_left);
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef idx_left  = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_right = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_RIGHT, 0);
+   LLVMValueRef a_left  = LLVMBuildExtractElement(bld->builder, a, idx_left, "left");
+   LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "right");
+   if (bld->type.floating)
+      return LLVMBuildFSub(bld->builder, a_right, a_left, "ddx");
+   else
+      return LLVMBuildSub(bld->builder, a_right, a_left, "ddx");
 }
 
 
@@ -93,9 +97,13 @@ LLVMValueRef
 lp_build_scalar_ddy(struct lp_build_context *bld,
                     LLVMValueRef a)
 {
-   LLVMValueRef idx_top    = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
-   LLVMValueRef idx_bottom = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_BOTTOM_LEFT, 0);
-   LLVMValueRef a_top    = LLVMBuildExtractElement(bld->builder, a, idx_top, "");
-   LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "");
-   return lp_build_sub(bld, a_bottom, a_top);
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef idx_top    = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_bottom = LLVMConstInt(i32t, LP_BLD_QUAD_BOTTOM_LEFT, 0);
+   LLVMValueRef a_top    = LLVMBuildExtractElement(bld->builder, a, idx_top, "top");
+   LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "bottom");
+   if (bld->type.floating)
+      return LLVMBuildFSub(bld->builder, a_bottom, a_top, "ddy");
+   else
+      return LLVMBuildSub(bld->builder, a_bottom, a_top, "ddy");
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index aee94c1b866..844d1d935b5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -39,12 +39,19 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_const.h"
 #include "lp_bld_debug.h"
+#include "lp_bld_printf.h"
 #include "lp_bld_flow.h"
 #include "lp_bld_sample.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_type.h"
 
 
+/*
+ * Bri-linear factor. Should be greater than one.
+ */
+#define BRILINEAR_FACTOR 2
+
+
 /**
  * Does the given texture wrap mode allow sampling the texture border color?
  * XXX maybe move this into gallium util code.
@@ -126,17 +133,32 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->wrap_r            = sampler->wrap_r;
    state->min_img_filter    = sampler->min_img_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
-   if (view->last_level) {
+
+   if (view->last_level && sampler->max_lod > 0.0f) {
       state->min_mip_filter = sampler->min_mip_filter;
    } else {
       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    }
 
-   /* If min_lod == max_lod we can greatly simplify mipmap selection.
-    * This is a case that occurs during automatic mipmap generation.
-    */
-   if (sampler->min_lod == sampler->max_lod) {
-      state->min_max_lod_equal = 1;
+   if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      if (sampler->lod_bias != 0.0f) {
+         state->lod_bias_non_zero = 1;
+      }
+
+      /* If min_lod == max_lod we can greatly simplify mipmap selection.
+       * This is a case that occurs during automatic mipmap generation.
+       */
+      if (sampler->min_lod == sampler->max_lod) {
+         state->min_max_lod_equal = 1;
+      } else {
+         if (sampler->min_lod > 0.0f) {
+            state->apply_min_lod = 1;
+         }
+
+         if (sampler->max_lod < (float)view->last_level) {
+            state->apply_max_lod = 1;
+         }
+      }
    }
 
    state->compare_mode      = sampler->compare_mode;
@@ -153,6 +175,220 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 /**
+ * Generate code to compute coordinate gradient (rho).
+ * \param ddx  partial derivatives of (s, t, r, q) with respect to X
+ * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ *
+ * XXX: The resulting rho is scalar, so we ignore all but the first element of
+ * derivatives that are passed by the shader.
+ */
+static LLVMValueRef
+lp_build_rho(struct lp_build_sample_context *bld,
+             const LLVMValueRef ddx[4],
+             const LLVMValueRef ddy[4])
+{
+   struct lp_build_context *float_size_bld = &bld->float_size_bld;
+   struct lp_build_context *float_bld = &bld->float_bld;
+   const unsigned dims = bld->dims;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+   LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
+   LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0);
+   LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
+   LLVMValueRef rho_x, rho_y;
+   LLVMValueRef rho_vec;
+   LLVMValueRef float_size;
+   LLVMValueRef rho;
+
+   dsdx = ddx[0];
+   dsdy = ddy[0];
+
+   if (dims <= 1) {
+      rho_x = dsdx;
+      rho_y = dsdy;
+   }
+   else {
+      rho_x = float_size_bld->undef;
+      rho_y = float_size_bld->undef;
+
+      rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dsdx, index0, "");
+      rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dsdy, index0, "");
+
+      dtdx = ddx[1];
+      dtdy = ddy[1];
+
+      rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dtdx, index1, "");
+      rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dtdy, index1, "");
+
+      if (dims >= 3) {
+         drdx = ddx[2];
+         drdy = ddy[2];
+
+         rho_x = LLVMBuildInsertElement(bld->builder, rho_x, drdx, index2, "");
+         rho_y = LLVMBuildInsertElement(bld->builder, rho_y, drdy, index2, "");
+      }
+   }
+
+   rho_x = lp_build_abs(float_size_bld, rho_x);
+   rho_y = lp_build_abs(float_size_bld, rho_y);
+
+   rho_vec = lp_build_max(float_size_bld, rho_x, rho_y);
+
+   float_size = lp_build_int_to_float(float_size_bld, bld->int_size);
+
+   rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size);
+
+   if (dims <= 1) {
+      rho = rho_vec;
+   }
+   else {
+      if (dims >= 2) {
+         LLVMValueRef rho_s, rho_t, rho_r;
+
+         rho_s = LLVMBuildExtractElement(bld->builder, rho_vec, index0, "");
+         rho_t = LLVMBuildExtractElement(bld->builder, rho_vec, index1, "");
+
+         rho = lp_build_max(float_bld, rho_s, rho_t);
+
+         if (dims >= 3) {
+            rho_r = LLVMBuildExtractElement(bld->builder, rho_vec, index0, "");
+            rho = lp_build_max(float_bld, rho, rho_r);
+         }
+      }
+   }
+
+   return rho;
+}
+
+
+/*
+ * Bri-linear lod computation
+ *
+ * Use a piece-wise linear approximation of log2 such that:
+ * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc.
+ * - linear approximation for values in the neighborhood of 0.5, 1.5., etc,
+ *   with the steepness specified in 'factor'
+ * - exact result for 0.5, 1.5, etc.
+ *
+ *
+ *   1.0 -              /----*
+ *                     /
+ *                    /
+ *                   /
+ *   0.5 -          *
+ *                 /
+ *                /
+ *               /
+ *   0.0 - *----/
+ *
+ *         |                 |
+ *        2^0               2^1
+ *
+ * This is a technique also commonly used in hardware:
+ * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html
+ *
+ * TODO: For correctness, this should only be applied when texture is known to
+ * have regular mipmaps, i.e., mipmaps derived from the base level.
+ *
+ * TODO: This could be done in fixed point, where applicable.
+ */
+static void
+lp_build_brilinear_lod(struct lp_build_context *bld,
+                       LLVMValueRef lod,
+                       double factor,
+                       LLVMValueRef *out_lod_ipart,
+                       LLVMValueRef *out_lod_fpart)
+{
+   LLVMValueRef lod_fpart;
+   double pre_offset = (factor - 0.5)/factor - 0.5;
+   double post_offset = 1 - factor;
+
+   if (0) {
+      lp_build_printf(bld->builder, "lod = %f\n", lod);
+   }
+
+   lod = lp_build_add(bld, lod,
+                      lp_build_const_vec(bld->type, pre_offset));
+
+   lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
+
+   lod_fpart = lp_build_mul(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, factor));
+
+   lod_fpart = lp_build_add(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, post_offset));
+
+   /*
+    * It's not necessary to clamp lod_fpart since:
+    * - the above expression will never produce numbers greater than one.
+    * - the mip filtering branch is only taken if lod_fpart is positive
+    */
+
+   *out_lod_fpart = lod_fpart;
+
+   if (0) {
+      lp_build_printf(bld->builder, "lod_ipart = %i\n", *out_lod_ipart);
+      lp_build_printf(bld->builder, "lod_fpart = %f\n\n", *out_lod_fpart);
+   }
+}
+
+
+/*
+ * Combined log2 and brilinear lod computation.
+ *
+ * It's in all identical to calling lp_build_fast_log2() and
+ * lp_build_brilinear_lod() above, but by combining we can compute the interger
+ * and fractional part independently.
+ */
+static void
+lp_build_brilinear_rho(struct lp_build_context *bld,
+                       LLVMValueRef rho,
+                       double factor,
+                       LLVMValueRef *out_lod_ipart,
+                       LLVMValueRef *out_lod_fpart)
+{
+   LLVMValueRef lod_ipart;
+   LLVMValueRef lod_fpart;
+
+   const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
+   const double post_offset = 1 - 2*factor;
+
+   assert(bld->type.floating);
+
+   assert(lp_check_value(bld->type, rho));
+
+   /*
+    * The pre factor will make the intersections with the exact powers of two
+    * happen precisely where we want then to be, which means that the integer
+    * part will not need any post adjustments.
+    */
+   rho = lp_build_mul(bld, rho,
+                      lp_build_const_vec(bld->type, pre_factor));
+
+   /* ipart = ifloor(log2(rho)) */
+   lod_ipart = lp_build_extract_exponent(bld, rho, 0);
+
+   /* fpart = rho / 2**ipart */
+   lod_fpart = lp_build_extract_mantissa(bld, rho);
+
+   lod_fpart = lp_build_mul(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, factor));
+
+   lod_fpart = lp_build_add(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, post_offset));
+
+   /*
+    * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
+    * - the above expression will never produce numbers greater than one.
+    * - the mip filtering branch is only taken if lod_fpart is positive
+    */
+
+   *out_lod_ipart = lod_ipart;
+   *out_lod_fpart = lod_fpart;
+}
+
+
+/**
  * Generate code to compute texture level of detail (lambda).
  * \param ddx  partial derivatives of (s, t, r, q) with respect to X
  * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
@@ -165,85 +401,81 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
  * XXX: The resulting lod is scalar, so ignore all but the first element of
  * derivatives, lod_bias, etc that are passed by the shader.
  */
-LLVMValueRef
+void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
                       unsigned unit,
                       const LLVMValueRef ddx[4],
                       const LLVMValueRef ddy[4],
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
-                      LLVMValueRef width,
-                      LLVMValueRef height,
-                      LLVMValueRef depth)
+                      unsigned mip_filter,
+                      LLVMValueRef *out_lod_ipart,
+                      LLVMValueRef *out_lod_fpart)
 
 {
-   LLVMValueRef min_lod =
-      bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
+   struct lp_build_context *float_bld = &bld->float_bld;
+   LLVMValueRef lod;
+
+   *out_lod_ipart = bld->int_bld.zero;
+   *out_lod_fpart = bld->float_bld.zero;
 
    if (bld->static_state->min_max_lod_equal) {
       /* User is forcing sampling from a particular mipmap level.
        * This is hit during mipmap generation.
        */
-      return min_lod;
+      LLVMValueRef min_lod =
+         bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
+
+      lod = min_lod;
    }
    else {
-      struct lp_build_context *float_bld = &bld->float_bld;
       LLVMValueRef sampler_lod_bias =
          bld->dynamic_state->lod_bias(bld->dynamic_state, bld->builder, unit);
-      LLVMValueRef max_lod =
-         bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit);
       LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-      LLVMValueRef lod;
 
       if (explicit_lod) {
          lod = LLVMBuildExtractElement(bld->builder, explicit_lod,
                                        index0, "");
       }
       else {
-         const int dims = texture_dims(bld->static_state->target);
-         LLVMValueRef dsdx, dsdy;
-         LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL;
          LLVMValueRef rho;
 
-         dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
-         dsdx = lp_build_abs(float_bld, dsdx);
-         dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
-         dsdy = lp_build_abs(float_bld, dsdy);
-         if (dims > 1) {
-            dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx");
-            dtdx = lp_build_abs(float_bld, dtdx);
-            dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy");
-            dtdy = lp_build_abs(float_bld, dtdy);
-            if (dims > 2) {
-               drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx");
-               drdx = lp_build_abs(float_bld, drdx);
-               drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy");
-               drdy = lp_build_abs(float_bld, drdy);
-            }
-         }
+         rho = lp_build_rho(bld, ddx, ddy);
 
-         /* Compute rho = max of all partial derivatives scaled by texture size.
-          * XXX this could be vectorized somewhat
+         /*
+          * Compute lod = log2(rho)
           */
-         rho = LLVMBuildFMul(bld->builder,
-                            lp_build_max(float_bld, dsdx, dsdy),
-                            lp_build_int_to_float(float_bld, width), "");
-         if (dims > 1) {
-            LLVMValueRef max;
-            max = LLVMBuildFMul(bld->builder,
-                               lp_build_max(float_bld, dtdx, dtdy),
-                               lp_build_int_to_float(float_bld, height), "");
-            rho = lp_build_max(float_bld, rho, max);
-            if (dims > 2) {
-               max = LLVMBuildFMul(bld->builder,
-                                  lp_build_max(float_bld, drdx, drdy),
-                                  lp_build_int_to_float(float_bld, depth), "");
-               rho = lp_build_max(float_bld, rho, max);
+
+         if (!lod_bias &&
+             !bld->static_state->lod_bias_non_zero &&
+             !bld->static_state->apply_max_lod &&
+             !bld->static_state->apply_min_lod) {
+            /*
+             * Special case when there are no post-log2 adjustments, which
+             * saves instructions but keeping the integer and fractional lod
+             * computations separate from the start.
+             */
+
+            if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
+                mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+               *out_lod_ipart = lp_build_ilog2(float_bld, rho);
+               *out_lod_fpart = bld->float_bld.zero;
+               return;
+            }
+            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
+                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+               lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR,
+                                      out_lod_ipart, out_lod_fpart);
+               return;
             }
          }
 
-         /* compute lod = log2(rho) */
-         lod = lp_build_log2(float_bld, rho);
+         if (0) {
+            lod = lp_build_log2(float_bld, rho);
+         }
+         else {
+            lod = lp_build_fast_log2(float_bld, rho);
+         }
 
          /* add shader lod bias */
          if (lod_bias) {
@@ -254,13 +486,43 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
       }
 
       /* add sampler lod bias */
-      lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
+      if (bld->static_state->lod_bias_non_zero)
+         lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
+
 
       /* clamp lod */
-      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
+      if (bld->static_state->apply_max_lod) {
+         LLVMValueRef max_lod =
+            bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit);
+
+         lod = lp_build_min(float_bld, lod, max_lod);
+      }
+      if (bld->static_state->apply_min_lod) {
+         LLVMValueRef min_lod =
+            bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
+
+         lod = lp_build_max(float_bld, lod, min_lod);
+      }
+   }
 
-      return lod;
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+         lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR,
+                                out_lod_ipart, out_lod_fpart);
+      }
+      else {
+         lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart);
+      }
+
+      lp_build_name(*out_lod_fpart, "lod_fpart");
+   }
+   else {
+      *out_lod_ipart = lp_build_iround(float_bld, lod);
    }
+
+   lp_build_name(*out_lod_ipart, "lod_ipart");
+
+   return;
 }
 
 
@@ -274,10 +536,9 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 void
 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
                            unsigned unit,
-                           LLVMValueRef lod,
+                           LLVMValueRef lod_ipart,
                            LLVMValueRef *level_out)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
    struct lp_build_context *int_bld = &bld->int_bld;
    LLVMValueRef last_level, level;
 
@@ -287,7 +548,7 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
                                                bld->builder, unit);
 
    /* convert float lod to integer */
-   level = lp_build_iround(float_bld, lod);
+   level = lod_ipart;
 
    /* clamp level to legal range of levels */
    *level_out = lp_build_clamp(int_bld, level, zero, last_level);
@@ -302,32 +563,61 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
 void
 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                            unsigned unit,
-                           LLVMValueRef lod,
+                           LLVMValueRef lod_ipart,
+                           LLVMValueRef *lod_fpart_inout,
                            LLVMValueRef *level0_out,
-                           LLVMValueRef *level1_out,
-                           LLVMValueRef *weight_out)
+                           LLVMValueRef *level1_out)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
+   LLVMBuilderRef builder = bld->builder;
    struct lp_build_context *int_bld = &bld->int_bld;
-   LLVMValueRef last_level, level;
+   struct lp_build_context *float_bld = &bld->float_bld;
+   LLVMValueRef last_level;
+   LLVMValueRef clamp_min;
+   LLVMValueRef clamp_max;
+
+   *level0_out = lod_ipart;
+   *level1_out = lp_build_add(int_bld, lod_ipart, int_bld->one);
 
    last_level = bld->dynamic_state->last_level(bld->dynamic_state,
                                                bld->builder, unit);
 
-   /* convert float lod to integer */
-   level = lp_build_ifloor(float_bld, lod);
-
-   /* compute level 0 and clamp to legal range of levels */
-   *level0_out = lp_build_clamp(int_bld, level,
-                                int_bld->zero,
-                                last_level);
-   /* compute level 1 and clamp to legal range of levels */
-   level = lp_build_add(int_bld, level, int_bld->one);
-   *level1_out = lp_build_clamp(int_bld, level,
-                                int_bld->zero,
-                                last_level);
-
-   *weight_out = lp_build_fract(float_bld, lod);
+   /*
+    * Clamp both lod_ipart and lod_ipart + 1 to [0, last_level], with the
+    * minimum number of comparisons, and zeroing lod_fpart in the extreme
+    * ends in the process.
+    */
+
+   /* lod_ipart < 0 */
+   clamp_min = LLVMBuildICmp(builder, LLVMIntSLT,
+                             lod_ipart, int_bld->zero,
+                             "clamp_lod_to_zero");
+
+   *level0_out = LLVMBuildSelect(builder, clamp_min,
+                                 int_bld->zero, *level0_out, "");
+
+   *level1_out = LLVMBuildSelect(builder, clamp_min,
+                                 int_bld->zero, *level1_out, "");
+
+   *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min,
+                                      float_bld->zero, *lod_fpart_inout, "");
+
+   /* lod_ipart >= last_level */
+   clamp_max = LLVMBuildICmp(builder, LLVMIntSGE,
+                             lod_ipart, last_level,
+                             "clamp_lod_to_last");
+
+   *level0_out = LLVMBuildSelect(builder, clamp_max,
+                                 last_level, *level0_out, "");
+
+   *level1_out = LLVMBuildSelect(builder, clamp_max,
+                                 last_level, *level1_out, "");
+
+   *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max,
+                                      float_bld->zero, *lod_fpart_inout, "");
+
+   lp_build_name(*level0_out, "sampler%u_miplevel0", unit);
+   lp_build_name(*level1_out, "sampler%u_miplevel1", unit);
+   lp_build_name(*lod_fpart_inout, "sampler%u_mipweight", unit);
 }
 
 
@@ -338,12 +628,12 @@ lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
  */
 LLVMValueRef
 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
-                          LLVMValueRef data_array, LLVMValueRef level)
+                          LLVMValueRef level)
 {
    LLVMValueRef indexes[2], data_ptr;
    indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
    indexes[1] = level;
-   data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, "");
+   data_ptr = LLVMBuildGEP(bld->builder, bld->data_array, indexes, 2, "");
    data_ptr = LLVMBuildLoad(bld->builder, data_ptr, "");
    return data_ptr;
 }
@@ -351,10 +641,10 @@ lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
 
 LLVMValueRef
 lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                LLVMValueRef data_array, int level)
+                                int level)
 {
    LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
-   return lp_build_get_mipmap_level(bld, data_array, lvl);
+   return lp_build_get_mipmap_level(bld, lvl);
 }
 
 
@@ -363,18 +653,22 @@ lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
  * Return max(1, base_size >> level);
  */
 static LLVMValueRef
-lp_build_minify(struct lp_build_sample_context *bld,
+lp_build_minify(struct lp_build_context *bld,
                 LLVMValueRef base_size,
                 LLVMValueRef level)
 {
-   if (level == bld->int_coord_bld.zero) {
+   assert(lp_check_value(bld->type, base_size));
+   assert(lp_check_value(bld->type, level));
+
+   if (level == bld->zero) {
       /* if we're using mipmap level zero, no minification is needed */
       return base_size;
    }
    else {
       LLVMValueRef size =
          LLVMBuildLShr(bld->builder, base_size, level, "minify");
-      size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
+      assert(bld->type.sign);
+      size = lp_build_max(bld, size, bld->one);
       return size;
    }
 }
@@ -405,71 +699,113 @@ lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
  */
 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
-                            unsigned dims,
-                            LLVMValueRef width_vec,
-                            LLVMValueRef height_vec,
-                            LLVMValueRef depth_vec,
-                            LLVMValueRef ilevel0,
-                            LLVMValueRef ilevel1,
-                            LLVMValueRef row_stride_array,
-                            LLVMValueRef img_stride_array,
-                            LLVMValueRef *width0_vec,
-                            LLVMValueRef *width1_vec,
-                            LLVMValueRef *height0_vec,
-                            LLVMValueRef *height1_vec,
-                            LLVMValueRef *depth0_vec,
-                            LLVMValueRef *depth1_vec,
-                            LLVMValueRef *row_stride0_vec,
-                            LLVMValueRef *row_stride1_vec,
-                            LLVMValueRef *img_stride0_vec,
-                            LLVMValueRef *img_stride1_vec)
+                            LLVMValueRef ilevel,
+                            LLVMValueRef *out_size,
+                            LLVMValueRef *row_stride_vec,
+                            LLVMValueRef *img_stride_vec)
 {
-   const unsigned mip_filter = bld->static_state->min_mip_filter;
-   LLVMValueRef ilevel0_vec, ilevel1_vec;
+   const unsigned dims = bld->dims;
+   LLVMValueRef ilevel_vec;
 
-   ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0);
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
-      ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1);
+   ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
 
    /*
-    * Compute width, height, depth at mipmap level 'ilevel0'
+    * Compute width, height, depth at mipmap level 'ilevel'
     */
-   *width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec);
+   *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
+
    if (dims >= 2) {
-      *height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec);
-      *row_stride0_vec = lp_build_get_level_stride_vec(bld,
-                                                       row_stride_array,
-                                                       ilevel0);
+      *row_stride_vec = lp_build_get_level_stride_vec(bld,
+                                                      bld->row_stride_array,
+                                                      ilevel);
       if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
-         *img_stride0_vec = lp_build_get_level_stride_vec(bld,
-                                                          img_stride_array,
-                                                          ilevel0);
-         if (dims == 3) {
-            *depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec);
-         }
+         *img_stride_vec = lp_build_get_level_stride_vec(bld,
+                                                         bld->img_stride_array,
+                                                         ilevel);
       }
    }
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      /* compute width, height, depth for second mipmap level at 'ilevel1' */
-      *width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec);
-      if (dims >= 2) {
-         *height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec);
-         *row_stride1_vec = lp_build_get_level_stride_vec(bld,
-                                                          row_stride_array,
-                                                          ilevel1);
-         if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
-            *img_stride1_vec = lp_build_get_level_stride_vec(bld,
-                                                             img_stride_array,
-                                                             ilevel1);
-            if (dims == 3) {
-               *depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec);
-            }
-         }
+}
+
+
+/**
+ * Extract and broadcast texture size.
+ *
+ * @param size_type   type of the texture size vector (either
+ *                    bld->int_size_type or bld->float_size_type)
+ * @param coord_type  type of the texture size vector (either
+ *                    bld->int_coord_type or bld->coord_type)
+ * @param int_size    vector with the integer texture size (width, height,
+ *                    depth)
+ */
+void
+lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
+                             struct lp_type size_type,
+                             struct lp_type coord_type,
+                             LLVMValueRef size,
+                             LLVMValueRef *out_width,
+                             LLVMValueRef *out_height,
+                             LLVMValueRef *out_depth)
+{
+   const unsigned dims = bld->dims;
+   LLVMTypeRef i32t = LLVMInt32Type();
+
+   *out_width = lp_build_extract_broadcast(bld->builder,
+                                           size_type,
+                                           coord_type,
+                                           size,
+                                           LLVMConstInt(i32t, 0, 0));
+   if (dims >= 2) {
+      *out_height = lp_build_extract_broadcast(bld->builder,
+                                               size_type,
+                                               coord_type,
+                                               size,
+                                               LLVMConstInt(i32t, 1, 0));
+      if (dims == 3) {
+         *out_depth = lp_build_extract_broadcast(bld->builder,
+                                                 size_type,
+                                                 coord_type,
+                                                 size,
+                                                 LLVMConstInt(i32t, 2, 0));
       }
    }
 }
 
 
+/**
+ * Unnormalize coords.
+ *
+ * @param int_size  vector with the integer texture size (width, height, depth)
+ */
+void
+lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
+                             LLVMValueRef flt_size,
+                             LLVMValueRef *s,
+                             LLVMValueRef *t,
+                             LLVMValueRef *r)
+{
+   const unsigned dims = bld->dims;
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef depth;
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &width,
+                                &height,
+                                &depth);
+
+   /* s = s * width, t = t * height */
+   *s = lp_build_mul(&bld->coord_bld, *s, width);
+   if (dims >= 2) {
+      *t = lp_build_mul(&bld->coord_bld, *t, height);
+      if (dims >= 3) {
+         *r = lp_build_mul(&bld->coord_bld, *r, depth);
+      }
+   }
+}
+
 
 /** Helper used by lp_build_cube_lookup() */
 static LLVMValueRef
@@ -588,25 +924,16 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
    rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
 
    {
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
+      LLVMValueRef face_s_var;
+      LLVMValueRef face_t_var;
+      LLVMValueRef face_var;
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
-
-      *face_s = bld->coord_bld.undef;
-      *face_t = bld->coord_bld.undef;
-      *face = bld->int_bld.undef;
+      face_s_var = lp_build_alloca(bld->builder, bld->coord_bld.vec_type, "face_s_var");
+      face_t_var = lp_build_alloca(bld->builder, bld->coord_bld.vec_type, "face_t_var");
+      face_var = lp_build_alloca(bld->builder, bld->int_bld.vec_type, "face_var");
 
-      lp_build_name(*face_s, "face_s");
-      lp_build_name(*face_t, "face_t");
-      lp_build_name(*face, "face");
-
-      lp_build_flow_scope_declare(flow_ctx, face_s);
-      lp_build_flow_scope_declare(flow_ctx, face_t);
-      lp_build_flow_scope_declare(flow_ctx, face);
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
+      lp_build_if(&if_ctx, bld->builder, arx_ge_ary_arz);
       {
          /* +/- X face */
          LLVMValueRef sign = lp_build_sgn(float_bld, rx);
@@ -616,57 +943,52 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
          *face = lp_build_cube_face(bld, rx,
                                     PIPE_TEX_FACE_POS_X,
                                     PIPE_TEX_FACE_NEG_X);
+         LLVMBuildStore(bld->builder, *face_s, face_s_var);
+         LLVMBuildStore(bld->builder, *face_t, face_t_var);
+         LLVMBuildStore(bld->builder, *face, face_var);
       }
       lp_build_else(&if_ctx);
       {
-         struct lp_build_flow_context *flow_ctx2;
          struct lp_build_if_state if_ctx2;
 
-         LLVMValueRef face_s2 = bld->coord_bld.undef;
-         LLVMValueRef face_t2 = bld->coord_bld.undef;
-         LLVMValueRef face2 = bld->int_bld.undef;
-
-         flow_ctx2 = lp_build_flow_create(bld->builder);
-         lp_build_flow_scope_begin(flow_ctx2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
-         lp_build_flow_scope_declare(flow_ctx2, &face2);
-
          ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
 
-         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
+         lp_build_if(&if_ctx2, bld->builder, ary_ge_arx_arz);
          {
             /* +/- Y face */
             LLVMValueRef sign = lp_build_sgn(float_bld, ry);
             LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
-            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
-            face2 = lp_build_cube_face(bld, ry,
+            *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
+            *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
+            *face = lp_build_cube_face(bld, ry,
                                        PIPE_TEX_FACE_POS_Y,
                                        PIPE_TEX_FACE_NEG_Y);
+            LLVMBuildStore(bld->builder, *face_s, face_s_var);
+            LLVMBuildStore(bld->builder, *face_t, face_t_var);
+            LLVMBuildStore(bld->builder, *face, face_var);
          }
          lp_build_else(&if_ctx2);
          {
             /* +/- Z face */
             LLVMValueRef sign = lp_build_sgn(float_bld, rz);
             LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
-            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
-            face2 = lp_build_cube_face(bld, rz,
+            *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
+            *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+            *face = lp_build_cube_face(bld, rz,
                                        PIPE_TEX_FACE_POS_Z,
                                        PIPE_TEX_FACE_NEG_Z);
+            LLVMBuildStore(bld->builder, *face_s, face_s_var);
+            LLVMBuildStore(bld->builder, *face_t, face_t_var);
+            LLVMBuildStore(bld->builder, *face, face_var);
          }
          lp_build_endif(&if_ctx2);
-         lp_build_flow_scope_end(flow_ctx2);
-         lp_build_flow_destroy(flow_ctx2);
-         *face_s = face_s2;
-         *face_t = face_t2;
-         *face = face2;
       }
 
       lp_build_endif(&if_ctx);
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
+
+      *face_s = LLVMBuildLoad(bld->builder, face_s_var, "face_s");
+      *face_t = LLVMBuildLoad(bld->builder, face_t_var, "face_t");
+      *face   = LLVMBuildLoad(bld->builder, face_var, "face");
    }
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 4d2eeaa5eb4..ffed27cee83 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -83,6 +83,9 @@ struct lp_sampler_static_state
    unsigned compare_func:3;
    unsigned normalized_coords:1;
    unsigned min_max_lod_equal:1;  /**< min_lod == max_lod ? */
+   unsigned lod_bias_non_zero:1;
+   unsigned apply_min_lod:1;  /**< min_lod > 0 ? */
+   unsigned apply_max_lod:1;  /**< max_lod < last_level ? */
 };
 
 
@@ -176,6 +179,9 @@ struct lp_build_sample_context
 
    const struct util_format_description *format_desc;
 
+   /* See texture_dims() */
+   unsigned dims;
+
    /** regular scalar float type */
    struct lp_type float_type;
    struct lp_build_context float_bld;
@@ -191,17 +197,32 @@ struct lp_build_sample_context
    struct lp_type coord_type;
    struct lp_build_context coord_bld;
 
-   /** Unsigned integer coordinates */
-   struct lp_type uint_coord_type;
-   struct lp_build_context uint_coord_bld;
-
    /** Signed integer coordinates */
    struct lp_type int_coord_type;
    struct lp_build_context int_coord_bld;
 
+   /** Unsigned integer texture size */
+   struct lp_type int_size_type;
+   struct lp_build_context int_size_bld;
+
+   /** Unsigned integer texture size */
+   struct lp_type float_size_type;
+   struct lp_build_context float_size_bld;
+
    /** Output texels type and build context */
    struct lp_type texel_type;
    struct lp_build_context texel_bld;
+
+   /* Common dynamic state values */
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef depth;
+   LLVMValueRef row_stride_array;
+   LLVMValueRef img_stride_array;
+   LLVMValueRef data_array;
+
+   /** Integer vector with texture width, height, depth */
+   LLVMValueRef int_size;
 };
 
 
@@ -238,7 +259,7 @@ apply_sampler_swizzle(struct lp_build_sample_context *bld,
 }
 
 
-static INLINE int
+static INLINE unsigned
 texture_dims(enum pipe_texture_target tex)
 {
    switch (tex) {
@@ -271,16 +292,16 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
                         const struct pipe_sampler_state *sampler);
 
 
-LLVMValueRef
+void
 lp_build_lod_selector(struct lp_build_sample_context *bld,
                       unsigned unit,
                       const LLVMValueRef ddx[4],
                       const LLVMValueRef ddy[4],
                       LLVMValueRef lod_bias, /* optional */
                       LLVMValueRef explicit_lod, /* optional */
-                      LLVMValueRef width,
-                      LLVMValueRef height,
-                      LLVMValueRef depth);
+                      unsigned mip_filter,
+                      LLVMValueRef *out_lod_ipart,
+                      LLVMValueRef *out_lod_fpart);
 
 void
 lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
@@ -291,40 +312,44 @@ lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
 void
 lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
                            unsigned unit,
-                           LLVMValueRef lod,
+                           LLVMValueRef lod_ipart,
+                           LLVMValueRef *lod_fpart_inout,
                            LLVMValueRef *level0_out,
-                           LLVMValueRef *level1_out,
-                           LLVMValueRef *weight_out);
+                           LLVMValueRef *level1_out);
 
 LLVMValueRef
 lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
-                          LLVMValueRef data_array, LLVMValueRef level);
+                          LLVMValueRef level);
 
 LLVMValueRef
 lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                LLVMValueRef data_array, int level);
+                                int level);
 
 
 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
-                            unsigned dims,
-                            LLVMValueRef width_vec,
-                            LLVMValueRef height_vec,
-                            LLVMValueRef depth_vec,
-                            LLVMValueRef ilevel0,
-                            LLVMValueRef ilevel1,
-                            LLVMValueRef row_stride_array,
-                            LLVMValueRef img_stride_array,
-                            LLVMValueRef *width0_vec,
-                            LLVMValueRef *width1_vec,
-                            LLVMValueRef *height0_vec,
-                            LLVMValueRef *height1_vec,
-                            LLVMValueRef *depth0_vec,
-                            LLVMValueRef *depth1_vec,
-                            LLVMValueRef *row_stride0_vec,
-                            LLVMValueRef *row_stride1_vec,
-                            LLVMValueRef *img_stride0_vec,
-                            LLVMValueRef *img_stride1_vec);
+                            LLVMValueRef ilevel,
+                            LLVMValueRef *out_size_vec,
+                            LLVMValueRef *row_stride_vec,
+                            LLVMValueRef *img_stride_vec);
+
+
+void
+lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
+                             struct lp_type size_type,
+                             struct lp_type coord_type,
+                             LLVMValueRef size,
+                             LLVMValueRef *out_width,
+                             LLVMValueRef *out_height,
+                             LLVMValueRef *out_depth);
+
+
+void
+lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
+                             LLVMValueRef flt_size,
+                             LLVMValueRef *s,
+                             LLVMValueRef *t,
+                             LLVMValueRef *r);
 
 
 void
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index 49a6eed615f..d6831a580b3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -45,6 +45,7 @@
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_bitarit.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_pack.h"
@@ -80,11 +81,10 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
                                  LLVMValueRef *out_offset,
                                  LLVMValueRef *out_i)
 {
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef length_minus_one;
 
-   length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 
    switch(wrap_mode) {
    case PIPE_TEX_WRAP_REPEAT:
@@ -92,7 +92,7 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
          coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
       else {
          /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
          coord = LLVMBuildAdd(bld->builder, coord, bias, "");
          coord = LLVMBuildURem(bld->builder, coord, length, "");
       }
@@ -113,7 +113,7 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
       assert(0);
    }
 
-   lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
+   lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride,
                                   out_offset, out_i);
 }
 
@@ -146,7 +146,6 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
                                 LLVMValueRef *i0,
                                 LLVMValueRef *i1)
 {
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef length_minus_one;
    LLVMValueRef lmask, umask, mask;
@@ -188,8 +187,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
     * multiplication.
     */
 
-   *i0 = uint_coord_bld->zero;
-   *i1 = uint_coord_bld->zero;
+   *i0 = int_coord_bld->zero;
+   *i1 = int_coord_bld->zero;
 
    length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 
@@ -200,7 +199,7 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
       }
       else {
          /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
          coord0 = LLVMBuildAdd(bld->builder, coord0, bias, "");
          coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
       }
@@ -208,9 +207,9 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
       mask = lp_build_compare(bld->builder, int_coord_bld->type,
                               PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 
-      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
       *offset1 = LLVMBuildAnd(bld->builder,
-                              lp_build_add(uint_coord_bld, *offset0, stride),
+                              lp_build_add(int_coord_bld, *offset0, stride),
                               mask, "");
       break;
 
@@ -225,8 +224,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
 
       mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
 
-      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
-      *offset1 = lp_build_add(uint_coord_bld,
+      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
+      *offset1 = lp_build_add(int_coord_bld,
                               *offset0,
                               LLVMBuildAnd(bld->builder, stride, mask, ""));
       break;
@@ -239,8 +238,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
    default:
       assert(0);
-      *offset0 = uint_coord_bld->zero;
-      *offset1 = uint_coord_bld->zero;
+      *offset0 = int_coord_bld->zero;
+      *offset1 = int_coord_bld->zero;
       break;
    }
 }
@@ -253,9 +252,7 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
-                              LLVMValueRef width_vec,
-                              LLVMValueRef height_vec,
-                              LLVMValueRef depth_vec,
+                              LLVMValueRef int_size,
                               LLVMValueRef row_stride_vec,
                               LLVMValueRef img_stride_vec,
                               LLVMValueRef data_ptr,
@@ -265,12 +262,13 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                               LLVMValueRef *colors_lo,
                               LLVMValueRef *colors_hi)
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->builder;
    struct lp_build_context i32, h16, u8n;
    LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
    LLVMValueRef i32_c8;
-   LLVMValueRef s_ipart, t_ipart, r_ipart;
+   LLVMValueRef width_vec, height_vec, depth_vec;
+   LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
    LLVMValueRef x_stride;
    LLVMValueRef x_offset, offset;
    LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
@@ -283,30 +281,33 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    h16_vec_type = lp_build_vec_type(h16.type);
    u8n_vec_type = lp_build_vec_type(u8n.type);
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                int_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
+
    if (bld->static_state->normalized_coords) {
-      /* s = s * width, t = t * height */
-      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
-      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
-                                              coord_vec_type, "");
-      s = lp_build_mul(&bld->coord_bld, s, fp_width);
-      if (dims >= 2) {
-         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
-                                                  coord_vec_type, "");
-         t = lp_build_mul(&bld->coord_bld, t, fp_height);
-         if (dims >= 3) {
-            LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
-                                                    coord_vec_type, "");
-            r = lp_build_mul(&bld->coord_bld, r, fp_depth);
-         }
-      }
-   }
+      LLVMValueRef scaled_size;
+      LLVMValueRef flt_size;
 
-   /* scale coords by 256 (8 fractional bits) */
-   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
-   if (dims >= 2)
-      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
-   if (dims >= 3)
-      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+      /* scale size by 256 (8 fractional bits) */
+      scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
+
+      flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
+
+      lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
+   }
+   else {
+      /* scale coords by 256 (8 fractional bits) */
+      s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+      if (dims >= 2)
+         t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+      if (dims >= 3)
+         r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+   }
 
    /* convert float to int */
    s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
@@ -324,7 +325,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
       r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
 
    /* get pixel, row, image strides */
-   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+   x_stride = lp_build_const_vec(bld->int_coord_bld.type,
                                  bld->format_desc->block.bits/8);
 
    /* Do texcoord wrapping, compute texel offset */
@@ -343,7 +344,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                        bld->static_state->pot_height,
                                        bld->static_state->wrap_t,
                                        &y_offset, &y_subcoord);
-      offset = lp_build_add(&bld->uint_coord_bld, offset, y_offset);
+      offset = lp_build_add(&bld->int_coord_bld, offset, y_offset);
       if (dims >= 3) {
          LLVMValueRef z_offset;
          lp_build_sample_wrap_nearest_int(bld,
@@ -352,13 +353,13 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                           bld->static_state->pot_height,
                                           bld->static_state->wrap_r,
                                           &z_offset, &z_subcoord);
-         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+         offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
       }
       else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
          LLVMValueRef z_offset;
          /* The r coord is the cube face in [0,5] */
-         z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
-         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+         z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
+         offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
       }
    }
 
@@ -417,9 +418,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
-                             LLVMValueRef width_vec,
-                             LLVMValueRef height_vec,
-                             LLVMValueRef depth_vec,
+                             LLVMValueRef int_size,
                              LLVMValueRef row_stride_vec,
                              LLVMValueRef img_stride_vec,
                              LLVMValueRef data_ptr,
@@ -429,14 +428,15 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              LLVMValueRef *colors_lo,
                              LLVMValueRef *colors_hi)
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const unsigned dims = bld->dims;
    LLVMBuilderRef builder = bld->builder;
    struct lp_build_context i32, h16, u8n;
    LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
    LLVMValueRef i32_c8, i32_c128, i32_c255;
+   LLVMValueRef width_vec, height_vec, depth_vec;
    LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
-   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
-   LLVMValueRef r_ipart, r_fpart, r_fpart_lo, r_fpart_hi;
+   LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL;
+   LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL;
    LLVMValueRef x_stride, y_stride, z_stride;
    LLVMValueRef x_offset0, x_offset1;
    LLVMValueRef y_offset0, y_offset1;
@@ -458,30 +458,33 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    h16_vec_type = lp_build_vec_type(h16.type);
    u8n_vec_type = lp_build_vec_type(u8n.type);
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                int_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
+
    if (bld->static_state->normalized_coords) {
-      /* s = s * width, t = t * height */
-      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
-      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
-                                              coord_vec_type, "");
-      s = lp_build_mul(&bld->coord_bld, s, fp_width);
-      if (dims >= 2) {
-         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
-                                                  coord_vec_type, "");
-         t = lp_build_mul(&bld->coord_bld, t, fp_height);
-      }
-      if (dims >= 3) {
-         LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
-                                                 coord_vec_type, "");
-         r = lp_build_mul(&bld->coord_bld, r, fp_depth);
-      }
-   }
+      LLVMValueRef scaled_size;
+      LLVMValueRef flt_size;
 
-   /* scale coords by 256 (8 fractional bits) */
-   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
-   if (dims >= 2)
-      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
-   if (dims >= 3)
-      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+      /* scale size by 256 (8 fractional bits) */
+      scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
+
+      flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
+
+      lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
+   }
+   else {
+      /* scale coords by 256 (8 fractional bits) */
+      s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+      if (dims >= 2)
+         t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+      if (dims >= 3)
+         r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+   }
 
    /* convert float to int */
    s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
@@ -517,7 +520,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
 
    /* get pixel, row and image strides */
-   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+   x_stride = lp_build_const_vec(bld->int_coord_bld.type,
                                  bld->format_desc->block.bits/8);
    y_stride = row_stride_vec;
    z_stride = img_stride_vec;
@@ -548,9 +551,9 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 
       for (z = 0; z < 2; z++) {
          for (x = 0; x < 2; x++) {
-            offset[z][0][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[z][0][x], y_offset0);
-            offset[z][1][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[z][1][x], y_offset1);
          }
       }
@@ -566,20 +569,20 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                                       &z_subcoord[0], &z_subcoord[1]);
       for (y = 0; y < 2; y++) {
          for (x = 0; x < 2; x++) {
-            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[0][y][x], z_offset0);
-            offset[1][y][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[1][y][x], z_offset1);
          }
       }
    }
    else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
       LLVMValueRef z_offset;
-      z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
+      z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
       for (y = 0; y < 2; y++) {
          for (x = 0; x < 2; x++) {
             /* The r coord is the cube face in [0,5] */
-            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[0][y][x], z_offset);
          }
       }
@@ -781,76 +784,124 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                        LLVMValueRef s,
                        LLVMValueRef t,
                        LLVMValueRef r,
+                       LLVMValueRef ilevel0,
+                       LLVMValueRef ilevel1,
                        LLVMValueRef lod_fpart,
-                       LLVMValueRef width0_vec,
-                       LLVMValueRef width1_vec,
-                       LLVMValueRef height0_vec,
-                       LLVMValueRef height1_vec,
-                       LLVMValueRef depth0_vec,
-                       LLVMValueRef depth1_vec,
-                       LLVMValueRef row_stride0_vec,
-                       LLVMValueRef row_stride1_vec,
-                       LLVMValueRef img_stride0_vec,
-                       LLVMValueRef img_stride1_vec,
-                       LLVMValueRef data_ptr0,
-                       LLVMValueRef data_ptr1,
-                       LLVMValueRef *colors_lo,
-                       LLVMValueRef *colors_hi)
+                       LLVMValueRef colors_lo_var,
+                       LLVMValueRef colors_hi_var)
 {
+   LLVMBuilderRef builder = bld->builder;
+   LLVMValueRef size0;
+   LLVMValueRef size1;
+   LLVMValueRef row_stride0_vec;
+   LLVMValueRef row_stride1_vec;
+   LLVMValueRef img_stride0_vec;
+   LLVMValueRef img_stride1_vec;
+   LLVMValueRef data_ptr0;
+   LLVMValueRef data_ptr1;
    LLVMValueRef colors0_lo, colors0_hi;
    LLVMValueRef colors1_lo, colors1_hi;
 
+
+   /* sample the first mipmap level */
+   lp_build_mipmap_level_sizes(bld, ilevel0,
+                               &size0,
+                               &row_stride0_vec, &img_stride0_vec);
+   data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
-      /* sample the first mipmap level */
       lp_build_sample_image_nearest(bld,
-                                    width0_vec, height0_vec, depth0_vec,
+                                    size0,
                                     row_stride0_vec, img_stride0_vec,
                                     data_ptr0, s, t, r,
                                     &colors0_lo, &colors0_hi);
-
-      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level */
-         lp_build_sample_image_nearest(bld,
-                                       width1_vec, height1_vec, depth1_vec,
-                                       row_stride1_vec, img_stride1_vec,
-                                       data_ptr1, s, t, r,
-                                       &colors1_lo, &colors1_hi);
-      }
    }
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
-
-      /* sample the first mipmap level */
       lp_build_sample_image_linear(bld,
-                                   width0_vec, height0_vec, depth0_vec,
+                                   size0,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, s, t, r,
                                    &colors0_lo, &colors0_hi);
-
-      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level */
-         lp_build_sample_image_linear(bld,
-                                      width1_vec, height1_vec, depth1_vec,
-                                      row_stride1_vec, img_stride1_vec,
-                                      data_ptr1, s, t, r,
-                                      &colors1_lo, &colors1_hi);
-      }
    }
 
+   /* Store the first level's colors in the output variables */
+   LLVMBuildStore(builder, colors0_lo, colors_lo_var);
+   LLVMBuildStore(builder, colors0_hi, colors_hi_var);
+
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      /* interpolate samples from the two mipmap levels */
-      struct lp_build_context h16;
-      lp_build_context_init(&h16, bld->builder, lp_type_ufixed(16));
-
-      *colors_lo = lp_build_lerp(&h16, lod_fpart,
-                                 colors0_lo, colors1_lo);
-      *colors_hi = lp_build_lerp(&h16, lod_fpart,
-                                 colors0_hi, colors1_hi);
-   }
-   else {
-      /* use first/only level's colors */
-      *colors_lo = colors0_lo;
-      *colors_hi = colors0_hi;
+      LLVMValueRef h16_scale = LLVMConstReal(LLVMFloatType(), 256.0);
+      LLVMTypeRef i32_type = LLVMIntType(32);
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef need_lerp;
+
+      lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, "");
+      lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16");
+
+      /* need_lerp = lod_fpart > 0 */
+      need_lerp = LLVMBuildICmp(builder, LLVMIntSGT,
+                                lod_fpart, LLVMConstNull(i32_type),
+                                "need_lerp");
+
+      lp_build_if(&if_ctx, builder, need_lerp);
+      {
+         struct lp_build_context h16_bld;
+
+         lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16));
+
+         /* sample the second mipmap level */
+         lp_build_mipmap_level_sizes(bld, ilevel1,
+                                     &size1,
+                                     &row_stride1_vec, &img_stride1_vec);
+         data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
+         if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+            lp_build_sample_image_nearest(bld,
+                                          size1,
+                                          row_stride1_vec, img_stride1_vec,
+                                          data_ptr1, s, t, r,
+                                          &colors1_lo, &colors1_hi);
+         }
+         else {
+            lp_build_sample_image_linear(bld,
+                                         size1,
+                                         row_stride1_vec, img_stride1_vec,
+                                         data_ptr1, s, t, r,
+                                         &colors1_lo, &colors1_hi);
+         }
+
+         /* interpolate samples from the two mipmap levels */
+
+         lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
+         lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
+
+#if HAVE_LLVM == 0x208
+         /* This is a work-around for a bug in LLVM 2.8.
+          * Evidently, something goes wrong in the construction of the
+          * lod_fpart short[8] vector.  Adding this no-effect shuffle seems
+          * to force the vector to be properly constructed.
+          * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
+          */
+         {
+            LLVMValueRef shuffles[8], shuffle;
+            int i;
+            assert(h16_bld.type.length <= Elements(shuffles));
+            for (i = 0; i < h16_bld.type.length; i++)
+               shuffles[i] = lp_build_const_int32(2 * (i & 1));
+            shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
+            lod_fpart = LLVMBuildShuffleVector(builder,
+                                               lod_fpart, lod_fpart,
+                                               shuffle, "");
+         }
+#endif
+
+         colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
+                                    colors0_lo, colors1_lo);
+         colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
+                                    colors0_hi, colors1_hi);
+
+         LLVMBuildStore(builder, colors0_lo, colors_lo_var);
+         LLVMBuildStore(builder, colors0_hi, colors_hi_var);
+      }
+      lp_build_endif(&if_ctx);
    }
 }
 
@@ -871,35 +922,22 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                     const LLVMValueRef *ddy,
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
-                    LLVMValueRef width,
-                    LLVMValueRef height,
-                    LLVMValueRef depth,
-                    LLVMValueRef width_vec,
-                    LLVMValueRef height_vec,
-                    LLVMValueRef depth_vec,
-                    LLVMValueRef row_stride_array,
-                    LLVMValueRef img_stride_array,
-                    LLVMValueRef data_array,
                     LLVMValueRef texel_out[4])
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
    LLVMBuilderRef builder = bld->builder;
    const unsigned mip_filter = bld->static_state->min_mip_filter;
    const unsigned min_filter = bld->static_state->min_img_filter;
    const unsigned mag_filter = bld->static_state->mag_img_filter;
-   const int dims = texture_dims(bld->static_state->target);
-   LLVMValueRef lod = NULL, lod_fpart = NULL;
+   const unsigned dims = bld->dims;
+   LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
    LLVMValueRef ilevel0, ilevel1 = NULL;
-   LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
-   LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
-   LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
-   LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
-   LLVMValueRef data_ptr0, data_ptr1 = NULL;
    LLVMValueRef packed, packed_lo, packed_hi;
    LLVMValueRef unswizzled[4];
    LLVMValueRef face_ddx[4], face_ddy[4];
-   struct lp_build_context h16;
-   LLVMTypeRef h16_vec_type;
+   struct lp_build_context h16_bld;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef i32t_zero = LLVMConstInt(i32t, 0, 0);
 
    /* we only support the common/simple wrap modes at this time */
    assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
@@ -910,9 +948,7 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
 
 
    /* make 16-bit fixed-pt builder context */
-   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
-   h16_vec_type = lp_build_vec_type(h16.type);
-
+   lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16));
 
    /* cube face selection, compute pre-face coords, etc. */
    if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
@@ -924,19 +960,18 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
       r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
 
       /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
       face_ddx[2] = NULL;
       face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
       face_ddy[2] = NULL;
       face_ddy[3] = NULL;
       ddx = face_ddx;
       ddy = face_ddy;
    }
 
-
    /*
     * Compute the level of detail (float).
     */
@@ -945,15 +980,16 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, unit, ddx, ddy,
-                                  lod_bias, explicit_lod,
-                                  width, height, depth);
+      lp_build_lod_selector(bld, unit, ddx, ddy,
+                            lod_bias, explicit_lod,
+                            mip_filter,
+                            &lod_ipart, &lod_fpart);
+   } else {
+      lod_ipart = i32t_zero;
    }
 
    /*
     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
-    * If mipfilter=linear, also compute the weight between the two
-    * mipmap levels: lod_fpart
     */
    switch (mip_filter) {
    default:
@@ -966,135 +1002,81 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
           * We should be able to set ilevel0 = const(0) but that causes
           * bad x86 code to be emitted.
           */
-         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
-         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+         assert(lod_ipart);
+         lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
       }
       else {
-         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+         ilevel0 = i32t_zero;
       }
       break;
    case PIPE_TEX_MIPFILTER_NEAREST:
-      assert(lod);
-      lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      assert(lod_ipart);
+      lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
       break;
    case PIPE_TEX_MIPFILTER_LINEAR:
-      {
-         LLVMValueRef f256 = LLVMConstReal(LLVMFloatType(), 256.0);
-         LLVMValueRef i255 = lp_build_const_int32(255);
-         LLVMTypeRef i16_type = LLVMIntType(16);
-
-         assert(lod);
-
-         lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
-                                    &lod_fpart);
-         lod_fpart = LLVMBuildFMul(builder, lod_fpart, f256, "");
-         lod_fpart = lp_build_ifloor(&bld->float_bld, lod_fpart);
-         lod_fpart = LLVMBuildAnd(builder, lod_fpart, i255, "");
-         lod_fpart = LLVMBuildTrunc(builder, lod_fpart, i16_type, "");
-         lod_fpart = lp_build_broadcast_scalar(&h16, lod_fpart);
-
-         /* the lod_fpart values will be fixed pt values in [0,1) */
-      }
+      assert(lod_ipart);
+      assert(lod_fpart);
+      lp_build_linear_mip_levels(bld, unit,
+                                 lod_ipart, &lod_fpart,
+                                 &ilevel0, &ilevel1);
       break;
    }
 
-   /* compute image size(s) of source mipmap level(s) */
-   lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec,
-                               ilevel0, ilevel1,
-                               row_stride_array, img_stride_array,
-                               &width0_vec, &width1_vec,
-                               &height0_vec, &height1_vec,
-                               &depth0_vec, &depth1_vec,
-                               &row_stride0_vec, &row_stride1_vec,
-                               &img_stride0_vec, &img_stride1_vec);
-
    /*
-    * Get pointer(s) to image data for mipmap level(s).
+    * Get/interpolate texture colors.
     */
-   data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
-   }
 
+   packed_lo = lp_build_alloca(builder, h16_bld.vec_type, "packed_lo");
+   packed_hi = lp_build_alloca(builder, h16_bld.vec_type, "packed_hi");
 
-   /*
-    * Get/interpolate texture colors.
-    */
    if (min_filter == mag_filter) {
       /* no need to distinquish between minification and magnification */
-      lp_build_sample_mipmap(bld, min_filter, mip_filter,
-                             s, t, r, lod_fpart,
-                             width0_vec, width1_vec,
-                             height0_vec, height1_vec,
-                             depth0_vec, depth1_vec,
-                             row_stride0_vec, row_stride1_vec,
-                             img_stride0_vec, img_stride1_vec,
-                             data_ptr0, data_ptr1,
-                             &packed_lo, &packed_hi);
+      lp_build_sample_mipmap(bld,
+                             min_filter, mip_filter,
+                             s, t, r,
+                             ilevel0, ilevel1, lod_fpart,
+                             packed_lo, packed_hi);
    }
    else {
       /* Emit conditional to choose min image filter or mag image filter
        * depending on the lod being > 0 or <= 0, respectively.
        */
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
       LLVMValueRef minify;
 
-      flow_ctx = lp_build_flow_create(builder);
-      lp_build_flow_scope_begin(flow_ctx);
+      /* minify = lod >= 0.0 */
+      minify = LLVMBuildICmp(builder, LLVMIntSGE,
+                             lod_ipart, int_bld->zero, "");
 
-      packed_lo = LLVMGetUndef(h16_vec_type);
-      packed_hi = LLVMGetUndef(h16_vec_type);
-
-      lp_build_flow_scope_declare(flow_ctx, &packed_lo);
-      lp_build_flow_scope_declare(flow_ctx, &packed_hi);
-
-      /* minify = lod > 0.0 */
-      minify = LLVMBuildFCmp(builder, LLVMRealUGE,
-                             lod, float_bld->zero, "");
-
-      lp_build_if(&if_ctx, flow_ctx, builder, minify);
+      lp_build_if(&if_ctx, builder, minify);
       {
          /* Use the minification filter */
-         lp_build_sample_mipmap(bld, min_filter, mip_filter,
-                                s, t, r, lod_fpart,
-                                width0_vec, width1_vec,
-                                height0_vec, height1_vec,
-                                depth0_vec, depth1_vec,
-                                row_stride0_vec, row_stride1_vec,
-                                img_stride0_vec, img_stride1_vec,
-                                data_ptr0, data_ptr1,
-                                &packed_lo, &packed_hi);
+         lp_build_sample_mipmap(bld,
+                                min_filter, mip_filter,
+                                s, t, r,
+                                ilevel0, ilevel1, lod_fpart,
+                                packed_lo, packed_hi);
       }
       lp_build_else(&if_ctx);
       {
          /* Use the magnification filter */
-         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
-                                s, t, r, lod_fpart,
-                                width0_vec, width1_vec,
-                                height0_vec, height1_vec,
-                                depth0_vec, depth1_vec,
-                                row_stride0_vec, row_stride1_vec,
-                                img_stride0_vec, img_stride1_vec,
-                                data_ptr0, data_ptr1,
-                                &packed_lo, &packed_hi);
+         lp_build_sample_mipmap(bld, 
+                                mag_filter, PIPE_TEX_MIPFILTER_NONE,
+                                s, t, r,
+                                i32t_zero, NULL, NULL,
+                                packed_lo, packed_hi);
       }
       lp_build_endif(&if_ctx);
-
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
    }
 
-   /* combine 'packed_lo', 'packed_hi' into 'packed' */
-   {
-      struct lp_build_context h16, u8n;
-
-      lp_build_context_init(&h16, builder, lp_type_ufixed(16));
-      lp_build_context_init(&u8n, builder, lp_type_unorm(8));
-
-      packed = lp_build_pack2(builder, h16.type, u8n.type,
-                              packed_lo, packed_hi);
-   }
+   /*
+    * combine the values stored in 'packed_lo' and 'packed_hi' variables
+    * into 'packed'
+    */
+   packed = lp_build_pack2(builder,
+                           h16_bld.type, lp_type_unorm(8),
+                           LLVMBuildLoad(builder, packed_lo, ""),
+                           LLVMBuildLoad(builder, packed_hi, ""));
 
    /*
     * Convert to SoA and swizzle.
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
index e1045bbbc21..5d9ecac4d50 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -50,15 +50,6 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                     const LLVMValueRef *ddy,
                     LLVMValueRef lod_bias, /* optional */
                     LLVMValueRef explicit_lod, /* optional */
-                    LLVMValueRef width,
-                    LLVMValueRef height,
-                    LLVMValueRef depth,
-                    LLVMValueRef width_vec,
-                    LLVMValueRef height_vec,
-                    LLVMValueRef depth_vec,
-                    LLVMValueRef row_stride_array,
-                    LLVMValueRef img_stride_array,
-                    LLVMValueRef data_array,
                     LLVMValueRef texel_out[4]);
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 36a77d3aff0..53cc0c5f345 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -82,7 +82,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                           LLVMValueRef texel_out[4])
 {
    const struct lp_sampler_static_state *static_state = bld->static_state;
-   const int dims = texture_dims(static_state->target);
+   const unsigned dims = bld->dims;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef offset;
    LLVMValueRef i, j;
@@ -131,7 +131,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
    }
 
    /* convert x,y,z coords to linear offset from start of texture, in bytes */
-   lp_build_sample_offset(&bld->uint_coord_bld,
+   lp_build_sample_offset(&bld->int_coord_bld,
                           bld->format_desc,
                           x, y, z, y_stride, z_stride,
                           &offset, &i, &j);
@@ -145,7 +145,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
        * coords which are out of bounds to become zero.  Zero's guaranteed
        * to be inside the texture image.
        */
-      offset = lp_build_andnot(&bld->uint_coord_bld, offset, use_border);
+      offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
    }
 
    lp_build_fetch_rgba_soa(bld->builder,
@@ -202,11 +202,7 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef fract, flr, isOdd;
 
-   /* fract = coord - floor(coord) */
-   fract = lp_build_sub(coord_bld, coord, lp_build_floor(coord_bld, coord));
-
-   /* flr = ifloor(coord); */
-   flr = lp_build_ifloor(coord_bld, coord);
+   lp_build_ifloor_fract(coord_bld, coord, &flr, &fract);
 
    /* isOdd = flr & 1 */
    isOdd = LLVMBuildAnd(bld->builder, flr, int_coord_bld->one, "");
@@ -234,6 +230,7 @@ static void
 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
                             LLVMValueRef coord,
                             LLVMValueRef length,
+                            LLVMValueRef length_f,
                             boolean is_pot,
                             unsigned wrap_mode,
                             LLVMValueRef *x0_out,
@@ -242,10 +239,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
-   LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
-   LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
    LLVMValueRef coord0, coord1, weight;
 
    switch(wrap_mode) {
@@ -253,23 +248,25 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       /* mul by size and subtract 0.5 */
       coord = lp_build_mul(coord_bld, coord, length_f);
       coord = lp_build_sub(coord_bld, coord, half);
-      /* convert to int */
-      coord0 = lp_build_ifloor(coord_bld, coord);
-      coord1 = lp_build_add(uint_coord_bld, coord0, uint_coord_bld->one);
-      /* compute lerp weight */
-      weight = lp_build_fract(coord_bld, coord);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
       /* repeat wrap */
       if (is_pot) {
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
          coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
          coord1 = LLVMBuildAnd(bld->builder, coord1, length_minus_one, "");
       }
       else {
          /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
+         LLVMValueRef mask;
          coord0 = LLVMBuildAdd(bld->builder, coord0, bias, "");
-         coord1 = LLVMBuildAdd(bld->builder, coord1, bias, "");
          coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
-         coord1 = LLVMBuildURem(bld->builder, coord1, length, "");
+         mask = lp_build_compare(bld->builder, int_coord_bld->type,
+                                 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+         coord1 = LLVMBuildAnd(bld->builder,
+                              lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
+                              mask, "");
       }
       break;
 
@@ -284,53 +281,47 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
       coord = lp_build_sub(coord_bld, coord, half);
 
-      weight = lp_build_fract(coord_bld, coord);
-      coord0 = lp_build_ifloor(coord_bld, coord);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      if (bld->static_state->normalized_coords) {
-         /* clamp to [0,1] */
-         coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, coord_bld->one);
-         /* mul by tex size and subtract 0.5 */
-         coord = lp_build_mul(coord_bld, coord, length_f);
+      {
+         struct lp_build_context abs_coord_bld = bld->coord_bld;
+         abs_coord_bld.type.sign = FALSE;
+
+         if (bld->static_state->normalized_coords) {
+            /* mul by tex size */
+            coord = lp_build_mul(coord_bld, coord, length_f);
+         }
+         /* clamp to length max */
+         coord = lp_build_min(coord_bld, coord, length_f);
+         /* subtract 0.5 */
          coord = lp_build_sub(coord_bld, coord, half);
+         /* clamp to [0, length - 0.5] */
+         coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* coord1 = min(coord1, length-1) */
+         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+         break;
       }
-      else {
-         LLVMValueRef min, max;
-         /* clamp to [0.5, length - 0.5] */
-         min = half;
-         max = lp_build_sub(coord_bld, length_f, min);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-      }
-      /* compute lerp weight */
-      weight = lp_build_fract(coord_bld, coord);
-      /* coord0 = floor(coord); */
-      coord0 = lp_build_ifloor(coord_bld, coord);
-      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
-      /* coord0 = max(coord0, 0) */
-      coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
-      /* coord1 = min(coord1, length-1) */
-      coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
-      break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       {
-         LLVMValueRef min, max;
+         LLVMValueRef min;
          if (bld->static_state->normalized_coords) {
             /* scale coord to length */
             coord = lp_build_mul(coord_bld, coord, length_f);
          }
-         /* clamp to [-0.5, length + 0.5] */
-         min = lp_build_const_vec(coord_bld->type, -0.5F);
-         max = lp_build_sub(coord_bld, length_f, min);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
+         /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */
          coord = lp_build_sub(coord_bld, coord, half);
-         /* compute lerp weight */
-         weight = lp_build_fract(coord_bld, coord);
-         /* convert to int */
-         coord0 = lp_build_ifloor(coord_bld, coord);
+         min = lp_build_const_vec(coord_bld->type, -1.0F);
+         coord = lp_build_clamp(coord_bld, coord, min, length_f);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
       break;
@@ -343,11 +334,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       coord = lp_build_mul(coord_bld, coord, length_f);
       coord = lp_build_sub(coord_bld, coord, half);
 
-      /* compute lerp weight */
-      weight = lp_build_fract(coord_bld, coord);
-
-      /* convert to int coords */
-      coord0 = lp_build_ifloor(coord_bld, coord);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
 
       /* coord0 = max(coord0, 0) */
@@ -369,15 +357,16 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
       coord = lp_build_sub(coord_bld, coord, half);
 
-      weight = lp_build_fract(coord_bld, coord);
-      coord0 = lp_build_ifloor(coord_bld, coord);
+      /* convert to int, compute lerp weight */
+      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
       {
          LLVMValueRef min, max;
-
+         struct lp_build_context abs_coord_bld = bld->coord_bld;
+         abs_coord_bld.type.sign = FALSE;
          coord = lp_build_abs(coord_bld, coord);
 
          if (bld->static_state->normalized_coords) {
@@ -392,16 +381,14 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
          coord = lp_build_sub(coord_bld, coord, half);
 
-         weight = lp_build_fract(coord_bld, coord);
-         coord0 = lp_build_ifloor(coord_bld, coord);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
       {
-         LLVMValueRef min, max;
-
          coord = lp_build_abs(coord_bld, coord);
 
          if (bld->static_state->normalized_coords) {
@@ -409,15 +396,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
             coord = lp_build_mul(coord_bld, coord, length_f);
          }
 
-         /* clamp to [-0.5, length + 0.5] */
-         min = lp_build_negate(coord_bld, half);
-         max = lp_build_sub(coord_bld, length_f, min);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-
+         /* was: clamp to [-0.5, length + 0.5] then sub 0.5 */
+         /* skip -0.5 clamp (always positive), do sub first */
          coord = lp_build_sub(coord_bld, coord, half);
+         coord = lp_build_min(coord_bld, coord, length_f);
 
-         weight = lp_build_fract(coord_bld, coord);
-         coord0 = lp_build_ifloor(coord_bld, coord);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
       break;
@@ -446,14 +431,13 @@ static LLVMValueRef
 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
                              LLVMValueRef coord,
                              LLVMValueRef length,
+                             LLVMValueRef length_f,
                              boolean is_pot,
                              unsigned wrap_mode)
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
-   LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
    LLVMValueRef icoord;
    
    switch(wrap_mode) {
@@ -464,7 +448,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          icoord = LLVMBuildAnd(bld->builder, icoord, length_minus_one, "");
       else {
          /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
          icoord = LLVMBuildAdd(bld->builder, icoord, bias, "");
          icoord = LLVMBuildURem(bld->builder, icoord, length, "");
       }
@@ -478,7 +462,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       }
 
       /* floor */
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* use itrunc instead since we clamp to 0 anyway */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length - 1]. */
       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
@@ -512,7 +497,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       assert(bld->static_state->normalized_coords);
       coord = lp_build_mul(coord_bld, coord, length_f);
 
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* itrunc == ifloor here */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length - 1] */
       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
@@ -527,7 +513,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
 
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* itrunc == ifloor here */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length - 1] */
       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
@@ -541,7 +528,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
 
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* itrunc == ifloor here */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length] */
       icoord = lp_build_min(int_coord_bld, icoord, length);
@@ -563,9 +551,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 static void
 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                               unsigned unit,
-                              LLVMValueRef width_vec,
-                              LLVMValueRef height_vec,
-                              LLVMValueRef depth_vec,
+                              LLVMValueRef size,
                               LLVMValueRef row_stride_vec,
                               LLVMValueRef img_stride_vec,
                               LLVMValueRef data_ptr,
@@ -574,25 +560,46 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                               LLVMValueRef r,
                               LLVMValueRef colors_out[4])
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const unsigned dims = bld->dims;
+   LLVMValueRef width_vec;
+   LLVMValueRef height_vec;
+   LLVMValueRef depth_vec;
+   LLVMValueRef flt_size;
+   LLVMValueRef flt_width_vec;
+   LLVMValueRef flt_height_vec;
+   LLVMValueRef flt_depth_vec;
    LLVMValueRef x, y, z;
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                size,
+                                &width_vec, &height_vec, &depth_vec);
+
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
+
    /*
     * Compute integer texcoords.
     */
-   x = lp_build_sample_wrap_nearest(bld, s, width_vec,
+   x = lp_build_sample_wrap_nearest(bld, s, width_vec, flt_width_vec,
                                     bld->static_state->pot_width,
                                     bld->static_state->wrap_s);
    lp_build_name(x, "tex.x.wrapped");
 
    if (dims >= 2) {
-      y = lp_build_sample_wrap_nearest(bld, t, height_vec,
+      y = lp_build_sample_wrap_nearest(bld, t, height_vec, flt_height_vec,
                                        bld->static_state->pot_height,
                                        bld->static_state->wrap_t);
       lp_build_name(y, "tex.y.wrapped");
 
       if (dims == 3) {
-         z = lp_build_sample_wrap_nearest(bld, r, depth_vec,
+         z = lp_build_sample_wrap_nearest(bld, r, depth_vec, flt_depth_vec,
                                           bld->static_state->pot_depth,
                                           bld->static_state->wrap_r);
          lp_build_name(z, "tex.z.wrapped");
@@ -626,9 +633,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 static void
 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              unsigned unit,
-                             LLVMValueRef width_vec,
-                             LLVMValueRef height_vec,
-                             LLVMValueRef depth_vec,
+                             LLVMValueRef size,
                              LLVMValueRef row_stride_vec,
                              LLVMValueRef img_stride_vec,
                              LLVMValueRef data_ptr,
@@ -637,16 +642,37 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              LLVMValueRef r,
                              LLVMValueRef colors_out[4])
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const unsigned dims = bld->dims;
+   LLVMValueRef width_vec;
+   LLVMValueRef height_vec;
+   LLVMValueRef depth_vec;
+   LLVMValueRef flt_size;
+   LLVMValueRef flt_width_vec;
+   LLVMValueRef flt_height_vec;
+   LLVMValueRef flt_depth_vec;
    LLVMValueRef x0, y0, z0, x1, y1, z1;
    LLVMValueRef s_fpart, t_fpart, r_fpart;
    LLVMValueRef neighbors[2][2][4];
    int chan;
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                size,
+                                &width_vec, &height_vec, &depth_vec);
+
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
+
    /*
     * Compute integer texcoords.
     */
-   lp_build_sample_wrap_linear(bld, s, width_vec,
+   lp_build_sample_wrap_linear(bld, s, width_vec, flt_width_vec,
                                bld->static_state->pot_width,
                                bld->static_state->wrap_s,
                                &x0, &x1, &s_fpart);
@@ -654,7 +680,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    lp_build_name(x1, "tex.x1.wrapped");
 
    if (dims >= 2) {
-      lp_build_sample_wrap_linear(bld, t, height_vec,
+      lp_build_sample_wrap_linear(bld, t, height_vec, flt_height_vec,
                                   bld->static_state->pot_height,
                                   bld->static_state->wrap_t,
                                   &y0, &y1, &t_fpart);
@@ -662,7 +688,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       lp_build_name(y1, "tex.y1.wrapped");
 
       if (dims == 3) {
-         lp_build_sample_wrap_linear(bld, r, depth_vec,
+         lp_build_sample_wrap_linear(bld, r, depth_vec, flt_depth_vec,
                                      bld->static_state->pot_depth,
                                      bld->static_state->wrap_r,
                                      &z0, &z1, &r_fpart);
@@ -799,69 +825,92 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                        LLVMValueRef s,
                        LLVMValueRef t,
                        LLVMValueRef r,
+                       LLVMValueRef ilevel0,
+                       LLVMValueRef ilevel1,
                        LLVMValueRef lod_fpart,
-                       LLVMValueRef width0_vec,
-                       LLVMValueRef width1_vec,
-                       LLVMValueRef height0_vec,
-                       LLVMValueRef height1_vec,
-                       LLVMValueRef depth0_vec,
-                       LLVMValueRef depth1_vec,
-                       LLVMValueRef row_stride0_vec,
-                       LLVMValueRef row_stride1_vec,
-                       LLVMValueRef img_stride0_vec,
-                       LLVMValueRef img_stride1_vec,
-                       LLVMValueRef data_ptr0,
-                       LLVMValueRef data_ptr1,
                        LLVMValueRef *colors_out)
 {
+   LLVMBuilderRef builder = bld->builder;
+   LLVMValueRef size0;
+   LLVMValueRef size1;
+   LLVMValueRef row_stride0_vec;
+   LLVMValueRef row_stride1_vec;
+   LLVMValueRef img_stride0_vec;
+   LLVMValueRef img_stride1_vec;
+   LLVMValueRef data_ptr0;
+   LLVMValueRef data_ptr1;
    LLVMValueRef colors0[4], colors1[4];
-   int chan;
+   unsigned chan;
 
+   /* sample the first mipmap level */
+   lp_build_mipmap_level_sizes(bld, ilevel0,
+                               &size0,
+                               &row_stride0_vec, &img_stride0_vec);
+   data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
-      /* sample the first mipmap level */
       lp_build_sample_image_nearest(bld, unit,
-                                    width0_vec, height0_vec, depth0_vec,
+                                    size0,
                                     row_stride0_vec, img_stride0_vec,
-                                    data_ptr0, s, t, r, colors0);
-
-      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level */
-         lp_build_sample_image_nearest(bld, unit,
-                                       width1_vec, height1_vec, depth1_vec,
-                                       row_stride1_vec, img_stride1_vec,
-                                       data_ptr1, s, t, r, colors1);
-      }
+                                    data_ptr0, s, t, r,
+                                    colors0);
    }
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
-
-      /* sample the first mipmap level */
       lp_build_sample_image_linear(bld, unit,
-                                   width0_vec, height0_vec, depth0_vec,
+                                   size0,
                                    row_stride0_vec, img_stride0_vec,
-                                   data_ptr0, s, t, r, colors0);
+                                   data_ptr0, s, t, r,
+                                   colors0);
+   }
 
-      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level */
-         lp_build_sample_image_linear(bld, unit,
-                                      width1_vec, height1_vec, depth1_vec,
-                                      row_stride1_vec, img_stride1_vec,
-                                      data_ptr1, s, t, r, colors1);
-      }
+   /* Store the first level's colors in the output variables */
+   for (chan = 0; chan < 4; chan++) {
+       LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
    }
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      /* interpolate samples from the two mipmap levels */
-      for (chan = 0; chan < 4; chan++) {
-         colors_out[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef need_lerp;
+
+      /* need_lerp = lod_fpart > 0 */
+      need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
+                                lod_fpart,
+                                bld->float_bld.zero,
+                                "need_lerp");
+
+      lp_build_if(&if_ctx, builder, need_lerp);
+      {
+         /* sample the second mipmap level */
+         lp_build_mipmap_level_sizes(bld, ilevel1,
+                                     &size1,
+                                     &row_stride1_vec, &img_stride1_vec);
+         data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
+         if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+            lp_build_sample_image_nearest(bld, unit,
+                                          size1,
+                                          row_stride1_vec, img_stride1_vec,
+                                          data_ptr1, s, t, r,
+                                          colors1);
+         }
+         else {
+            lp_build_sample_image_linear(bld, unit,
+                                         size1,
+                                         row_stride1_vec, img_stride1_vec,
+                                         data_ptr1, s, t, r,
+                                         colors1);
+         }
+
+         /* interpolate samples from the two mipmap levels */
+
+         lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart);
+
+         for (chan = 0; chan < 4; chan++) {
+            colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
                                           colors0[chan], colors1[chan]);
+            LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
+         }
       }
-   }
-   else {
-      /* use first/only level's colors */
-      for (chan = 0; chan < 4; chan++) {
-         colors_out[chan] = colors0[chan];
-      }
+      lp_build_endif(&if_ctx);
    }
 }
 
@@ -882,30 +931,20 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
                         const LLVMValueRef *ddy,
                         LLVMValueRef lod_bias, /* optional */
                         LLVMValueRef explicit_lod, /* optional */
-                        LLVMValueRef width,
-                        LLVMValueRef height,
-                        LLVMValueRef depth,
-                        LLVMValueRef width_vec,
-                        LLVMValueRef height_vec,
-                        LLVMValueRef depth_vec,
-                        LLVMValueRef row_stride_array,
-                        LLVMValueRef img_stride_array,
-                        LLVMValueRef data_array,
                         LLVMValueRef *colors_out)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMBuilderRef builder = bld->builder;
    const unsigned mip_filter = bld->static_state->min_mip_filter;
    const unsigned min_filter = bld->static_state->min_img_filter;
    const unsigned mag_filter = bld->static_state->mag_img_filter;
-   const int dims = texture_dims(bld->static_state->target);
-   LLVMValueRef lod = NULL, lod_fpart = NULL;
+   LLVMValueRef lod_ipart = NULL, lod_fpart = NULL;
    LLVMValueRef ilevel0, ilevel1 = NULL;
-   LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
-   LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
-   LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
-   LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
-   LLVMValueRef data_ptr0, data_ptr1 = NULL;
    LLVMValueRef face_ddx[4], face_ddy[4];
+   LLVMValueRef texels[4];
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef i32t_zero = LLVMConstInt(i32t, 0, 0);
+   unsigned chan;
 
    /*
    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
@@ -924,12 +963,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
 
       /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
       face_ddx[2] = NULL;
       face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
       face_ddy[2] = NULL;
       face_ddy[3] = NULL;
       ddx = face_ddx;
@@ -944,126 +983,100 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, unit, ddx, ddy,
-                                  lod_bias, explicit_lod,
-                                  width, height, depth);
+      lp_build_lod_selector(bld, unit, ddx, ddy,
+                            lod_bias, explicit_lod,
+                            mip_filter,
+                            &lod_ipart, &lod_fpart);
+   } else {
+      lod_ipart = i32t_zero;
    }
 
    /*
-    * Compute integer mipmap level(s) to fetch texels from.
+    * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
     */
-   if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+   switch (mip_filter) {
+   default:
+      assert(0 && "bad mip_filter value in lp_build_sample_soa()");
+      /* fall-through */
+   case PIPE_TEX_MIPFILTER_NONE:
       /* always use mip level 0 */
       if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
          /* XXX this is a work-around for an apparent bug in LLVM 2.7.
           * We should be able to set ilevel0 = const(0) but that causes
           * bad x86 code to be emitted.
           */
-         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
-         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+         assert(lod_ipart);
+         lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
       }
       else {
-         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-      }
-   }
-   else {
-      assert(lod);
-      if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
-      }
-      else {
-         assert(mip_filter == PIPE_TEX_MIPFILTER_LINEAR);
-         lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
-                                    &lod_fpart);
-         lod_fpart = lp_build_broadcast_scalar(&bld->coord_bld, lod_fpart);
+         ilevel0 = i32t_zero;
       }
+      break;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      assert(lod_ipart);
+      lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0);
+      break;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      assert(lod_ipart);
+      assert(lod_fpart);
+      lp_build_linear_mip_levels(bld, unit,
+                                 lod_ipart, &lod_fpart,
+                                 &ilevel0, &ilevel1);
+      break;
    }
 
-   /* compute image size(s) of source mipmap level(s) */
-   lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec,
-                               ilevel0, ilevel1,
-                               row_stride_array, img_stride_array,
-                               &width0_vec, &width1_vec,
-                               &height0_vec, &height1_vec,
-                               &depth0_vec, &depth1_vec,
-                               &row_stride0_vec, &row_stride1_vec,
-                               &img_stride0_vec, &img_stride1_vec);
-
    /*
-    * Get pointer(s) to image data for mipmap level(s).
+    * Get/interpolate texture colors.
     */
-   data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
+
+   for (chan = 0; chan < 4; ++chan) {
+     texels[chan] = lp_build_alloca(builder, bld->texel_bld.vec_type, "");
+     lp_build_name(texels[chan], "sampler%u_texel_%c_var", unit, "xyzw"[chan]);
    }
 
-   /*
-    * Get/interpolate texture colors.
-    */
    if (min_filter == mag_filter) {
       /* no need to distinquish between minification and magnification */
       lp_build_sample_mipmap(bld, unit,
-                             min_filter, mip_filter, s, t, r, lod_fpart,
-                             width0_vec, width1_vec,
-                             height0_vec, height1_vec,
-                             depth0_vec, depth1_vec,
-                             row_stride0_vec, row_stride1_vec,
-                             img_stride0_vec, img_stride1_vec,
-                             data_ptr0, data_ptr1,
-                             colors_out);
+                             min_filter, mip_filter,
+                             s, t, r,
+                             ilevel0, ilevel1, lod_fpart,
+                             texels);
    }
    else {
       /* Emit conditional to choose min image filter or mag image filter
-       * depending on the lod being >0 or <= 0, respectively.
+       * depending on the lod being > 0 or <= 0, respectively.
        */
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
       LLVMValueRef minify;
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
+      /* minify = lod >= 0.0 */
+      minify = LLVMBuildICmp(builder, LLVMIntSGE,
+                             lod_ipart, int_bld->zero, "");
 
-      lp_build_flow_scope_declare(flow_ctx, &colors_out[0]);
-      lp_build_flow_scope_declare(flow_ctx, &colors_out[1]);
-      lp_build_flow_scope_declare(flow_ctx, &colors_out[2]);
-      lp_build_flow_scope_declare(flow_ctx, &colors_out[3]);
-
-      /* minify = lod > 0.0 */
-      minify = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
-                             lod, float_bld->zero, "");
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, minify);
+      lp_build_if(&if_ctx, builder, minify);
       {
          /* Use the minification filter */
          lp_build_sample_mipmap(bld, unit,
                                 min_filter, mip_filter,
-                                s, t, r, lod_fpart,
-                                width0_vec, width1_vec,
-                                height0_vec, height1_vec,
-                                depth0_vec, depth1_vec,
-                                row_stride0_vec, row_stride1_vec,
-                                img_stride0_vec, img_stride1_vec,
-                                data_ptr0, data_ptr1,
-                                colors_out);
+                                s, t, r,
+                                ilevel0, ilevel1, lod_fpart,
+                                texels);
       }
       lp_build_else(&if_ctx);
       {
          /* Use the magnification filter */
          lp_build_sample_mipmap(bld, unit,
-                                mag_filter, mip_filter,
-                                s, t, r, lod_fpart,
-                                width0_vec, width1_vec,
-                                height0_vec, height1_vec,
-                                depth0_vec, depth1_vec,
-                                row_stride0_vec, row_stride1_vec,
-                                img_stride0_vec, img_stride1_vec,
-                                data_ptr0, data_ptr1,
-                                colors_out);
+                                mag_filter, PIPE_TEX_MIPFILTER_NONE,
+                                s, t, r,
+                                i32t_zero, NULL, NULL,
+                                texels);
       }
       lp_build_endif(&if_ctx);
+   }
 
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
+   for (chan = 0; chan < 4; ++chan) {
+     colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
+     lp_build_name(colors_out[chan], "sampler%u_texel_%c", unit, "xyzw"[chan]);
    }
 }
 
@@ -1147,12 +1160,10 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     LLVMValueRef explicit_lod, /* optional */
                     LLVMValueRef texel_out[4])
 {
+   unsigned dims = texture_dims(static_state->target);
    struct lp_build_sample_context bld;
-   LLVMValueRef width, width_vec;
-   LLVMValueRef height, height_vec;
-   LLVMValueRef depth, depth_vec;
-   LLVMValueRef row_stride_array, img_stride_array;
-   LLVMValueRef data_array;
+   LLVMTypeRef i32t = LLVMInt32Type();
+
    LLVMValueRef s;
    LLVMValueRef t;
    LLVMValueRef r;
@@ -1171,12 +1182,15 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    bld.static_state = static_state;
    bld.dynamic_state = dynamic_state;
    bld.format_desc = util_format_description(static_state->format);
+   bld.dims = dims;
 
    bld.float_type = lp_type_float(32);
    bld.int_type = lp_type_int(32);
    bld.coord_type = type;
-   bld.uint_coord_type = lp_uint_type(type);
    bld.int_coord_type = lp_int_type(type);
+   bld.float_size_type = lp_type_float(32);
+   bld.float_size_type.length = dims > 1 ? 4 : 1;
+   bld.int_size_type = lp_int_type(bld.float_size_type);
    bld.texel_type = type;
 
    float_vec_type = lp_type_float_vec(32);
@@ -1185,27 +1199,40 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    lp_build_context_init(&bld.float_vec_bld, builder, float_vec_type);
    lp_build_context_init(&bld.int_bld, builder, bld.int_type);
    lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
-   lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type);
    lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
+   lp_build_context_init(&bld.int_size_bld, builder, bld.int_size_type);
+   lp_build_context_init(&bld.float_size_bld, builder, bld.float_size_type);
    lp_build_context_init(&bld.texel_bld, builder, bld.texel_type);
 
    /* Get the dynamic state */
-   width = dynamic_state->width(dynamic_state, builder, unit);
-   height = dynamic_state->height(dynamic_state, builder, unit);
-   depth = dynamic_state->depth(dynamic_state, builder, unit);
-   row_stride_array = dynamic_state->row_stride(dynamic_state, builder, unit);
-   img_stride_array = dynamic_state->img_stride(dynamic_state, builder, unit);
-   data_array = dynamic_state->data_ptr(dynamic_state, builder, unit);
+   bld.width = dynamic_state->width(dynamic_state, builder, unit);
+   bld.height = dynamic_state->height(dynamic_state, builder, unit);
+   bld.depth = dynamic_state->depth(dynamic_state, builder, unit);
+   bld.row_stride_array = dynamic_state->row_stride(dynamic_state, builder, unit);
+   bld.img_stride_array = dynamic_state->img_stride(dynamic_state, builder, unit);
+   bld.data_array = dynamic_state->data_ptr(dynamic_state, builder, unit);
    /* Note that data_array is an array[level] of pointers to texture images */
 
    s = coords[0];
    t = coords[1];
    r = coords[2];
 
-   /* width, height, depth as uint vectors */
-   width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
-   height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
-   depth_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, depth);
+   /* width, height, depth as single int vector */
+   if (dims <= 1) {
+      bld.int_size = bld.width;
+   }
+   else {
+      bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef,
+                                            bld.width, LLVMConstInt(i32t, 0, 0), "");
+      if (dims >= 2) {
+         bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
+                                               bld.height, LLVMConstInt(i32t, 1, 0), "");
+         if (dims >= 3) {
+            bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
+                                                  bld.depth, LLVMConstInt(i32t, 2, 0), "");
+         }
+      }
+   }
 
    if (0) {
       /* For debug: no-op texture sampling */
@@ -1217,10 +1244,7 @@ lp_build_sample_soa(LLVMBuilderRef builder,
       /* do sampling/filtering with fixed pt arithmetic */
       lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
                           lod_bias, explicit_lod,
-                          width, height, depth,
-                          width_vec, height_vec, depth_vec,
-                          row_stride_array, img_stride_array,
-                          data_array, texel_out);
+                          texel_out);
    }
 
    else {
@@ -1238,10 +1262,6 @@ lp_build_sample_soa(LLVMBuilderRef builder,
 
       lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
                               lod_bias, explicit_lod,
-                              width, height, depth,
-                              width_vec, height_vec, depth_vec,
-                              row_stride_array, img_stride_array,
-                              data_array,
                               texel_out);
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 2e9e8386de0..4685a90e418 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -101,6 +101,83 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
 
 
 /**
+ * Combined extract and broadcast (or a mere shuffle when the two types match)
+ */
+LLVMValueRef
+lp_build_extract_broadcast(LLVMBuilderRef builder,
+                           struct lp_type src_type,
+                           struct lp_type dst_type,
+                           LLVMValueRef vector,
+                           LLVMValueRef index)
+{
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef res;
+
+   assert(src_type.floating == dst_type.floating);
+   assert(src_type.width    == dst_type.width);
+
+   assert(lp_check_value(src_type, vector));
+   assert(LLVMTypeOf(index) == i32t);
+
+   if (src_type.length == 1) {
+      if (dst_type.length == 1) {
+         /*
+          * Trivial scalar -> scalar.
+          */
+
+         res = vector;
+      }
+      else {
+         /*
+          * Broadcast scalar -> vector.
+          */
+
+         res = lp_build_broadcast(builder,
+                                  lp_build_vec_type(dst_type),
+                                  vector);
+      }
+   }
+   else {
+      if (dst_type.length == src_type.length) {
+         /*
+          * Special shuffle of the same size.
+          */
+
+         LLVMValueRef shuffle;
+         shuffle = lp_build_broadcast(builder,
+                                      LLVMVectorType(i32t, dst_type.length),
+                                      index);
+         res = LLVMBuildShuffleVector(builder, vector,
+                                      LLVMGetUndef(lp_build_vec_type(dst_type)),
+                                      shuffle, "");
+      }
+      else {
+         LLVMValueRef scalar;
+         scalar = LLVMBuildExtractElement(builder, vector, index, "");
+         if (dst_type.length == 1) {
+            /*
+             * Trivial extract scalar from vector.
+             */
+
+            res = scalar;
+         }
+         else {
+            /*
+             * General case of different sized vectors.
+             */
+
+            res = lp_build_broadcast(builder,
+                                     lp_build_vec_type(dst_type),
+                                     vector);
+         }
+      }
+   }
+
+   return res;
+}
+
+
+/**
  * Swizzle one channel into all other three channels.
  */
 LLVMValueRef
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index f9b6a5e7258..fdea8442aef 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -55,6 +55,14 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
                           LLVMValueRef scalar);
 
 
+LLVMValueRef
+lp_build_extract_broadcast(LLVMBuilderRef builder,
+                           struct lp_type src_type,
+                           struct lp_type dst_type,
+                           LLVMValueRef vector,
+                           LLVMValueRef index);
+
+
 /**
  * Broadcast one channel of a vector composed of arrays of XYZW structures into
  * all four channel.
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 97318b3456c..a4d3b750c3c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -36,6 +36,9 @@
 #define LP_BLD_TGSI_H
 
 #include "gallivm/lp_bld.h"
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
 
 
 struct tgsi_token;
@@ -55,6 +58,75 @@ enum lp_build_tex_modifier {
 
 
 /**
+ * Describe a channel of a register.
+ *
+ * The value can be a:
+ * - immediate value (i.e. derived from a IMM register)
+ * - CONST[n].x/y/z/w
+ * - IN[n].x/y/z/w
+ * - undetermined (when .file == TGSI_FILE_NULL)
+ *
+ * This is one of the analysis results, and is used to described
+ * the output color in terms of inputs.
+ */
+struct lp_tgsi_channel_info
+{
+   unsigned file:4; /* TGSI_FILE_* */
+   unsigned swizzle:3; /* PIPE_SWIZZLE_x */
+   union {
+      uint32_t index;
+      float value; /* for TGSI_FILE_IMMEDIATE */
+   } u;
+};
+
+
+/**
+ * Describe a texture sampler interpolator.
+ *
+ * The interpolation is described in terms of regular inputs.
+ */
+struct lp_tgsi_texture_info
+{
+   struct lp_tgsi_channel_info coord[4];
+   unsigned target:8; /* TGSI_TEXTURE_* */
+   unsigned unit:8;  /* Sampler unit */
+   unsigned modifier:8; /* LP_BLD_TEX_MODIFIER_* */
+};
+
+
+struct lp_tgsi_info
+{
+   struct tgsi_shader_info base;
+
+   /*
+    * Whether any of the texture opcodes access a register file other than
+    * TGSI_FILE_INPUT.
+    *
+    * We could also handle TGSI_FILE_CONST/IMMEDIATE here, but there is little
+    * benefit.
+    */
+   unsigned indirect_textures:1;
+
+   /*
+    * Texture opcode description. Aimed at detecting and described direct
+    * texture opcodes.
+    */
+   unsigned num_texs;
+   struct lp_tgsi_texture_info tex[PIPE_MAX_SAMPLERS];
+
+   /*
+    * Output description. Aimed at detecting and describing simple blit
+    * shaders.
+    */
+   struct lp_tgsi_channel_info output[PIPE_MAX_SHADER_OUTPUTS][4];
+
+   /*
+    * Shortcut pointers into the above (for fragment shaders).
+    */
+   const struct lp_tgsi_channel_info *cbuf[PIPE_MAX_COLOR_BUFS];
+};
+
+/**
  * Sampler code generation interface.
  *
  * Although texture sampling is a requirement for TGSI translation, it is
@@ -97,6 +169,11 @@ struct lp_build_sampler_aos
 
 
 void
+lp_build_tgsi_info(const struct tgsi_token *tokens,
+                   struct lp_tgsi_info *info);
+
+
+void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
                   struct lp_type type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index d5f963be58d..c3c082b2b95 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -513,7 +513,7 @@ emit_instruction(
 {
    LLVMValueRef src0, src1, src2;
    LLVMValueRef tmp0, tmp1;
-   LLVMValueRef dst0;
+   LLVMValueRef dst0 = NULL;
 
    /*
     * Stores and write masks are handled in a general fashion after the long
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
new file mode 100644
index 00000000000..ad514463de0
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -0,0 +1,479 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_dump.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_tgsi.h"
+
+
+/**
+ * Analysis context.
+ *
+ * This is where we keep store the value of each channel of the IMM/TEMP/OUT
+ * register values, as we walk the shader.
+ */
+struct analysis_context
+{
+   struct lp_tgsi_info *info;
+
+   unsigned num_imms;
+   float imm[32][4];
+
+   struct lp_tgsi_channel_info temp[32][4];
+};
+
+
+/**
+ * Describe the specified channel of the src register.
+ */
+static void
+analyse_src(struct analysis_context *ctx,
+            struct lp_tgsi_channel_info *chan_info,
+            const struct tgsi_src_register *src,
+            unsigned chan)
+{
+   chan_info->file = TGSI_FILE_NULL;
+   if (!src->Indirect && !src->Absolute && !src->Negate) {
+      unsigned swizzle = tgsi_util_get_src_register_swizzle(src, chan);
+      if (src->File == TGSI_FILE_TEMPORARY) {
+         if (src->Index < Elements(ctx->temp)) {
+            *chan_info = ctx->temp[src->Index][swizzle];
+         }
+      } else {
+         chan_info->file = src->File;
+         if (src->File == TGSI_FILE_IMMEDIATE) {
+            assert(src->Index < Elements(ctx->imm));
+            if (src->Index < Elements(ctx->imm)) {
+               chan_info->u.value = ctx->imm[src->Index][swizzle];
+            }
+         } else {
+            chan_info->u.index = src->Index;
+            chan_info->swizzle = swizzle;
+         }
+      }
+   }
+}
+
+
+/**
+ * Whether this register channel refers to a specific immediate value.
+ */
+static boolean
+is_immediate(const struct lp_tgsi_channel_info *chan_info, float value)
+{
+   return chan_info->file == TGSI_FILE_IMMEDIATE &&
+          chan_info->u.value == value;
+}
+
+
+static void
+analyse_tex(struct analysis_context *ctx,
+            const struct tgsi_full_instruction *inst,
+            enum lp_build_tex_modifier modifier)
+{
+   struct lp_tgsi_info *info = ctx->info;
+   unsigned chan;
+
+   if (info->num_texs < Elements(info->tex)) {
+      struct lp_tgsi_texture_info *tex_info = &info->tex[info->num_texs];
+      bool indirect = FALSE;
+      unsigned readmask = 0;
+
+      tex_info->target = inst->Texture.Texture;
+      switch (inst->Texture.Texture) {
+      case TGSI_TEXTURE_1D:
+         readmask = TGSI_WRITEMASK_X;
+         break;
+      case TGSI_TEXTURE_2D:
+      case TGSI_TEXTURE_RECT:
+         readmask = TGSI_WRITEMASK_XY;
+         break;
+      case TGSI_TEXTURE_SHADOW1D:
+      case TGSI_TEXTURE_SHADOW2D:
+      case TGSI_TEXTURE_SHADOWRECT:
+      case TGSI_TEXTURE_3D:
+      case TGSI_TEXTURE_CUBE:
+         readmask = TGSI_WRITEMASK_XYZ;
+         break;
+      default:
+         assert(0);
+         return;
+      }
+
+      if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+         /* We don't track explicit derivatives, although we could */
+         indirect = TRUE;
+         tex_info->unit = inst->Src[3].Register.Index;
+      }  else {
+         if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED ||
+             modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
+             modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
+            readmask |= TGSI_WRITEMASK_W;
+         }
+         tex_info->unit = inst->Src[1].Register.Index;
+      }
+
+      for (chan = 0; chan < 4; ++chan) {
+         struct lp_tgsi_channel_info *chan_info = &tex_info->coord[chan];
+         if (readmask & (1 << chan)) {
+            analyse_src(ctx, chan_info, &inst->Src[0].Register, chan);
+            if (chan_info->file != TGSI_FILE_INPUT) {
+               indirect = TRUE;
+            }
+         } else {
+            memset(chan_info, 0, sizeof *chan_info);
+         }
+      }
+
+      if (indirect) {
+         info->indirect_textures = TRUE;
+      }
+
+      ++info->num_texs;
+   } else {
+      info->indirect_textures = TRUE;
+   }
+}
+
+
+/**
+ * Process an instruction, and update the register values accordingly.
+ */
+static void
+analyse_instruction(struct analysis_context *ctx,
+                    struct tgsi_full_instruction *inst)
+{
+   struct lp_tgsi_info *info = ctx->info;
+   struct lp_tgsi_channel_info (*regs)[4];
+   unsigned max_regs;
+   unsigned i;
+   unsigned index;
+   unsigned chan;
+
+   for (i = 0; i < inst->Instruction.NumDstRegs; ++i) {
+      const struct tgsi_dst_register *dst = &inst->Dst[i].Register;
+
+      /*
+       * Get the lp_tgsi_channel_info array corresponding to the destination
+       * register file.
+       */
+
+      if (dst->File == TGSI_FILE_TEMPORARY) {
+         regs = ctx->temp;
+         max_regs = Elements(ctx->temp);
+      } else if (dst->File == TGSI_FILE_OUTPUT) {
+         regs = info->output;
+         max_regs = Elements(info->output);
+      } else if (dst->File == TGSI_FILE_ADDRESS ||
+                 dst->File == TGSI_FILE_PREDICATE) {
+         continue;
+      } else {
+         assert(0);
+         continue;
+      }
+
+      /*
+       * Detect direct TEX instructions
+       */
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_TEX:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_NONE);
+         break;
+      case TGSI_OPCODE_TXD:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV);
+         break;
+      case TGSI_OPCODE_TXB:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS);
+         break;
+      case TGSI_OPCODE_TXL:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD);
+         break;
+      case TGSI_OPCODE_TXP:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_PROJECTED);
+         break;
+      default:
+         break;
+      }
+
+      /*
+       * Keep track of assignments and writes
+       */
+
+      if (dst->Indirect) {
+         /*
+          * It could be any register index so clear all register indices.
+          */
+
+         for (chan = 0; chan < 4; ++chan) {
+            if (dst->WriteMask & (1 << chan)) {
+               for (index = 0; index < max_regs; ++index) {
+                  regs[index][chan].file = TGSI_FILE_NULL;
+               }
+            }
+         }
+      } else if (dst->Index < max_regs) {
+         /*
+          * Update this destination register value.
+          */
+
+         struct lp_tgsi_channel_info res[4];
+
+         memset(res, 0, sizeof res);
+
+         if (!inst->Instruction.Predicate &&
+             !inst->Instruction.Saturate) {
+            for (chan = 0; chan < 4; ++chan) {
+               if (dst->WriteMask & (1 << chan)) {
+                  if (inst->Instruction.Opcode == TGSI_OPCODE_MOV) {
+                     analyse_src(ctx, &res[chan],
+                                 &inst->Src[0].Register, chan);
+                  } else if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
+                     /*
+                      * Propagate values across 1.0 and 0.0 multiplications.
+                      */
+
+                     struct lp_tgsi_channel_info src0;
+                     struct lp_tgsi_channel_info src1;
+
+                     analyse_src(ctx, &src0, &inst->Src[0].Register, chan);
+                     analyse_src(ctx, &src1, &inst->Src[1].Register, chan);
+
+                     if (is_immediate(&src0, 0.0f)) {
+                        res[chan] = src0;
+                     } else if (is_immediate(&src1, 0.0f)) {
+                        res[chan] = src1;
+                     } else if (is_immediate(&src0, 1.0f)) {
+                        res[chan] = src1;
+                     } else if (is_immediate(&src1, 1.0f)) {
+                        res[chan] = src0;
+                     }
+                  }
+               }
+            }
+         }
+
+         for (chan = 0; chan < 4; ++chan) {
+            if (dst->WriteMask & (1 << chan)) {
+               regs[dst->Index][chan] = res[chan];
+            }
+         }
+      }
+   }
+
+   /*
+    * Clear all temporaries information in presence of a control flow opcode.
+    */
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_IFC:
+   case TGSI_OPCODE_ELSE:
+   case TGSI_OPCODE_ENDIF:
+   case TGSI_OPCODE_BGNLOOP:
+   case TGSI_OPCODE_BRK:
+   case TGSI_OPCODE_BREAKC:
+   case TGSI_OPCODE_CONT:
+   case TGSI_OPCODE_ENDLOOP:
+   case TGSI_OPCODE_CALLNZ:
+   case TGSI_OPCODE_CAL:
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+   case TGSI_OPCODE_SWITCH:
+   case TGSI_OPCODE_CASE:
+   case TGSI_OPCODE_DEFAULT:
+   case TGSI_OPCODE_ENDSWITCH:
+   case TGSI_OPCODE_RET:
+   case TGSI_OPCODE_END:
+      /* XXX: Are there more cases? */
+      memset(&ctx->temp, 0, sizeof ctx->temp);
+      memset(&info->output, 0, sizeof info->output);
+   default:
+      break;
+   }
+}
+
+
+static INLINE void
+dump_info(const struct tgsi_token *tokens,
+          struct lp_tgsi_info *info)
+{
+   unsigned index;
+   unsigned chan;
+
+   tgsi_dump(tokens, 0);
+
+   for (index = 0; index < info->num_texs; ++index) {
+      const struct lp_tgsi_texture_info *tex_info = &info->tex[index];
+      debug_printf("TEX[%u] =", index);
+      for (chan = 0; chan < 4; ++chan) {
+         const struct lp_tgsi_channel_info *chan_info =
+               &tex_info->coord[chan];
+         if (chan_info->file != TGSI_FILE_NULL) {
+            debug_printf(" %s[%u].%c",
+                         tgsi_file_names[chan_info->file],
+                         chan_info->u.index,
+                         "xyzw01"[chan_info->swizzle]);
+         } else {
+            debug_printf(" _");
+         }
+      }
+      debug_printf(", SAMP[%u], %s\n",
+                   tex_info->unit,
+                   tgsi_texture_names[tex_info->target]);
+   }
+
+   for (index = 0; index < PIPE_MAX_SHADER_OUTPUTS; ++index) {
+      for (chan = 0; chan < 4; ++chan) {
+         const struct lp_tgsi_channel_info *chan_info =
+               &info->output[index][chan];
+         if (chan_info->file != TGSI_FILE_NULL) {
+            debug_printf("OUT[%u].%c = ", index, "xyzw"[chan]);
+            if (chan_info->file == TGSI_FILE_IMMEDIATE) {
+               debug_printf("%f", chan_info->u.value);
+            } else {
+               const char *file_name;
+               switch (chan_info->file) {
+               case TGSI_FILE_CONSTANT:
+                  file_name = "CONST";
+                  break;
+               case TGSI_FILE_INPUT:
+                  file_name = "IN";
+                  break;
+               default:
+                  file_name = "???";
+                  break;
+               }
+               debug_printf("%s[%u].%c",
+                            file_name,
+                            chan_info->u.index,
+                            "xyzw01"[chan_info->swizzle]);
+            }
+            debug_printf("\n");
+         }
+      }
+   }
+}
+
+
+/**
+ * Detect any direct relationship between the output color
+ */
+void
+lp_build_tgsi_info(const struct tgsi_token *tokens,
+                   struct lp_tgsi_info *info)
+{
+   struct tgsi_parse_context parse;
+   struct analysis_context ctx;
+   unsigned index;
+   unsigned chan;
+
+   memset(info, 0, sizeof *info);
+
+   tgsi_scan_shader(tokens, &info->base);
+
+   memset(&ctx, 0, sizeof ctx);
+   ctx.info = info;
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *inst =
+                  &parse.FullToken.FullInstruction;
+
+            if (inst->Instruction.Opcode == TGSI_OPCODE_END ||
+                inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) {
+               /* We reached the end of main function body. */
+               goto finished;
+            }
+
+            analyse_instruction(&ctx, inst);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            const unsigned size =
+                  parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+            assert(size <= 4);
+            if (ctx.num_imms < Elements(ctx.imm)) {
+               for (chan = 0; chan < size; ++chan) {
+                  ctx.imm[ctx.num_imms][chan] =
+                        parse.FullToken.FullImmediate.u[chan].Float;
+               }
+               ++ctx.num_imms;
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+finished:
+
+   tgsi_parse_free(&parse);
+
+
+   /*
+    * Link the output color values.
+    */
+
+   for (index = 0; index < PIPE_MAX_COLOR_BUFS; ++index) {
+      const struct lp_tgsi_channel_info null_output[4];
+      info->cbuf[index] = null_output;
+   }
+
+   for (index = 0; index < info->base.num_outputs; ++index) {
+      unsigned semantic_name = info->base.output_semantic_name[index];
+      unsigned semantic_index = info->base.output_semantic_index[index];
+      if (semantic_name == TGSI_SEMANTIC_COLOR &&
+          semantic_index < PIPE_MAX_COLOR_BUFS) {
+         info->cbuf[semantic_index] = info->output[index];
+      }
+   }
+
+   if (gallivm_debug & GALLIVM_DEBUG_TGSI) {
+      dump_info(tokens, info);
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 441aebae298..3c318cc8c80 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -887,21 +887,25 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    }
 
    if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+      LLVMTypeRef i32t = LLVMInt32Type();
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = emit_fetch( bld, inst, 1, i );
-         ddy[i] = emit_fetch( bld, inst, 2, i );
+         LLVMValueRef src1 = emit_fetch( bld, inst, 1, i );
+         LLVMValueRef src2 = emit_fetch( bld, inst, 2, i );
+         ddx[i] = LLVMBuildExtractElement(bld->base.builder, src1, index0, "");
+         ddy[i] = LLVMBuildExtractElement(bld->base.builder, src2, index0, "");
       }
       unit = inst->Src[3].Register.Index;
    }  else {
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = lp_build_ddx( &bld->base, coords[i] );
-         ddy[i] = lp_build_ddy( &bld->base, coords[i] );
+         ddx[i] = lp_build_scalar_ddx( &bld->base, coords[i] );
+         ddy[i] = lp_build_scalar_ddy( &bld->base, coords[i] );
       }
       unit = inst->Src[1].Register.Index;
    }
    for (i = num_coords; i < 3; i++) {
-      ddx[i] = bld->base.undef;
-      ddy[i] = bld->base.undef;
+      ddx[i] = LLVMGetUndef(bld->base.elem_type);
+      ddy[i] = LLVMGetUndef(bld->base.elem_type);
    }
 
    bld->sampler->emit_fetch_texel(bld->sampler,
@@ -913,6 +917,43 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
                                   texel);
 }
 
+static boolean
+near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
+		   int pc)
+{
+   int i;
+
+   for (i = 0; i < 5; i++) {
+      unsigned opcode;
+
+      if (pc + i >= bld->info->num_instructions)
+	 return TRUE;
+
+      opcode = bld->instructions[pc + i].Instruction.Opcode;
+
+      if (opcode == TGSI_OPCODE_END)
+	 return TRUE;
+
+      if (opcode == TGSI_OPCODE_TEX ||
+	  opcode == TGSI_OPCODE_TXP ||
+	  opcode == TGSI_OPCODE_TXD ||
+	  opcode == TGSI_OPCODE_TXB ||
+	  opcode == TGSI_OPCODE_TXL ||
+	  opcode == TGSI_OPCODE_TXF ||
+	  opcode == TGSI_OPCODE_TXQ ||
+	  opcode == TGSI_OPCODE_CAL ||
+	  opcode == TGSI_OPCODE_CALLNZ ||
+	  opcode == TGSI_OPCODE_IF ||
+	  opcode == TGSI_OPCODE_IFC ||
+	  opcode == TGSI_OPCODE_BGNLOOP ||
+	  opcode == TGSI_OPCODE_SWITCH)
+	 return FALSE;
+   }
+
+   return TRUE;
+}
+
+
 
 /**
  * Kill fragment if any of the src register values are negative.
@@ -920,7 +961,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
 static void
 emit_kil(
    struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst )
+   const struct tgsi_full_instruction *inst,
+   int pc)
 {
    const struct tgsi_full_src_register *reg = &inst->Src[0];
    LLVMValueRef terms[NUM_CHANNELS];
@@ -959,8 +1001,12 @@ emit_kil(
       }
    }
 
-   if(mask)
+   if(mask) {
       lp_build_mask_update(bld->mask, mask);
+
+      if (!near_end_of_shader(bld, pc))
+	 lp_build_mask_check(bld->mask);
+   }
 }
 
 
@@ -972,7 +1018,8 @@ emit_kil(
  */
 static void
 emit_kilp(struct lp_build_tgsi_soa_context *bld,
-          const struct tgsi_full_instruction *inst)
+          const struct tgsi_full_instruction *inst,
+	  int pc)
 {
    LLVMValueRef mask;
 
@@ -983,10 +1030,14 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld,
       mask = LLVMBuildNot(bld->base.builder, bld->exec_mask.exec_mask, "kilp");
    }
    else {
-      mask = bld->base.zero;
+      LLVMValueRef zero = LLVMConstNull(bld->base.int_vec_type);
+      mask = zero;
    }
 
    lp_build_mask_update(bld->mask, mask);
+
+   if (!near_end_of_shader(bld, pc))
+      lp_build_mask_check(bld->mask);
 }
 
 static void
@@ -1535,12 +1586,12 @@ emit_instruction(
 
    case TGSI_OPCODE_KILP:
       /* predicated kill */
-      emit_kilp( bld, inst );
+      emit_kilp( bld, inst, (*pc)-1 );
       break;
 
    case TGSI_OPCODE_KIL:
       /* conditional kill */
-      emit_kil( bld, inst );
+      emit_kil( bld, inst, (*pc)-1 );
       break;
 
    case TGSI_OPCODE_PK2H:
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index ef4b306cb67..330838d23cf 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -97,7 +97,7 @@ void (*ppc_get_func(struct ppc_function *p))(void)
       return (void (*)(void)) NULL;
    else
 #endif
-      return (void (*)(void)) p->store;
+      return (void (*)(void)) pointer_to_func(p->store);
 }
 
 
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
index 036c1ee48a8..34bfa527db0 100644
--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -23,26 +23,13 @@
 #include "cell/ppu/cell_public.h"
 #endif
 
+
 static INLINE struct pipe_screen *
-sw_screen_create(struct sw_winsys *winsys)
+sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
 {
-   const char *default_driver;
-   const char *driver;
    struct pipe_screen *screen = NULL;
 
 #if defined(GALLIUM_CELL)
-   default_driver = "cell";
-#elif defined(GALLIUM_LLVMPIPE)
-   default_driver = "llvmpipe";
-#elif defined(GALLIUM_SOFTPIPE)
-   default_driver = "softpipe";
-#else
-   default_driver = "";
-#endif
-
-   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
-
-#if defined(GALLIUM_CELL)
    if (screen == NULL && strcmp(driver, "cell") == 0)
       screen = cell_create_screen(winsys);
 #endif
@@ -60,4 +47,26 @@ sw_screen_create(struct sw_winsys *winsys)
    return screen;
 }
 
+
+static INLINE struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys)
+{
+   const char *default_driver;
+   const char *driver;
+
+#if defined(GALLIUM_CELL)
+   default_driver = "cell";
+#elif defined(GALLIUM_LLVMPIPE)
+   default_driver = "llvmpipe";
+#elif defined(GALLIUM_SOFTPIPE)
+   default_driver = "softpipe";
+#else
+   default_driver = "";
+#endif
+
+   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
+   return sw_screen_create_named(winsys, driver);
+}
+
+
 #endif
diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
index 0b4e7404034..e4effa713e9 100644
--- a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
@@ -13,22 +13,28 @@ static INLINE struct pipe_screen *
 sw_screen_wrap(struct pipe_screen *screen)
 {
    struct sw_winsys *sws;
-   struct pipe_screen *sw_screen;
+   struct pipe_screen *sw_screen = NULL;
+   const char *driver;
 
-   sws = wrapper_sw_winsys_warp_pipe_screen(screen);
+   driver = debug_get_option("GALLIUM_DRIVER", "native");
+   if (strcmp(driver, "native") == 0)
+      return screen;
+
+   sws = wrapper_sw_winsys_wrap_pipe_screen(screen);
    if (!sws)
       goto err;
 
-   sw_screen = sw_screen_create(sws);
-   if (sw_screen == screen)
+   sw_screen = sw_screen_create_named(sws, driver);
+
+   if (!sw_screen)
       goto err_winsys;
 
    return sw_screen;
 
 err_winsys:
-   sws->destroy(sws);
+   return wrapper_sw_winsys_dewrap_pipe_screen(sws);
 err:
-  return screen;
+   return screen;
 }
 
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index f71ffb70308..77bde86684e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -90,7 +90,8 @@ static const char *processor_type_names[] =
    "GEOM"
 };
 
-static const char *file_names[TGSI_FILE_COUNT] =
+const char *
+tgsi_file_names[TGSI_FILE_COUNT] =
 {
    "NULL",
    "CONST",
@@ -125,7 +126,8 @@ static const char *semantic_names[] =
    "FACE",
    "EDGEFLAG",
    "PRIM_ID",
-   "INSTANCEID"
+   "INSTANCEID",
+   "STENCIL"
 };
 
 static const char *immediate_type_names[] =
@@ -135,7 +137,8 @@ static const char *immediate_type_names[] =
    "INT32"
 };
 
-static const char *swizzle_names[] =
+const char *
+tgsi_swizzle_names[] =
 {
    "x",
    "y",
@@ -143,7 +146,8 @@ static const char *swizzle_names[] =
    "w"
 };
 
-static const char *texture_names[] =
+const char *
+tgsi_texture_names[] =
 {
    "UNKNOWN",
    "1D",
@@ -201,15 +205,15 @@ _dump_register_src(
    struct dump_ctx *ctx,
    const struct tgsi_full_src_register *src )
 {
-   ENM(src->Register.File, file_names);
+   ENM(src->Register.File, tgsi_file_names);
    if (src->Register.Dimension) {
       if (src->Dimension.Indirect) {
          CHR( '[' );
-         ENM( src->DimIndirect.File, file_names );
+         ENM( src->DimIndirect.File, tgsi_file_names );
          CHR( '[' );
          SID( src->DimIndirect.Index );
          TXT( "]." );
-         ENM( src->DimIndirect.SwizzleX, swizzle_names );
+         ENM( src->DimIndirect.SwizzleX, tgsi_swizzle_names );
          if (src->Dimension.Index != 0) {
             if (src->Dimension.Index > 0)
                CHR( '+' );
@@ -224,11 +228,11 @@ _dump_register_src(
    }
    if (src->Register.Indirect) {
       CHR( '[' );
-      ENM( src->Indirect.File, file_names );
+      ENM( src->Indirect.File, tgsi_file_names );
       CHR( '[' );
       SID( src->Indirect.Index );
       TXT( "]." );
-      ENM( src->Indirect.SwizzleX, swizzle_names );
+      ENM( src->Indirect.SwizzleX, tgsi_swizzle_names );
       if (src->Register.Index != 0) {
          if (src->Register.Index > 0)
             CHR( '+' );
@@ -248,15 +252,15 @@ _dump_register_dst(
    struct dump_ctx *ctx,
    const struct tgsi_full_dst_register *dst )
 {
-   ENM(dst->Register.File, file_names);
+   ENM(dst->Register.File, tgsi_file_names);
    if (dst->Register.Dimension) {
       if (dst->Dimension.Indirect) {
          CHR( '[' );
-         ENM( dst->DimIndirect.File, file_names );
+         ENM( dst->DimIndirect.File, tgsi_file_names );
          CHR( '[' );
          SID( dst->DimIndirect.Index );
          TXT( "]." );
-         ENM( dst->DimIndirect.SwizzleX, swizzle_names );
+         ENM( dst->DimIndirect.SwizzleX, tgsi_swizzle_names );
          if (dst->Dimension.Index != 0) {
             if (dst->Dimension.Index > 0)
                CHR( '+' );
@@ -271,11 +275,11 @@ _dump_register_dst(
    }
    if (dst->Register.Indirect) {
       CHR( '[' );
-      ENM( dst->Indirect.File, file_names );
+      ENM( dst->Indirect.File, tgsi_file_names );
       CHR( '[' );
       SID( dst->Indirect.Index );
       TXT( "]." );
-      ENM( dst->Indirect.SwizzleX, swizzle_names );
+      ENM( dst->Indirect.SwizzleX, tgsi_swizzle_names );
       if (dst->Register.Index != 0) {
          if (dst->Register.Index > 0)
             CHR( '+' );
@@ -351,7 +355,7 @@ iter_declaration(
 
    TXT( "DCL " );
 
-   ENM(decl->Declaration.File, file_names);
+   ENM(decl->Declaration.File, tgsi_file_names);
 
    /* all geometry shader inputs are two dimensional */
    if (decl->Declaration.File == TGSI_FILE_INPUT &&
@@ -585,10 +589,10 @@ iter_instruction(
           inst->Predicate.SwizzleZ != TGSI_SWIZZLE_Z ||
           inst->Predicate.SwizzleW != TGSI_SWIZZLE_W) {
          CHR( '.' );
-         ENM( inst->Predicate.SwizzleX, swizzle_names );
-         ENM( inst->Predicate.SwizzleY, swizzle_names );
-         ENM( inst->Predicate.SwizzleZ, swizzle_names );
-         ENM( inst->Predicate.SwizzleW, swizzle_names );
+         ENM( inst->Predicate.SwizzleX, tgsi_swizzle_names );
+         ENM( inst->Predicate.SwizzleY, tgsi_swizzle_names );
+         ENM( inst->Predicate.SwizzleZ, tgsi_swizzle_names );
+         ENM( inst->Predicate.SwizzleW, tgsi_swizzle_names );
       }
 
       TXT( ") " );
@@ -641,10 +645,10 @@ iter_instruction(
           src->Register.SwizzleZ != TGSI_SWIZZLE_Z ||
           src->Register.SwizzleW != TGSI_SWIZZLE_W) {
          CHR( '.' );
-         ENM( src->Register.SwizzleX, swizzle_names );
-         ENM( src->Register.SwizzleY, swizzle_names );
-         ENM( src->Register.SwizzleZ, swizzle_names );
-         ENM( src->Register.SwizzleW, swizzle_names );
+         ENM( src->Register.SwizzleX, tgsi_swizzle_names );
+         ENM( src->Register.SwizzleY, tgsi_swizzle_names );
+         ENM( src->Register.SwizzleZ, tgsi_swizzle_names );
+         ENM( src->Register.SwizzleW, tgsi_swizzle_names );
       }
 
       if (src->Register.Absolute)
@@ -655,7 +659,7 @@ iter_instruction(
 
    if (inst->Instruction.Texture) {
       TXT( ", " );
-      ENM( inst->Texture.Texture, texture_names );
+      ENM( inst->Texture.Texture, tgsi_texture_names );
    }
 
    switch (inst->Instruction.Opcode) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
index dd78b361007..fc0429ad8d9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -35,6 +35,15 @@
 extern "C" {
 #endif
 
+extern const char *
+tgsi_file_names[TGSI_FILE_COUNT];
+
+extern const char *
+tgsi_swizzle_names[];
+
+extern const char *
+tgsi_texture_names[];
+
 void
 tgsi_dump_str(
    const struct tgsi_token *tokens,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 90198a4f604..6585da3e838 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -147,6 +147,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                   info->input_semantic_name[reg] = (ubyte)fulldecl->Semantic.Name;
                   info->input_semantic_index[reg] = (ubyte)fulldecl->Semantic.Index;
                   info->input_interpolate[reg] = (ubyte)fulldecl->Declaration.Interpolate;
+                  info->input_centroid[reg] = (ubyte)fulldecl->Declaration.Centroid;
                   info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Declaration.CylindricalWrap;
                   info->num_inputs++;
                }
@@ -157,9 +158,11 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
 
                   /* extra info for special outputs */
                   if (procType == TGSI_PROCESSOR_FRAGMENT &&
-                      fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
-                     info->writes_z = TRUE;
-                  }
+                      fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION)
+                        info->writes_z = TRUE;
+                  if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                      fulldecl->Semantic.Name == TGSI_SEMANTIC_STENCIL)
+                        info->writes_stencil = TRUE;
                   if (procType == TGSI_PROCESSOR_VERTEX &&
                       fulldecl->Semantic.Name == TGSI_SEMANTIC_EDGEFLAG) {
                      info->writes_edgeflag = TRUE;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index f8aa90cf065..104097fbc03 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -45,6 +45,7 @@ struct tgsi_shader_info
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
    ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_centroid[PIPE_MAX_SHADER_INPUTS];
    ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
    ubyte input_cylindrical_wrap[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
@@ -60,6 +61,7 @@ struct tgsi_shader_info
    uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
 
    boolean writes_z;  /**< does fragment shader write Z value? */
+   boolean writes_stencil; /**< does fragment shader write stencil value? */
    boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
    boolean uses_kill;  /**< KIL or KILP instruction used? */
 
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index a75380228b1..850ef39ef21 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -68,6 +68,33 @@ struct translate_key {
 };
 
 
+struct translate;
+
+
+typedef void (PIPE_CDECL *run_elts_func)(struct translate *,
+                                         const unsigned *elts,
+                                         unsigned count,
+                                         unsigned instance_id,
+                                         void *output_buffer);
+
+typedef void (PIPE_CDECL *run_elts16_func)(struct translate *,
+                                           const uint16_t *elts,
+                                           unsigned count,
+                                           unsigned instance_id,
+                                           void *output_buffer);
+
+typedef void (PIPE_CDECL *run_elts8_func)(struct translate *,
+                                          const uint8_t *elts,
+                                          unsigned count,
+                                          unsigned instance_id,
+                                          void *output_buffer);
+
+typedef void (PIPE_CDECL *run_func)(struct translate *,
+                                    unsigned start,
+                                    unsigned count,
+                                    unsigned instance_id,
+                                    void *output_buffer);
+
 struct translate {
    struct translate_key key;
 
@@ -79,42 +106,14 @@ struct translate {
 		       unsigned stride,
 		       unsigned max_index );
 
-   void (PIPE_CDECL *run_elts)( struct translate *,
-                                const unsigned *elts,
-                                unsigned count,
-                                unsigned instance_id,
-                                void *output_buffer);
-
-   void (PIPE_CDECL *run_elts16)( struct translate *,
-                                const uint16_t *elts,
-                                unsigned count,
-                                unsigned instance_id,
-                                void *output_buffer);
-
-   void (PIPE_CDECL *run_elts8)( struct translate *,
-                                const uint8_t *elts,
-                                unsigned count,
-                                unsigned instance_id,
-                                void *output_buffer);
-
-   void (PIPE_CDECL *run)( struct translate *,
-                           unsigned start,
-                           unsigned count,
-                           unsigned instance_id,
-                           void *output_buffer);
+   run_elts_func run_elts;
+   run_elts16_func run_elts16;
+   run_elts8_func run_elts8;
+   run_func run;
 };
 
 
 
-#if 0
-struct translate_context *translate_context_create( void );
-void translate_context_destroy( struct translate_context * );
-
-struct translate *translate_lookup_or_create( struct translate_context *tctx,
-					      const struct translate_key *key );
-#endif
-
-
 struct translate *translate_create( const struct translate_key *key );
 
 boolean translate_is_output_format_supported(enum pipe_format format);
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index f8bf5b46692..ef7f4be4c3e 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -1495,19 +1495,19 @@ struct translate *translate_sse2_create( const struct translate_key *key )
    if (!build_vertex_emit(p, &p->elt8_func, 1))
       goto fail;
 
-   p->translate.run = (void*)x86_get_func(&p->linear_func);
+   p->translate.run = (run_func) x86_get_func(&p->linear_func);
    if (p->translate.run == NULL)
       goto fail;
 
-   p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
+   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
    if (p->translate.run_elts == NULL)
       goto fail;
 
-   p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
+   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
    if (p->translate.run_elts16 == NULL)
       goto fail;
 
-   p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
+   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
    if (p->translate.run_elts8 == NULL)
       goto fail;
 
diff --git a/src/gallium/auxiliary/util/u_dl.c b/src/gallium/auxiliary/util/u_dl.c
index 220860ebf4b..aca435d6cad 100644
--- a/src/gallium/auxiliary/util/u_dl.c
+++ b/src/gallium/auxiliary/util/u_dl.c
@@ -38,6 +38,7 @@
 #endif
 
 #include "u_dl.h"
+#include "u_pointer.h"
 
 
 struct util_dl_library *
@@ -58,7 +59,7 @@ util_dl_get_proc_address(struct util_dl_library *library,
                          const char *procname)
 {
 #if defined(PIPE_OS_UNIX)
-   return (util_dl_proc)dlsym((void *)library, procname);
+   return (util_dl_proc) pointer_to_func(dlsym((void *)library, procname));
 #elif defined(PIPE_OS_WINDOWS)
    return (util_dl_proc)GetProcAddress((HMODULE)library, procname);
 #else
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 0811280b97b..8e5d4487a67 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -109,9 +109,12 @@ PIPE_FORMAT_Z32_UNORM               , plain, 1, 1, un32,     ,     ,     , x___,
 PIPE_FORMAT_Z32_FLOAT               , plain, 1, 1, f32 ,     ,     ,     , x___, zs
 PIPE_FORMAT_Z24_UNORM_S8_USCALED    , plain, 1, 1, un24, u8  ,     ,     , xy__, zs
 PIPE_FORMAT_S8_USCALED_Z24_UNORM    , plain, 1, 1, u8 ,  un24,     ,     , yx__, zs
+PIPE_FORMAT_X24S8_USCALED           , plain, 1, 1, x24,  u8  ,     ,     , _y__, zs
+PIPE_FORMAT_S8X24_USCALED           , plain, 1, 1, u8  , x24 ,     ,     , _x__, zs
 PIPE_FORMAT_Z24X8_UNORM             , plain, 1, 1, un24, x8  ,     ,     , x___, zs
 PIPE_FORMAT_X8Z24_UNORM             , plain, 1, 1, x8  , un24,     ,     , y___, zs
 PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED , plain, 1, 1, f32,  u8  , x24 ,     , xy__, zs
+PIPE_FORMAT_X32_S8X24_USCALED       , plain, 1, 1, x32,  u8  , x24 ,     , _y__, zs
 
 # YUV formats
 # http://www.fourcc.org/yuv.php#UYVY
diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c
index 792d69c214c..80081e22f7c 100644
--- a/src/gallium/auxiliary/util/u_format_zs.c
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -918,3 +918,56 @@ util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned d
    }
 }
 
+
+void
+util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_z24_unorm_s8_uscaled_unpack_s_8uscaled(dst_row, dst_stride,
+						      src_row, src_stride,
+						      width, height);
+}
+
+void
+util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_z24_unorm_s8_uscaled_pack_s_8uscaled(dst_row, dst_stride,
+						    src_row, src_stride,
+						    width, height);
+}
+
+void
+util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_s8_uscaled_z24_unorm_unpack_s_8uscaled(dst_row, dst_stride,
+						      src_row, src_stride,
+						      width, height);
+}
+
+void
+util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_s8_uscaled_z24_unorm_pack_s_8uscaled(dst_row, dst_stride,
+						      src_row, src_stride,
+						      width, height);
+}
+
+void
+util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+						const uint8_t *src_row, unsigned src_stride,
+						unsigned width, unsigned height)
+{
+   util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(dst_row, dst_stride,
+							 src_row, src_stride,
+							 width, height);
+
+}
+
+void
+util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+					      const uint8_t *src_row, unsigned src_stride,
+					      unsigned width, unsigned height)
+{
+   util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(dst_row, dst_stride,
+                                                       src_row, src_stride,
+						       width, height);
+}
diff --git a/src/gallium/auxiliary/util/u_format_zs.h b/src/gallium/auxiliary/util/u_format_zs.h
index 650db4b95fd..1604cc3eee2 100644
--- a/src/gallium/auxiliary/util/u_format_zs.h
+++ b/src/gallium/auxiliary/util/u_format_zs.h
@@ -192,5 +192,21 @@ util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned
 void
 util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
 
+void
+util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
 
+void
+util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_sride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
 #endif /* U_FORMAT_ZS_H_ */
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 69a76814945..37294b7203f 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -118,6 +118,11 @@ __inline double __cdecl atan2(double val)
 #endif
 
 
+#ifndef M_SQRT2
+#define M_SQRT2 1.41421356237309504880
+#endif
+
+
 #if defined(_MSC_VER) 
 
 #if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index c90b0fdbc3f..5378f2d782f 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -434,8 +434,8 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color *
 /* Integer versions of util_pack_z and util_pack_z_stencil - useful for
  * constructing clear masks.
  */
-static INLINE uint
-util_pack_uint_z(enum pipe_format format, unsigned z)
+static INLINE uint32_t
+util_pack_mask_z(enum pipe_format format, uint32_t z)
 {
    switch (format) {
    case PIPE_FORMAT_Z16_UNORM:
@@ -452,29 +452,32 @@ util_pack_uint_z(enum pipe_format format, unsigned z)
    case PIPE_FORMAT_S8_USCALED:
       return 0;
    default:
-      debug_print_format("gallium: unhandled format in util_pack_z()", format);
+      debug_print_format("gallium: unhandled format in util_pack_mask_z()", format);
       assert(0);
       return 0;
    }
 }
 
-static INLINE uint
-util_pack_uint_z_stencil(enum pipe_format format, double z, uint s)
+static INLINE uint32_t
+util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s)
 {
-   unsigned packed = util_pack_uint_z(format, z);
-
-   s &= 0xff;
+   uint32_t packed = util_pack_mask_z(format, z);
 
    switch (format) {
    case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-      return packed | (s << 24);
+      packed |= (uint32_t)s << 24;
+      break;
    case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-      return packed | s;
+      packed |= s;
+      break;
    case PIPE_FORMAT_S8_USCALED:
-      return packed | s;
+      packed |= s;
+      break;
    default:
-      return packed;
+      break;
    }
+
+   return packed;
 }
 
 
@@ -482,9 +485,11 @@ util_pack_uint_z_stencil(enum pipe_format format, double z, uint s)
 /**
  * Note: it's assumed that z is in [0,1]
  */
-static INLINE uint
+static INLINE uint32_t
 util_pack_z(enum pipe_format format, double z)
 {
+   union fi fui;
+
    if (z == 0.0)
       return 0;
 
@@ -492,24 +497,25 @@ util_pack_z(enum pipe_format format, double z)
    case PIPE_FORMAT_Z16_UNORM:
       if (z == 1.0)
          return 0xffff;
-      return (uint) (z * 0xffff);
+      return (uint32_t) (z * 0xffff);
    case PIPE_FORMAT_Z32_UNORM:
       /* special-case to avoid overflow */
       if (z == 1.0)
          return 0xffffffff;
-      return (uint) (z * 0xffffffff);
+      return (uint32_t) (z * 0xffffffff);
    case PIPE_FORMAT_Z32_FLOAT:
-      return (uint)z;
+      fui.f = (float)z;
+      return fui.ui;
    case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
    case PIPE_FORMAT_Z24X8_UNORM:
       if (z == 1.0)
          return 0xffffff;
-      return (uint) (z * 0xffffff);
+      return (uint32_t) (z * 0xffffff);
    case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       if (z == 1.0)
          return 0xffffff00;
-      return ((uint) (z * 0xffffff)) << 8;
+      return ((uint32_t) (z * 0xffffff)) << 8;
    case PIPE_FORMAT_S8_USCALED:
       /* this case can get it via util_pack_z_stencil() */
       return 0;
@@ -525,14 +531,14 @@ util_pack_z(enum pipe_format format, double z)
  * Pack Z and/or stencil values into a 32-bit value described by format.
  * Note: it's assumed that z is in [0,1] and s in [0,255]
  */
-static INLINE uint
-util_pack_z_stencil(enum pipe_format format, double z, uint s)
+static INLINE uint32_t
+util_pack_z_stencil(enum pipe_format format, double z, uint8_t s)
 {
-   unsigned packed = util_pack_z(format, z);
+   uint32_t packed = util_pack_z(format, z);
 
    switch (format) {
    case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-      packed |= s << 24;
+      packed |= (uint32_t)s << 24;
       break;
    case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
       packed |= s;
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 03198c91da4..1df6c872677 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -71,6 +71,96 @@ _mm_castps_si128(__m128 a)
 
 #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
 
+union m128i {
+   __m128i m;
+   ubyte ub[16];
+   ushort us[8];
+   uint ui[4];
+};
+
+static INLINE void u_print_epi8(const char *name, __m128i r)
+{
+   union { __m128i m; ubyte ub[16]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x\n",
+                name,
+                u.ub[0],  u.ub[1],  u.ub[2],  u.ub[3],
+                u.ub[4],  u.ub[5],  u.ub[6],  u.ub[7],
+                u.ub[8],  u.ub[9],  u.ub[10], u.ub[11],
+                u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
+}
+
+static INLINE void u_print_epi16(const char *name, __m128i r)
+{
+   union { __m128i m; ushort us[8]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x\n",
+                name,
+                u.us[0],  u.us[1],  u.us[2],  u.us[3],
+                u.us[4],  u.us[5],  u.us[6],  u.us[7]);
+}
+
+static INLINE void u_print_epi32(const char *name, __m128i r)
+{
+   union { __m128i m; uint ui[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%08x/"
+                "%08x/"
+                "%08x/"
+                "%08x\n",
+                name,
+                u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
+}
+
+static INLINE void u_print_ps(const char *name, __m128 r)
+{
+   union { __m128 m; float f[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%f/"
+                "%f/"
+                "%f/"
+                "%f\n",
+                name,
+                u.f[0],  u.f[1],  u.f[2],  u.f[3]);
+}
+
+
+#define U_DUMP_EPI32(a) u_print_epi32(#a, a)
+#define U_DUMP_EPI16(a) u_print_epi16(#a, a)
+#define U_DUMP_EPI8(a)  u_print_epi8(#a, a)
+#define U_DUMP_PS(a)    u_print_ps(#a, a)
+
+
 
 #if defined(PIPE_ARCH_SSSE3)
 
@@ -98,6 +188,68 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
 #endif /* !PIPE_ARCH_SSSE3 */
 
 
-#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+
+/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * I suspect this works fine for us because one of our operands is
+ * always positive, but not sure that this can be used for general
+ * signed integer multiplication.
+ *
+ * This seems close enough to the speed of SSE4 and the real
+ * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
+ * dependency at this point.
+ */
+static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
+{
+   __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
+   __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
+   __m128i ba   = _mm_mul_epu32(b, a);   /* multply dwords 0, 2 */
+   __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
+
+   /* Interleave the results, either with shuffles or (slightly
+    * faster) direct bit operations:
+    */
+#if 0
+   __m128i ba8             = _mm_shuffle_epi32(ba, 8);
+   __m128i b4a48           = _mm_shuffle_epi32(b4a4, 8);
+   __m128i result          = _mm_unpacklo_epi32(ba8, b4a48);
+#else
+   __m128i mask            = _mm_setr_epi32(~0,0,~0,0);
+   __m128i ba_mask         = _mm_and_si128(ba, mask);
+   __m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32);
+   __m128i result          = _mm_or_si128(ba_mask, b4a4_mask_shift);
+#endif
+
+   return result;
+}
+
+
+static INLINE void
+transpose4_epi32(const __m128i * restrict a,
+                 const __m128i * restrict b,
+                 const __m128i * restrict c,
+                 const __m128i * restrict d,
+                 __m128i * restrict o,
+                 __m128i * restrict p,
+                 __m128i * restrict q,
+                 __m128i * restrict r)
+{
+  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+  *o = _mm_unpacklo_epi64(t0, t1);
+  *p = _mm_unpackhi_epi64(t0, t1);
+  *q = _mm_unpacklo_epi64(t2, t3);
+  *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
+
+
+#endif /* PIPE_ARCH_SSE */
 
 #endif /* U_SSE_H_ */
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index f7aa1403d08..44cadbfcdd0 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -217,6 +217,81 @@ z24s8_get_tile_rgba(const unsigned *src,
    }
 }
 
+/*** PIPE_FORMAT_S8X24_USCALED ***/
+
+/**
+ * Return S component as four uint32_t in [0..255].  Z part ignored.
+ */
+static void
+s8x24_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float)((*src++ >> 24) & 0xff);
+      }
+
+      p += dst_stride;
+   }
+}
+
+/*** PIPE_FORMAT_X24S8_USCALED ***/
+
+/**
+ * Return S component as four uint32_t in [0..255].  Z part ignored.
+ */
+static void
+x24s8_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float)(*src++ & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/**
+ * Return S component as four uint32_t in [0..255].  Z part ignored.
+ */
+static void
+s8_get_tile_rgba(const unsigned char *src,
+		 unsigned w, unsigned h,
+		 float *p,
+		 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float)(*src++ & 0xff);
+      }
+      p += dst_stride;
+   }
+}
 
 /*** PIPE_FORMAT_Z32_FLOAT ***/
 
@@ -261,10 +336,19 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_Z24X8_UNORM:
       s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_S8_USCALED:
+      s8_get_tile_rgba((unsigned char *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_X24S8_USCALED:
+      s8x24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_S8X24_USCALED:
+      x24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_Z32_FLOAT:
       z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
       break;
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index 5342fc25dc1..e09a1304c4d 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -156,6 +156,15 @@ If there is an index buffer bound, and ``indexed`` field is true, all vertex
 indices will be looked up in the index buffer.  ``min_index``, ``max_index``,
 and ``index_bias`` apply after index lookup.
 
+When drawing indexed primitives, the primitive restart index can be
+used to draw disjoint primitive strips.  For example, several separate
+line strips can be drawn by designating a special index value as the
+restart index.  The ``primitive_restart`` flag enables/disables this
+feature.  The ``restart_index`` field specifies the restart index value.
+
+When primitive restart is in use, array indexes are compared to the
+restart index before adding the index_bias offset.
+
 If a given vertex element has ``instance_divisor`` set to 0, it is said
 it contains per-vertex data and effective vertex attribute address needs
 to be recalculated for every index.
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 9e02d43ab74..d99ed7c6d6b 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -1415,6 +1415,12 @@ Edge flags are used to control which lines or points are actually
 drawn when the polygon mode converts triangles/quads/polygons into
 points or lines.
 
+TGSI_SEMANTIC_STENCIL
+""""""""""""""""""""""
+
+For fragment shaders, this semantic label indicates than an output
+is a writable stencil reference value. Only the Y component is writable.
+This allows the fragment shader to change the fragments stencilref value.
 
 
 Properties
@@ -1493,6 +1499,8 @@ well.
 | Z                  | XXX TBD      | (z, z, z, 1)       | (0, z, 0, 1) |
 |                    |              | [#depth-tex-mode]_ |              |
 +--------------------+--------------+--------------------+--------------+
+| S                  | (s, s, s, s) | unknown            | unknown      |
++--------------------+--------------+--------------------+--------------+
 
 .. [#envmap-bumpmap] http://www.opengl.org/registry/specs/ATI/envmap_bumpmap.txt
 .. [#depth-tex-mode] the default is (z, z, z, 1) but may also be (0, 0, 0, z)
diff --git a/src/gallium/drivers/galahad/glhd_context.c b/src/gallium/drivers/galahad/glhd_context.c
index ff6d2aa00ab..50f66079c2a 100644
--- a/src/gallium/drivers/galahad/glhd_context.c
+++ b/src/gallium/drivers/galahad/glhd_context.c
@@ -641,7 +641,7 @@ galahad_set_index_buffer(struct pipe_context *_pipe,
          break;
       default:
          glhd_warn("index buffer %p has unrecognized index size %d",
-               _ib->buffer, _ib->index_size);
+                   (void *) _ib->buffer, _ib->index_size);
          break;
       }
    }
@@ -1013,7 +1013,7 @@ galahad_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
 
    glhd_pipe->pipe = pipe;
 
-   glhd_warn("Created context %p", glhd_pipe);
+   glhd_warn("Created context %p", (void *) glhd_pipe);
 
    return &glhd_pipe->base;
 }
diff --git a/src/gallium/drivers/galahad/glhd_screen.c b/src/gallium/drivers/galahad/glhd_screen.c
index 288941b1066..b6cc41d908b 100644
--- a/src/gallium/drivers/galahad/glhd_screen.c
+++ b/src/gallium/drivers/galahad/glhd_screen.c
@@ -370,7 +370,7 @@ galahad_screen_create(struct pipe_screen *screen)
 
    glhd_screen->screen = screen;
 
-   glhd_warn("Created screen %p", glhd_screen);
+   glhd_warn("Created screen %p", (void *) glhd_screen);
 
    return &glhd_screen->base;
 }
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 55b877b4ab9..08da2286b05 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -28,8 +28,6 @@ C_SOURCES = \
 	lp_scene_queue.c \
 	lp_screen.c \
 	lp_setup.c \
-	lp_setup_coef.c \
-	lp_setup_coef_intrin.c \
 	lp_setup_line.c \
 	lp_setup_point.c \
 	lp_setup_tri.c \
@@ -38,6 +36,7 @@ C_SOURCES = \
 	lp_state_clip.c \
 	lp_state_derived.c \
 	lp_state_fs.c \
+	lp_state_setup.c \
 	lp_state_gs.c \
 	lp_state_rasterizer.c \
 	lp_state_sampler.c \
@@ -63,12 +62,12 @@ PROGS := lp_test_format	\
 # Need this for the lp_test_*.o files
 CLEAN_EXTRA = *.o
 
+include ../../Makefile.template
+
 lp_test_sincos.o : sse_mathfun.h
 
 PROGS_DEPS := ../../auxiliary/libgallium.a
 
-include ../../Makefile.template
-
 lp_tile_soa.c: lp_tile_soa.py ../../auxiliary/util/u_format_parse.py ../../auxiliary/util/u_format_pack.py ../../auxiliary/util/u_format.csv
 	python lp_tile_soa.py ../../auxiliary/util/u_format.csv > $@
 
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 650435f0f19..49950153a4f 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -27,13 +27,7 @@ env.Depends('lp_tile_soa.c', [
 ])
 
 
-# Only enable SSSE3 for lp_tile_soa_sse3.c
-ssse3_env = env.Clone()
-if env['gcc'] \
-   and distutils.version.LooseVersion(env['CCVERSION']) >= distutils.version.LooseVersion('4.3') \
-   and env['machine'] in ('x86', 'x86_64') :
-    ssse3_env.Append(CCFLAGS = ['-mssse3'])
-lp_tile_soa_os = ssse3_env.SharedObject('lp_tile_soa.c')
+lp_tile_soa_os = env.SharedObject('lp_tile_soa.c')
 
 
 llvmpipe = env.ConvenienceLibrary(
@@ -64,13 +58,12 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_setup_line.c',
 		'lp_setup_point.c',
 		'lp_setup_tri.c',
-		'lp_setup_coef.c',
-		'lp_setup_coef_intrin.c',
 		'lp_setup_vbuf.c',
 		'lp_state_blend.c',
 		'lp_state_clip.c',
 		'lp_state_derived.c',
 		'lp_state_fs.c',
+		'lp_state_setup.c',
 		'lp_state_gs.c',
 		'lp_state_rasterizer.c',
 		'lp_state_sampler.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
index e28efe778f9..e50643790c8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder,
                     struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
-                    LLVMValueRef ref)
+                    LLVMValueRef ref,
+                    boolean do_branch)
 {
    struct lp_build_context bld;
    LLVMValueRef test;
@@ -60,4 +61,7 @@ lp_build_alpha_test(LLVMBuilderRef builder,
    lp_build_name(test, "alpha_mask");
 
    lp_build_mask_update(mask, test);
+
+   if (do_branch)
+      lp_build_mask_check(mask);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
index 44603b418c0..27ca8aad4d4 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder,
                     struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
-                    LLVMValueRef ref);
+                    LLVMValueRef ref,
+                    boolean do_branch);
 
 
 #endif /* !LP_BLD_ALPHA_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 7561899a74e..7eb76d4fb31 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -53,15 +53,8 @@
  *  ... ... ... ... ... ... ... ... ...
  *
  *
- * Stencil test:
- * Two-sided stencil test is supported but probably not as efficient as
- * it could be.  Currently, we use if/then/else constructs to do the
- * operations for front vs. back-facing polygons.  We could probably do
- * both the front and back arithmetic then use a Select() instruction to
- * choose the result depending on polyon orientation.  We'd have to
- * measure performance both ways and see which is better.
- *
  * @author Jose Fonseca <[email protected]>
+ * @author Brian Paul <[email protected]>
  */
 
 #include "pipe/p_state.h"
@@ -71,6 +64,7 @@
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_intr.h"
@@ -128,57 +122,32 @@ lp_build_stencil_test_single(struct lp_build_context *bld,
 /**
  * Do the one or two-sided stencil test comparison.
  * \sa lp_build_stencil_test_single
- * \param face  an integer indicating front (+) or back (-) facing polygon.
- *              If NULL, assume front-facing.
+ * \param front_facing  an integer vector mask, indicating front (~0) or back
+ *                      (0) facing polygon. If NULL, assume front-facing.
  */
 static LLVMValueRef
 lp_build_stencil_test(struct lp_build_context *bld,
                       const struct pipe_stencil_state stencil[2],
                       LLVMValueRef stencilRefs[2],
                       LLVMValueRef stencilVals,
-                      LLVMValueRef face)
+                      LLVMValueRef front_facing)
 {
    LLVMValueRef res;
 
    assert(stencil[0].enabled);
 
-   if (stencil[1].enabled && face) {
-      /* do two-sided test */
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-      LLVMValueRef front_facing;
-      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
-      LLVMValueRef result = bld->undef;
-
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
+   /* do front face test */
+   res = lp_build_stencil_test_single(bld, &stencil[0],
+                                      stencilRefs[0], stencilVals);
 
-      lp_build_flow_scope_declare(flow_ctx, &result);
+   if (stencil[1].enabled && front_facing) {
+      /* do back face test */
+      LLVMValueRef back_res;
 
-      /* front_facing = face > 0.0 */
-      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
-      {
-         result = lp_build_stencil_test_single(bld, &stencil[0],
-                                               stencilRefs[0], stencilVals);
-      }
-      lp_build_else(&if_ctx);
-      {
-         result = lp_build_stencil_test_single(bld, &stencil[1],
-                                               stencilRefs[1], stencilVals);
-      }
-      lp_build_endif(&if_ctx);
+      back_res = lp_build_stencil_test_single(bld, &stencil[1],
+                                              stencilRefs[1], stencilVals);
 
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
-
-      res = result;
-   }
-   else {
-      /* do single-side test */
-      res = lp_build_stencil_test_single(bld, &stencil[0],
-                                         stencilRefs[0], stencilVals);
+      res = lp_build_select(bld, front_facing, res, back_res);
    }
 
    return res;
@@ -195,14 +164,12 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
                            const struct pipe_stencil_state *stencil,
                            enum stencil_op op,
                            LLVMValueRef stencilRef,
-                           LLVMValueRef stencilVals,
-                           LLVMValueRef mask)
+                           LLVMValueRef stencilVals)
 
 {
-   const unsigned stencilMax = 255; /* XXX fix */
    struct lp_type type = bld->type;
    LLVMValueRef res;
-   LLVMValueRef max = lp_build_const_int_vec(type, stencilMax);
+   LLVMValueRef max = lp_build_const_int_vec(type, 0xff);
    unsigned stencil_op;
 
    assert(type.sign);
@@ -255,19 +222,7 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
       break;
    default:
       assert(0 && "bad stencil op mode");
-      res = NULL;
-   }
-
-   if (stencil->writemask != stencilMax) {
-      /* mask &= stencil->writemask */
-      LLVMValueRef writemask = lp_build_const_int_vec(type, stencil->writemask);
-      mask = LLVMBuildAnd(bld->builder, mask, writemask, "");
-      /* res = (res & mask) | (stencilVals & ~mask) */
-      res = lp_build_select_bitwise(bld, writemask, res, stencilVals);
-   }
-   else {
-      /* res = mask ? res : stencilVals */
-      res = lp_build_select(bld, mask, res, stencilVals);
+      res = bld->undef;
    }
 
    return res;
@@ -284,49 +239,40 @@ lp_build_stencil_op(struct lp_build_context *bld,
                     LLVMValueRef stencilRefs[2],
                     LLVMValueRef stencilVals,
                     LLVMValueRef mask,
-                    LLVMValueRef face)
+                    LLVMValueRef front_facing)
 
 {
-   assert(stencil[0].enabled);
-
-   if (stencil[1].enabled && face) {
-      /* do two-sided op */
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-      LLVMValueRef front_facing;
-      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
-      LLVMValueRef result = bld->undef;
+   LLVMValueRef res;
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
+   assert(stencil[0].enabled);
 
-      lp_build_flow_scope_declare(flow_ctx, &result);
+   /* do front face op */
+   res = lp_build_stencil_op_single(bld, &stencil[0], op,
+                                     stencilRefs[0], stencilVals);
 
-      /* front_facing = face > 0.0 */
-      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
+   if (stencil[1].enabled && front_facing) {
+      /* do back face op */
+      LLVMValueRef back_res;
 
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
-      {
-         result = lp_build_stencil_op_single(bld, &stencil[0], op,
-                                             stencilRefs[0], stencilVals, mask);
-      }
-      lp_build_else(&if_ctx);
-      {
-         result = lp_build_stencil_op_single(bld, &stencil[1], op,
-                                             stencilRefs[1], stencilVals, mask);
-      }
-      lp_build_endif(&if_ctx);
+      back_res = lp_build_stencil_op_single(bld, &stencil[1], op,
+                                            stencilRefs[1], stencilVals);
 
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
+      res = lp_build_select(bld, front_facing, res, back_res);
+   }
 
-      return result;
+   if (stencil->writemask != 0xff) {
+      /* mask &= stencil->writemask */
+      LLVMValueRef writemask = lp_build_const_int_vec(bld->type, stencil->writemask);
+      mask = LLVMBuildAnd(bld->builder, mask, writemask, "");
+      /* res = (res & mask) | (stencilVals & ~mask) */
+      res = lp_build_select_bitwise(bld, writemask, res, stencilVals);
    }
    else {
-      /* do single-sided op */
-      return lp_build_stencil_op_single(bld, &stencil[0], op,
-                                        stencilRefs[0], stencilVals, mask);
+      /* res = mask ? res : stencilVals */
+      res = lp_build_select(bld, mask, res, stencilVals);
    }
+
+   return res;
 }
 
 
@@ -358,8 +304,13 @@ lp_depth_type(const struct util_format_description *format_desc,
    }
    else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
       assert(format_desc->block.bits <= 32);
-      if(format_desc->channel[swizzle].normalized)
-         type.norm = TRUE;
+      assert(format_desc->channel[swizzle].normalized);
+      if (format_desc->channel[swizzle].size < format_desc->block.bits) {
+         /* Prefer signed integers when possible, as SSE has less support
+          * for unsigned comparison;
+          */
+         type.sign = TRUE;
+      }
    }
    else
       assert(0);
@@ -381,7 +332,7 @@ lp_depth_type(const struct util_format_description *format_desc,
  */
 static boolean
 get_z_shift_and_mask(const struct util_format_description *format_desc,
-                     unsigned *shift, unsigned *mask)
+                     unsigned *shift, unsigned *width, unsigned *mask)
 {
    const unsigned total_bits = format_desc->block.bits;
    unsigned z_swizzle;
@@ -397,12 +348,14 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
    if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
       return FALSE;
 
+   *width = format_desc->channel[z_swizzle].size;
+
    padding_right = 0;
    for (chan = 0; chan < z_swizzle; ++chan)
       padding_right += format_desc->channel[chan].size;
 
    padding_left =
-      total_bits - (padding_right + format_desc->channel[z_swizzle].size);
+      total_bits - (padding_right + *width);
 
    if (padding_left || padding_right) {
       unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
@@ -413,7 +366,7 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
       *mask = 0xffffffff;
    }
 
-   *shift = padding_left;
+   *shift = padding_right;
 
    return TRUE;
 }
@@ -457,7 +410,7 @@ get_s_shift_and_mask(const struct util_format_description *format_desc,
  * \param maskvalue is the depth test mask.
  * \param counter is a pointer of the uint32 counter.
  */
-static void
+void
 lp_build_occlusion_count(LLVMBuilderRef builder,
                          struct lp_type type,
                          LLVMValueRef maskvalue,
@@ -494,31 +447,57 @@ lp_build_occlusion_count(LLVMBuilderRef builder,
  * \param format_desc  description of the depth/stencil surface
  * \param mask  the alive/dead pixel mask for the quad (vector)
  * \param stencil_refs  the front/back stencil ref values (scalar)
- * \param z_src  the incoming depth/stencil values (a 2x2 quad)
+ * \param z_src  the incoming depth/stencil values (a 2x2 quad, float32)
  * \param zs_dst_ptr  pointer to depth/stencil values in framebuffer
- * \param facing  contains float value indicating front/back facing polygon
+ * \param facing  contains boolean value indicating front/back facing polygon
  */
 void
 lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             const struct pipe_depth_state *depth,
                             const struct pipe_stencil_state stencil[2],
-                            struct lp_type type,
+                            struct lp_type z_src_type,
                             const struct util_format_description *format_desc,
                             struct lp_build_mask_context *mask,
                             LLVMValueRef stencil_refs[2],
                             LLVMValueRef z_src,
                             LLVMValueRef zs_dst_ptr,
                             LLVMValueRef face,
-                            LLVMValueRef counter)
+                            LLVMValueRef *zs_value,
+                            boolean do_branch)
 {
-   struct lp_build_context bld;
-   struct lp_build_context sbld;
+   struct lp_type z_type;
+   struct lp_build_context z_bld;
+   struct lp_build_context s_bld;
    struct lp_type s_type;
+   unsigned z_shift = 0, z_width = 0, z_mask = 0;
    LLVMValueRef zs_dst, z_dst = NULL;
    LLVMValueRef stencil_vals = NULL;
    LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
    LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
-   LLVMValueRef orig_mask = mask->value;
+   LLVMValueRef orig_mask = lp_build_mask_value(mask);
+   LLVMValueRef front_facing = NULL;
+
+
+   /*
+    * Depths are expected to be between 0 and 1, even if they are stored in
+    * floats. Setting these bits here will ensure that the lp_build_conv() call
+    * below won't try to unnecessarily clamp the incoming values.
+    */
+   if(z_src_type.floating) {
+      z_src_type.sign = FALSE;
+      z_src_type.norm = TRUE;
+   }
+   else {
+      assert(!z_src_type.sign);
+      assert(z_src_type.norm);
+   }
+
+   /* Pick the depth type. */
+   z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+
+   /* FIXME: Cope with a depth test type with a different bit width. */
+   assert(z_type.width == z_src_type.width);
+   assert(z_type.length == z_src_type.length);
 
    /* Sanity checking */
    {
@@ -540,8 +519,8 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       }
 
       assert(z_swizzle < 4);
-      assert(format_desc->block.bits == type.width);
-      if (type.floating) {
+      assert(format_desc->block.bits == z_type.width);
+      if (z_type.floating) {
          assert(z_swizzle == 0);
          assert(format_desc->channel[z_swizzle].type ==
                 UTIL_FORMAT_TYPE_FLOAT);
@@ -552,54 +531,56 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          assert(format_desc->channel[z_swizzle].type ==
                 UTIL_FORMAT_TYPE_UNSIGNED);
          assert(format_desc->channel[z_swizzle].normalized);
-         assert(!type.fixed);
-         assert(!type.sign);
-         assert(type.norm);
+         assert(!z_type.fixed);
       }
    }
 
 
    /* Setup build context for Z vals */
-   lp_build_context_init(&bld, builder, type);
+   lp_build_context_init(&z_bld, builder, z_type);
 
    /* Setup build context for stencil vals */
-   s_type = lp_type_int_vec(type.width);
-   lp_build_context_init(&sbld, builder, s_type);
+   s_type = lp_type_int_vec(z_type.width);
+   lp_build_context_init(&s_bld, builder, s_type);
 
    /* Load current z/stencil value from z/stencil buffer */
+   zs_dst_ptr = LLVMBuildBitCast(builder,
+                                 zs_dst_ptr,
+                                 LLVMPointerType(z_bld.vec_type, 0), "");
    zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, "");
 
-   lp_build_name(zs_dst, "zsbufval");
+   lp_build_name(zs_dst, "zs_dst");
 
 
    /* Compute and apply the Z/stencil bitmasks and shifts.
     */
    {
-      unsigned z_shift, z_mask;
       unsigned s_shift, s_mask;
 
-      if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) {
-         if (z_shift) {
-            LLVMValueRef shift = lp_build_const_int_vec(type, z_shift);
-            z_src = LLVMBuildLShr(builder, z_src, shift, "");
-         }
-
+      if (get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask)) {
          if (z_mask != 0xffffffff) {
-            LLVMValueRef mask = lp_build_const_int_vec(type, z_mask);
-            z_src = LLVMBuildAnd(builder, z_src, mask, "");
-            z_dst = LLVMBuildAnd(builder, zs_dst, mask, "");
-            z_bitmask = mask;  /* used below */
+            z_bitmask = lp_build_const_int_vec(z_type, z_mask);
          }
-         else {
+
+         /*
+          * Align the framebuffer Z 's LSB to the right.
+          */
+         if (z_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
+            z_dst = LLVMBuildLShr(builder, zs_dst, shift, "z_dst");
+         } else if (z_bitmask) {
+	    /* TODO: Instead of loading a mask from memory and ANDing, it's
+	     * probably faster to just shake the bits with two shifts. */
+            z_dst = LLVMBuildAnd(builder, zs_dst, z_bitmask, "z_dst");
+         } else {
             z_dst = zs_dst;
+            lp_build_name(z_dst, "z_dst");
          }
-
-         lp_build_name(z_dst, "zsbuf.z");
       }
 
       if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
          if (s_shift) {
-            LLVMValueRef shift = lp_build_const_int_vec(type, s_shift);
+            LLVMValueRef shift = lp_build_const_int_vec(s_type, s_shift);
             stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, "");
             stencil_shift = shift;  /* used below */
          }
@@ -608,35 +589,85 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          }
 
          if (s_mask != 0xffffffff) {
-            LLVMValueRef mask = lp_build_const_int_vec(type, s_mask);
+            LLVMValueRef mask = lp_build_const_int_vec(s_type, s_mask);
             stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
          }
 
-         lp_build_name(stencil_vals, "stencil");
+         lp_build_name(stencil_vals, "s_dst");
       }
    }
 
-
    if (stencil[0].enabled) {
+
+      if (face) {
+         LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
+
+         /* front_facing = face != 0 ? ~0 : 0 */
+         front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
+         front_facing = LLVMBuildSExt(builder, front_facing,
+                                      LLVMIntType(s_bld.type.length*s_bld.type.width),
+                                      "");
+         front_facing = LLVMBuildBitCast(builder, front_facing,
+                                         s_bld.int_vec_type, "");
+      }
+
       /* convert scalar stencil refs into vectors */
-      stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]);
-      stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]);
+      stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]);
+      stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]);
 
-      s_pass_mask = lp_build_stencil_test(&sbld, stencil,
-                                          stencil_refs, stencil_vals, face);
+      s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
+                                          stencil_refs, stencil_vals,
+                                          front_facing);
 
       /* apply stencil-fail operator */
       {
-         LLVMValueRef s_fail_mask = lp_build_andnot(&bld, orig_mask, s_pass_mask);
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP,
+         LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
                                             stencil_refs, stencil_vals,
-                                            s_fail_mask, face);
+                                            s_fail_mask, front_facing);
       }
    }
 
    if (depth->enabled) {
+      /*
+       * Convert fragment Z to the desired type, aligning the LSB to the right.
+       */
+
+      assert(z_type.width == z_src_type.width);
+      assert(z_type.length == z_src_type.length);
+      assert(lp_check_value(z_src_type, z_src));
+      if (z_src_type.floating) {
+         /*
+          * Convert from floating point values
+          */
+
+         if (!z_type.floating) {
+            z_src = lp_build_clamped_float_to_unsigned_norm(builder,
+                                                            z_src_type,
+                                                            z_width,
+                                                            z_src);
+         }
+      } else {
+         /*
+          * Convert from unsigned normalized values.
+          */
+
+         assert(!z_src_type.sign);
+         assert(!z_src_type.fixed);
+         assert(z_src_type.norm);
+         assert(!z_type.floating);
+         if (z_src_type.width > z_width) {
+            LLVMValueRef shift = lp_build_const_int_vec(z_src_type,
+                                                        z_src_type.width - z_width);
+            z_src = LLVMBuildLShr(builder, z_src, shift, "");
+         }
+      }
+      assert(lp_check_value(z_type, z_src));
+
+      lp_build_name(z_src, "z_src");
+
       /* compare src Z to dst Z, returning 'pass' mask */
-      z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst);
+      z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
 
       if (!stencil[0].enabled) {
          /* We can potentially skip all remaining operations here, but only
@@ -644,28 +675,28 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
           * buffer values.  Don't need to update Z buffer values.
           */
          lp_build_mask_update(mask, z_pass);
+
+         if (do_branch) {
+            lp_build_mask_check(mask);
+            do_branch = FALSE;
+         }
       }
 
       if (depth->writemask) {
-         LLVMValueRef zselectmask = mask->value;
+         LLVMValueRef zselectmask;
 
          /* mask off bits that failed Z test */
-         zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, "");
+         zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, "");
 
          /* mask off bits that failed stencil test */
          if (s_pass_mask) {
             zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
          }
 
-         /* if combined Z/stencil format, mask off the stencil bits */
-         if (z_bitmask) {
-            zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, "");
-         }
-
          /* Mix the old and new Z buffer values.
-          * z_dst[i] = (zselectmask[i] & z_src[i]) | (~zselectmask[i] & z_dst[i])
+          * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
           */
-         z_dst = lp_build_select_bitwise(&bld, zselectmask, z_src, z_dst);
+         z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst);
       }
 
       if (stencil[0].enabled) {
@@ -673,33 +704,35 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          LLVMValueRef z_fail_mask, z_pass_mask;
 
          /* apply Z-fail operator */
-         z_fail_mask = lp_build_andnot(&bld, orig_mask, z_pass);
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP,
+         z_fail_mask = lp_build_andnot(&z_bld, orig_mask, z_pass);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
                                             stencil_refs, stencil_vals,
-                                            z_fail_mask, face);
+                                            z_fail_mask, front_facing);
 
          /* apply Z-pass operator */
-         z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, "");
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+         z_pass_mask = LLVMBuildAnd(z_bld.builder, orig_mask, z_pass, "");
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                             stencil_refs, stencil_vals,
-                                            z_pass_mask, face);
+                                            z_pass_mask, front_facing);
       }
    }
    else {
       /* No depth test: apply Z-pass operator to stencil buffer values which
        * passed the stencil test.
        */
-      s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, "");
-      stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+      s_pass_mask = LLVMBuildAnd(s_bld.builder, orig_mask, s_pass_mask, "");
+      stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                          stencil_refs, stencil_vals,
-                                         s_pass_mask, face);
+                                         s_pass_mask, front_facing);
    }
 
-   /* The Z bits are already in the right place but we may need to shift the
-    * stencil bits before ORing Z with Stencil to make the final pixel value.
-    */
+   /* Put Z and ztencil bits in the right place */
+   if (z_dst && z_shift) {
+      LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
+      z_dst = LLVMBuildShl(builder, z_dst, shift, "");
+   }
    if (stencil_vals && stencil_shift)
-      stencil_vals = LLVMBuildShl(bld.builder, stencil_vals,
+      stencil_vals = LLVMBuildShl(s_bld.builder, stencil_vals,
                                   stencil_shift, "");
 
    /* Finally, merge/store the z/stencil values */
@@ -707,13 +740,13 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
        (stencil[0].enabled && stencil[0].writemask)) {
 
       if (z_dst && stencil_vals)
-         zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, "");
+         zs_dst = LLVMBuildOr(z_bld.builder, z_dst, stencil_vals, "");
       else if (z_dst)
          zs_dst = z_dst;
       else
          zs_dst = stencil_vals;
 
-      LLVMBuildStore(builder, zs_dst, zs_dst_ptr);
+      *zs_value = zs_dst;
    }
 
    if (s_pass_mask)
@@ -722,6 +755,47 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    if (depth->enabled && stencil[0].enabled)
       lp_build_mask_update(mask, z_pass);
 
-   if (counter)
-      lp_build_occlusion_count(builder, type, mask->value, counter);
+   if (do_branch)
+      lp_build_mask_check(mask);
+
+}
+
+
+void
+lp_build_depth_write(LLVMBuilderRef builder,
+                     const struct util_format_description *format_desc,
+                     LLVMValueRef zs_dst_ptr,
+                     LLVMValueRef zs_value)
+{
+   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr,
+                                 LLVMPointerType(LLVMTypeOf(zs_value), 0), "");
+
+   LLVMBuildStore(builder, zs_value, zs_dst_ptr);
+}
+
+
+void
+lp_build_deferred_depth_write(LLVMBuilderRef builder,
+                              struct lp_type z_src_type,
+                              const struct util_format_description *format_desc,
+                              struct lp_build_mask_context *mask,
+                              LLVMValueRef zs_dst_ptr,
+                              LLVMValueRef zs_value)
+{
+   struct lp_type z_type;
+   struct lp_build_context z_bld;
+   LLVMValueRef z_dst;
+
+   /* XXX: pointlessly redo type logic:
+    */
+   z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+   lp_build_context_init(&z_bld, builder, z_type);
+
+   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr,
+                                 LLVMPointerType(z_bld.vec_type, 0), "");
+
+   z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
+   z_dst = lp_build_select(&z_bld, lp_build_mask_value(mask), zs_value, z_dst);
+
+   LLVMBuildStore(builder, z_dst, zs_dst_ptr);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index e257a5bd7d0..a54ef3a711e 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -61,7 +61,27 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             LLVMValueRef zs_src,
                             LLVMValueRef zs_dst_ptr,
                             LLVMValueRef facing,
-                            LLVMValueRef counter);
+                            LLVMValueRef *zs_value,
+                            boolean do_branch);
 
+void
+lp_build_depth_write(LLVMBuilderRef builder,
+                     const struct util_format_description *format_desc,
+                     LLVMValueRef zs_dst_ptr,
+                     LLVMValueRef zs_value);
+
+void
+lp_build_deferred_depth_write(LLVMBuilderRef builder,
+                              struct lp_type z_src_type,
+                              const struct util_format_description *format_desc,
+                              struct lp_build_mask_context *mask,
+                              LLVMValueRef zs_dst_ptr,
+                              LLVMValueRef zs_value);
+
+void
+lp_build_occlusion_count(LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef maskvalue,
+                         LLVMValueRef counter);
 
 #endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 2a374f8c390..c9da8900d0c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -206,7 +206,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
             dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
 
             /*
-             * a = a0 + x * dadx + y * dady
+             * a = a0 + (x * dadx + y * dady)
              */
 
             if (attrib == 0 && chan == 0) {
@@ -219,11 +219,11 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
                a = a0;
                if (interp != LP_INTERP_CONSTANT &&
                    interp != LP_INTERP_FACING) {
-                  LLVMValueRef tmp;
-                  tmp = LLVMBuildFMul(builder, bld->x, dadx, "");
-                  a = LLVMBuildFAdd(builder, a, tmp, "");
-                  tmp = LLVMBuildFMul(builder, bld->y, dady, "");
-                  a = LLVMBuildFAdd(builder, a, tmp, "");
+                  LLVMValueRef ax, ay, axy;
+                  ax = LLVMBuildFMul(builder, bld->x, dadx, "");
+                  ay = LLVMBuildFMul(builder, bld->y, dady, "");
+                  axy = LLVMBuildFAdd(builder, ax, ay, "");
+                  a = LLVMBuildFAdd(builder, a, axy, "");
                }
             }
 
@@ -272,7 +272,10 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
  * This is called when we move from one quad to the next.
  */
 static void
-attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
+attribs_update(struct lp_build_interp_soa_context *bld,
+               int quad_index,
+               int start,
+               int end)
 {
    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    LLVMValueRef shuffle = lp_build_const_int_vec(coeff_bld->type, quad_index);
@@ -282,7 +285,7 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
 
    assert(quad_index < 4);
 
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+   for(attrib = start; attrib < end; ++attrib) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
@@ -350,6 +353,14 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
                }
 #endif
 
+               if (attrib == 0 && chan == 2) {
+                  /* FIXME: Depth values can exceed 1.0, due to the fact that
+                   * setup interpolation coefficients refer to (0,0) which causes
+                   * precision loss. So we must clamp to 1.0 here to avoid artifacts
+                   */
+                  a = lp_build_min(coeff_bld, a, coeff_bld->one);
+               }
+
                attrib_name(a, attrib, chan, "");
             }
             bld->attribs[attrib][chan] = a;
@@ -434,8 +445,6 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
    pos_init(bld, x0, y0);
 
    coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
-
-   attribs_update(bld, 0);
 }
 
 
@@ -443,10 +452,20 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
  * Advance the position and inputs to the given quad within the block.
  */
 void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
-                           int quad_index)
+lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
+                                  int quad_index)
+{
+   assert(quad_index < 4);
+
+   attribs_update(bld, quad_index, 1, bld->num_attribs);
+}
+
+void
+lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
+                                  int quad_index)
 {
    assert(quad_index < 4);
 
-   attribs_update(bld, quad_index);
+   attribs_update(bld, quad_index, 0, 1);
 }
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 3054030f739..a7ebdd1bfa2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -46,7 +46,31 @@
 
 #include "tgsi/tgsi_exec.h"
 
-#include "lp_setup.h"
+/**
+ * Describes how to compute the interpolation coefficients (a0, dadx, dady)
+ * from the vertices passed into our triangle/line/point functions by the
+ * draw module.
+ *
+ * Vertices are treated as an array of float[4] values, indexed by
+ * src_index.
+ *
+ * LP_INTERP_COLOR is translated to either LP_INTERP_CONSTANT or
+ * LINEAR depending on flatshade state.
+ */
+enum lp_interp {
+   LP_INTERP_CONSTANT,
+   LP_INTERP_COLOR,
+   LP_INTERP_LINEAR,
+   LP_INTERP_PERSPECTIVE,
+   LP_INTERP_POSITION,
+   LP_INTERP_FACING
+};
+
+struct lp_shader_input {
+   ushort interp:4;       /* enum lp_interp */
+   ushort usage_mask:4;   /* bitmask of TGSI_WRITEMASK_x flags */
+   ushort src_index:8;    /* where to find values in incoming vertices */
+};
 
 
 struct lp_build_interp_soa_context
@@ -89,7 +113,11 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          LLVMValueRef y);
 
 void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
+lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
+                           int quad_index);
+
+void
+lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
                            int quad_index);
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 39f2c6085ef..763432ed712 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -82,6 +82,8 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
       }
    }
 
+   lp_delete_setup_variants(llvmpipe);
+
    align_free( llvmpipe );
 }
 
@@ -108,6 +110,7 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
    memset(llvmpipe, 0, sizeof *llvmpipe);
 
    make_empty_list(&llvmpipe->fs_variants_list);
+   make_empty_list(&llvmpipe->setup_variants_list);
 
    llvmpipe->pipe.winsys = screen->winsys;
    llvmpipe->pipe.screen = screen;
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 34fa20e204a..db09c95b272 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -39,6 +39,7 @@
 #include "lp_jit.h"
 #include "lp_setup.h"
 #include "lp_state_fs.h"
+#include "lp_state_setup.h"
 
 
 struct llvmpipe_vbuf_render;
@@ -48,6 +49,7 @@ struct lp_fragment_shader;
 struct lp_vertex_shader;
 struct lp_blend_state;
 struct lp_setup_context;
+struct lp_setup_variant;
 struct lp_velems_state;
 
 struct llvmpipe_context {
@@ -105,12 +107,9 @@ struct llvmpipe_context {
    /** Which vertex shader output slot contains point size */
    int psize_slot;
 
-   /** Fragment shader input interpolation info */
-   unsigned num_inputs;
-   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
-
    /** The tiling engine */
    struct lp_setup_context *setup;
+   struct lp_setup_variant setup_variant;
 
    /** The primitive drawing context */
    struct draw_context *draw;
@@ -120,6 +119,9 @@ struct llvmpipe_context {
 
    struct lp_fs_variant_list_item fs_variants_list;
    unsigned nr_fs_variants;
+
+   struct lp_setup_variant_list_item setup_variants_list;
+   unsigned nr_setup_variants;
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h
index bb538b2bd83..3626ce4a86c 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.h
+++ b/src/gallium/drivers/llvmpipe/lp_flush.h
@@ -32,6 +32,7 @@
 
 struct pipe_context;
 struct pipe_fence_handle;
+struct pipe_resource;
 
 void
 llvmpipe_flush(struct pipe_context *pipe,
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 04b12dedccf..c540f9b3628 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,7 +36,6 @@
 #include <llvm-c/Transforms/Scalar.h>
 
 #include "util/u_memory.h"
-#include "util/u_cpu_detect.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_debug.h"
 #include "lp_screen.h"
@@ -162,9 +161,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
 void
 lp_jit_screen_cleanup(struct llvmpipe_screen *screen)
 {
-   if(screen->engine)
-      LLVMDisposeExecutionEngine(screen->engine);
-
    if(screen->pass)
       LLVMDisposePassManager(screen->pass);
 }
@@ -190,13 +186,7 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
       LLVMAddCFGSimplificationPass(screen->pass);
       LLVMAddPromoteMemoryToRegisterPass(screen->pass);
       LLVMAddConstantPropagationPass(screen->pass);
-      if(util_cpu_caps.has_sse4_1) {
-         /* FIXME: There is a bug in this pass, whereby the combination of fptosi
-          * and sitofp (necessary for trunc/floor/ceil/round implementation)
-          * somehow becomes invalid code.
-          */
-         LLVMAddInstructionCombiningPass(screen->pass);
-      }
+      LLVMAddInstructionCombiningPass(screen->pass);
       LLVMAddGVNPass(screen->pass);
    } else {
       /* We need at least this pass to prevent the backends to fail in
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 16e04fce0cd..114f21f2d16 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -144,7 +144,7 @@ typedef void
 (*lp_jit_frag_func)(const struct lp_jit_context *context,
                     uint32_t x,
                     uint32_t y,
-                    float facing,
+                    uint32_t facing,
                     const void *a0,
                     const void *dadx,
                     const void *dady,
diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h b/src/gallium/drivers/llvmpipe/lp_limits.h
index d1c431475d8..2538164ffaa 100644
--- a/src/gallium/drivers/llvmpipe/lp_limits.h
+++ b/src/gallium/drivers/llvmpipe/lp_limits.h
@@ -72,4 +72,14 @@
  */
 #define LP_MAX_SHADER_VARIANTS 1024
 
+/**
+ * Max number of setup variants that will be kept around.
+ *
+ * These are determined by the combination of the fragment shader
+ * input signature and a small amount of rasterization state (eg
+ * flatshading).  It is likely that many active fragment shaders will
+ * share the same setup variant.
+ */
+#define LP_MAX_SETUP_VARIANTS 64
+
 #endif /* LP_LIMITS_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index d7e6415e139..d358a983943 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -211,8 +211,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
                        const union lp_rast_cmd_arg arg)
 {
    const struct lp_scene *scene = task->scene;
-   unsigned clear_value = arg.clear_zstencil.value;
-   unsigned clear_mask = arg.clear_zstencil.mask;
+   uint32_t clear_value = arg.clear_zstencil.value;
+   uint32_t clear_mask = arg.clear_zstencil.mask;
    const unsigned height = TILE_SIZE / TILE_VECTOR_HEIGHT;
    const unsigned width = TILE_SIZE * TILE_VECTOR_HEIGHT;
    const unsigned block_size = scene->zsbuf.blocksize;
@@ -220,7 +220,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
    uint8_t *dst;
    unsigned i, j;
 
-   LP_DBG(DEBUG_RAST, "%s 0x%x%x\n", __FUNCTION__, clear_value, clear_mask);
+   LP_DBG(DEBUG_RAST, "%s: value=0x%08x, mask=0x%08x\n",
+           __FUNCTION__, clear_value, clear_mask);
 
    /*
     * Clear the aera of the swizzled depth/depth buffer matching this tile, in
@@ -232,16 +233,31 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
 
    dst = task->depth_tile;
 
+   clear_value &= clear_mask;
+
    switch (block_size) {
    case 1:
+      assert(clear_mask == 0xff);
       memset(dst, (uint8_t) clear_value, height * width);
       break;
    case 2:
-      for (i = 0; i < height; i++) {
-         uint16_t *row = (uint16_t *)dst;
-         for (j = 0; j < width; j++)
-            *row++ = (uint16_t) clear_value;
-         dst += dst_stride;
+      if (clear_mask == 0xffff) {
+         for (i = 0; i < height; i++) {
+            uint16_t *row = (uint16_t *)dst;
+            for (j = 0; j < width; j++)
+               *row++ = (uint16_t) clear_value;
+            dst += dst_stride;
+         }
+      }
+      else {
+         for (i = 0; i < height; i++) {
+            uint16_t *row = (uint16_t *)dst;
+            for (j = 0; j < width; j++) {
+               uint16_t tmp = ~clear_mask & *row;
+               *row++ = clear_value | tmp;
+            }
+            dst += dst_stride;
+         }
       }
       break;
    case 4:
@@ -258,7 +274,7 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
             uint32_t *row = (uint32_t *)dst;
             for (j = 0; j < width; j++) {
                uint32_t tmp = ~clear_mask & *row;
-               *row++ = (clear_value & clear_mask) | tmp;
+               *row++ = clear_value | tmp;
             }
             dst += dst_stride;
          }
@@ -318,7 +334,7 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
 {
    const struct lp_scene *scene = task->scene;
    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
-   const struct lp_rast_state *state = inputs->state;
+   const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    const unsigned tile_x = task->x, tile_y = task->y;
    unsigned x, y;
@@ -349,10 +365,10 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
          BEGIN_JIT_CALL(state);
          variant->jit_function[RAST_WHOLE]( &state->jit_context,
                                             tile_x + x, tile_y + y,
-                                            inputs->facing,
-                                            inputs->a0,
-                                            inputs->dadx,
-                                            inputs->dady,
+                                            inputs->frontfacing,
+                                            GET_A0(inputs),
+                                            GET_DADX(inputs),
+                                            GET_DADY(inputs),
                                             color,
                                             depth,
                                             0xffff,
@@ -398,7 +414,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
                          unsigned x, unsigned y,
                          unsigned mask)
 {
-   const struct lp_rast_state *state = inputs->state;
+   const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    const struct lp_scene *scene = task->scene;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
@@ -430,10 +446,10 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
    BEGIN_JIT_CALL(state);
    variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
                                          x, y,
-                                         inputs->facing,
-                                         inputs->a0,
-                                         inputs->dadx,
-                                         inputs->dady,
+                                         inputs->frontfacing,
+                                         GET_A0(inputs),
+                                         GET_DADX(inputs),
+                                         GET_DADY(inputs),
                                          color,
                                          depth,
                                          mask,
@@ -474,6 +490,14 @@ lp_rast_end_query(struct lp_rasterizer_task *task,
 }
 
 
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg)
+{
+   task->state = arg.state;
+}
+
+
 
 /**
  * Set top row and left column of the tile's pixels to white.  For debugging.
@@ -581,10 +605,12 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] =
    lp_rast_triangle_8,
    lp_rast_triangle_3_4,
    lp_rast_triangle_3_16,
+   lp_rast_triangle_4_16,
    lp_rast_shade_tile,
    lp_rast_shade_tile_opaque,
    lp_rast_begin_query,
    lp_rast_end_query,
+   lp_rast_set_state,
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c55b97a9d13..a64c152cf83 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -78,30 +78,28 @@ struct lp_rast_state {
  * These pointers point into the bin data buffer.
  */
 struct lp_rast_shader_inputs {
-   float facing;     /** Positive for front-facing, negative for back-facing */
-   unsigned disable:1;  /** Partially binned, disable this command */
-   unsigned opaque:1;   /** Is opaque */
-
-   float (*a0)[4];
-   float (*dadx)[4];
-   float (*dady)[4];
-
-   const struct lp_rast_state *state;
+   unsigned frontfacing:1;      /** True for front-facing */
+   unsigned disable:1;          /** Partially binned, disable this command */
+   unsigned opaque:1;           /** Is opaque */
+   unsigned pad0:29;            /* wasted space */
+   unsigned stride;             /* how much to advance data between a0, dadx, dady */
+   unsigned pad2;               /* wasted space */
+   unsigned pad3;               /* wasted space */
+   /* followed by a0, dadx, dady and planes[] */
 };
 
-
+/* Note: the order of these values is important as they are loaded by
+ * sse code in rasterization:
+ */
 struct lp_rast_plane {
-   /* one-pixel sized trivial accept offsets for each plane */
-   int ei;
-
-   /* one-pixel sized trivial reject offsets for each plane */
-   int eo;
-
    /* edge function values at minx,miny ?? */
    int c;
 
    int dcdx;
    int dcdy;
+
+   /* one-pixel sized trivial reject offsets for each plane */
+   int eo;
 };
 
 /**
@@ -111,17 +109,24 @@ struct lp_rast_plane {
  * Objects of this type are put into the lp_setup_context::data buffer.
  */
 struct lp_rast_triangle {
-   /* inputs for the shader */
-   struct lp_rast_shader_inputs inputs;
-
 #ifdef DEBUG
    float v[3][2];
+   float pad0;
+   float pad1;
 #endif
 
-   struct lp_rast_plane plane[8]; /* NOTE: may allocate fewer planes */
+   /* inputs for the shader */
+   struct lp_rast_shader_inputs inputs;
+   /* planes are also allocated here */
 };
 
 
+#define GET_A0(inputs) ((float (*)[4])((inputs)+1))
+#define GET_DADX(inputs) ((float (*)[4])((char *)((inputs) + 1) + (inputs)->stride))
+#define GET_DADY(inputs) ((float (*)[4])((char *)((inputs) + 1) + 2 * (inputs)->stride))
+#define GET_PLANES(tri) ((struct lp_rast_plane *)((char *)(&(tri)->inputs + 1) + 3 * (tri)->inputs.stride))
+
+
 
 struct lp_rasterizer *
 lp_rast_create( unsigned num_threads );
@@ -149,9 +154,10 @@ union lp_rast_cmd_arg {
    const struct lp_rast_state *set_state;
    uint8_t clear_color[4];
    struct {
-      unsigned value;
-      unsigned mask;
+      uint32_t value;
+      uint32_t mask;
    } clear_zstencil;
+   const struct lp_rast_state *state;
    struct lp_fence *fence;
    struct llvmpipe_query *query_obj;
 };
@@ -238,12 +244,14 @@ lp_rast_arg_null( void )
 #define LP_RAST_OP_TRIANGLE_8        0x9
 #define LP_RAST_OP_TRIANGLE_3_4      0xa
 #define LP_RAST_OP_TRIANGLE_3_16     0xb
-#define LP_RAST_OP_SHADE_TILE        0xc
-#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xd
-#define LP_RAST_OP_BEGIN_QUERY       0xe
-#define LP_RAST_OP_END_QUERY         0xf
-
-#define LP_RAST_OP_MAX               0x10
+#define LP_RAST_OP_TRIANGLE_4_16     0xc
+#define LP_RAST_OP_SHADE_TILE        0xd
+#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xe
+#define LP_RAST_OP_BEGIN_QUERY       0xf
+#define LP_RAST_OP_END_QUERY         0x10
+#define LP_RAST_OP_SET_STATE         0x11
+
+#define LP_RAST_OP_MAX               0x12
 #define LP_RAST_OP_MASK              0xff
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
index 9fc78645a3a..64ac616f629 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
@@ -12,6 +12,7 @@ static INLINE int u_bit_scan(unsigned *mask)
 struct tile {
    int coverage;
    int overdraw;
+   const struct lp_rast_state *state;
    char data[TILE_SIZE][TILE_SIZE];
 };
 
@@ -42,10 +43,12 @@ static const char *cmd_names[LP_RAST_OP_MAX] =
    "triangle_8",
    "triangle_3_4",
    "triangle_3_16",
+   "triangle_4_16",
    "shade_tile",
    "shade_tile_opaque",
    "begin_query",
    "end_query",
+   "set_state",
 };
 
 static const char *cmd_name(unsigned cmd)
@@ -55,31 +58,31 @@ static const char *cmd_name(unsigned cmd)
 }
 
 static const struct lp_fragment_shader_variant *
-get_variant(  const struct cmd_block *block,
-              int k )
+get_variant( const struct lp_rast_state *state,
+             const struct cmd_block *block,
+             int k )
 {
    if (block->cmd[k] == LP_RAST_OP_SHADE_TILE ||
-       block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE)
-      return  block->arg[k].shade_tile->state->variant;
-
-   if (block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
+       block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_2 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_3 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_4 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_5 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_6 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_7)
-      return block->arg[k].triangle.tri->inputs.state->variant;
+      return state->variant;
 
    return NULL;
 }
 
 
 static boolean
-is_blend( const struct cmd_block *block,
+is_blend( const struct lp_rast_state *state,
+          const struct cmd_block *block,
           int k )
 {
-   const struct lp_fragment_shader_variant *variant = get_variant(block, k);
+   const struct lp_fragment_shader_variant *variant = get_variant(state, block, k);
 
    if (variant)
       return  variant->key.blend.rt[0].blend_enable;
@@ -92,6 +95,7 @@ is_blend( const struct cmd_block *block,
 static void
 debug_bin( const struct cmd_bin *bin )
 {
+   const struct lp_rast_state *state = NULL;
    const struct cmd_block *head = bin->head;
    int i, j = 0;
 
@@ -99,9 +103,12 @@ debug_bin( const struct cmd_bin *bin )
                 
    while (head) {
       for (i = 0; i < head->count; i++, j++) {
+         if (head->cmd[i] == LP_RAST_OP_SET_STATE)
+            state = head->arg[i].state;
+
          debug_printf("%d: %s %s\n", j,
                       cmd_name(head->cmd[i]),
-                      is_blend(head, i) ? "blended" : "");
+                      is_blend(state, head, i) ? "blended" : "");
       }
       head = head->next;
    }
@@ -133,7 +140,7 @@ debug_shade_tile(int x, int y,
                  char val)
 {
    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
-   boolean blend = inputs->state->variant->key.blend.rt[0].blend_enable;
+   boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;
    unsigned i,j;
 
    if (inputs->disable)
@@ -171,11 +178,12 @@ debug_triangle(int tilex, int tiley,
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    unsigned plane_mask = arg.triangle.plane_mask;
+   const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
    struct lp_rast_plane plane[8];
    int x, y;
    int count = 0;
    unsigned i, nr_planes = 0;
-   boolean blend = tri->inputs.state->variant->key.blend.rt[0].blend_enable;
+   boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;
 
    if (tri->inputs.disable) {
       /* This triangle was partially binned and has been disabled */
@@ -183,7 +191,7 @@ debug_triangle(int tilex, int tiley,
    }
 
    while (plane_mask) {
-      plane[nr_planes] = tri->plane[u_bit_scan(&plane_mask)];
+      plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)];
       plane[nr_planes].c = (plane[nr_planes].c +
                             plane[nr_planes].dcdy * tiley -
                             plane[nr_planes].dcdx * tilex);
@@ -232,15 +240,19 @@ do_debug_bin( struct tile *tile,
    memset(tile->data, ' ', sizeof tile->data);
    tile->coverage = 0;
    tile->overdraw = 0;
+   tile->state = NULL;
 
    for (block = bin->head; block; block = block->next) {
       for (k = 0; k < block->count; k++, j++) {
-         boolean blend = is_blend(block, k);
+         boolean blend = is_blend(tile->state, block, k);
          char val = get_label(j);
          int count = 0;
             
          if (print_cmds)
             debug_printf("%c: %15s", val, cmd_name(block->cmd[k]));
+
+         if (block->cmd[k] == LP_RAST_OP_SET_STATE)
+            tile->state = block->arg[k].state;
          
          if (block->cmd[k] == LP_RAST_OP_CLEAR_COLOR ||
              block->cmd[k] == LP_RAST_OP_CLEAR_ZSTENCIL)
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 7370119e966..b30408f097b 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -77,6 +77,7 @@ struct cmd_bin;
 struct lp_rasterizer_task
 {
    const struct cmd_bin *bin;
+   const struct lp_rast_state *state;
 
    struct lp_scene *scene;
    unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
@@ -244,7 +245,7 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
                          unsigned x, unsigned y )
 {
    const struct lp_scene *scene = task->scene;
-   const struct lp_rast_state *state = inputs->state;
+   const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
    void *depth;
@@ -260,10 +261,10 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
    BEGIN_JIT_CALL(state);
    variant->jit_function[RAST_WHOLE]( &state->jit_context,
                                       x, y,
-                                      inputs->facing,
-                                      inputs->a0,
-                                      inputs->dadx,
-                                      inputs->dady,
+                                      inputs->frontfacing,
+                                      GET_A0(inputs),
+                                      GET_DADX(inputs),
+                                      GET_DADY(inputs),
                                       color,
                                       depth,
                                       0xffff,
@@ -293,6 +294,14 @@ void lp_rast_triangle_3_4(struct lp_rasterizer_task *,
 
 void lp_rast_triangle_3_16( struct lp_rasterizer_task *, 
                             const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_4_16( struct lp_rasterizer_task *, 
+                            const union lp_rast_cmd_arg );
+
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg);
+ 
 void
 lp_debug_bin( const struct cmd_bin *bin );
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index a1f309d4b01..042c315635e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -123,6 +123,16 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
 }
 
 void
+lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_3(task, arg2);
+}
+
+void
 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
 {
@@ -230,144 +240,207 @@ sign_bits4(const __m128i *cstep, int cdiff)
 }
 
 
-/* Special case for 3 plane triangle which is contained entirely
- * within a 16x16 block.
- */
+#define NR_PLANES 3
+
+
+
+
+
+
+
 void
 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
-   unsigned mask = arg.triangle.plane_mask;
-   const int x = task->x + (mask & 0xff);
-   const int y = task->y + (mask >> 8);
-   unsigned outmask, inmask, partmask, partial_mask;
-   unsigned j;
-   __m128i cstep4[3][4];
-
-   outmask = 0;                 /* outside one or more trivial reject planes */
-   partmask = 0;                /* outside one or more trivial accept planes */
-
-   for (j = 0; j < 3; j++) {
-      const int dcdx = -plane[j].dcdx * 4;
-      const int dcdy = plane[j].dcdy * 4;
-      __m128i xdcdy = _mm_set1_epi32(dcdy);
-
-      cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
-      cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
-      cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
-      cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
-
-      {
-	 const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
-	 const int cox = plane[j].eo * 4;
-	 const int cio = plane[j].ei * 4 - 1;
-
-	 outmask |= sign_bits4(cstep4[j], c + cox);
-	 partmask |= sign_bits4(cstep4[j], c + cio);
-      }
-   }
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+   unsigned i, j;
+
+   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+   unsigned nr = 0;
+
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = _mm_setzero_si128();
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+   __m128i rej4;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+   
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+   
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &rej4);
+
+   /* Adjust dcdx;
+    */
+   dcdx = _mm_sub_epi32(zero, dcdx);
 
-   if (outmask == 0xffff)
-      return;
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+   rej4 = _mm_slli_epi32(rej4, 2);
 
-   /* Mask of sub-blocks which are inside all trivial accept planes:
-    */
-   inmask = ~partmask & 0xffff;
+   dcdx2 = _mm_add_epi32(dcdx, dcdx);
+   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 
-   /* Mask of sub-blocks which are inside all trivial reject planes,
-    * but outside at least one trivial accept plane:
-    */
-   partial_mask = partmask & ~outmask;
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
 
-   assert((partial_mask & inmask) == 0);
+   for (i = 0; i < 4; i++) {
+      __m128i cx = c;
 
-   /* Iterate over partials:
-    */
-   while (partial_mask) {
-      int i = ffs(partial_mask) - 1;
-      int ix = (i & 3) * 4;
-      int iy = (i >> 2) * 4;
-      int px = x + ix;
-      int py = y + iy; 
-      unsigned mask = 0xffff;
-
-      partial_mask &= ~(1 << i);
-
-      for (j = 0; j < 3; j++) {
-         const int cx = (plane[j].c 
-			 - plane[j].dcdx * px
-			 + plane[j].dcdy * py) * 4;
-
-	 mask &= ~sign_bits4(cstep4[j], cx);
-      }
+      for (j = 0; j < 4; j++) {
+         __m128i c4rej = _mm_add_epi32(cx, rej4);
+         __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
 
-      if (mask)
-	 lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
-   }
+         /* if (is_zero(rej_masks)) */
+         if (_mm_movemask_epi8(rej_masks) == 0) {
+            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
+            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
+            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
 
-   /* Iterate over fulls: 
-    */
-   while (inmask) {
-      int i = ffs(inmask) - 1;
-      int ix = (i & 3) * 4;
-      int iy = (i >> 2) * 4;
-      int px = x + ix;
-      int py = y + iy; 
+            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
+
+            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+            __m128i c_01 = _mm_packs_epi32(c_0, c_1);
+
+            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
 
-      inmask &= ~(1 << i);
+            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
 
-      block_full_4(task, tri, px, py);
+            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+            __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+            __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+            unsigned mask = _mm_movemask_epi8(c_0123);
+
+            out[nr].i = i;
+            out[nr].j = j;
+            out[nr].mask = mask;
+            if (mask != 0xffff)
+               nr++;
+         }
+         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
+      }
+
+      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
    }
+
+   for (i = 0; i < nr; i++)
+      lp_rast_shade_quads_mask(task,
+                               &tri->inputs,
+                               x + 4 * out[i].j,
+                               y + 4 * out[i].i,
+                               0xffff & ~out[i].mask);
 }
 
 
+
+
+
 void
 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
-		     const union lp_rast_cmd_arg arg)
+                     const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
-   unsigned mask = arg.triangle.plane_mask;
-   const int x = task->x + (mask & 0xff);
-   const int y = task->y + (mask >> 8);
-   unsigned j;
-
-   /* Iterate over partials:
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = _mm_setzero_si128();
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+   
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+   
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &unused);
+
+   /* Adjust dcdx;
     */
+   dcdx = _mm_sub_epi32(zero, dcdx);
+
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+
+   dcdx2 = _mm_add_epi32(dcdx, dcdx);
+   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
+
+
    {
-      unsigned mask = 0xffff;
+      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
+      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
+      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
+      
+      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
 
-      for (j = 0; j < 3; j++) {
-	 const int cx = (plane[j].c 
-			 - plane[j].dcdx * x
-			 + plane[j].dcdy * y);
+      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
 
-	 const int dcdx = -plane[j].dcdx;
-	 const int dcdy = plane[j].dcdy;
-	 __m128i xdcdy = _mm_set1_epi32(dcdy);
+      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+      __m128i c_01 = _mm_packs_epi32(c_0, c_1);
 
-	 __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
-	 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
-	 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
-	 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
 
-	 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
-	 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
-	 __m128i result = _mm_packs_epi16(cstep01, cstep23);
+      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
 
-	 /* Extract the sign bits
-	  */
-	 mask &= ~_mm_movemask_epi8(result);
-      }
+      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
 
-      if (mask)
-	 lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
+      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+      __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+      __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+      unsigned mask = _mm_movemask_epi8(c_0123);
+
+      if (mask != 0xffff)
+         lp_rast_shade_quads_mask(task,
+                                  &tri->inputs,
+                                  x,
+                                  y,
+                                  0xffff & ~mask);
    }
 }
 
-
+#undef NR_PLANES
 #endif
 
 
@@ -383,10 +456,13 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
 
 #define TAG(x) x##_3
 #define NR_PLANES 3
+/*#define TRI_4 lp_rast_triangle_3_4*/
+/*#define TRI_16 lp_rast_triangle_3_16*/
 #include "lp_rast_tri_tmp.h"
 
 #define TAG(x) x##_4
 #define NR_PLANES 4
+#define TRI_16 lp_rast_triangle_4_16
 #include "lp_rast_tri_tmp.h"
 
 #define TAG(x) x##_5
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 9830a43ba55..4825d651c04 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -82,7 +82,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
       const int dcdx = -plane[j].dcdx * 4;
       const int dcdy = plane[j].dcdy * 4;
       const int cox = plane[j].eo * 4;
-      const int cio = plane[j].ei * 4 - 1;
+      const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+      const int cio = ei * 4 - 1;
 
       build_masks(c[j] + cox,
 		  cio - cox,
@@ -156,6 +157,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    unsigned plane_mask = arg.triangle.plane_mask;
+   const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
    const int x = task->x, y = task->y;
    struct lp_rast_plane plane[NR_PLANES];
    int c[NR_PLANES];
@@ -172,7 +174,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 
    while (plane_mask) {
       int i = ffs(plane_mask) - 1;
-      plane[j] = tri->plane[i];
+      plane[j] = tri_plane[i];
       plane_mask &= ~(1 << i);
       c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
 
@@ -180,7 +182,8 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 	 const int dcdx = -plane[j].dcdx * 16;
 	 const int dcdy = plane[j].dcdy * 16;
 	 const int cox = plane[j].eo * 16;
-	 const int cio = plane[j].ei * 16 - 1;
+         const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+         const int cio = ei * 16 - 1;
 
 	 build_masks(c[j] + cox,
 		     cio - cox,
@@ -245,6 +248,133 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
    }
 }
 
+#if defined(PIPE_ARCH_SSE) && defined(TRI_16)
+/* XXX: special case this when intersection is not required.
+ *      - tile completely within bbox,
+ *      - bbox completely within tile.
+ */
+void
+TRI_16(struct lp_rasterizer_task *task,
+       const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   unsigned mask = arg.triangle.plane_mask;
+   unsigned outmask, partial_mask;
+   unsigned j;
+   __m128i cstep4[NR_PLANES][4];
+
+   int x = (mask & 0xff);
+   int y = (mask >> 8);
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   
+   x += task->x;
+   y += task->y;
+
+   for (j = 0; j < NR_PLANES; j++) {
+      const int dcdx = -plane[j].dcdx * 4;
+      const int dcdy = plane[j].dcdy * 4;
+      __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+      cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
+      cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
+      cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
+      cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
+
+      {
+	 const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+	 const int cox = plane[j].eo * 4;
+
+	 outmask |= sign_bits4(cstep4[j], c + cox);
+      }
+   }
+
+   if (outmask == 0xffff)
+      return;
+
+
+   /* Mask of sub-blocks which are inside all trivial reject planes,
+    * but outside at least one trivial accept plane:
+    */
+   partial_mask = 0xffff & ~outmask;
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+      unsigned mask = 0xffff;
+
+      partial_mask &= ~(1 << i);
+
+      for (j = 0; j < NR_PLANES; j++) {
+         const int cx = (plane[j].c - 1
+			 - plane[j].dcdx * px
+			 + plane[j].dcdy * py) * 4;
+
+	 mask &= ~sign_bits4(cstep4[j], cx);
+      }
+
+      if (mask)
+	 lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
+   }
+}
+#endif
+
+#if defined(PIPE_ARCH_SSE) && defined(TRI_4)
+void
+TRI_4(struct lp_rasterizer_task *task,
+      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   unsigned mask = arg.triangle.plane_mask;
+   const int x = task->x + (mask & 0xff);
+   const int y = task->y + (mask >> 8);
+   unsigned j;
+
+   /* Iterate over partials:
+    */
+   {
+      unsigned mask = 0xffff;
+
+      for (j = 0; j < NR_PLANES; j++) {
+	 const int cx = (plane[j].c 
+			 - plane[j].dcdx * x
+			 + plane[j].dcdy * y);
+
+	 const int dcdx = -plane[j].dcdx;
+	 const int dcdy = plane[j].dcdy;
+	 __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+	 __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
+	 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+	 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+	 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+	 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+	 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+	 __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+	 /* Extract the sign bits
+	  */
+	 mask &= ~_mm_movemask_epi8(result);
+      }
+
+      if (mask)
+	 lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
+   }
+}
+#endif
+
+
+
 #undef TAG
+#undef TRI_4
+#undef TRI_16
 #undef NR_PLANES
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index 8b504f23a33..a4fdf7cff36 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -203,7 +203,9 @@ lp_scene_end_rasterization(struct lp_scene *scene )
    for (i = 0; i < scene->tiles_x; i++) {
       for (j = 0; j < scene->tiles_y; j++) {
          struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
-         bin->head = bin->tail = NULL;
+         bin->head = NULL;
+         bin->tail = NULL;
+         bin->last_state = NULL;
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index dbef7692e42..622c522f11a 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -41,6 +41,7 @@
 #include "lp_debug.h"
 
 struct lp_scene_queue;
+struct lp_rast_state;
 
 /* We're limited to 2K by 2K for 32bit fixed point rasterization.
  * Will need a 64-bit version for larger framebuffers.
@@ -94,6 +95,7 @@ struct data_block {
 struct cmd_bin {
    ushort x;
    ushort y;
+   const struct lp_rast_state *last_state;       /* most recent state set in bin */
    struct cmd_block *head;
    struct cmd_block *tail;
 };
@@ -297,7 +299,7 @@ lp_scene_bin_command( struct lp_scene *scene,
 
    assert(x < scene->tiles_x);
    assert(y < scene->tiles_y);
-   assert(cmd <= LP_RAST_OP_END_QUERY);
+   assert(cmd < LP_RAST_OP_MAX);
 
    if (tail == NULL || tail->count == CMD_BLOCK_MAX) {
       tail = lp_scene_new_cmd_block( scene, bin );
@@ -318,6 +320,30 @@ lp_scene_bin_command( struct lp_scene *scene,
 }
 
 
+static INLINE boolean
+lp_scene_bin_cmd_with_state( struct lp_scene *scene,
+                             unsigned x, unsigned y,
+                             const struct lp_rast_state *state,
+                             unsigned cmd,
+                             union lp_rast_cmd_arg arg )
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+
+   if (state != bin->last_state) {
+      bin->last_state = state;
+      if (!lp_scene_bin_command(scene, x, y,
+                                LP_RAST_OP_SET_STATE,
+                                lp_rast_arg_state(state)))
+         return FALSE;
+   }
+
+   if (!lp_scene_bin_command( scene, x, y, cmd, arg ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
 /* Add a command to all active bins.
  */
 static INLINE boolean
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 96633d93654..ad0ea75b3a3 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -158,6 +158,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
    case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
       return 0;
+   case PIPE_CAP_PRIMITIVE_RESTART:
+      return 1;
    case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
       return 1;
    case PIPE_CAP_DEPTH_CLAMP:
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 5ff11a33632..6118434d3d3 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -56,7 +56,7 @@
 #include "draw/draw_vbuf.h"
 
 
-static void set_scene_state( struct lp_setup_context *, enum setup_state,
+static boolean set_scene_state( struct lp_setup_context *, enum setup_state,
                              const char *reason);
 static boolean try_update_scene_state( struct lp_setup_context *setup );
 
@@ -167,7 +167,7 @@ lp_setup_rasterize_scene( struct lp_setup_context *setup )
 
 
 
-static void
+static boolean
 begin_binning( struct lp_setup_context *setup )
 {
    struct lp_scene *scene = setup->scene;
@@ -181,6 +181,8 @@ begin_binning( struct lp_setup_context *setup )
    /* Always create a fence:
     */
    scene->fence = lp_fence_create(MAX2(1, setup->num_threads));
+   if (!scene->fence)
+      return FALSE;
 
    /* Initialize the bin flags and x/y coords:
     */
@@ -192,7 +194,8 @@ begin_binning( struct lp_setup_context *setup )
    }
 
    ok = try_update_scene_state(setup);
-   assert(ok);
+   if (!ok)
+      return FALSE;
 
    if (setup->fb.zsbuf &&
        ((setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
@@ -208,7 +211,8 @@ begin_binning( struct lp_setup_context *setup )
          ok = lp_scene_bin_everywhere( scene, 
                                        LP_RAST_OP_CLEAR_COLOR, 
                                        setup->clear.color );
-         assert(ok);
+         if (!ok)
+            return FALSE;
       }
    }
 
@@ -216,12 +220,14 @@ begin_binning( struct lp_setup_context *setup )
       if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) {
          if (!need_zsload)
             scene->has_depthstencil_clear = TRUE;
+
          ok = lp_scene_bin_everywhere( scene,
                                        LP_RAST_OP_CLEAR_ZSTENCIL,
                                        lp_rast_arg_clearzs(
                                           setup->clear.zsvalue,
                                           setup->clear.zsmask));
-         assert(ok);
+         if (!ok)
+            return FALSE;
       }
    }
 
@@ -229,15 +235,16 @@ begin_binning( struct lp_setup_context *setup )
       ok = lp_scene_bin_everywhere( scene,
                                     LP_RAST_OP_BEGIN_QUERY,
                                     lp_rast_arg_query(setup->active_query) );
-      assert(ok);
+      if (!ok)
+         return FALSE;
    }
-      
 
    setup->clear.flags = 0;
    setup->clear.zsmask = 0;
    setup->clear.zsvalue = 0;
 
    LP_DBG(DEBUG_SETUP, "%s done\n", __FUNCTION__);
+   return TRUE;
 }
 
 
@@ -246,12 +253,12 @@ begin_binning( struct lp_setup_context *setup )
  *
  * TODO: fast path for fullscreen clears and no triangles.
  */
-static void
+static boolean
 execute_clears( struct lp_setup_context *setup )
 {
    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
-   begin_binning( setup );
+   return begin_binning( setup );
 }
 
 const char *states[] = {
@@ -262,7 +269,7 @@ const char *states[] = {
 };
 
 
-static void
+static boolean
 set_scene_state( struct lp_setup_context *setup,
                  enum setup_state new_state,
                  const char *reason)
@@ -270,7 +277,7 @@ set_scene_state( struct lp_setup_context *setup,
    unsigned old_state = setup->state;
 
    if (old_state == new_state)
-      return;
+      return TRUE;
    
    if (LP_DEBUG & DEBUG_SCENE) {
       debug_printf("%s old %s new %s%s%s\n",
@@ -294,12 +301,14 @@ set_scene_state( struct lp_setup_context *setup,
       break;
 
    case SETUP_ACTIVE:
-      begin_binning( setup );
+      if (!begin_binning( setup ))
+         goto fail;
       break;
 
    case SETUP_FLUSHED:
       if (old_state == SETUP_CLEARED)
-         execute_clears( setup );
+         if (!execute_clears( setup ))
+            goto fail;
 
       lp_setup_rasterize_scene( setup );
       assert(setup->scene == NULL);
@@ -307,9 +316,21 @@ set_scene_state( struct lp_setup_context *setup,
 
    default:
       assert(0 && "invalid setup state mode");
+      goto fail;
    }
 
    setup->state = new_state;
+   return TRUE;
+
+fail:
+   if (setup->scene) {
+      lp_scene_end_rasterization(setup->scene);
+      setup->scene = NULL;
+   }
+
+   setup->state = SETUP_FLUSHED;
+   lp_setup_reset( setup );
+   return FALSE;
 }
 
 
@@ -377,16 +398,19 @@ lp_setup_try_clear( struct lp_setup_context *setup,
    }
 
    if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
-      unsigned zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
-      unsigned smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
+      uint32_t zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
+      uint32_t smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
 
       zsvalue = util_pack_z_stencil(setup->fb.zsbuf->format,
                                     depth,
                                     stencil);
 
-      zsmask = util_pack_uint_z_stencil(setup->fb.zsbuf->format,
+
+      zsmask = util_pack_mask_z_stencil(setup->fb.zsbuf->format,
                                         zmask,
                                         smask);
+
+      zsvalue &= zsmask;
    }
 
    if (setup->state == SETUP_ACTIVE) {
@@ -431,7 +455,7 @@ lp_setup_try_clear( struct lp_setup_context *setup,
       if (flags & PIPE_CLEAR_COLOR) {
          memcpy(setup->clear.color.clear_color,
                 &color_arg,
-                sizeof color_arg);
+                sizeof setup->clear.color.clear_color);
       }
    }
    
@@ -502,14 +526,12 @@ lp_setup_set_point_state( struct lp_setup_context *setup,
 }
 
 void
-lp_setup_set_fs_inputs( struct lp_setup_context *setup,
-                        const struct lp_shader_input *input,
-                        unsigned nr )
+lp_setup_set_setup_variant( struct lp_setup_context *setup,
+			    const struct lp_setup_variant *variant)
 {
-   LP_DBG(DEBUG_SETUP, "%s %p %u\n", __FUNCTION__, (void *) input, nr);
-
-   memcpy( setup->fs.input, input, nr * sizeof input[0] );
-   setup->fs.nr_inputs = nr;
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+   
+   setup->setup.variant = variant;
 }
 
 void
@@ -875,7 +897,7 @@ try_update_scene_state( struct lp_setup_context *setup )
    return TRUE;
 }
 
-void
+boolean
 lp_setup_update_state( struct lp_setup_context *setup,
                        boolean update_scene )
 {
@@ -897,22 +919,47 @@ lp_setup_update_state( struct lp_setup_context *setup,
       setup->psize = lp->psize_slot;
 
       assert(lp->dirty == 0);
+
+      assert(lp->setup_variant.key.size == 
+	     setup->setup.variant->key.size);
+
+      assert(memcmp(&lp->setup_variant.key,
+		    &setup->setup.variant->key,
+		    setup->setup.variant->key.size) == 0);
    }
 
-   if (update_scene)
-      set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ );
+   if (update_scene) {
+      if (!set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ ))
+         return FALSE;
+   }
 
    /* Only call into update_scene_state() if we already have a
     * scene:
     */
    if (update_scene && setup->scene) {
       assert(setup->state == SETUP_ACTIVE);
-      if (!try_update_scene_state(setup)) {
-         lp_setup_flush_and_restart(setup);
-         if (!try_update_scene_state(setup))
-            assert(0);
-      }
+
+      if (try_update_scene_state(setup))
+         return TRUE;
+
+      /* Update failed, try to restart the scene.
+       *
+       * Cannot call lp_setup_flush_and_restart() directly here
+       * because of potential recursion.
+       */
+      if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+         return FALSE;
+
+      if (!set_scene_state(setup, SETUP_ACTIVE, __FUNCTION__))
+         return FALSE;
+
+      if (!setup->scene)
+         return FALSE;
+
+      return try_update_scene_state(setup);
    }
+
+   return TRUE;
 }
 
 
@@ -1016,12 +1063,12 @@ lp_setup_begin_query(struct lp_setup_context *setup,
                                    LP_RAST_OP_BEGIN_QUERY,
                                    lp_rast_arg_query(pq))) {
 
-         lp_setup_flush_and_restart(setup);
+         if (!lp_setup_flush_and_restart(setup))
+            return;
 
          if (!lp_scene_bin_everywhere(setup->scene,
                                       LP_RAST_OP_BEGIN_QUERY,
                                       lp_rast_arg_query(pq))) {
-            assert(0);
             return;
          }
       }
@@ -1065,14 +1112,20 @@ lp_setup_end_query(struct lp_setup_context *setup, struct llvmpipe_query *pq)
 }
 
 
-void
+boolean
 lp_setup_flush_and_restart(struct lp_setup_context *setup)
 {
    if (0) debug_printf("%s\n", __FUNCTION__);
 
    assert(setup->state == SETUP_ACTIVE);
-   set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__);
-   lp_setup_update_state(setup, TRUE);
+
+   if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+      return FALSE;
+   
+   if (!lp_setup_update_state(setup, TRUE))
+      return FALSE;
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index 25dab78f64f..ebb18f81344 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -33,28 +33,6 @@
 struct draw_context;
 struct vertex_info;
 
-enum lp_interp {
-   LP_INTERP_CONSTANT,
-   LP_INTERP_LINEAR,
-   LP_INTERP_PERSPECTIVE,
-   LP_INTERP_POSITION,
-   LP_INTERP_FACING
-};
-
-
-/**
- * Describes how to compute the interpolation coefficients (a0, dadx, dady)
- * from the vertices passed into our triangle/line/point functions by the
- * draw module.
- *
- * Vertices are treated as an array of float[4] values, indexed by
- * src_index.
- */
-struct lp_shader_input {
-   enum lp_interp interp;       /* how to interpolate values */
-   unsigned src_index;          /* where to find values in incoming vertices */
-   unsigned usage_mask;         /* bitmask of TGSI_WRITEMASK_x flags */
-};
 
 struct pipe_resource;
 struct pipe_query;
@@ -66,7 +44,7 @@ struct lp_fragment_shader_variant;
 struct lp_jit_context;
 struct llvmpipe_query;
 struct pipe_fence_handle;
-
+struct lp_setup_variant;
 
 struct lp_setup_context *
 lp_setup_create( struct pipe_context *pipe,
@@ -111,9 +89,8 @@ lp_setup_set_point_state( struct lp_setup_context *setup,
                           uint sprite_coord_origin);
 
 void
-lp_setup_set_fs_inputs( struct lp_setup_context *setup,
-                        const struct lp_shader_input *interp,
-                        unsigned nr );
+lp_setup_set_setup_variant( struct lp_setup_context *setup,
+			    const struct lp_setup_variant *variant );
 
 void
 lp_setup_set_fs_variant( struct lp_setup_context *setup,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
deleted file mode 100644
index 8dc2688ddb6..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010, VMware.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/*
- * Binning code for triangles
- */
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "lp_perf.h"
-#include "lp_setup_context.h"
-#include "lp_setup_coef.h"
-#include "lp_rast.h"
-#include "lp_state_fs.h"
-
-#if !defined(PIPE_ARCH_SSE)
-
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- */
-static void constant_coef( struct lp_rast_shader_inputs *inputs,
-                           unsigned slot,
-			   const float value,
-                           unsigned i )
-{
-   inputs->a0[slot][i] = value;
-   inputs->dadx[slot][i] = 0.0f;
-   inputs->dady[slot][i] = 0.0f;
-}
-
-
-
-static void linear_coef( struct lp_rast_shader_inputs *inputs,
-                         const struct lp_tri_info *info,
-                         unsigned slot,
-                         unsigned vert_attr,
-                         unsigned i)
-{
-   float a0 = info->v0[vert_attr][i];
-   float a1 = info->v1[vert_attr][i];
-   float a2 = info->v2[vert_attr][i];
-
-   float da01 = a0 - a1;
-   float da20 = a2 - a0;
-   float dadx = (da01 * info->dy20_ooa - info->dy01_ooa * da20);
-   float dady = (da20 * info->dx01_ooa - info->dx20_ooa * da01);
-
-   inputs->dadx[slot][i] = dadx;
-   inputs->dady[slot][i] = dady;
-
-   /* calculate a0 as the value which would be sampled for the
-    * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
-    *
-    * this is neat but unfortunately not a good way to do things for
-    * triangles with very large values of dadx or dady as it will
-    * result in the subtraction and re-addition from a0 of a very
-    * large number, which means we'll end up loosing a lot of the
-    * fractional bits and precision from a0.  the way to fix this is
-    * to define a0 as the sample at a pixel center somewhere near vmin
-    * instead - i'll switch to this later.
-    */
-   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
-				   dady * info->y0_center);
-}
-
-
-/**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a triangle.
- * We basically multiply the vertex value by 1/w before computing
- * the plane coefficients (a0, dadx, dady).
- * Later, when we compute the value at a particular fragment position we'll
- * divide the interpolated value by the interpolated W at that fragment.
- */
-static void perspective_coef( struct lp_rast_shader_inputs *inputs,
-                              const struct lp_tri_info *info,
-                              unsigned slot,
-			      unsigned vert_attr,
-                              unsigned i)
-{
-   /* premultiply by 1/w  (v[0][3] is always 1/w):
-    */
-   float a0 = info->v0[vert_attr][i] * info->v0[0][3];
-   float a1 = info->v1[vert_attr][i] * info->v1[0][3];
-   float a2 = info->v2[vert_attr][i] * info->v2[0][3];
-   float da01 = a0 - a1;
-   float da20 = a2 - a0;
-   float dadx = da01 * info->dy20_ooa - info->dy01_ooa * da20;
-   float dady = da20 * info->dx01_ooa - info->dx20_ooa * da01;
-
-   inputs->dadx[slot][i] = dadx;
-   inputs->dady[slot][i] = dady;
-   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
-				   dady * info->y0_center);
-}
-
-
-/**
- * Special coefficient setup for gl_FragCoord.
- * X and Y are trivial
- * Z and W are copied from position_coef which should have already been computed.
- * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
- */
-static void
-setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs,
-                     const struct lp_tri_info *info,
-                     unsigned slot,
-                     unsigned usage_mask)
-{
-   /*X*/
-   if (usage_mask & TGSI_WRITEMASK_X) {
-      inputs->a0[slot][0] = 0.0;
-      inputs->dadx[slot][0] = 1.0;
-      inputs->dady[slot][0] = 0.0;
-   }
-
-   /*Y*/
-   if (usage_mask & TGSI_WRITEMASK_Y) {
-      inputs->a0[slot][1] = 0.0;
-      inputs->dadx[slot][1] = 0.0;
-      inputs->dady[slot][1] = 1.0;
-   }
-
-   /*Z*/
-   if (usage_mask & TGSI_WRITEMASK_Z) {
-      linear_coef(inputs, info, slot, 0, 2);
-   }
-
-   /*W*/
-   if (usage_mask & TGSI_WRITEMASK_W) {
-      linear_coef(inputs, info, slot, 0, 3);
-   }
-}
-
-
-/**
- * Setup the fragment input attribute with the front-facing value.
- * \param frontface  is the triangle front facing?
- */
-static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
-                               unsigned slot,
-                               boolean frontface,
-                               unsigned usage_mask)
-{
-   /* convert TRUE to 1.0 and FALSE to -1.0 */
-   if (usage_mask & TGSI_WRITEMASK_X)
-      constant_coef( inputs, slot, 2.0f * frontface - 1.0f, 0 );
-
-   if (usage_mask & TGSI_WRITEMASK_Y)
-      constant_coef( inputs, slot, 0.0f, 1 ); /* wasted */
-
-   if (usage_mask & TGSI_WRITEMASK_Z)
-      constant_coef( inputs, slot, 0.0f, 2 ); /* wasted */
-
-   if (usage_mask & TGSI_WRITEMASK_W)
-      constant_coef( inputs, slot, 0.0f, 3 ); /* wasted */
-}
-
-
-/**
- * Compute the tri->coef[] array dadx, dady, a0 values.
- */
-void lp_setup_tri_coef( struct lp_setup_context *setup,
-			struct lp_rast_shader_inputs *inputs,
-                        const float (*v0)[4],
-                        const float (*v1)[4],
-                        const float (*v2)[4],
-                        boolean frontfacing)
-{
-   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
-   unsigned slot;
-   unsigned i;
-   struct lp_tri_info info;
-   float dx01 = v0[0][0] - v1[0][0];
-   float dy01 = v0[0][1] - v1[0][1];
-   float dx20 = v2[0][0] - v0[0][0];
-   float dy20 = v2[0][1] - v0[0][1];
-   float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01);
-
-   info.v0 = v0;
-   info.v1 = v1;
-   info.v2 = v2;
-   info.frontfacing = frontfacing;
-   info.x0_center = v0[0][0] - setup->pixel_offset;
-   info.y0_center = v0[0][1] - setup->pixel_offset;
-   info.dx01_ooa  = dx01 * oneoverarea;
-   info.dx20_ooa  = dx20 * oneoverarea;
-   info.dy01_ooa  = dy01 * oneoverarea;
-   info.dy20_ooa  = dy20 * oneoverarea;
-
-
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-      unsigned usage_mask = setup->fs.input[slot].usage_mask;
-
-      switch (setup->fs.input[slot].interp) {
-      case LP_INTERP_CONSTANT:
-         if (setup->flatshade_first) {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               if (usage_mask & (1 << i))
-                  constant_coef(inputs, slot+1, info.v0[vert_attr][i], i);
-         }
-         else {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               if (usage_mask & (1 << i))
-                  constant_coef(inputs, slot+1, info.v2[vert_attr][i], i);
-         }
-         break;
-
-      case LP_INTERP_LINEAR:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            if (usage_mask & (1 << i))
-               linear_coef(inputs, &info, slot+1, vert_attr, i);
-         break;
-
-      case LP_INTERP_PERSPECTIVE:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            if (usage_mask & (1 << i))
-               perspective_coef(inputs, &info, slot+1, vert_attr, i);
-         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
-         break;
-
-      case LP_INTERP_POSITION:
-         /*
-          * The generated pixel interpolators will pick up the coeffs from
-          * slot 0, so all need to ensure that the usage mask is covers all
-          * usages.
-          */
-         fragcoord_usage_mask |= usage_mask;
-         break;
-
-      case LP_INTERP_FACING:
-         setup_facing_coef(inputs, slot+1, info.frontfacing, usage_mask);
-         break;
-
-      default:
-         assert(0);
-      }
-   }
-
-   /* The internal position input is in slot zero:
-    */
-   setup_fragcoord_coef(inputs, &info, 0, fragcoord_usage_mask);
-}
-
-#else
-extern void lp_setup_coef_dummy(void);
-void lp_setup_coef_dummy(void)
-{
-}
-
-#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.h b/src/gallium/drivers/llvmpipe/lp_setup_coef.h
deleted file mode 100644
index 87a3255ccc6..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * The setup code is concerned with point/line/triangle setup and
- * putting commands/data into the bins.
- */
-
-
-#ifndef LP_SETUP_COEF_H
-#define LP_SETUP_COEF_H
-
-
-struct lp_tri_info {
-
-   float x0_center;
-   float y0_center;
-
-   /* turn these into an aligned float[4] */
-   float dy01_ooa;
-   float dy20_ooa;
-   float dx01_ooa;
-   float dx20_ooa;
-
-   const float (*v0)[4];
-   const float (*v1)[4];
-   const float (*v2)[4];
-
-   boolean frontfacing;		/* remove eventually */
-};
-
-void lp_setup_tri_coef( struct lp_setup_context *setup,
-			struct lp_rast_shader_inputs *inputs,
-                        const float (*v0)[4],
-                        const float (*v1)[4],
-                        const float (*v2)[4],
-                        boolean frontfacing);
-
-#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
deleted file mode 100644
index 3742fd672b2..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 VMware.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/*
- * Binning code for triangles
- */
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "lp_perf.h"
-#include "lp_setup_context.h"
-#include "lp_setup_coef.h"
-#include "lp_rast.h"
-
-#if defined(PIPE_ARCH_SSE)
-#include <emmintrin.h>
-
-
-static void constant_coef4( struct lp_rast_shader_inputs *inputs,
-			    const struct lp_tri_info *info,
-			    unsigned slot,
-			    const float *attr)
-{
-   *(__m128 *)inputs->a0[slot]   = *(__m128 *)attr;
-   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
-   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
-}
-
-
-
-/**
- * Setup the fragment input attribute with the front-facing value.
- * \param frontface  is the triangle front facing?
- */
-static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
-			       const struct lp_tri_info *info,
-			       unsigned slot )
-{
-   /* XXX: just pass frontface directly to the shader, don't bother
-    * treating it as an input.
-    */
-   __m128 a0 = _mm_setr_ps(info->frontfacing ? 1.0 : -1.0,
-			   0, 0, 0);
-
-   *(__m128 *)inputs->a0[slot]   = a0;
-   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
-   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
-}
-
-
-
-static void calc_coef4( struct lp_rast_shader_inputs *inputs,
-			const struct lp_tri_info *info,
-			unsigned slot,
-			__m128 a0,
-			__m128 a1,
-			__m128 a2)
-{
-   __m128 da01          = _mm_sub_ps(a0, a1);
-   __m128 da20          = _mm_sub_ps(a2, a0);
-
-   __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dy20_ooa));
-   __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dy01_ooa));   
-   __m128 dadx          = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa);
-
-   __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dx20_ooa));
-   __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dx01_ooa));
-   __m128 dady          = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa);
-
-   __m128 dadx_x0       = _mm_mul_ps(dadx, _mm_set1_ps(info->x0_center));
-   __m128 dady_y0       = _mm_mul_ps(dady, _mm_set1_ps(info->y0_center));
-   __m128 attr_v0       = _mm_add_ps(dadx_x0, dady_y0);
-   __m128 attr_0        = _mm_sub_ps(a0, attr_v0);
-
-   *(__m128 *)inputs->a0[slot]   = attr_0;
-   *(__m128 *)inputs->dadx[slot] = dadx;
-   *(__m128 *)inputs->dady[slot] = dady;
-}
-
-
-static void linear_coef( struct lp_rast_shader_inputs *inputs,
-                         const struct lp_tri_info *info,
-                         unsigned slot,
-                         unsigned vert_attr)
-{
-   __m128 a0 = *(const __m128 *)info->v0[vert_attr];
-   __m128 a1 = *(const __m128 *)info->v1[vert_attr];
-   __m128 a2 = *(const __m128 *)info->v2[vert_attr];
-
-   calc_coef4(inputs, info, slot, a0, a1, a2);
-}
-
-
-
-/**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a triangle.
- * We basically multiply the vertex value by 1/w before computing
- * the plane coefficients (a0, dadx, dady).
- * Later, when we compute the value at a particular fragment position we'll
- * divide the interpolated value by the interpolated W at that fragment.
- */
-static void perspective_coef( struct lp_rast_shader_inputs *inputs,
-                              const struct lp_tri_info *info,
-                              unsigned slot,
-			      unsigned vert_attr)
-{
-   /* premultiply by 1/w  (v[0][3] is always 1/w):
-    */
-   __m128 a0 = *(const __m128 *)info->v0[vert_attr];
-   __m128 a1 = *(const __m128 *)info->v1[vert_attr];
-   __m128 a2 = *(const __m128 *)info->v2[vert_attr];
-
-   __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(info->v0[0][3]));
-   __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(info->v1[0][3]));
-   __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(info->v2[0][3]));
-
-   calc_coef4(inputs, info, slot, a0_oow, a1_oow, a2_oow);
-}
-
-
-
-
-
-/**
- * Compute the inputs-> dadx, dady, a0 values.
- */
-void lp_setup_tri_coef( struct lp_setup_context *setup,
-			struct lp_rast_shader_inputs *inputs,
-                        const float (*v0)[4],
-                        const float (*v1)[4],
-                        const float (*v2)[4],
-                        boolean frontfacing)
-{
-   unsigned slot;
-   struct lp_tri_info info;
-   float dx01 = v0[0][0] - v1[0][0];
-   float dy01 = v0[0][1] - v1[0][1];
-   float dx20 = v2[0][0] - v0[0][0];
-   float dy20 = v2[0][1] - v0[0][1];
-   float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01);
-
-   info.v0 = v0;
-   info.v1 = v1;
-   info.v2 = v2;
-   info.frontfacing = frontfacing;
-   info.x0_center = v0[0][0] - setup->pixel_offset;
-   info.y0_center = v0[0][1] - setup->pixel_offset;
-   info.dx01_ooa  = dx01 * oneoverarea;
-   info.dx20_ooa  = dx20 * oneoverarea;
-   info.dy01_ooa  = dy01 * oneoverarea;
-   info.dy20_ooa  = dy20 * oneoverarea;
-
-
-   /* The internal position input is in slot zero:
-    */
-   linear_coef(inputs, &info, 0, 0);
-
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-
-      switch (setup->fs.input[slot].interp) {
-      case LP_INTERP_CONSTANT:
-         if (setup->flatshade_first) {
-	    constant_coef4(inputs, &info, slot+1, info.v0[vert_attr]);
-         }
-         else {
-	    constant_coef4(inputs, &info, slot+1, info.v2[vert_attr]);
-         }
-         break;
-
-      case LP_INTERP_LINEAR:
-	 linear_coef(inputs, &info, slot+1, vert_attr);
-         break;
-
-      case LP_INTERP_PERSPECTIVE:
-	 perspective_coef(inputs, &info, slot+1, vert_attr);
-         break;
-
-      case LP_INTERP_POSITION:
-         /*
-          * The generated pixel interpolators will pick up the coeffs from
-          * slot 0.
-          */
-         break;
-
-      case LP_INTERP_FACING:
-         setup_facing_coef(inputs, &info, slot+1);
-         break;
-
-      default:
-         assert(0);
-      }
-   }
-}
-
-#else
-extern void lp_setup_coef_dummy(void);
-void lp_setup_coef_dummy(void)
-{
-}
-#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 8506ed2dc9e..dc2533bedc5 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -39,6 +39,7 @@
 #include "lp_rast.h"
 #include "lp_tile_soa.h"        /* for TILE_SIZE */
 #include "lp_scene.h"
+#include "lp_bld_interp.h"	/* for struct lp_shader_input */
 
 #include "draw/draw_vbuf.h"
 #include "util/u_rect.h"
@@ -49,6 +50,8 @@
 #define LP_SETUP_NEW_SCISSOR     0x08
 
 
+struct lp_setup_variant;
+
 
 /** Max number of scenes */
 #define MAX_SCENES 2
@@ -118,9 +121,6 @@ struct lp_setup_context
    } state;
    
    struct {
-      struct lp_shader_input input[PIPE_MAX_ATTRIBS];
-      unsigned nr_inputs;
-
       const struct lp_rast_state *stored; /**< what's in the scene */
       struct lp_rast_state current;  /**< currently set state */
       struct pipe_resource *current_tex[PIPE_MAX_SAMPLERS];
@@ -139,6 +139,10 @@ struct lp_setup_context
    } blend_color;
 
 
+   struct {
+      const struct lp_setup_variant *variant;
+   } setup;
+
    unsigned dirty;   /**< bitmask of LP_SETUP_NEW_x bits */
 
    void (*point)( struct lp_setup_context *,
@@ -160,12 +164,12 @@ void lp_setup_choose_point( struct lp_setup_context *setup );
 
 void lp_setup_init_vbuf(struct lp_setup_context *setup);
 
-void lp_setup_update_state( struct lp_setup_context *setup,
+boolean lp_setup_update_state( struct lp_setup_context *setup,
                             boolean update_scene);
 
 void lp_setup_destroy( struct lp_setup_context *setup );
 
-void lp_setup_flush_and_restart(struct lp_setup_context *setup);
+boolean lp_setup_flush_and_restart(struct lp_setup_context *setup);
 
 void
 lp_setup_print_triangle(struct lp_setup_context *setup,
@@ -181,7 +185,7 @@ lp_setup_print_vertex(struct lp_setup_context *setup,
 
 struct lp_rast_triangle *
 lp_setup_alloc_triangle(struct lp_scene *scene,
-                        unsigned nr_inputs,
+                        unsigned num_inputs,
                         unsigned nr_planes,
                         unsigned *tri_size);
 
@@ -191,6 +195,4 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                        const struct u_rect *bbox,
                        int nr_planes );
 
-void lp_setup_flush_and_restart(struct lp_setup_context *setup);
-
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 156bd633754..827413bb33e 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -35,6 +35,7 @@
 #include "lp_setup_context.h"
 #include "lp_rast.h"
 #include "lp_state_fs.h"
+#include "lp_state_setup.h"
 
 #define NUM_CHANNELS 4
 
@@ -46,6 +47,10 @@ struct lp_line_info {
 
    const float (*v1)[4];
    const float (*v2)[4];
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
 };
 
 
@@ -53,14 +58,14 @@ struct lp_line_info {
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  */
 static void constant_coef( struct lp_setup_context *setup,
-                           struct lp_rast_triangle *tri,
+                           struct lp_line_info *info,
                            unsigned slot,
                            const float value,
                            unsigned i )
 {
-   tri->inputs.a0[slot][i] = value;
-   tri->inputs.dadx[slot][i] = 0.0f;
-   tri->inputs.dady[slot][i] = 0.0f;
+   info->a0[slot][i] = value;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
 }
 
 
@@ -69,7 +74,6 @@ static void constant_coef( struct lp_setup_context *setup,
  * for a triangle.
  */
 static void linear_coef( struct lp_setup_context *setup,
-                         struct lp_rast_triangle *tri,
                          struct lp_line_info *info,
                          unsigned slot,
                          unsigned vert_attr,
@@ -82,10 +86,10 @@ static void linear_coef( struct lp_setup_context *setup,
    float dadx = da21 * info->dx * info->oneoverarea;
    float dady = da21 * info->dy * info->oneoverarea;
 
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;  
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;  
    
-   tri->inputs.a0[slot][i] = (a1 -
+   info->a0[slot][i] = (a1 -
                               (dadx * (info->v1[0][0] - setup->pixel_offset) +
                                dady * (info->v1[0][1] - setup->pixel_offset)));
 }
@@ -100,7 +104,6 @@ static void linear_coef( struct lp_setup_context *setup,
  * divide the interpolated value by the interpolated W at that fragment.
  */
 static void perspective_coef( struct lp_setup_context *setup,
-                              struct lp_rast_triangle *tri,
                               struct lp_line_info *info,
                               unsigned slot,
                               unsigned vert_attr,
@@ -115,43 +118,42 @@ static void perspective_coef( struct lp_setup_context *setup,
    float dadx = da21 * info->dx * info->oneoverarea;
    float dady = da21 * info->dy * info->oneoverarea;
 
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;
    
-   tri->inputs.a0[slot][i] = (a1 -
-                              (dadx * (info->v1[0][0] - setup->pixel_offset) +
-                               dady * (info->v1[0][1] - setup->pixel_offset)));
+   info->a0[slot][i] = (a1 -
+                        (dadx * (info->v1[0][0] - setup->pixel_offset) +
+                         dady * (info->v1[0][1] - setup->pixel_offset)));
 }
 
 static void
 setup_fragcoord_coef( struct lp_setup_context *setup,
-                      struct lp_rast_triangle *tri,
                       struct lp_line_info *info,
                       unsigned slot,
                       unsigned usage_mask)
 {
    /*X*/
    if (usage_mask & TGSI_WRITEMASK_X) {
-      tri->inputs.a0[slot][0] = 0.0;
-      tri->inputs.dadx[slot][0] = 1.0;
-      tri->inputs.dady[slot][0] = 0.0;
+      info->a0[slot][0] = 0.0;
+      info->dadx[slot][0] = 1.0;
+      info->dady[slot][0] = 0.0;
    }
 
    /*Y*/
    if (usage_mask & TGSI_WRITEMASK_Y) {
-      tri->inputs.a0[slot][1] = 0.0;
-      tri->inputs.dadx[slot][1] = 0.0;
-      tri->inputs.dady[slot][1] = 1.0;
+      info->a0[slot][1] = 0.0;
+      info->dadx[slot][1] = 0.0;
+      info->dady[slot][1] = 1.0;
    }
 
    /*Z*/
    if (usage_mask & TGSI_WRITEMASK_Z) {
-      linear_coef(setup, tri, info, slot, 0, 2);
+      linear_coef(setup, info, slot, 0, 2);
    }
 
    /*W*/
    if (usage_mask & TGSI_WRITEMASK_W) {
-      linear_coef(setup, tri, info, slot, 0, 3);
+      linear_coef(setup, info, slot, 0, 3);
    }
 }
 
@@ -159,43 +161,43 @@ setup_fragcoord_coef( struct lp_setup_context *setup,
  * Compute the tri->coef[] array dadx, dady, a0 values.
  */
 static void setup_line_coefficients( struct lp_setup_context *setup,
-                                     struct lp_rast_triangle *tri,
                                      struct lp_line_info *info)
 {
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
    unsigned slot;
 
    /* setup interpolation for all the remaining attributes:
     */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned vert_attr = key->inputs[slot].src_index;
+      unsigned usage_mask = key->inputs[slot].usage_mask;
       unsigned i;
            
-      switch (setup->fs.input[slot].interp) {
+      switch (key->inputs[slot].interp) {
       case LP_INTERP_CONSTANT:
-         if (setup->flatshade_first) {
+         if (key->flatshade_first) {
             for (i = 0; i < NUM_CHANNELS; i++)
                if (usage_mask & (1 << i))
-                  constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i);
+                  constant_coef(setup, info, slot+1, info->v1[vert_attr][i], i);
          }
          else {
             for (i = 0; i < NUM_CHANNELS; i++)
                if (usage_mask & (1 << i))
-                  constant_coef(setup, tri, slot+1, info->v2[vert_attr][i], i);
+                  constant_coef(setup, info, slot+1, info->v2[vert_attr][i], i);
          }
          break;
 
       case LP_INTERP_LINEAR:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               linear_coef(setup, tri, info, slot+1, vert_attr, i);
+               linear_coef(setup, info, slot+1, vert_attr, i);
          break;
 
       case LP_INTERP_PERSPECTIVE:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               perspective_coef(setup, tri, info, slot+1, vert_attr, i);
+               perspective_coef(setup, info, slot+1, vert_attr, i);
          fragcoord_usage_mask |= TGSI_WRITEMASK_W;
          break;
 
@@ -211,7 +213,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
       case LP_INTERP_FACING:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               constant_coef(setup, tri, slot+1, 1.0, i);
+               constant_coef(setup, info, slot+1, 1.0, i);
          break;
 
       default:
@@ -221,7 +223,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
 
    /* The internal position input is in slot zero:
     */
-   setup_fragcoord_coef(setup, tri, info, 0,
+   setup_fragcoord_coef(setup, info, 0,
                         fragcoord_usage_mask);
 }
 
@@ -241,14 +243,15 @@ print_line(struct lp_setup_context *setup,
            const float (*v1)[4],
            const float (*v2)[4])
 {
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    uint i;
 
    debug_printf("llvmpipe line\n");
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
+   for (i = 0; i < 1 + key->num_inputs; i++) {
       debug_printf("  v1[%d]:  %f %f %f %f\n", i,
                    v1[i][0], v1[i][1], v1[i][2], v1[i][3]);
    }
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
+   for (i = 0; i < 1 + key->num_inputs; i++) {
       debug_printf("  v2[%d]:  %f %f %f %f\n", i,
                    v2[i][0], v2[i][1], v2[i][2], v2[i][3]);
    }
@@ -275,7 +278,9 @@ try_setup_line( struct lp_setup_context *setup,
                const float (*v2)[4])
 {
    struct lp_scene *scene = setup->scene;
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    struct lp_rast_triangle *line;
+   struct lp_rast_plane *plane;
    struct lp_line_info info;
    float width = MAX2(1.0, setup->line_width);
    struct u_rect bbox;
@@ -475,7 +480,7 @@ try_setup_line( struct lp_setup_context *setup,
       else {
          /* do intersection test */
          float xintersect = fracf(v2[0][0]) + y2diff * dxdy;
-         draw_end = (xintersect < 1.0 && xintersect > 0.0);
+         draw_end = (xintersect < 1.0 && xintersect >= 0.0);
       }
 
       /* Are we already drawing start/end?
@@ -513,7 +518,7 @@ try_setup_line( struct lp_setup_context *setup,
             x_offset_end = y_offset_end * dxdy;
          }
       }
- 
+
       /* x/y positions in fixed point */
       x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) - fixed_width/2;
       x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2;
@@ -567,7 +572,7 @@ try_setup_line( struct lp_setup_context *setup,
    u_rect_find_intersection(&setup->draw_region, &bbox);
 
    line = lp_setup_alloc_triangle(scene,
-                                  setup->fs.nr_inputs,
+                                  key->num_inputs,
                                   nr_planes,
                                   &tri_bytes);
    if (!line)
@@ -581,33 +586,35 @@ try_setup_line( struct lp_setup_context *setup,
 #endif
 
    /* calculate the deltas */
-   line->plane[0].dcdy = x[0] - x[1];
-   line->plane[1].dcdy = x[1] - x[2];
-   line->plane[2].dcdy = x[2] - x[3];
-   line->plane[3].dcdy = x[3] - x[0];
+   plane = GET_PLANES(line);
+   plane[0].dcdy = x[0] - x[1];
+   plane[1].dcdy = x[1] - x[2];
+   plane[2].dcdy = x[2] - x[3];
+   plane[3].dcdy = x[3] - x[0];
 
-   line->plane[0].dcdx = y[0] - y[1];
-   line->plane[1].dcdx = y[1] - y[2];
-   line->plane[2].dcdx = y[2] - y[3];
-   line->plane[3].dcdx = y[3] - y[0];
+   plane[0].dcdx = y[0] - y[1];
+   plane[1].dcdx = y[1] - y[2];
+   plane[2].dcdx = y[2] - y[3];
+   plane[3].dcdx = y[3] - y[0];
 
 
    /* Setup parameter interpolants:
     */
-   setup_line_coefficients( setup, line, &info); 
+   info.a0 = GET_A0(&line->inputs);
+   info.dadx = GET_DADX(&line->inputs);
+   info.dady = GET_DADY(&line->inputs);
+   setup_line_coefficients(setup, &info); 
 
-   line->inputs.facing = 1.0F;
-   line->inputs.state = setup->fs.stored;
+   line->inputs.frontfacing = TRUE;
    line->inputs.disable = FALSE;
    line->inputs.opaque = FALSE;
 
    for (i = 0; i < 4; i++) {
-      struct lp_rast_plane *plane = &line->plane[i];
 
       /* half-edge constants, will be interated over the whole render
        * target.
        */
-      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
+      plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
 
       
       /* correct for top-left vs. bottom-left fill convention.  
@@ -623,38 +630,34 @@ try_setup_line( struct lp_setup_context *setup,
        * to its usual method, in which case it will probably want
        * to use the opposite, top-left convention.
        */         
-      if (plane->dcdx < 0) {
+      if (plane[i].dcdx < 0) {
          /* both fill conventions want this - adjust for left edges */
-         plane->c++;            
+         plane[i].c++;            
       }
-      else if (plane->dcdx == 0) {
+      else if (plane[i].dcdx == 0) {
          if (setup->pixel_offset == 0) {
             /* correct for top-left fill convention:
              */
-            if (plane->dcdy > 0) plane->c++;
+            if (plane[i].dcdy > 0) plane[i].c++;
          }
          else {
             /* correct for bottom-left fill convention:
              */
-            if (plane->dcdy < 0) plane->c++;
+            if (plane[i].dcdy < 0) plane[i].c++;
          }
       }
 
-      plane->dcdx *= FIXED_ONE;
-      plane->dcdy *= FIXED_ONE;
+      plane[i].dcdx *= FIXED_ONE;
+      plane[i].dcdy *= FIXED_ONE;
 
       /* find trivial reject offsets for each edge for a single-pixel
        * sized block.  These will be scaled up at each recursive level to
        * match the active blocksize.  Scaling in this way works best if
        * the blocks are square.
        */
-      plane->eo = 0;
-      if (plane->dcdx < 0) plane->eo -= plane->dcdx;
-      if (plane->dcdy > 0) plane->eo += plane->dcdy;
-
-      /* Calculate trivial accept offsets from the above.
-       */
-      plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+      plane[i].eo = 0;
+      if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+      if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
    }
 
 
@@ -677,29 +680,25 @@ try_setup_line( struct lp_setup_context *setup,
     * these planes elsewhere.
     */
    if (nr_planes == 8) {
-      line->plane[4].dcdx = -1;
-      line->plane[4].dcdy = 0;
-      line->plane[4].c = 1-bbox.x0;
-      line->plane[4].ei = 0;
-      line->plane[4].eo = 1;
-
-      line->plane[5].dcdx = 1;
-      line->plane[5].dcdy = 0;
-      line->plane[5].c = bbox.x1+1;
-      line->plane[5].ei = -1;
-      line->plane[5].eo = 0;
-
-      line->plane[6].dcdx = 0;
-      line->plane[6].dcdy = 1;
-      line->plane[6].c = 1-bbox.y0;
-      line->plane[6].ei = 0;
-      line->plane[6].eo = 1;
-
-      line->plane[7].dcdx = 0;
-      line->plane[7].dcdy = -1;
-      line->plane[7].c = bbox.y1+1;
-      line->plane[7].ei = -1;
-      line->plane[7].eo = 0;
+      plane[4].dcdx = -1;
+      plane[4].dcdy = 0;
+      plane[4].c = 1-bbox.x0;
+      plane[4].eo = 1;
+
+      plane[5].dcdx = 1;
+      plane[5].dcdy = 0;
+      plane[5].c = bbox.x1+1;
+      plane[5].eo = 0;
+
+      plane[6].dcdx = 0;
+      plane[6].dcdy = 1;
+      plane[6].c = 1-bbox.y0;
+      plane[6].eo = 1;
+
+      plane[7].dcdx = 0;
+      plane[7].dcdy = -1;
+      plane[7].c = bbox.y1+1;
+      plane[7].eo = 0;
    }
 
    return lp_setup_bin_triangle(setup, line, &bbox, nr_planes);
@@ -712,10 +711,11 @@ static void lp_setup_line( struct lp_setup_context *setup,
 {
    if (!try_setup_line( setup, v0, v1 ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
 
       if (!try_setup_line( setup, v0, v1 ))
-         assert(0);
+         return;
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 3b217f95447..146f1bd07ca 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -35,6 +35,7 @@
 #include "lp_perf.h"
 #include "lp_rast.h"
 #include "lp_state_fs.h"
+#include "lp_state_setup.h"
 #include "tgsi/tgsi_scan.h"
 
 #define NUM_CHANNELS 4
@@ -45,6 +46,10 @@ struct point_info {
    int dx01, dx12;
 
    const float (*v0)[4];
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
 };   
 
 
@@ -53,14 +58,37 @@ struct point_info {
  */
 static void
 constant_coef(struct lp_setup_context *setup,
-              struct lp_rast_triangle *point,
+              struct point_info *info,
               unsigned slot,
               const float value,
               unsigned i)
 {
-   point->inputs.a0[slot][i] = value;
-   point->inputs.dadx[slot][i] = 0.0f;
-   point->inputs.dady[slot][i] = 0.0f;
+   info->a0[slot][i] = value;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
+}
+
+
+static void
+point_persp_coeff(struct lp_setup_context *setup,
+                  const struct point_info *info,
+                  unsigned slot,
+                  unsigned i)
+{
+   /*
+    * Fragment shader expects pre-multiplied w for LP_INTERP_PERSPECTIVE. A
+    * better stratergy would be to take the primitive in consideration when
+    * generating the fragment shader key, and therefore avoid the per-fragment
+    * perspective divide.
+    */
+
+   float w0 = info->v0[0][3];
+
+   assert(i < 4);
+
+   info->a0[slot][i] = info->v0[slot][i]*w0;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
 }
 
 
@@ -69,17 +97,19 @@ constant_coef(struct lp_setup_context *setup,
  * \param slot  the vertex attribute slot to setup
  * \param i  the attribute channel in [0,3]
  * \param sprite_coord_origin  one of PIPE_SPRITE_COORD_x
- * \param perspective_proj  will the TEX instruction do a divide by Q?
+ * \param perspective  does the shader expects pre-multiplied w, i.e.,
+ *    LP_INTERP_PERSPECTIVE is specified in the shader key
  */
 static void
 texcoord_coef(struct lp_setup_context *setup,
-              struct lp_rast_triangle *point,
               const struct point_info *info,
               unsigned slot,
               unsigned i,
               unsigned sprite_coord_origin,
-              boolean perspective_proj)
+              boolean perspective)
 {
+   float w0 = info->v0[0][3];
+
    assert(i < 4);
 
    if (i == 0) {
@@ -88,21 +118,14 @@ texcoord_coef(struct lp_setup_context *setup,
       float x0 = info->v0[0][0] - setup->pixel_offset;
       float y0 = info->v0[0][1] - setup->pixel_offset;
 
-      point->inputs.dadx[slot][0] = dadx;
-      point->inputs.dady[slot][0] = dady;
-      point->inputs.a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
+      info->dadx[slot][0] = dadx;
+      info->dady[slot][0] = dady;
+      info->a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
 
-      if (!perspective_proj) {
-         /* Divide coefficients by vertex.w here.
-          *
-          * It would be clearer to always multiply by w0 above and
-          * then divide it out for perspective projection here, but
-          * doing it this way involves less algebra.
-          */
-         float w0 = info->v0[0][3];
-         point->inputs.dadx[slot][0] *= w0;
-         point->inputs.dady[slot][0] *= w0;
-         point->inputs.a0[slot][0] *= w0;
+      if (perspective) {
+         info->dadx[slot][0] *= w0;
+         info->dady[slot][0] *= w0;
+         info->a0[slot][0] *= w0;
       }
    }
    else if (i == 1) {
@@ -115,26 +138,25 @@ texcoord_coef(struct lp_setup_context *setup,
          dady = -dady;
       }
 
-      point->inputs.dadx[slot][1] = dadx;
-      point->inputs.dady[slot][1] = dady;
-      point->inputs.a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
+      info->dadx[slot][1] = dadx;
+      info->dady[slot][1] = dady;
+      info->a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
 
-      if (!perspective_proj) {
-         float w0 = info->v0[0][3];
-         point->inputs.dadx[slot][1] *= w0;
-         point->inputs.dady[slot][1] *= w0;
-         point->inputs.a0[slot][1] *= w0;
+      if (perspective) {
+         info->dadx[slot][1] *= w0;
+         info->dady[slot][1] *= w0;
+         info->a0[slot][1] *= w0;
       }
    }
    else if (i == 2) {
-      point->inputs.a0[slot][2] = 0.0f;
-      point->inputs.dadx[slot][2] = 0.0f;
-      point->inputs.dady[slot][2] = 0.0f;
+      info->a0[slot][2] = 0.0f;
+      info->dadx[slot][2] = 0.0f;
+      info->dady[slot][2] = 0.0f;
    }
    else {
-      point->inputs.a0[slot][3] = 1.0f;
-      point->inputs.dadx[slot][3] = 0.0f;
-      point->inputs.dady[slot][3] = 0.0f;
+      info->a0[slot][3] = perspective ? w0 : 1.0f;
+      info->dadx[slot][3] = 0.0f;
+      info->dady[slot][3] = 0.0f;
    }
 }
 
@@ -147,33 +169,32 @@ texcoord_coef(struct lp_setup_context *setup,
  */
 static void
 setup_point_fragcoord_coef(struct lp_setup_context *setup,
-                           struct lp_rast_triangle *point,
-                           const struct point_info *info,
+                           struct point_info *info,
                            unsigned slot,
                            unsigned usage_mask)
 {
    /*X*/
    if (usage_mask & TGSI_WRITEMASK_X) {
-      point->inputs.a0[slot][0] = 0.0;
-      point->inputs.dadx[slot][0] = 1.0;
-      point->inputs.dady[slot][0] = 0.0;
+      info->a0[slot][0] = 0.0;
+      info->dadx[slot][0] = 1.0;
+      info->dady[slot][0] = 0.0;
    }
 
    /*Y*/
    if (usage_mask & TGSI_WRITEMASK_Y) {
-      point->inputs.a0[slot][1] = 0.0;
-      point->inputs.dadx[slot][1] = 0.0;
-      point->inputs.dady[slot][1] = 1.0;
+      info->a0[slot][1] = 0.0;
+      info->dadx[slot][1] = 0.0;
+      info->dady[slot][1] = 1.0;
    }
 
    /*Z*/
    if (usage_mask & TGSI_WRITEMASK_Z) {
-      constant_coef(setup, point, slot, info->v0[0][2], 2);
+      constant_coef(setup, info, slot, info->v0[0][2], 2);
    }
 
    /*W*/
    if (usage_mask & TGSI_WRITEMASK_W) {
-      constant_coef(setup, point, slot, info->v0[0][3], 3);
+      constant_coef(setup, info, slot, info->v0[0][3], 3);
    }
 }
 
@@ -183,21 +204,27 @@ setup_point_fragcoord_coef(struct lp_setup_context *setup,
  */
 static void   
 setup_point_coefficients( struct lp_setup_context *setup,
-                          struct lp_rast_triangle *point,
-                          const struct point_info *info)
+                          struct point_info *info)
 {
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    const struct lp_fragment_shader *shader = setup->fs.current.variant->shader;
    unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
    unsigned slot;
 
    /* setup interpolation for all the remaining attributes:
     */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned vert_attr = key->inputs[slot].src_index;
+      unsigned usage_mask = key->inputs[slot].usage_mask;
+      enum lp_interp interp = key->inputs[slot].interp;
+      boolean perspective = !!(interp == LP_INTERP_PERSPECTIVE);
       unsigned i;
+
+      if (perspective & usage_mask) {
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+      }
       
-      switch (setup->fs.input[slot].interp) {
+      switch (interp) {
       case LP_INTERP_POSITION:
          /*
           * The generated pixel interpolators will pick up the coeffs from
@@ -210,39 +237,45 @@ setup_point_coefficients( struct lp_setup_context *setup,
       case LP_INTERP_LINEAR:
          /* Sprite tex coords may use linear interpolation someday */
          /* fall-through */
-
       case LP_INTERP_PERSPECTIVE:
          /* check if the sprite coord flag is set for this attribute.
           * If so, set it up so it up so x and y vary from 0 to 1.
           */
-         if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
-            const int index = shader->info.input_semantic_index[slot];
+         if (shader->info.base.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
+            unsigned semantic_index = shader->info.base.input_semantic_index[slot];
             /* Note that sprite_coord enable is a bitfield of
              * PIPE_MAX_SHADER_OUTPUTS bits.
              */
-            if (index < PIPE_MAX_SHADER_OUTPUTS &&
-                (setup->sprite_coord_enable & (1 << index))) {
-               for (i = 0; i < NUM_CHANNELS; i++)
-                  if (usage_mask & (1 << i))
-                     texcoord_coef(setup, point, info, slot + 1, i,
+            if (semantic_index < PIPE_MAX_SHADER_OUTPUTS &&
+                (setup->sprite_coord_enable & (1 << semantic_index))) {
+               for (i = 0; i < NUM_CHANNELS; i++) {
+                  if (usage_mask & (1 << i)) {
+                     texcoord_coef(setup, info, slot + 1, i,
                                    setup->sprite_coord_origin,
-                                   (usage_mask & TGSI_WRITEMASK_W));
-               fragcoord_usage_mask |= TGSI_WRITEMASK_W;
-               break;                     
+                                   perspective);
+                  }
+               }
+               break;
             }
          }
-         /* FALLTHROUGH */
+         /* fall-through */
       case LP_INTERP_CONSTANT:
          for (i = 0; i < NUM_CHANNELS; i++) {
-            if (usage_mask & (1 << i))
-               constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i);
+            if (usage_mask & (1 << i)) {
+               if (perspective) {
+                  point_persp_coeff(setup, info, slot+1, i);
+               }
+               else {
+                  constant_coef(setup, info, slot+1, info->v0[vert_attr][i], i);
+               }
+            }
          }
          break;
 
       case LP_INTERP_FACING:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               constant_coef(setup, point, slot+1, 1.0, i);
+               constant_coef(setup, info, slot+1, 1.0, i);
          break;
 
       default:
@@ -253,7 +286,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
 
    /* The internal position input is in slot zero:
     */
-   setup_point_fragcoord_coef(setup, point, info, 0,
+   setup_point_fragcoord_coef(setup, info, 0,
                               fragcoord_usage_mask);
 }
 
@@ -270,6 +303,7 @@ try_setup_point( struct lp_setup_context *setup,
                  const float (*v0)[4] )
 {
    /* x/y positions in fixed point */
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    const int sizeAttr = setup->psize;
    const float size
       = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0]
@@ -321,7 +355,7 @@ try_setup_point( struct lp_setup_context *setup,
    u_rect_find_intersection(&setup->draw_region, &bbox);
 
    point = lp_setup_alloc_triangle(scene,
-                                   setup->fs.nr_inputs,
+                                   key->num_inputs,
                                    nr_planes,
                                    &bytes);
    if (!point)
@@ -337,40 +371,40 @@ try_setup_point( struct lp_setup_context *setup,
    info.dx12 = fixed_width;
    info.dy01 = fixed_width;
    info.dy12 = 0;
+   info.a0 = GET_A0(&point->inputs);
+   info.dadx = GET_DADX(&point->inputs);
+   info.dady = GET_DADY(&point->inputs);
    
    /* Setup parameter interpolants:
     */
-   setup_point_coefficients(setup, point, &info);
+   setup_point_coefficients(setup, &info);
 
-   point->inputs.facing = 1.0F;
-   point->inputs.state = setup->fs.stored;
+   point->inputs.frontfacing = TRUE;
    point->inputs.disable = FALSE;
    point->inputs.opaque = FALSE;
 
    {
-      point->plane[0].dcdx = -1;
-      point->plane[0].dcdy = 0;
-      point->plane[0].c = 1-bbox.x0;
-      point->plane[0].ei = 0;
-      point->plane[0].eo = 1;
-
-      point->plane[1].dcdx = 1;
-      point->plane[1].dcdy = 0;
-      point->plane[1].c = bbox.x1+1;
-      point->plane[1].ei = -1;
-      point->plane[1].eo = 0;
-
-      point->plane[2].dcdx = 0;
-      point->plane[2].dcdy = 1;
-      point->plane[2].c = 1-bbox.y0;
-      point->plane[2].ei = 0;
-      point->plane[2].eo = 1;
-
-      point->plane[3].dcdx = 0;
-      point->plane[3].dcdy = -1;
-      point->plane[3].c = bbox.y1+1;
-      point->plane[3].ei = -1;
-      point->plane[3].eo = 0;
+      struct lp_rast_plane *plane = GET_PLANES(point);
+
+      plane[0].dcdx = -1;
+      plane[0].dcdy = 0;
+      plane[0].c = 1-bbox.x0;
+      plane[0].eo = 1;
+
+      plane[1].dcdx = 1;
+      plane[1].dcdy = 0;
+      plane[1].c = bbox.x1+1;
+      plane[1].eo = 0;
+
+      plane[2].dcdx = 0;
+      plane[2].dcdy = 1;
+      plane[2].c = 1-bbox.y0;
+      plane[2].eo = 1;
+
+      plane[3].dcdx = 0;
+      plane[3].dcdy = -1;
+      plane[3].c = bbox.y1+1;
+      plane[3].eo = 0;
    }
 
    return lp_setup_bin_triangle(setup, point, &bbox, nr_planes);
@@ -383,10 +417,11 @@ lp_setup_point(struct lp_setup_context *setup,
 {
    if (!try_setup_point( setup, v0 ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
 
       if (!try_setup_point( setup, v0 ))
-         assert(0);
+         return;
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 9016bb8e249..4ab0b72a574 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -32,15 +32,18 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_rect.h"
+#include "util/u_sse.h"
 #include "lp_perf.h"
 #include "lp_setup_context.h"
-#include "lp_setup_coef.h"
 #include "lp_rast.h"
 #include "lp_state_fs.h"
+#include "lp_state_setup.h"
 
 #define NUM_CHANNELS 4
 
-
+#if defined(PIPE_ARCH_SSE)
+#include <emmintrin.h>
+#endif
    
 static INLINE int
 subpixel_snap(float a)
@@ -65,7 +68,7 @@ fixed_to_float(int a)
  * immediately after it.
  * The memory is allocated from the per-scene pool, not per-tile.
  * \param tri_size  returns number of bytes allocated
- * \param nr_inputs  number of fragment shader inputs
+ * \param num_inputs  number of fragment shader inputs
  * \return pointer to triangle space
  */
 struct lp_rast_triangle *
@@ -75,22 +78,23 @@ lp_setup_alloc_triangle(struct lp_scene *scene,
                         unsigned *tri_size)
 {
    unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
+   unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
    struct lp_rast_triangle *tri;
-   unsigned tri_bytes, bytes;
-   char *inputs;
 
-   tri_bytes = align(Offset(struct lp_rast_triangle, plane[nr_planes]), 16);
-   bytes = tri_bytes + (3 * input_array_sz);
+   *tri_size = (sizeof(struct lp_rast_triangle) +
+                3 * input_array_sz +
+                plane_sz);
 
-   tri = lp_scene_alloc_aligned( scene, bytes, 16 );
+   tri = lp_scene_alloc_aligned( scene, *tri_size, 16 );
+   if (tri == NULL)
+      return NULL;
 
-   if (tri) {
-      inputs = ((char *)tri) + tri_bytes;
-      tri->inputs.a0   = (float (*)[4]) inputs;
-      tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz);
-      tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz);
+   tri->inputs.stride = input_array_sz;
 
-      *tri_size = bytes;
+   {
+      char *a = (char *)tri;
+      char *b = (char *)&GET_PLANES(tri)[nr_planes];
+      assert(b - a == *tri_size);
    }
 
    return tri;
@@ -101,25 +105,26 @@ lp_setup_print_vertex(struct lp_setup_context *setup,
                       const char *name,
                       const float (*v)[4])
 {
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    int i, j;
 
    debug_printf("   wpos (%s[0]) xyzw %f %f %f %f\n",
                 name,
                 v[0][0], v[0][1], v[0][2], v[0][3]);
 
-   for (i = 0; i < setup->fs.nr_inputs; i++) {
-      const float *in = v[setup->fs.input[i].src_index];
+   for (i = 0; i < key->num_inputs; i++) {
+      const float *in = v[key->inputs[i].src_index];
 
       debug_printf("  in[%d] (%s[%d]) %s%s%s%s ",
                    i, 
-                   name, setup->fs.input[i].src_index,
-                   (setup->fs.input[i].usage_mask & 0x1) ? "x" : " ",
-                   (setup->fs.input[i].usage_mask & 0x2) ? "y" : " ",
-                   (setup->fs.input[i].usage_mask & 0x4) ? "z" : " ",
-                   (setup->fs.input[i].usage_mask & 0x8) ? "w" : " ");
+                   name, key->inputs[i].src_index,
+                   (key->inputs[i].usage_mask & 0x1) ? "x" : " ",
+                   (key->inputs[i].usage_mask & 0x2) ? "y" : " ",
+                   (key->inputs[i].usage_mask & 0x4) ? "z" : " ",
+                   (key->inputs[i].usage_mask & 0x8) ? "w" : " ");
 
       for (j = 0; j < 4; j++)
-         if (setup->fs.input[i].usage_mask & (1<<j))
+         if (key->inputs[i].usage_mask & (1<<j))
             debug_printf("%.5f ", in[j]);
 
       debug_printf("\n");
@@ -200,14 +205,16 @@ lp_setup_whole_tile(struct lp_setup_context *setup,
       }
 
       LP_COUNT(nr_shade_opaque_64);
-      return lp_scene_bin_command( scene, tx, ty,
-                                   LP_RAST_OP_SHADE_TILE_OPAQUE,
-                                   lp_rast_arg_inputs(inputs) );
+      return lp_scene_bin_cmd_with_state( scene, tx, ty,
+                                          setup->fs.stored,
+                                          LP_RAST_OP_SHADE_TILE_OPAQUE,
+                                          lp_rast_arg_inputs(inputs) );
    } else {
       LP_COUNT(nr_shade_64);
-      return lp_scene_bin_command( scene, tx, ty,
-                                   LP_RAST_OP_SHADE_TILE,
-                                   lp_rast_arg_inputs(inputs) );
+      return lp_scene_bin_cmd_with_state( scene, tx, ty,
+                                          setup->fs.stored, 
+                                          LP_RAST_OP_SHADE_TILE,
+                                          lp_rast_arg_inputs(inputs) );
    }
 }
 
@@ -225,13 +232,13 @@ do_triangle_ccw(struct lp_setup_context *setup,
 		boolean frontfacing )
 {
    struct lp_scene *scene = setup->scene;
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    struct lp_rast_triangle *tri;
-   int x[3];
-   int y[3];
-   int area;
+   struct lp_rast_plane *plane;
+   int x[4];
+   int y[4];
    struct u_rect bbox;
    unsigned tri_bytes;
-   int i;
    int nr_planes = 3;
 
    if (0)
@@ -248,10 +255,12 @@ do_triangle_ccw(struct lp_setup_context *setup,
    x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
    x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
    x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
+   x[3] = 0;
    y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
    y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
    y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
-
+   y[3] = 0;
+   
 
    /* Bounding rectangle (in pixels) */
    {
@@ -289,13 +298,13 @@ do_triangle_ccw(struct lp_setup_context *setup,
    u_rect_find_intersection(&setup->draw_region, &bbox);
 
    tri = lp_setup_alloc_triangle(scene,
-                                 setup->fs.nr_inputs,
+                                 key->num_inputs,
                                  nr_planes,
                                  &tri_bytes);
    if (!tri)
       return FALSE;
 
-#ifdef DEBUG
+#if 0
    tri->v[0][0] = v0[0][0];
    tri->v[1][0] = v1[0][0];
    tri->v[2][0] = v2[0][0];
@@ -304,92 +313,172 @@ do_triangle_ccw(struct lp_setup_context *setup,
    tri->v[2][1] = v2[0][1];
 #endif
 
-   tri->plane[0].dcdy = x[0] - x[1];
-   tri->plane[1].dcdy = x[1] - x[2];
-   tri->plane[2].dcdy = x[2] - x[0];
-
-   tri->plane[0].dcdx = y[0] - y[1];
-   tri->plane[1].dcdx = y[1] - y[2];
-   tri->plane[2].dcdx = y[2] - y[0];
-
-   area = (tri->plane[0].dcdy * tri->plane[2].dcdx -
-           tri->plane[2].dcdy * tri->plane[0].dcdx);
-
    LP_COUNT(nr_tris);
 
-   /* Cull non-ccw and zero-sized triangles. 
-    *
-    * XXX: subject to overflow??
-    */
-   if (area <= 0) {
-      lp_scene_putback_data( scene, tri_bytes );
-      LP_COUNT(nr_culled_tris);
-      return TRUE;
-   }
-
    /* Setup parameter interpolants:
     */
-   lp_setup_tri_coef( setup, &tri->inputs, v0, v1, v2, frontfacing );
-
-   tri->inputs.facing = frontfacing ? 1.0F : -1.0F;
+   setup->setup.variant->jit_function( v0,
+				       v1,
+				       v2,
+				       frontfacing,
+				       GET_A0(&tri->inputs),
+				       GET_DADX(&tri->inputs),
+				       GET_DADY(&tri->inputs) );
+
+   tri->inputs.frontfacing = frontfacing;
    tri->inputs.disable = FALSE;
    tri->inputs.opaque = setup->fs.current.variant->opaque;
-   tri->inputs.state = setup->fs.stored;
 
-  
-   for (i = 0; i < 3; i++) {
-      struct lp_rast_plane *plane = &tri->plane[i];
+   if (0)
+      lp_dump_setup_coef(&setup->setup.variant->key,
+			 (const float (*)[4])GET_A0(&tri->inputs),
+			 (const float (*)[4])GET_DADX(&tri->inputs),
+			 (const float (*)[4])GET_DADY(&tri->inputs));
+
+   plane = GET_PLANES(tri);
+
+#if defined(PIPE_ARCH_SSE)
+   {
+      __m128i vertx, verty;
+      __m128i shufx, shufy;
+      __m128i dcdx, dcdy, c;
+      __m128i unused;
+      __m128i dcdx_neg_mask;
+      __m128i dcdy_neg_mask;
+      __m128i dcdx_zero_mask;
+      __m128i top_left_flag;
+      __m128i c_inc_mask, c_inc;
+      __m128i eo, p0, p1, p2;
+      __m128i zero = _mm_setzero_si128();
+
+      vertx = _mm_loadu_si128((__m128i *)x); /* vertex x coords */
+      verty = _mm_loadu_si128((__m128i *)y); /* vertex y coords */
+
+      shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
+      shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
+
+      dcdx = _mm_sub_epi32(verty, shufy);
+      dcdy = _mm_sub_epi32(vertx, shufx);
+
+      dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
+      dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero);
+      dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
+
+      top_left_flag = _mm_set1_epi32((setup->pixel_offset == 0) ? ~0 : 0);
 
-      /* half-edge constants, will be interated over the whole render
-       * target.
+      c_inc_mask = _mm_or_si128(dcdx_neg_mask,
+                                _mm_and_si128(dcdx_zero_mask,
+                                              _mm_xor_si128(dcdy_neg_mask,
+                                                            top_left_flag)));
+
+      c_inc = _mm_srli_epi32(c_inc_mask, 31);
+
+      c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
+                        mm_mullo_epi32(dcdy, verty));
+
+      c = _mm_add_epi32(c, c_inc);
+
+      /* Scale up to match c:
        */
-      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
-
-      /* correct for top-left vs. bottom-left fill convention.  
-       *
-       * note that we're overloading gl_rasterization_rules to mean
-       * both (0.5,0.5) pixel centers *and* bottom-left filling
-       * convention.
-       *
-       * GL actually has a top-left filling convention, but GL's
-       * notion of "top" differs from gallium's...
-       *
-       * Also, sometimes (in FBO cases) GL will render upside down
-       * to its usual method, in which case it will probably want
-       * to use the opposite, top-left convention.
-       */         
-      if (plane->dcdx < 0) {
-         /* both fill conventions want this - adjust for left edges */
-         plane->c++;            
-      }
-      else if (plane->dcdx == 0) {
-         if (setup->pixel_offset == 0) {
-            /* correct for top-left fill convention:
-             */
-            if (plane->dcdy > 0) plane->c++;
+      dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
+      dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
+
+      /* Calculate trivial reject values:
+       */
+      eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
+                         _mm_and_si128(dcdx_neg_mask, dcdx));
+
+      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
+      /* Pointless transpose which gets undone immediately in
+       * rasterization:
+       */
+      transpose4_epi32(&c, &dcdx, &dcdy, &eo,
+                       &p0, &p1, &p2, &unused);
+
+      _mm_store_si128((__m128i *)&plane[0], p0);
+      _mm_store_si128((__m128i *)&plane[1], p1);
+      _mm_store_si128((__m128i *)&plane[2], p2);
+   }
+#else
+   {
+      int i;
+      plane[0].dcdy = x[0] - x[1];
+      plane[1].dcdy = x[1] - x[2];
+      plane[2].dcdy = x[2] - x[0];
+      plane[0].dcdx = y[0] - y[1];
+      plane[1].dcdx = y[1] - y[2];
+      plane[2].dcdx = y[2] - y[0];
+  
+      for (i = 0; i < 3; i++) {
+         /* half-edge constants, will be interated over the whole render
+          * target.
+          */
+         plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
+
+         /* correct for top-left vs. bottom-left fill convention.  
+          *
+          * note that we're overloading gl_rasterization_rules to mean
+          * both (0.5,0.5) pixel centers *and* bottom-left filling
+          * convention.
+          *
+          * GL actually has a top-left filling convention, but GL's
+          * notion of "top" differs from gallium's...
+          *
+          * Also, sometimes (in FBO cases) GL will render upside down
+          * to its usual method, in which case it will probably want
+          * to use the opposite, top-left convention.
+          */         
+         if (plane[i].dcdx < 0) {
+            /* both fill conventions want this - adjust for left edges */
+            plane[i].c++;            
          }
-         else {
-            /* correct for bottom-left fill convention:
-             */
-            if (plane->dcdy < 0) plane->c++;
+         else if (plane[i].dcdx == 0) {
+            if (setup->pixel_offset == 0) {
+               /* correct for top-left fill convention:
+                */
+               if (plane[i].dcdy > 0) plane[i].c++;
+            }
+            else {
+               /* correct for bottom-left fill convention:
+                */
+               if (plane[i].dcdy < 0) plane[i].c++;
+            }
          }
-      }
 
-      plane->dcdx *= FIXED_ONE;
-      plane->dcdy *= FIXED_ONE;
+         plane[i].dcdx *= FIXED_ONE;
+         plane[i].dcdy *= FIXED_ONE;
 
-      /* find trivial reject offsets for each edge for a single-pixel
-       * sized block.  These will be scaled up at each recursive level to
-       * match the active blocksize.  Scaling in this way works best if
-       * the blocks are square.
-       */
-      plane->eo = 0;
-      if (plane->dcdx < 0) plane->eo -= plane->dcdx;
-      if (plane->dcdy > 0) plane->eo += plane->dcdy;
+         /* find trivial reject offsets for each edge for a single-pixel
+          * sized block.  These will be scaled up at each recursive level to
+          * match the active blocksize.  Scaling in this way works best if
+          * the blocks are square.
+          */
+         plane[i].eo = 0;
+         if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+         if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
+      }
+   }
+#endif
 
-      /* Calculate trivial accept offsets from the above.
-       */
-      plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+   if (0) {
+      debug_printf("p0: %08x/%08x/%08x/%08x\n",
+                   plane[0].c,
+                   plane[0].dcdx,
+                   plane[0].dcdy,
+                   plane[0].eo);
+      
+      debug_printf("p1: %08x/%08x/%08x/%08x\n",
+                   plane[1].c,
+                   plane[1].dcdx,
+                   plane[1].dcdy,
+                   plane[1].eo);
+      
+      debug_printf("p0: %08x/%08x/%08x/%08x\n",
+                   plane[2].c,
+                   plane[2].dcdx,
+                   plane[2].dcdy,
+                   plane[2].eo);
    }
 
 
@@ -412,29 +501,25 @@ do_triangle_ccw(struct lp_setup_context *setup,
     * these planes elsewhere.
     */
    if (nr_planes == 7) {
-      tri->plane[3].dcdx = -1;
-      tri->plane[3].dcdy = 0;
-      tri->plane[3].c = 1-bbox.x0;
-      tri->plane[3].ei = 0;
-      tri->plane[3].eo = 1;
-
-      tri->plane[4].dcdx = 1;
-      tri->plane[4].dcdy = 0;
-      tri->plane[4].c = bbox.x1+1;
-      tri->plane[4].ei = -1;
-      tri->plane[4].eo = 0;
-
-      tri->plane[5].dcdx = 0;
-      tri->plane[5].dcdy = 1;
-      tri->plane[5].c = 1-bbox.y0;
-      tri->plane[5].ei = 0;
-      tri->plane[5].eo = 1;
-
-      tri->plane[6].dcdx = 0;
-      tri->plane[6].dcdy = -1;
-      tri->plane[6].c = bbox.y1+1;
-      tri->plane[6].ei = -1;
-      tri->plane[6].eo = 0;
+      plane[3].dcdx = -1;
+      plane[3].dcdy = 0;
+      plane[3].c = 1-bbox.x0;
+      plane[3].eo = 1;
+
+      plane[4].dcdx = 1;
+      plane[4].dcdy = 0;
+      plane[4].c = bbox.x1+1;
+      plane[4].eo = 0;
+
+      plane[5].dcdx = 0;
+      plane[5].dcdy = 1;
+      plane[5].c = 1-bbox.y0;
+      plane[5].eo = 1;
+
+      plane[6].dcdx = 0;
+      plane[6].dcdy = -1;
+      plane[6].c = bbox.y1+1;
+      plane[6].eo = 0;
    }
 
    return lp_setup_bin_triangle( setup, tri, &bbox, nr_planes );
@@ -487,51 +572,58 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    int sz = floor_pot((bbox->x1 - (bbox->x0 & ~3)) |
 		      (bbox->y1 - (bbox->y0 & ~3)));
 
-   if (nr_planes == 3) {
-      if (sz < 4 && dx < 64)
-      {
-	 /* Triangle is contained in a single 4x4 stamp:
-	  */
-	 int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8);
-
-	 return lp_scene_bin_command( scene,
-				      bbox->x0/64, bbox->y0/64,
-				      LP_RAST_OP_TRIANGLE_3_4,
-				      lp_rast_arg_triangle(tri, mask) );
-      }
-
-      if (sz < 16 && dx < 64)
-      {
-	 int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8);
-
-	 /* Triangle is contained in a single 16x16 block:
-	  */
-	 return lp_scene_bin_command( scene,
-				      bbox->x0/64, bbox->y0/64,
-                                      LP_RAST_OP_TRIANGLE_3_16,
-                                      lp_rast_arg_triangle(tri, mask) );
-      }
-   }
-
-
    /* Determine which tile(s) intersect the triangle's bounding box
     */
    if (dx < TILE_SIZE)
    {
       int ix0 = bbox->x0 / TILE_SIZE;
       int iy0 = bbox->y0 / TILE_SIZE;
+      int px = bbox->x0 & 63 & ~3;
+      int py = bbox->y0 & 63 & ~3;
+      int mask = px | (py << 8);
 
       assert(iy0 == bbox->y1 / TILE_SIZE &&
 	     ix0 == bbox->x1 / TILE_SIZE);
 
+      if (nr_planes == 3) {
+         if (sz < 4)
+         {
+            /* Triangle is contained in a single 4x4 stamp:
+             */
+            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+                                                setup->fs.stored,
+                                                LP_RAST_OP_TRIANGLE_3_4,
+                                                lp_rast_arg_triangle(tri, mask) );
+         }
+
+         if (sz < 16)
+         {
+            /* Triangle is contained in a single 16x16 block:
+             */
+            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+                                                setup->fs.stored,
+                                                LP_RAST_OP_TRIANGLE_3_16,
+                                                lp_rast_arg_triangle(tri, mask) );
+         }
+      }
+      else if (nr_planes == 4 && sz < 16) 
+      {
+         return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
+                                            setup->fs.stored,
+                                            LP_RAST_OP_TRIANGLE_4_16,
+                                            lp_rast_arg_triangle(tri, mask) );
+      }
+
+
       /* Triangle is contained in a single tile:
        */
-      return lp_scene_bin_command( scene, ix0, iy0,
-                                   lp_rast_tri_tab[nr_planes], 
-                                   lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
+      return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored,
+                                          lp_rast_tri_tab[nr_planes], 
+                                          lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
    }
    else
    {
+      struct lp_rast_plane *plane = GET_PLANES(tri);
       int c[MAX_PLANES];
       int ei[MAX_PLANES];
       int eo[MAX_PLANES];
@@ -545,14 +637,17 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
       int iy1 = bbox->y1 / TILE_SIZE;
       
       for (i = 0; i < nr_planes; i++) {
-         c[i] = (tri->plane[i].c + 
-                 tri->plane[i].dcdy * iy0 * TILE_SIZE - 
-                 tri->plane[i].dcdx * ix0 * TILE_SIZE);
-
-         ei[i] = tri->plane[i].ei << TILE_ORDER;
-         eo[i] = tri->plane[i].eo << TILE_ORDER;
-         xstep[i] = -(tri->plane[i].dcdx << TILE_ORDER);
-         ystep[i] = tri->plane[i].dcdy << TILE_ORDER;
+         c[i] = (plane[i].c + 
+                 plane[i].dcdy * iy0 * TILE_SIZE - 
+                 plane[i].dcdx * ix0 * TILE_SIZE);
+
+         ei[i] = (plane[i].dcdy - 
+                  plane[i].dcdx - 
+                  plane[i].eo) << TILE_ORDER;
+
+         eo[i] = plane[i].eo << TILE_ORDER;
+         xstep[i] = -(plane[i].dcdx << TILE_ORDER);
+         ystep[i] = plane[i].dcdy << TILE_ORDER;
       }
 
 
@@ -594,9 +689,11 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                 */
                int count = util_bitcount(partial);
                in = TRUE;
-               if (!lp_scene_bin_command( scene, x, y,
-                                          lp_rast_tri_tab[count], 
-                                          lp_rast_arg_triangle(tri, partial) ))
+               
+               if (!lp_scene_bin_cmd_with_state( scene, x, y,
+                                                 setup->fs.stored,
+                                                 lp_rast_tri_tab[count], 
+                                                 lp_rast_arg_triangle(tri, partial) ))
                   goto fail;
 
                LP_COUNT(nr_partially_covered_64);
@@ -635,40 +732,62 @@ fail:
 
 
 /**
- * Draw triangle if it's CW, cull otherwise.
+ * Try to draw the triangle, restart the scene on failure.
  */
-static void triangle_cw( struct lp_setup_context *setup,
-			 const float (*v0)[4],
-			 const float (*v1)[4],
-			 const float (*v2)[4] )
+static void retry_triangle_ccw( struct lp_setup_context *setup,
+                                const float (*v0)[4],
+                                const float (*v1)[4],
+                                const float (*v2)[4],
+                                boolean front)
 {
-   if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
+   if (!do_triangle_ccw( setup, v0, v1, v2, front ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
 
-      if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
-         assert(0);
+      if (!do_triangle_ccw( setup, v0, v1, v2, front ))
+         return;
    }
 }
 
+static INLINE float
+calc_area(const float (*v0)[4],
+          const float (*v1)[4],
+          const float (*v2)[4])
+{
+   float dx01 = v0[0][0] - v1[0][0];
+   float dy01 = v0[0][1] - v1[0][1];
+   float dx20 = v2[0][0] - v0[0][0];
+   float dy20 = v2[0][1] - v0[0][1];
+   return dx01 * dy20 - dx20 * dy01;
+}
+
 
 /**
- * Draw triangle if it's CCW, cull otherwise.
+ * Draw triangle if it's CW, cull otherwise.
  */
-static void triangle_ccw( struct lp_setup_context *setup,
+static void triangle_cw( struct lp_setup_context *setup,
 			 const float (*v0)[4],
 			 const float (*v1)[4],
 			 const float (*v2)[4] )
 {
-   if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
-   {
-      lp_setup_flush_and_restart(setup);
-      if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
-         assert(0);
-   }
+   float area = calc_area(v0, v1, v2);
+
+   if (area < 0.0f) 
+      retry_triangle_ccw(setup, v0, v2, v1, !setup->ccw_is_frontface);
 }
 
 
+static void triangle_ccw( struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4],
+                          const float (*v2)[4])
+{
+   float area = calc_area(v0, v1, v2);
+
+   if (area > 0.0f) 
+      retry_triangle_ccw(setup, v0, v1, v2, setup->ccw_is_frontface);
+}
 
 /**
  * Draw triangle whether it's CW or CCW.
@@ -678,18 +797,12 @@ static void triangle_both( struct lp_setup_context *setup,
 			   const float (*v1)[4],
 			   const float (*v2)[4] )
 {
-   /* edge vectors e = v0 - v2, f = v1 - v2 */
-   const float ex = v0[0][0] - v2[0][0];
-   const float ey = v0[0][1] - v2[0][1];
-   const float fx = v1[0][0] - v2[0][0];
-   const float fy = v1[0][1] - v2[0][1];
-
-   /* det = cross(e,f).z */
-   const float det = ex * fy - ey * fx;
-   if (det < 0.0f) 
-      triangle_ccw( setup, v0, v1, v2 );
-   else if (det > 0.0f)
-      triangle_cw( setup, v0, v1, v2 );
+   float area = calc_area(v0, v1, v2);
+
+   if (area > 0.0f) 
+      retry_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface );
+   else if (area < 0.0f)
+      retry_triangle_ccw( setup, v0, v2, v1, !setup->ccw_is_frontface );
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
index 6308561f242..9c1f0fe7939 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -141,7 +141,8 @@ lp_setup_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
    const boolean flatshade_first = setup->flatshade_first;
    unsigned i;
 
-   lp_setup_update_state(setup, TRUE);
+   if (!lp_setup_update_state(setup, TRUE))
+      return;
 
    switch (setup->prim) {
    case PIPE_PRIM_POINTS:
@@ -338,7 +339,8 @@ lp_setup_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
    const boolean flatshade_first = setup->flatshade_first;
    unsigned i;
 
-   lp_setup_update_state(setup, TRUE);
+   if (!lp_setup_update_state(setup, TRUE))
+      return;
 
    switch (setup->prim) {
    case PIPE_PRIM_POINTS:
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 86313e1c484..7893e9cdc0c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -97,6 +97,9 @@ llvmpipe_set_framebuffer_state(struct pipe_context *,
 void
 llvmpipe_update_fs(struct llvmpipe_context *lp);
 
+void 
+llvmpipe_update_setup(struct llvmpipe_context *lp);
+
 void
 llvmpipe_update_derived(struct llvmpipe_context *llvmpipe);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index bb059d04599..0f5f7369e04 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -50,12 +50,13 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 {
    const struct lp_fragment_shader *lpfs = llvmpipe->fs;
    struct vertex_info *vinfo = &llvmpipe->vertex_info;
-   struct lp_shader_input *inputs = llvmpipe->inputs;
    unsigned vs_index;
    uint i;
 
    /*
-    * Match FS inputs against VS outputs, emitting the necessary attributes.
+    * Match FS inputs against VS outputs, emitting the necessary
+    * attributes.  Could cache these structs and look them up with a
+    * combination of fragment shader, vertex shader ids.
     */
 
    vinfo->num_attribs = 0;
@@ -66,72 +67,18 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 
    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
 
-   for (i = 0; i < lpfs->info.num_inputs; i++) {
+   for (i = 0; i < lpfs->info.base.num_inputs; i++) {
       /*
        * Search for each input in current vs output:
        */
 
       vs_index = draw_find_shader_output(llvmpipe->draw,
-                                         lpfs->info.input_semantic_name[i],
-                                         lpfs->info.input_semantic_index[i]);
-      if (vs_index < 0) {
-         /*
-          * This can happen with sprite coordinates - the vertex
-          * shader doesn't need to provide an output as we generate
-          * them internally.  However, lets keep pretending that there
-          * is something there to not confuse other code.
-          */
-         vs_index = 0;
-      }
-
-      /* This can be pre-computed, except for flatshade:
-       */
-      inputs[i].usage_mask = lpfs->info.input_usage_mask[i];
-
-      switch (lpfs->info.input_interpolate[i]) {
-      case TGSI_INTERPOLATE_CONSTANT:
-         inputs[i].interp = LP_INTERP_CONSTANT;
-         break;
-      case TGSI_INTERPOLATE_LINEAR:
-         inputs[i].interp = LP_INTERP_LINEAR;
-         break;
-      case TGSI_INTERPOLATE_PERSPECTIVE:
-         inputs[i].interp = LP_INTERP_PERSPECTIVE;
-         break;
-      default:
-         assert(0);
-         break;
-      }
-
-      switch (lpfs->info.input_semantic_name[i]) {
-      case TGSI_SEMANTIC_FACE:
-         inputs[i].interp = LP_INTERP_FACING;
-         break;
-      case TGSI_SEMANTIC_POSITION:
-         /* Position was already emitted above
-          */
-         inputs[i].interp = LP_INTERP_POSITION;
-         inputs[i].src_index = 0;
-         continue;
-      case TGSI_SEMANTIC_COLOR:
-         /* Colors are linearly inputs[i].interpolated in the fragment shader
-          * even when flatshading is active.  This just tells the
-          * setup module to use coefficients with ddx==0 and
-          * ddy==0.
-          */
-         if (llvmpipe->rasterizer->flatshade)
-            inputs[i].interp = LP_INTERP_CONSTANT;
-         break;
-
-      default:
-         break;
-      }
+                                         lpfs->info.base.input_semantic_name[i],
+                                         lpfs->info.base.input_semantic_index[i]);
 
       /*
        * Emit the requested fs attribute for all but position.
        */
-
-      inputs[i].src_index = vinfo->num_attribs;
       draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
    }
 
@@ -145,15 +92,8 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
    }
 
-   llvmpipe->num_inputs = lpfs->info.num_inputs;
-
    draw_compute_vertex_size(vinfo);
-
    lp_setup_set_vertex_info(llvmpipe->setup, vinfo);
-
-   lp_setup_set_fs_inputs(llvmpipe->setup,
-                          inputs,
-                          lpfs->info.num_inputs);
 }
 
 
@@ -190,6 +130,10 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
                           LP_NEW_QUERY))
       llvmpipe_update_fs( llvmpipe );
 
+   if (llvmpipe->dirty & (LP_NEW_FS |
+			  LP_NEW_RASTERIZER))
+      llvmpipe_update_setup( llvmpipe );
+
    if (llvmpipe->dirty & LP_NEW_BLEND_COLOR)
       lp_setup_set_blend_color(llvmpipe->setup,
                                &llvmpipe->blend_color);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index f0a15e11b9b..9fbedac165f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -99,74 +99,12 @@
 
 
 #include <llvm-c/Analysis.h>
+#include <llvm-c/BitWriter.h>
 
 
 static unsigned fs_no = 0;
 
 
-/**
- * Generate the depth /stencil test code.
- */
-static void
-generate_depth_stencil(LLVMBuilderRef builder,
-                       const struct lp_fragment_shader_variant_key *key,
-                       struct lp_type src_type,
-                       struct lp_build_mask_context *mask,
-                       LLVMValueRef stencil_refs[2],
-                       LLVMValueRef src,
-                       LLVMValueRef dst_ptr,
-                       LLVMValueRef facing,
-                       LLVMValueRef counter)
-{
-   const struct util_format_description *format_desc;
-   struct lp_type dst_type;
-
-   if (!key->depth.enabled && !key->stencil[0].enabled && !key->stencil[1].enabled)
-      return;
-
-   format_desc = util_format_description(key->zsbuf_format);
-   assert(format_desc);
-
-   /*
-    * Depths are expected to be between 0 and 1, even if they are stored in
-    * floats. Setting these bits here will ensure that the lp_build_conv() call
-    * below won't try to unnecessarily clamp the incoming values.
-    */
-   if(src_type.floating) {
-      src_type.sign = FALSE;
-      src_type.norm = TRUE;
-   }
-   else {
-      assert(!src_type.sign);
-      assert(src_type.norm);
-   }
-
-   /* Pick the depth type. */
-   dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
-
-   /* FIXME: Cope with a depth test type with a different bit width. */
-   assert(dst_type.width == src_type.width);
-   assert(dst_type.length == src_type.length);
-
-   /* Convert fragment Z from float to integer */
-   lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1);
-
-   dst_ptr = LLVMBuildBitCast(builder,
-                              dst_ptr,
-                              LLVMPointerType(lp_build_vec_type(dst_type), 0), "");
-   lp_build_depth_stencil_test(builder,
-                               &key->depth,
-                               key->stencil,
-                               dst_type,
-                               format_desc,
-                               mask,
-                               stencil_refs,
-                               src,
-                               dst_ptr,
-                               facing,
-                               counter);
-}
-
 
 /**
  * Expand the relevent bits of mask_input to a 4-dword mask for the 
@@ -248,6 +186,26 @@ generate_quad_mask(LLVMBuilderRef builder,
 }
 
 
+#define EARLY_DEPTH_TEST  0x1
+#define LATE_DEPTH_TEST   0x2
+#define EARLY_DEPTH_WRITE 0x4
+#define LATE_DEPTH_WRITE  0x8
+
+static int
+find_output_by_semantic( const struct tgsi_shader_info *info,
+			 unsigned semantic,
+			 unsigned index )
+{
+   int i;
+
+   for (i = 0; i < info->num_outputs; i++)
+      if (info->output_semantic_name[i] == semantic &&
+	  info->output_semantic_index[i] == index)
+	 return i;
+
+   return -1;
+}
+
 
 /**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
@@ -255,14 +213,13 @@ generate_quad_mask(LLVMBuilderRef builder,
  * \param partial_mask  if 1, do mask_input testing
  */
 static void
-generate_fs(struct llvmpipe_context *lp,
-            struct lp_fragment_shader *shader,
+generate_fs(struct lp_fragment_shader *shader,
             const struct lp_fragment_shader_variant_key *key,
             LLVMBuilderRef builder,
             struct lp_type type,
             LLVMValueRef context_ptr,
             unsigned i,
-            const struct lp_build_interp_soa_context *interp,
+            struct lp_build_interp_soa_context *interp,
             struct lp_build_sampler_soa *sampler,
             LLVMValueRef *pmask,
             LLVMValueRef (*color)[4],
@@ -272,18 +229,52 @@ generate_fs(struct llvmpipe_context *lp,
             LLVMValueRef mask_input,
             LLVMValueRef counter)
 {
+   const struct util_format_description *zs_format_desc = NULL;
    const struct tgsi_token *tokens = shader->base.tokens;
    LLVMTypeRef vec_type;
    LLVMValueRef consts_ptr;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
-   LLVMValueRef z = interp->pos[2];
+   LLVMValueRef z;
+   LLVMValueRef zs_value = NULL;
    LLVMValueRef stencil_refs[2];
-   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
-   boolean early_depth_stencil_test;
+   boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
+                            shader->info.base.num_inputs < 3 &&
+                            shader->info.base.num_instructions < 8);
    unsigned attrib;
    unsigned chan;
    unsigned cbuf;
+   unsigned depth_mode;
+
+   if (key->depth.enabled ||
+       key->stencil[0].enabled ||
+       key->stencil[1].enabled) {
+
+      zs_format_desc = util_format_description(key->zsbuf_format);
+      assert(zs_format_desc);
+
+      if (!shader->info.base.writes_z) {
+         if (key->alpha.enabled || shader->info.base.uses_kill)
+            /* With alpha test and kill, can do the depth test early
+             * and hopefully eliminate some quads.  But need to do a
+             * special deferred depth write once the final mask value
+             * is known.
+             */
+            depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+         else
+            depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
+      }
+      else {
+         depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+      }
+
+      if (!(key->depth.enabled && key->depth.writemask) &&
+          !(key->stencil[0].enabled && key->stencil[0].writemask))
+         depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
+   }
+   else {
+      depth_mode = 0;
+   }
 
    assert(i < 4);
 
@@ -294,20 +285,14 @@ generate_fs(struct llvmpipe_context *lp,
 
    consts_ptr = lp_jit_context_constants(builder, context_ptr);
 
-   flow = lp_build_flow_create(builder);
-
    memset(outputs, 0, sizeof outputs);
 
-   lp_build_flow_scope_begin(flow);
-
    /* Declare the color and z variables */
    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-	 color[cbuf][chan] = LLVMGetUndef(vec_type);
-	 lp_build_flow_scope_declare(flow, &color[cbuf][chan]);
+	 color[cbuf][chan] = lp_build_alloca(builder, vec_type, "color");
       }
    }
-   lp_build_flow_scope_declare(flow, &z);
 
    /* do triangle edge testing */
    if (partial_mask) {
@@ -319,74 +304,126 @@ generate_fs(struct llvmpipe_context *lp,
    }
 
    /* 'mask' will control execution based on quad's pixel alive/killed state */
-   lp_build_mask_begin(&mask, flow, type, *pmask);
-
-   early_depth_stencil_test =
-      (key->depth.enabled || key->stencil[0].enabled) &&
-      !key->alpha.enabled &&
-      !shader->info.uses_kill &&
-      !shader->info.writes_z;
-
-   if (early_depth_stencil_test)
-      generate_depth_stencil(builder, key,
-                             type, &mask,
-                             stencil_refs, z, depth_ptr, facing, counter);
+   lp_build_mask_begin(&mask, builder, type, *pmask);
+
+   if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
+      lp_build_mask_check(&mask);
+
+   lp_build_interp_soa_update_pos(interp, i);
+   z = interp->pos[2];
+
+   if (depth_mode & EARLY_DEPTH_TEST) {
+      lp_build_depth_stencil_test(builder,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z,
+                                  depth_ptr, facing,
+                                  &zs_value,
+                                  !simple_shader);
+
+      if (depth_mode & EARLY_DEPTH_WRITE) {
+         lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+      }
+   }
 
+   lp_build_interp_soa_update_inputs(interp, i);
+   
+   /* Build the actual shader */
    lp_build_tgsi_soa(builder, tokens, type, &mask,
                      consts_ptr, interp->pos, interp->inputs,
-                     outputs, sampler, &shader->info);
+                     outputs, sampler, &shader->info.base);
 
-   /* loop over fragment shader outputs/results */
-   for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(outputs[attrib][chan]) {
-            LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
-            lp_build_name(out, "output%u.%u.%c", i, attrib, "xyzw"[chan]);
-
-            switch (shader->info.output_semantic_name[attrib]) {
-            case TGSI_SEMANTIC_COLOR:
-               {
-                  unsigned cbuf = shader->info.output_semantic_index[attrib];
-
-                  lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
-
-                  /* Alpha test */
-		  /* XXX: should only test the final assignment to alpha */
-                  if (cbuf == 0 && chan == 3 && key->alpha.enabled) {
-                     LLVMValueRef alpha = out;
-                     LLVMValueRef alpha_ref_value;
-                     alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
-                     alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
-                     lp_build_alpha_test(builder, key->alpha.func, type,
-                                         &mask, alpha, alpha_ref_value);
-                  }
-
-		  color[cbuf][chan] = out;
-                  break;
-               }
-
-            case TGSI_SEMANTIC_POSITION:
-               if(chan == 2)
-                  z = out;
-               break;
-            }
-         }
+
+   /* Alpha test */
+   if (key->alpha.enabled) {
+      int color0 = find_output_by_semantic(&shader->info.base,
+                                           TGSI_SEMANTIC_COLOR,
+                                           0);
+
+      if (color0 != -1 && outputs[color0][3]) {
+         LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
+         LLVMValueRef alpha_ref_value;
+
+         alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
+         alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
+
+         lp_build_alpha_test(builder, key->alpha.func, type,
+                             &mask, alpha, alpha_ref_value,
+                             (depth_mode & LATE_DEPTH_TEST) != 0);
       }
    }
 
-   if (!early_depth_stencil_test)
-      generate_depth_stencil(builder, key,
-                             type, &mask,
-                             stencil_refs, z, depth_ptr, facing, counter);
+   /* Late Z test */
+   if (depth_mode & LATE_DEPTH_TEST) { 
+      int pos0 = find_output_by_semantic(&shader->info.base,
+                                         TGSI_SEMANTIC_POSITION,
+                                         0);
+         
+      if (pos0 != -1 && outputs[pos0][2]) {
+         z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
+      }
 
-   lp_build_mask_end(&mask);
+      lp_build_depth_stencil_test(builder,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z,
+                                  depth_ptr, facing,
+                                  &zs_value,
+                                  !simple_shader);
+      /* Late Z write */
+      if (depth_mode & LATE_DEPTH_WRITE) {
+         lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+      }
+   }
+   else if ((depth_mode & EARLY_DEPTH_TEST) &&
+            (depth_mode & LATE_DEPTH_WRITE))
+   {
+      /* Need to apply a reduced mask to the depth write.  Reload the
+       * depth value, update from zs_value with the new mask value and
+       * write that out.
+       */
+      lp_build_deferred_depth_write(builder,
+                                    type,
+                                    zs_format_desc,
+                                    &mask,
+                                    depth_ptr,
+                                    zs_value);
+   }
 
-   lp_build_flow_scope_end(flow);
 
-   lp_build_flow_destroy(flow);
+   /* Color write  */
+   for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
+   {
+      if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR &&
+          shader->info.base.output_semantic_index[attrib] < key->nr_cbufs)
+      {
+         unsigned cbuf = shader->info.base.output_semantic_index[attrib];
+         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+            if(outputs[attrib][chan]) {
+               /* XXX: just initialize outputs to point at colors[] and
+                * skip this.
+                */
+               LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+               lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
+               LLVMBuildStore(builder, out, color[cbuf][chan]);
+            }
+         }
+      }
+   }
 
-   *pmask = mask.value;
+   if (counter)
+      lp_build_occlusion_count(builder, type,
+                               lp_build_mask_value(&mask), counter);
 
+   *pmask = lp_build_mask_end(&mask);
 }
 
 
@@ -407,10 +444,10 @@ generate_blend(const struct pipe_blend_state *blend,
                LLVMValueRef context_ptr,
                LLVMValueRef mask,
                LLVMValueRef *src,
-               LLVMValueRef dst_ptr)
+               LLVMValueRef dst_ptr,
+               boolean do_branch)
 {
    struct lp_build_context bld;
-   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask_ctx;
    LLVMTypeRef vec_type;
    LLVMValueRef const_ptr;
@@ -421,10 +458,9 @@ generate_blend(const struct pipe_blend_state *blend,
 
    lp_build_context_init(&bld, builder, type);
 
-   flow = lp_build_flow_create(builder);
-
-   /* we'll use this mask context to skip blending if all pixels are dead */
-   lp_build_mask_begin(&mask_ctx, flow, type, mask);
+   lp_build_mask_begin(&mask_ctx, builder, type, mask);
+   if (do_branch)
+      lp_build_mask_check(&mask_ctx);
 
    vec_type = lp_build_vec_type(type);
 
@@ -457,7 +493,6 @@ generate_blend(const struct pipe_blend_state *blend,
    }
 
    lp_build_mask_end(&mask_ctx);
-   lp_build_flow_destroy(flow);
 }
 
 
@@ -468,13 +503,13 @@ generate_blend(const struct pipe_blend_state *blend,
  * 2x2 pixels.
  */
 static void
-generate_fragment(struct llvmpipe_context *lp,
+generate_fragment(struct llvmpipe_screen *screen,
                   struct lp_fragment_shader *shader,
                   struct lp_fragment_shader_variant *variant,
                   unsigned partial_mask)
 {
-   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
    const struct lp_fragment_shader_variant_key *key = &variant->key;
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
    char func_name[256];
    struct lp_type fs_type;
    struct lp_type blend_type;
@@ -502,11 +537,24 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMValueRef blend_mask;
    LLVMValueRef function;
    LLVMValueRef facing;
+   const struct util_format_description *zs_format_desc;
    unsigned num_fs;
    unsigned i;
    unsigned chan;
    unsigned cbuf;
 
+   /* Adjust color input interpolation according to flatshade state:
+    */
+   memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]);
+   for (i = 0; i < shader->info.base.num_inputs; i++) {
+      if (inputs[i].interp == LP_INTERP_COLOR) {
+	 if (key->flatshade)
+	    inputs[i].interp = LP_INTERP_CONSTANT;
+	 else
+	    inputs[i].interp = LP_INTERP_LINEAR;
+      }
+   }
+
 
    /* TODO: actually pick these based on the fs and color buffer
     * characteristics. */
@@ -542,12 +590,12 @@ generate_fragment(struct llvmpipe_context *lp,
    arg_types[0] = screen->context_ptr_type;            /* context */
    arg_types[1] = LLVMInt32Type();                     /* x */
    arg_types[2] = LLVMInt32Type();                     /* y */
-   arg_types[3] = LLVMFloatType();                     /* facing */
+   arg_types[3] = LLVMInt32Type();                     /* facing */
    arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
    arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
-   arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+   arg_types[8] = LLVMPointerType(LLVMInt8Type(), 0);  /* depth */
    arg_types[9] = LLVMInt32Type();                     /* mask_input */
    arg_types[10] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */
 
@@ -558,7 +606,6 @@ generate_fragment(struct llvmpipe_context *lp,
 
    variant->function[partial_mask] = function;
 
-
    /* XXX: need to propagate noalias down into color param now we are
     * passing a pointer-to-pointer?
     */
@@ -606,8 +653,8 @@ generate_fragment(struct llvmpipe_context *lp,
     * already included in the shader key.
     */
    lp_build_interp_soa_init(&interp, 
-                            lp->num_inputs,
-                            lp->inputs,
+                            shader->info.base.num_inputs,
+                            inputs,
                             builder, fs_type,
                             a0_ptr, dadx_ptr, dady_ptr,
                             x, y);
@@ -616,17 +663,18 @@ generate_fragment(struct llvmpipe_context *lp,
    sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
 
    /* loop over quads in the block */
+   zs_format_desc = util_format_description(key->zsbuf_format);
+
    for(i = 0; i < num_fs; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef depth_offset = LLVMConstInt(LLVMInt32Type(),
+                                               i*fs_type.length*zs_format_desc->block.bits/8,
+                                               0);
       LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS];
       LLVMValueRef depth_ptr_i;
 
-      if(i != 0)
-         lp_build_interp_soa_update(&interp, i);
+      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &depth_offset, 1, "");
 
-      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
-
-      generate_fs(lp, shader, key,
+      generate_fs(shader, key,
                   builder,
                   fs_type,
                   context_ptr,
@@ -660,9 +708,18 @@ generate_fragment(struct llvmpipe_context *lp,
        * Convert the fs's output color and mask to fit to the blending type. 
        */
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH];
+         
+         for (i = 0; i < num_fs; i++) {
+            fs_color_vals[i] =
+               LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
+         }
+
 	 lp_build_conv(builder, fs_type, blend_type,
-		       fs_out_color[cbuf][chan], num_fs,
+                       fs_color_vals,
+                       num_fs,
 		       &blend_in_color[chan], 1);
+
 	 lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
       }
 
@@ -685,14 +742,23 @@ generate_fragment(struct llvmpipe_context *lp,
       /*
        * Blending.
        */
-      generate_blend(&key->blend,
-                     rt,
-		     builder,
-		     blend_type,
-		     context_ptr,
-		     blend_mask,
-		     blend_in_color,
-		     color_ptr);
+      {
+         /* Could the 4x4 have been killed?
+          */
+         boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) &&
+                              !key->alpha.enabled &&
+                              !shader->info.base.uses_kill);
+
+         generate_blend(&key->blend,
+                        rt,
+                        builder,
+                        blend_type,
+                        context_ptr,
+                        blend_mask,
+                        blend_in_color,
+                        color_ptr,
+                        do_branch);
+      }
    }
 
 #ifdef PIPE_ARCH_X86
@@ -717,12 +783,17 @@ generate_fragment(struct llvmpipe_context *lp,
    /* Apply optimizations to LLVM IR */
    LLVMRunFunctionPassManager(screen->pass, function);
 
-   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+   if ((gallivm_debug & GALLIVM_DEBUG_IR) || (LP_DEBUG & DEBUG_FS)) {
       /* Print the LLVM IR to stderr */
       lp_debug_dump_value(function);
       debug_printf("\n");
    }
 
+   /* Dump byte code to a file */
+   if (0) {
+      LLVMWriteBitcodeToFile(lp_build_module, "llvmpipe.bc");
+   }
+
    /*
     * Translate the LLVM IR into machine code.
     */
@@ -731,7 +802,7 @@ generate_fragment(struct llvmpipe_context *lp,
 
       variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f);
 
-      if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+      if ((gallivm_debug & GALLIVM_DEBUG_ASM) || (LP_DEBUG & DEBUG_FS)) {
          lp_disassemble(f);
       }
       lp_func_delete_body(function);
@@ -746,6 +817,9 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
 
    debug_printf("fs variant %p:\n", (void *) key);
 
+   if (key->flatshade) {
+      debug_printf("flatshade = 1\n");
+   }
    for (i = 0; i < key->nr_cbufs; ++i) {
       debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
    }
@@ -770,6 +844,10 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
       debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
    }
 
+   if (key->occlusion_count) {
+      debug_printf("occlusion_count = 1\n");
+   }
+
    if (key->blend.logicop_enable) {
       debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
    }
@@ -782,31 +860,33 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
       debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
    }
    debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
-   for (i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
-      if (key->sampler[i].format) {
-         debug_printf("sampler[%u] = \n", i);
-         debug_printf("  .format = %s\n",
-                      util_format_name(key->sampler[i].format));
-         debug_printf("  .target = %s\n",
-                      util_dump_tex_target(key->sampler[i].target, TRUE));
-         debug_printf("  .pot = %u %u %u\n",
-                      key->sampler[i].pot_width,
-                      key->sampler[i].pot_height,
-                      key->sampler[i].pot_depth);
-         debug_printf("  .wrap = %s %s %s\n",
-                      util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
-                      util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
-                      util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
-         debug_printf("  .min_img_filter = %s\n",
-                      util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
-         debug_printf("  .min_mip_filter = %s\n",
-                      util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
-         debug_printf("  .mag_img_filter = %s\n",
-                      util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
-         if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
-            debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
-         debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
-      }
+   for (i = 0; i < key->nr_samplers; ++i) {
+      debug_printf("sampler[%u] = \n", i);
+      debug_printf("  .format = %s\n",
+                   util_format_name(key->sampler[i].format));
+      debug_printf("  .target = %s\n",
+                   util_dump_tex_target(key->sampler[i].target, TRUE));
+      debug_printf("  .pot = %u %u %u\n",
+                   key->sampler[i].pot_width,
+                   key->sampler[i].pot_height,
+                   key->sampler[i].pot_depth);
+      debug_printf("  .wrap = %s %s %s\n",
+                   util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                   util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                   util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+      debug_printf("  .min_img_filter = %s\n",
+                   util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+      debug_printf("  .min_mip_filter = %s\n",
+                   util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+      debug_printf("  .mag_img_filter = %s\n",
+                   util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+      if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+         debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
+      debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+      debug_printf("  .min_max_lod_equal = %u\n", key->sampler[i].min_max_lod_equal);
+      debug_printf("  .lod_bias_non_zero = %u\n", key->sampler[i].lod_bias_non_zero);
+      debug_printf("  .apply_min_lod = %u\n", key->sampler[i].apply_min_lod);
+      debug_printf("  .apply_max_lod = %u\n", key->sampler[i].apply_max_lod);
    }
 }
 
@@ -823,7 +903,7 @@ lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant)
 }
 
 static struct lp_fragment_shader_variant *
-generate_variant(struct llvmpipe_context *lp,
+generate_variant(struct llvmpipe_screen *screen,
                  struct lp_fragment_shader *shader,
                  const struct lp_fragment_shader_variant_key *key)
 {
@@ -861,7 +941,7 @@ generate_variant(struct llvmpipe_context *lp,
          !key->stencil[0].enabled &&
          !key->alpha.enabled &&
          !key->depth.enabled &&
-         !shader->info.uses_kill
+         !shader->info.base.uses_kill
          ? TRUE : FALSE;
 
 
@@ -869,11 +949,11 @@ generate_variant(struct llvmpipe_context *lp,
       lp_debug_fs_variant(variant);
    }
 
-   generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+   generate_fragment(screen, shader, variant, RAST_EDGE_TEST);
 
    if (variant->opaque) {
       /* Specialized shader, which doesn't need to read the color buffer. */
-      generate_fragment(lp, shader, variant, RAST_WHOLE);
+      generate_fragment(screen, shader, variant, RAST_WHOLE);
    } else {
       variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
    }
@@ -889,6 +969,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
    struct lp_fragment_shader *shader;
    int nr_samplers;
+   int i;
 
    shader = CALLOC_STRUCT(lp_fragment_shader);
    if (!shader)
@@ -898,7 +979,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
    make_empty_list(&shader->variants);
 
    /* get/save the summary info for this shader */
-   tgsi_scan_shader(templ->tokens, &shader->info);
+   lp_build_tgsi_info(templ->tokens, &shader->info);
 
    /* we need to keep a local copy of the tokens */
    shader->base.tokens = tgsi_dup_tokens(templ->tokens);
@@ -910,18 +991,58 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
       return NULL;
    }
 
-   nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+   nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
 
    shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
 				     sampler[nr_samplers]);
 
+   for (i = 0; i < shader->info.base.num_inputs; i++) {
+      shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i];
+
+      switch (shader->info.base.input_interpolate[i]) {
+      case TGSI_INTERPOLATE_CONSTANT:
+	 shader->inputs[i].interp = LP_INTERP_CONSTANT;
+	 break;
+      case TGSI_INTERPOLATE_LINEAR:
+	 shader->inputs[i].interp = LP_INTERP_LINEAR;
+	 break;
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+	 shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+	 break;
+      default:
+	 assert(0);
+	 break;
+      }
+
+      switch (shader->info.base.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_COLOR:
+         /* Colors may be either linearly or constant interpolated in
+	  * the fragment shader, but that information isn't available
+	  * here.  Mark color inputs and fix them up later.
+          */
+	 shader->inputs[i].interp = LP_INTERP_COLOR;
+         break;
+      case TGSI_SEMANTIC_FACE:
+	 shader->inputs[i].interp = LP_INTERP_FACING;
+	 break;
+      case TGSI_SEMANTIC_POSITION:
+	 /* Position was already emitted above
+	  */
+	 shader->inputs[i].interp = LP_INTERP_POSITION;
+	 shader->inputs[i].src_index = 0;
+	 continue;
+      }
+
+      shader->inputs[i].src_index = i+1;
+   }
+
    if (LP_DEBUG & DEBUG_TGSI) {
       unsigned attrib;
       debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader);
       tgsi_dump(templ->tokens, 0);
       debug_printf("usage masks:\n");
-      for (attrib = 0; attrib < shader->info.num_inputs; ++attrib) {
-         unsigned usage_mask = shader->info.input_usage_mask[attrib];
+      for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) {
+         unsigned usage_mask = shader->info.base.input_usage_mask[attrib];
          debug_printf("  IN[%u].%s%s%s%s\n",
                       attrib,
                       usage_mask & TGSI_WRITEMASK_X ? "x" : "",
@@ -1150,10 +1271,10 @@ make_variant_key(struct llvmpipe_context *lp,
 
    /* This value will be the same for all the variants of a given shader:
     */
-   key->nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+   key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
 
    for(i = 0; i < key->nr_samplers; ++i) {
-      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+      if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
          lp_sampler_static_state(&key->sampler[i],
 				 lp->fragment_sampler_views[i],
 				 lp->sampler[i]);
@@ -1168,6 +1289,7 @@ make_variant_key(struct llvmpipe_context *lp,
 void 
 llvmpipe_update_fs(struct llvmpipe_context *lp)
 {
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
    struct lp_fragment_shader *shader = lp->fs;
    struct lp_fragment_shader_variant_key key;
    struct lp_fragment_shader_variant *variant = NULL;
@@ -1208,7 +1330,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
       }
       t0 = os_time_get();
 
-      variant = generate_variant(lp, shader, &key);
+      variant = generate_variant(screen, shader, &key);
 
       t1 = os_time_get();
       dt = t1 - t0;
@@ -1228,6 +1350,10 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
 
 
 
+
+
+
+
 void
 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 4999b8dca1a..7d58c4936c7 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -34,6 +34,8 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h" /* for tgsi_shader_info */
 #include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */
+#include "gallivm/lp_bld_tgsi.h" /* for lp_tgsi_info */
+#include "lp_bld_interp.h" /* for struct lp_shader_input */
 
 
 struct tgsi_token;
@@ -96,7 +98,7 @@ struct lp_fragment_shader
 {
    struct pipe_shader_state base;
 
-   struct tgsi_shader_info info;
+   struct lp_tgsi_info info;
 
    struct lp_fs_variant_list_item variants;
 
@@ -107,6 +109,9 @@ struct lp_fragment_shader
    unsigned no;
    unsigned variants_created;
    unsigned variants_cached;
+
+   /** Fragment shader input interpolation info */
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 17a4a0ed02d..1dd866195d3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -246,9 +246,9 @@ llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp,
                                  struct pipe_sampler_view **views)
 {
    unsigned i;
-   uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
-   uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
-   const void *data[DRAW_MAX_TEXTURE_LEVELS];
+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   const void *data[PIPE_MAX_TEXTURE_LEVELS];
 
    assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
    if (!num)
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
new file mode 100644
index 00000000000..2c8b8b9a928
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -0,0 +1,759 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "os/os_time.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_intr.h"
+#include <llvm-c/Analysis.h>	/* for LLVMVerifyFunction */
+
+#include "lp_perf.h"
+#include "lp_debug.h"
+#include "lp_flush.h"
+#include "lp_screen.h"
+#include "lp_context.h"
+#include "lp_setup_context.h"
+#include "lp_rast.h"
+#include "lp_state.h"
+#include "lp_state_fs.h"
+#include "lp_state_setup.h"
+
+
+
+/* currently organized to interpolate full float[4] attributes even
+ * when some elements are unused.  Later, can pack vertex data more
+ * closely.
+ */
+
+
+struct lp_setup_args
+{
+   /* Function arguments:
+    */
+   LLVMValueRef v0;
+   LLVMValueRef v1;
+   LLVMValueRef v2;
+   LLVMValueRef facing;		/* boolean */
+   LLVMValueRef a0;
+   LLVMValueRef dadx;
+   LLVMValueRef dady;
+
+   /* Derived:
+    */
+   LLVMValueRef x0_center;
+   LLVMValueRef y0_center;
+   LLVMValueRef dy20_ooa;
+   LLVMValueRef dy01_ooa;
+   LLVMValueRef dx20_ooa;
+   LLVMValueRef dx01_ooa;
+};
+
+static LLVMTypeRef type4f(void)
+{
+   return LLVMVectorType(LLVMFloatType(), 4);
+}
+
+
+/* Equivalent of _mm_setr_ps(a,b,c,d)
+ */
+static LLVMValueRef vec4f(LLVMBuilderRef bld,
+			  LLVMValueRef a, LLVMValueRef b, LLVMValueRef c, LLVMValueRef d,
+			  const char *name)
+{
+   LLVMValueRef i0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   LLVMValueRef i1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+   LLVMValueRef i2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+   LLVMValueRef i3 = LLVMConstInt(LLVMInt32Type(), 3, 0);
+
+   LLVMValueRef res = LLVMGetUndef(type4f());
+
+   res = LLVMBuildInsertElement(bld, res, a, i0, "");
+   res = LLVMBuildInsertElement(bld, res, b, i1, "");
+   res = LLVMBuildInsertElement(bld, res, c, i2, "");
+   res = LLVMBuildInsertElement(bld, res, d, i3, name);
+
+   return res;
+}
+
+/* Equivalent of _mm_set1_ps(a)
+ */
+static LLVMValueRef vec4f_from_scalar(LLVMBuilderRef bld,
+				      LLVMValueRef a,
+				      const char *name)
+{
+   LLVMValueRef res = LLVMGetUndef(type4f());
+   int i;
+
+   for(i = 0; i < 4; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildInsertElement(bld, res, a, index, i == 3 ? name : "");
+   }
+
+   return res;
+}
+
+static void
+store_coef(LLVMBuilderRef builder,
+	   struct lp_setup_args *args,
+	   unsigned slot,
+	   LLVMValueRef a0,
+	   LLVMValueRef dadx,
+	   LLVMValueRef dady)
+{
+   LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), slot, 0);
+   
+   LLVMBuildStore(builder,
+		  a0, 
+		  LLVMBuildGEP(builder, args->a0, &idx, 1, ""));
+
+   LLVMBuildStore(builder,
+		  dadx, 
+		  LLVMBuildGEP(builder, args->dadx, &idx, 1, ""));
+
+   LLVMBuildStore(builder,
+		  dady, 
+		  LLVMBuildGEP(builder, args->dady, &idx, 1, ""));
+}
+
+
+
+static void 
+emit_constant_coef4( LLVMBuilderRef builder,
+		     struct lp_setup_args *args,
+		     unsigned slot,
+		     LLVMValueRef vert,
+		     unsigned attr)
+{
+   LLVMValueRef zero      = LLVMConstReal(LLVMFloatType(), 0.0);
+   LLVMValueRef zerovec   = vec4f_from_scalar(builder, zero, "zero");
+   LLVMValueRef idx       = LLVMConstInt(LLVMInt32Type(), attr, 0);
+   LLVMValueRef attr_ptr  = LLVMBuildGEP(builder, vert, &idx, 1, "attr_ptr");
+   LLVMValueRef vert_attr = LLVMBuildLoad(builder, attr_ptr, "vert_attr");
+
+   store_coef(builder, args, slot, vert_attr, zerovec, zerovec);
+}
+
+
+
+/**
+ * Setup the fragment input attribute with the front-facing value.
+ * \param frontface  is the triangle front facing?
+ */
+static void 
+emit_facing_coef( LLVMBuilderRef builder,
+		  struct lp_setup_args *args,
+		  unsigned slot )
+{
+   LLVMValueRef a0_0 = args->facing;
+   LLVMValueRef a0_0f = LLVMBuildSIToFP(builder, a0_0, LLVMFloatType(), "");
+   LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
+   LLVMValueRef a0      = vec4f(builder, a0_0f, zero, zero, zero, "facing");
+   LLVMValueRef zerovec = vec4f_from_scalar(builder, zero, "zero");
+
+   store_coef(builder, args, slot, a0, zerovec, zerovec);
+}
+
+
+static LLVMValueRef
+vert_attrib(LLVMBuilderRef b,
+	    LLVMValueRef vert,
+	    int attr,
+	    int elem,
+	    const char *name)
+{
+   LLVMValueRef idx[2];
+   idx[0] = LLVMConstInt(LLVMInt32Type(), attr, 0);
+   idx[1] = LLVMConstInt(LLVMInt32Type(), elem, 0);
+   return LLVMBuildLoad(b, LLVMBuildGEP(b, vert, idx, 2, ""), name);
+}
+
+
+
+static void 
+emit_coef4( LLVMBuilderRef b,
+	    struct lp_setup_args *args,
+	    unsigned slot,
+	    LLVMValueRef a0,
+	    LLVMValueRef a1,
+	    LLVMValueRef a2)
+{
+   LLVMValueRef dy20_ooa = args->dy20_ooa;
+   LLVMValueRef dy01_ooa = args->dy01_ooa;
+   LLVMValueRef dx20_ooa = args->dx20_ooa;
+   LLVMValueRef dx01_ooa = args->dx01_ooa;
+   LLVMValueRef x0_center = args->x0_center;
+   LLVMValueRef y0_center = args->y0_center;
+
+   /* XXX: using fsub, fmul on vector types -- does this work??
+    */
+   LLVMValueRef da01 = LLVMBuildFSub(b, a0, a1, "da01");
+   LLVMValueRef da20 = LLVMBuildFSub(b, a2, a0, "da20");
+
+   /* Calculate dadx (vec4f)
+    */
+   LLVMValueRef da01_dy20_ooa = LLVMBuildFMul(b, da01, dy20_ooa, "da01_dy20_ooa");
+   LLVMValueRef da20_dy01_ooa = LLVMBuildFMul(b, da20, dy01_ooa, "da20_dy01_ooa");
+   LLVMValueRef dadx          = LLVMBuildFSub(b, da01_dy20_ooa, da20_dy01_ooa, "dadx");
+
+   /* Calculate dady (vec4f)
+    */
+   LLVMValueRef da01_dx20_ooa = LLVMBuildFMul(b, da01, dx20_ooa, "da01_dx20_ooa");
+   LLVMValueRef da20_dx01_ooa = LLVMBuildFMul(b, da20, dx01_ooa, "da20_dx01_ooa");
+   LLVMValueRef dady          = LLVMBuildFSub(b, da20_dx01_ooa, da01_dx20_ooa, "dady");
+
+   /* Calculate a0 - the attribute value at the origin
+    */
+   LLVMValueRef dadx_x0       = LLVMBuildFMul(b, dadx, x0_center, "dadx_x0"); 
+   LLVMValueRef dady_y0       = LLVMBuildFMul(b, dady, y0_center, "dady_y0"); 
+   LLVMValueRef attr_v0       = LLVMBuildFAdd(b, dadx_x0, dady_y0, "attr_v0"); 
+   LLVMValueRef attr_0        = LLVMBuildFSub(b, a0, attr_v0, "attr_0"); 
+
+   store_coef(b, args, slot, attr_0, dadx, dady);
+}
+
+
+static void 
+emit_linear_coef( LLVMBuilderRef b,
+		  struct lp_setup_args *args,
+		  unsigned slot,
+		  unsigned vert_attr)
+{
+   LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0);
+
+   LLVMValueRef a0 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a");
+   LLVMValueRef a1 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a");
+   LLVMValueRef a2 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a");
+
+   emit_coef4(b, args, slot, a0, a1, a2);
+}
+
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void 
+emit_perspective_coef( LLVMBuilderRef b,
+		       struct lp_setup_args *args,
+		       unsigned slot,
+		       unsigned vert_attr)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0);
+
+   LLVMValueRef v0a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a");
+   LLVMValueRef v1a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a");
+   LLVMValueRef v2a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a");
+
+   LLVMValueRef v0_oow = vec4f_from_scalar(b, vert_attrib(b, args->v0, 0, 3, ""), "v0_oow");
+   LLVMValueRef v1_oow = vec4f_from_scalar(b, vert_attrib(b, args->v1, 0, 3, ""), "v1_oow");
+   LLVMValueRef v2_oow = vec4f_from_scalar(b, vert_attrib(b, args->v2, 0, 3, ""), "v2_oow");
+
+   LLVMValueRef v0_oow_v0a = LLVMBuildFMul(b, v0a, v0_oow, "v0_oow_v0a");
+   LLVMValueRef v1_oow_v1a = LLVMBuildFMul(b, v1a, v1_oow, "v1_oow_v1a");
+   LLVMValueRef v2_oow_v2a = LLVMBuildFMul(b, v2a, v2_oow, "v2_oow_v2a");
+
+   emit_coef4(b, args, slot, v0_oow_v0a, v1_oow_v1a, v2_oow_v2a);
+}
+
+
+static void
+emit_position_coef( LLVMBuilderRef builder,
+		    struct lp_setup_args *args,
+		    int slot, int attrib )
+{
+   emit_linear_coef(builder, args, slot, attrib);
+}
+
+
+
+
+/**
+ * Compute the inputs-> dadx, dady, a0 values.
+ */
+static void 
+emit_tri_coef( LLVMBuilderRef builder,
+	       const struct lp_setup_variant_key *key,
+	       struct lp_setup_args *args )
+{
+   unsigned slot;
+
+   /* The internal position input is in slot zero:
+    */
+   emit_position_coef(builder, args, 0, 0);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned vert_attr = key->inputs[slot].src_index;
+
+      switch (key->inputs[slot].interp) {
+      case LP_INTERP_CONSTANT:
+	 if (key->flatshade_first) {
+	    emit_constant_coef4(builder, args, slot+1, args->v0, vert_attr);
+	 }
+	 else {
+	    emit_constant_coef4(builder, args, slot+1, args->v2, vert_attr);
+	 }
+	 break;
+
+      case LP_INTERP_LINEAR:
+	 emit_linear_coef(builder, args, slot+1, vert_attr);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+	 emit_perspective_coef(builder, args, slot+1, vert_attr);
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0.
+          */
+         break;
+
+      case LP_INTERP_FACING:
+         emit_facing_coef(builder, args, slot+1);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+}
+
+
+/* XXX: This is generic code, share with fs/vs codegen:
+ */
+static lp_jit_setup_triangle
+finalize_function(struct llvmpipe_screen *screen,
+		  LLVMBuilderRef builder,
+		  LLVMValueRef function)
+{
+   void *f;
+
+   /* Verify the LLVM IR.  If invalid, dump and abort */
+#ifdef DEBUG
+   if (LLVMVerifyFunction(function, LLVMPrintMessageAction)) {
+      if (1)
+         lp_debug_dump_value(function);
+      abort();
+   }
+#endif
+
+   /* Apply optimizations to LLVM IR */
+   LLVMRunFunctionPassManager(screen->pass, function);
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR)
+   {
+      /* Print the LLVM IR to stderr */
+      lp_debug_dump_value(function);
+      debug_printf("\n");
+   }
+
+   /*
+    * Translate the LLVM IR into machine code.
+    */
+   f = LLVMGetPointerToGlobal(screen->engine, function);
+
+   if (gallivm_debug & GALLIVM_DEBUG_ASM)
+   {
+      lp_disassemble(f);
+   }
+
+   lp_func_delete_body(function);
+
+   return f;
+}
+
+/* XXX: Generic code:
+ */
+static void
+lp_emit_emms(LLVMBuilderRef builder)
+{
+#ifdef PIPE_ARCH_X86
+   /* Avoid corrupting the FPU stack on 32bit OSes. */
+   lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0);
+#endif
+}
+
+
+/* XXX: generic code:
+ */
+static void
+set_noalias(LLVMBuilderRef builder,
+	    LLVMValueRef function,
+	    const LLVMTypeRef *arg_types,
+	    int nr_args)
+{
+   int i;
+   for(i = 0; i < Elements(arg_types); ++i)
+      if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
+         LLVMAddAttribute(LLVMGetParam(function, i),
+			  LLVMNoAliasAttribute);
+}
+
+static void
+init_args(LLVMBuilderRef b, 
+	  struct lp_setup_args *args,
+	  const struct lp_setup_variant *variant)
+{
+   LLVMValueRef v0_x = vert_attrib(b, args->v0, 0, 0, "v0_x");
+   LLVMValueRef v0_y = vert_attrib(b, args->v0, 0, 1, "v0_y");
+
+   LLVMValueRef v1_x = vert_attrib(b, args->v1, 0, 0, "v1_x");
+   LLVMValueRef v1_y = vert_attrib(b, args->v1, 0, 1, "v1_y");
+
+   LLVMValueRef v2_x = vert_attrib(b, args->v2, 0, 0, "v2_x");
+   LLVMValueRef v2_y = vert_attrib(b, args->v2, 0, 1, "v2_y");
+
+   LLVMValueRef pixel_center = LLVMConstReal(LLVMFloatType(),
+					     variant->key.pixel_center_half ? 0.5 : 0);
+
+   LLVMValueRef x0_center = LLVMBuildFSub(b, v0_x, pixel_center, "x0_center" );
+   LLVMValueRef y0_center = LLVMBuildFSub(b, v0_y, pixel_center, "y0_center" );
+   
+   LLVMValueRef dx01 = LLVMBuildFSub(b, v0_x, v1_x, "dx01");
+   LLVMValueRef dy01 = LLVMBuildFSub(b, v0_y, v1_y, "dy01");
+   LLVMValueRef dx20 = LLVMBuildFSub(b, v2_x, v0_x, "dx20");
+   LLVMValueRef dy20 = LLVMBuildFSub(b, v2_y, v0_y, "dy20");
+
+   LLVMValueRef one  = LLVMConstReal(LLVMFloatType(), 1.0);
+   LLVMValueRef e    = LLVMBuildFMul(b, dx01, dy20, "e");
+   LLVMValueRef f    = LLVMBuildFMul(b, dx20, dy01, "f");
+   LLVMValueRef ooa  = LLVMBuildFDiv(b, one, LLVMBuildFSub(b, e, f, ""), "ooa");
+
+   LLVMValueRef dy20_ooa = LLVMBuildFMul(b, dy20, ooa, "dy20_ooa");
+   LLVMValueRef dy01_ooa = LLVMBuildFMul(b, dy01, ooa, "dy01_ooa");
+   LLVMValueRef dx20_ooa = LLVMBuildFMul(b, dx20, ooa, "dx20_ooa");
+   LLVMValueRef dx01_ooa = LLVMBuildFMul(b, dx01, ooa, "dx01_ooa");
+
+   args->dy20_ooa  = vec4f_from_scalar(b, dy20_ooa, "dy20_ooa_4f");
+   args->dy01_ooa  = vec4f_from_scalar(b, dy01_ooa, "dy01_ooa_4f");
+
+   args->dx20_ooa  = vec4f_from_scalar(b, dx20_ooa, "dx20_ooa_4f");
+   args->dx01_ooa  = vec4f_from_scalar(b, dx01_ooa, "dx01_ooa_4f");
+
+   args->x0_center = vec4f_from_scalar(b, x0_center, "x0_center_4f");
+   args->y0_center = vec4f_from_scalar(b, y0_center, "y0_center_4f");
+}
+
+/**
+ * Generate the runtime callable function for the coefficient calculation.
+ *
+ */
+static struct lp_setup_variant *
+generate_setup_variant(struct llvmpipe_screen *screen,
+		       struct lp_setup_variant_key *key)
+{
+   struct lp_setup_variant *variant = NULL;
+   struct lp_setup_args args;
+   char func_name[256];
+   LLVMTypeRef vec4f_type;
+   LLVMTypeRef func_type;
+   LLVMTypeRef arg_types[7];
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   int64_t t0, t1;
+
+   if (0)
+      goto fail;
+
+   variant = CALLOC_STRUCT(lp_setup_variant);
+   if (variant == NULL)
+      goto fail;
+
+   if (LP_DEBUG & DEBUG_COUNTERS) {
+      t0 = os_time_get();
+   }
+
+   memcpy(&variant->key, key, key->size);
+   variant->list_item_global.base = variant;
+
+   util_snprintf(func_name, sizeof(func_name), "fs%u_setup%u",
+		 0,
+		 variant->no);
+
+   /* Currently always deal with full 4-wide vertex attributes from
+    * the vertices.
+    */
+
+   vec4f_type = LLVMVectorType(LLVMFloatType(), 4);
+
+   arg_types[0] = LLVMPointerType(vec4f_type, 0);        /* v0 */
+   arg_types[1] = LLVMPointerType(vec4f_type, 0);        /* v1 */
+   arg_types[2] = LLVMPointerType(vec4f_type, 0);        /* v2 */
+   arg_types[3] = LLVMInt32Type();			/* facing */
+   arg_types[4] = LLVMPointerType(vec4f_type, 0);	/* a0, aligned */
+   arg_types[5] = LLVMPointerType(vec4f_type, 0);	/* dadx, aligned */
+   arg_types[6] = LLVMPointerType(vec4f_type, 0);	/* dady, aligned */
+
+   func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
+
+   variant->function = LLVMAddFunction(screen->module, func_name, func_type);
+   if (!variant->function)
+      goto fail;
+
+   LLVMSetFunctionCallConv(variant->function, LLVMCCallConv);
+
+   args.v0       = LLVMGetParam(variant->function, 0);
+   args.v1       = LLVMGetParam(variant->function, 1);
+   args.v2       = LLVMGetParam(variant->function, 2);
+   args.facing   = LLVMGetParam(variant->function, 3);
+   args.a0       = LLVMGetParam(variant->function, 4);
+   args.dadx     = LLVMGetParam(variant->function, 5);
+   args.dady     = LLVMGetParam(variant->function, 6);
+
+   lp_build_name(args.v0, "in_v0");
+   lp_build_name(args.v1, "in_v1");
+   lp_build_name(args.v2, "in_v2");
+   lp_build_name(args.facing, "in_facing");
+   lp_build_name(args.a0, "out_a0");
+   lp_build_name(args.dadx, "out_dadx");
+   lp_build_name(args.dady, "out_dady");
+
+   /*
+    * Function body
+    */
+   block = LLVMAppendBasicBlock(variant->function, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   set_noalias(builder, variant->function, arg_types, Elements(arg_types));
+   init_args(builder, &args, variant);
+   emit_tri_coef(builder, &variant->key, &args);
+
+   lp_emit_emms(builder);
+   LLVMBuildRetVoid(builder);
+   LLVMDisposeBuilder(builder);
+
+   variant->jit_function = finalize_function(screen, builder,
+					     variant->function);
+   if (!variant->jit_function)
+      goto fail;
+
+   /*
+    * Update timing information:
+    */
+   if (LP_DEBUG & DEBUG_COUNTERS) {
+      t1 = os_time_get();
+      LP_COUNT_ADD(llvm_compile_time, t1 - t0);
+      LP_COUNT_ADD(nr_llvm_compiles, 1);
+   }
+   
+   return variant;
+
+fail:
+   if (variant) {
+      if (variant->function) {
+	 if (variant->jit_function)
+	    LLVMFreeMachineCodeForFunction(screen->engine,
+					   variant->function);
+	 LLVMDeleteFunction(variant->function);
+      }
+      FREE(variant);
+   }
+   
+   return NULL;
+}
+
+
+
+static void
+lp_make_setup_variant_key(struct llvmpipe_context *lp,
+			  struct lp_setup_variant_key *key)
+{
+   struct lp_fragment_shader *fs = lp->fs;
+   unsigned i;
+
+   assert(sizeof key->inputs[0] == sizeof(ushort));
+   
+   key->num_inputs = fs->info.base.num_inputs;
+   key->flatshade_first = lp->rasterizer->flatshade_first;
+   key->pixel_center_half = lp->rasterizer->gl_rasterization_rules;
+   key->size = Offset(struct lp_setup_variant_key,
+		      inputs[key->num_inputs]);
+   key->pad = 0;
+
+   memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]);
+   for (i = 0; i < key->num_inputs; i++) {
+      if (key->inputs[i].interp == LP_INTERP_COLOR) {
+	 if (lp->rasterizer->flatshade)
+	    key->inputs[i].interp = LP_INTERP_CONSTANT;
+	 else
+	    key->inputs[i].interp = LP_INTERP_LINEAR;
+      }
+   }
+
+}
+
+
+static void
+remove_setup_variant(struct llvmpipe_context *lp,
+		     struct lp_setup_variant *variant)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      debug_printf("llvmpipe: del setup_variant #%u total %u\n",
+		   variant->no, lp->nr_setup_variants);
+   }
+
+   if (variant->function) {
+      if (variant->jit_function)
+	 LLVMFreeMachineCodeForFunction(screen->engine,
+					variant->function);
+      LLVMDeleteFunction(variant->function);
+   }
+
+   remove_from_list(&variant->list_item_global);
+   lp->nr_setup_variants--;
+   FREE(variant);
+}
+
+
+
+/* When the number of setup variants exceeds a threshold, cull a
+ * fraction (currently a quarter) of them.
+ */
+static void
+cull_setup_variants(struct llvmpipe_context *lp)
+{
+   struct pipe_context *pipe = &lp->pipe;
+   int i;
+
+   /*
+    * XXX: we need to flush the context until we have some sort of reference
+    * counting in fragment shaders as they may still be binned
+    * Flushing alone might not be sufficient we need to wait on it too.
+    */
+   llvmpipe_finish(pipe, __FUNCTION__);
+
+   for (i = 0; i < LP_MAX_SETUP_VARIANTS / 4; i++) {
+      struct lp_setup_variant_list_item *item = last_elem(&lp->setup_variants_list);
+      remove_setup_variant(lp, item->base);
+   }
+}
+
+
+/**
+ * Update fragment/vertex shader linkage state.  This is called just
+ * prior to drawing something when some fragment-related state has
+ * changed.
+ */
+void 
+llvmpipe_update_setup(struct llvmpipe_context *lp)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
+
+   struct lp_setup_variant_key *key = &lp->setup_variant.key;
+   struct lp_setup_variant *variant = NULL;
+   struct lp_setup_variant_list_item *li;
+
+   lp_make_setup_variant_key(lp, key);
+
+   foreach(li, &lp->setup_variants_list) {
+      if(li->base->key.size == key->size &&
+	 memcmp(&li->base->key, key, key->size) == 0) {
+         variant = li->base;
+         break;
+      }
+   }
+
+   if (variant) {
+      move_to_head(&lp->setup_variants_list, &variant->list_item_global);
+   }
+   else {
+      if (lp->nr_setup_variants >= LP_MAX_SETUP_VARIANTS) {
+	 cull_setup_variants(lp);
+      }
+
+      variant = generate_setup_variant(screen, key);
+      insert_at_head(&lp->setup_variants_list, &variant->list_item_global);
+      lp->nr_setup_variants++;
+   }
+
+   lp_setup_set_setup_variant(lp->setup,
+			      variant);
+}
+
+void
+lp_delete_setup_variants(struct llvmpipe_context *lp)
+{
+   struct lp_setup_variant_list_item *li;
+   li = first_elem(&lp->setup_variants_list);
+   while(!at_end(&lp->setup_variants_list, li)) {
+      struct lp_setup_variant_list_item *next = next_elem(li);
+      remove_setup_variant(lp, li->base);
+      li = next;
+   }
+}
+
+void
+lp_dump_setup_coef( const struct lp_setup_variant_key *key,
+		    const float (*sa0)[4],
+		    const float (*sdadx)[4],
+		    const float (*sdady)[4])
+{
+   int i, slot;
+
+   for (i = 0; i < NUM_CHANNELS; i++) {
+      float a0   = sa0  [0][i];
+      float dadx = sdadx[0][i];
+      float dady = sdady[0][i];
+
+      debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n",
+		   "xyzw"[i],
+		   a0, dadx, dady);
+   }
+
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned usage_mask = key->inputs[slot].usage_mask;
+      for (i = 0; i < NUM_CHANNELS; i++) {
+	 if (usage_mask & (1 << i)) {
+	    float a0   = sa0  [1 + slot][i];
+	    float dadx = sdadx[1 + slot][i];
+	    float dady = sdady[1 + slot][i];
+
+	    debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n",
+			 slot,
+			 "xyzw"[i],
+			 a0, dadx, dady);
+	 }
+      }
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h
new file mode 100644
index 00000000000..b0c81baa75f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -0,0 +1,80 @@
+#ifndef LP_STATE_SETUP_H
+#define LP_STATE_SETUP_H
+
+#include "lp_bld_interp.h"
+
+
+struct llvmpipe_context;
+struct lp_setup_variant;
+
+struct lp_setup_variant_list_item
+{
+   struct lp_setup_variant *base;
+   struct lp_setup_variant_list_item *next, *prev;
+};
+
+
+struct lp_setup_variant_key {   
+   unsigned num_inputs:8;
+   unsigned flatshade_first:1;
+   unsigned pixel_center_half:1;
+   unsigned pad:7;
+   unsigned size:16;
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
+};
+
+
+typedef void (*lp_jit_setup_triangle)( const float (*v0)[4],
+				       const float (*v1)[4],
+				       const float (*v2)[4],
+				       boolean front_facing,
+				       float (*a0)[4],
+				       float (*dadx)[4],
+				       float (*dady)[4] );
+
+
+
+
+/* At this stage, for a given variant key, we create a
+ * draw_vertex_info struct telling the draw module how to format the
+ * vertices, and an llvm-generated function which calculates the
+ * attribute interpolants (a0, dadx, dady) from three of those
+ * vertices.
+ */
+struct lp_setup_variant {
+   struct lp_setup_variant_key key;
+   
+   struct lp_setup_variant_list_item list_item_global;
+
+   /* XXX: this is a pointer to the LLVM IR.  Once jit_function is
+    * generated, we never need to use the IR again - need to find a
+    * way to release this data without destroying the generated
+    * assembly.
+    */
+   LLVMValueRef function;
+
+   /* The actual generated setup function:
+    */
+   lp_jit_setup_triangle jit_function;
+
+   unsigned no;
+};
+
+void lp_setup_tri_fallback( const float (*v0)[4],
+			    const float (*v1)[4],
+			    const float (*v2)[4],
+			    boolean front_facing,
+			    float (*a0)[4],
+			    float (*dadx)[4],
+			    float (*dady)[4],
+			    const struct lp_setup_variant_key *key );
+
+void lp_delete_setup_variants(struct llvmpipe_context *lp);
+
+void
+lp_dump_setup_coef( const struct lp_setup_variant_key *key,
+		    const float (*sa0)[4],
+		    const float (*sdadx)[4],
+		    const float (*sdady)[4]);
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c
index 57b0ee57767..816518e5081 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_round.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_round.c
@@ -75,10 +75,7 @@ add_test(LLVMModuleRef module, const char *name, lp_func_t lp_func)
    LLVMValueRef ret;
    struct lp_build_context bld;
 
-   bld.builder = builder;
-   bld.type.floating = 1;
-   bld.type.width = 32;
-   bld.type.length = 4;
+   lp_build_context_init(&bld, builder, lp_float32_vec4_type());
 
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
 
@@ -100,9 +97,10 @@ printv(char* string, v4sf value)
            f[0], f[1], f[2], f[3]);
 }
 
-static void
+static boolean
 compare(v4sf x, v4sf y)
 {
+   boolean success = TRUE;
    float *xp = (float *) &x;
    float *yp = (float *) &y;
    if (xp[0] != yp[0] ||
@@ -110,7 +108,9 @@ compare(v4sf x, v4sf y)
        xp[2] != yp[2] ||
        xp[3] != yp[3]) {
       printf(" Incorrect result! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n");
+      success = FALSE;
    }
+   return success;
 }
 
 
@@ -171,9 +171,12 @@ test_round(unsigned verbose, FILE *fp)
       LLVMDumpModule(module);
 
    for (i = 0; i < 3; i++) {
+      /* NOTE: There are several acceptable rules for x.5 rounding: ceiling,
+       * nearest even, etc. So we avoid testing such corner cases here.
+       */
       v4sf xvals[3] = {
          {-10.0, -1, 0, 12.0},
-         {-1.5, -0.25, 1.25, 2.5},
+         {-1.49, -0.25, 1.25, 2.51},
          {-0.99, -0.01, 0.01, 0.99}
       };
       v4sf x = xvals[i];
@@ -191,7 +194,7 @@ test_round(unsigned verbose, FILE *fp)
       y = round_func(x);
       printv("C round(x)   ", ref);
       printv("LLVM round(x)", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
 
       refp[0] = trunc(xp[0]);
       refp[1] = trunc(xp[1]);
@@ -200,7 +203,7 @@ test_round(unsigned verbose, FILE *fp)
       y = trunc_func(x);
       printv("C trunc(x)   ", ref);
       printv("LLVM trunc(x)", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
 
       refp[0] = floor(xp[0]);
       refp[1] = floor(xp[1]);
@@ -209,7 +212,7 @@ test_round(unsigned verbose, FILE *fp)
       y = floor_func(x);
       printv("C floor(x)   ", ref);
       printv("LLVM floor(x)", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
 
       refp[0] = ceil(xp[0]);
       refp[1] = ceil(xp[1]);
@@ -218,7 +221,7 @@ test_round(unsigned verbose, FILE *fp)
       y = ceil_func(x);
       printv("C ceil(x)    ", ref);
       printv("LLVM ceil(x) ", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
    }
 
    LLVMFreeMachineCodeForFunction(engine, test_round);
@@ -247,11 +250,7 @@ test_round(unsigned verbose, FILE *fp)
 boolean
 test_all(unsigned verbose, FILE *fp)
 {
-   boolean success = TRUE;
-
-   test_round(verbose, fp);
-
-   return success;
+   return test_round(verbose, fp);
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_sincos.c b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
index 7ab357f162e..79939b1a393 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_sincos.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
@@ -72,10 +72,7 @@ add_sincos_test(LLVMModuleRef module, boolean sin)
    LLVMValueRef ret;
    struct lp_build_context bld;
 
-   bld.builder = builder;
-   bld.type.floating = 1;
-   bld.type.width = 32;
-   bld.type.length = 4;
+   lp_build_context_init(&bld, builder, lp_float32_vec4_type());
 
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
index 2ba39052aba..e49f9c62fe7 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -289,172 +289,141 @@ def generate_format_write(format, src_channel, src_native_type, src_suffix):
     print
     
 
-def generate_ssse3():
+def generate_sse2():
     print '''
 #if defined(PIPE_ARCH_SSE)
 
 #include "util/u_sse.h"
 
-static void
-lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
-                                         const uint8_t *src, unsigned src_stride,
-                                         unsigned x0, unsigned y0)
+static ALWAYS_INLINE void 
+swz4( const __m128i * restrict x, 
+      const __m128i * restrict y, 
+      const __m128i * restrict z, 
+      const __m128i * restrict w, 
+      __m128i * restrict a, 
+      __m128i * restrict b, 
+      __m128i * restrict c, 
+      __m128i * restrict d)
 {
+   __m128i i, j, k, l;
+   __m128i m, n, o, p;
+   __m128i e, f, g, h;
+
+   m = _mm_unpacklo_epi8(*x,*y);
+   n = _mm_unpackhi_epi8(*x,*y);
+   o = _mm_unpacklo_epi8(*z,*w);
+   p = _mm_unpackhi_epi8(*z,*w);
+
+   i = _mm_unpacklo_epi16(m,n);
+   j = _mm_unpackhi_epi16(m,n);
+   k = _mm_unpacklo_epi16(o,p);
+   l = _mm_unpackhi_epi16(o,p);
+
+   e = _mm_unpacklo_epi8(i,j);
+   f = _mm_unpackhi_epi8(i,j);
+   g = _mm_unpacklo_epi8(k,l);
+   h = _mm_unpackhi_epi8(k,l);
+
+   *a = _mm_unpacklo_epi64(e,g);
+   *b = _mm_unpackhi_epi64(e,g);
+   *c = _mm_unpacklo_epi64(f,h);
+   *d = _mm_unpackhi_epi64(f,h);
+}
+
+static ALWAYS_INLINE void
+unswz4( const __m128i * restrict a, 
+        const __m128i * restrict b, 
+        const __m128i * restrict c, 
+        const __m128i * restrict d, 
+        __m128i * restrict x, 
+        __m128i * restrict y, 
+        __m128i * restrict z, 
+        __m128i * restrict w)
+{
+   __m128i i, j, k, l;
+   __m128i m, n, o, p;
+
+   i = _mm_unpacklo_epi8(*a,*b);
+   j = _mm_unpackhi_epi8(*a,*b);
+   k = _mm_unpacklo_epi8(*c,*d);
+   l = _mm_unpackhi_epi8(*c,*d);
+
+   m = _mm_unpacklo_epi16(i,k);
+   n = _mm_unpackhi_epi16(i,k);
+   o = _mm_unpacklo_epi16(j,l);
+   p = _mm_unpackhi_epi16(j,l);
+
+   *x = _mm_unpacklo_epi64(m,n);
+   *y = _mm_unpackhi_epi64(m,n);
+   *z = _mm_unpacklo_epi64(o,p);
+   *w = _mm_unpackhi_epi64(o,p);
+}
 
+static void
+lp_tile_b8g8r8a8_unorm_swizzle_4ub_sse2(uint8_t * restrict dst,
+                                        const uint8_t * restrict src, unsigned src_stride,
+                                        unsigned x0, unsigned y0)
+{
+   __m128i *dst128 = (__m128i *) dst;
    unsigned x, y;
-   __m128i *pdst = (__m128i*) dst;
-   const uint8_t *ysrc0 = src + y0*src_stride + x0*sizeof(uint32_t);
-   unsigned int tile_stridex = src_stride*(TILE_VECTOR_HEIGHT - 1) - sizeof(uint32_t)*TILE_VECTOR_WIDTH;
-   unsigned int tile_stridey = src_stride*TILE_VECTOR_HEIGHT;
-
-   const __m128i shuffle00 = _mm_setr_epi8(0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle01 = _mm_setr_epi8(0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle02 = _mm_setr_epi8(0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle03 = _mm_setr_epi8(0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-
-   const __m128i shuffle10 = _mm_setr_epi8(0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle11 = _mm_setr_epi8(0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle12 = _mm_setr_epi8(0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle13 = _mm_setr_epi8(0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-
-   const __m128i shuffle20 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff);
-   const __m128i shuffle21 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff);
-   const __m128i shuffle22 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff);
-   const __m128i shuffle23 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff);
-
-   const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e);
-   const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d);
-   const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c);
-   const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f);
-
-   for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
-      __m128i line0 = *(__m128i*)ysrc0;
-      const uint8_t *ysrc = ysrc0 + src_stride;
-      ysrc0 += tile_stridey;
-
-      for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
-         __m128i r, g, b, a, line1;
-         line1 = *(__m128i*)ysrc;
-         PIPE_READ_WRITE_BARRIER();
-         ysrc += src_stride;
-         r = _mm_shuffle_epi8(line0, shuffle00);
-         g = _mm_shuffle_epi8(line0, shuffle01);
-         b = _mm_shuffle_epi8(line0, shuffle02);
-         a = _mm_shuffle_epi8(line0, shuffle03);
-
-         line0 = *(__m128i*)ysrc;
-         PIPE_READ_WRITE_BARRIER();
-         ysrc += src_stride;
-         r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle10));
-         g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle11));
-         b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle12));
-         a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle13));
-
-         line1 = *(__m128i*)ysrc;
-         PIPE_READ_WRITE_BARRIER();
-         ysrc -= tile_stridex;
-         r = _mm_or_si128(r, _mm_shuffle_epi8(line0, shuffle20));
-         g = _mm_or_si128(g, _mm_shuffle_epi8(line0, shuffle21));
-         b = _mm_or_si128(b, _mm_shuffle_epi8(line0, shuffle22));
-         a = _mm_or_si128(a, _mm_shuffle_epi8(line0, shuffle23));
-
-         if (x + 1 < TILE_SIZE) {
-            line0 = *(__m128i*)ysrc;
-            ysrc += src_stride;
-         }
-
-         PIPE_READ_WRITE_BARRIER();
-         r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle30));
-         g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle31));
-         b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle32));
-         a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle33));
-
-         *pdst++ = r;
-         *pdst++ = g;
-         *pdst++ = b;
-         *pdst++ = a;
+   
+   src += y0 * src_stride;
+   src += x0 * sizeof(uint32_t);
+
+   for (y = 0; y < TILE_SIZE; y += 4) {
+      const uint8_t *src_row = src;
+
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         swz4((const __m128i *) (src_row + 0 * src_stride),
+              (const __m128i *) (src_row + 1 * src_stride),
+              (const __m128i *) (src_row + 2 * src_stride),
+              (const __m128i *) (src_row + 3 * src_stride),
+              dst128 + 2,     /* b */
+              dst128 + 1,     /* g */
+              dst128 + 0,     /* r */
+              dst128 + 3);    /* a */
+
+         dst128 += 4;
+         src_row += sizeof(__m128i);
       }
-   }
 
+      src += 4 * src_stride;
+   }
 }
 
 static void
-lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
-                                          uint8_t *dst, unsigned dst_stride,
+lp_tile_b8g8r8a8_unorm_unswizzle_4ub_sse2(const uint8_t * restrict src,
+                                          uint8_t * restrict dst, unsigned dst_stride,
                                           unsigned x0, unsigned y0)
 {
    unsigned int x, y;
-   const __m128i *psrc = (__m128i*) src;
-   const __m128i *end = (__m128i*) (src + (y0 + TILE_SIZE - 1)*dst_stride + (x0 + TILE_SIZE - 1)*sizeof(uint32_t));
-   uint8_t *pdst = dst + y0 * dst_stride + x0 * sizeof(uint32_t);
-   __m128i c0 = *psrc++;
-   __m128i c1;
-
-   const __m128i shuffle00 = _mm_setr_epi8(0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff);
-   const __m128i shuffle01 = _mm_setr_epi8(0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff);
-   const __m128i shuffle02 = _mm_setr_epi8(0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff);
-   const __m128i shuffle03 = _mm_setr_epi8(0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff);
-
-   const __m128i shuffle10 = _mm_setr_epi8(0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff);
-   const __m128i shuffle11 = _mm_setr_epi8(0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff);
-   const __m128i shuffle12 = _mm_setr_epi8(0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff);
-   const __m128i shuffle13 = _mm_setr_epi8(0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff);
-
-   const __m128i shuffle20 = _mm_setr_epi8(0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff,0xff);
-   const __m128i shuffle21 = _mm_setr_epi8(0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff,0xff);
-   const __m128i shuffle22 = _mm_setr_epi8(0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff,0xff);
-   const __m128i shuffle23 = _mm_setr_epi8(0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff,0xff);
-
-   const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05);
-   const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07);
-   const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d);
-   const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f);
-
-   for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
-      __m128i *tile = (__m128i*) pdst;
-      pdst += dst_stride * TILE_VECTOR_HEIGHT;
-      for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
-         uint8_t *linep = (uint8_t*) (tile++);
-         __m128i line0, line1, line2, line3;
-
-         c1 = *psrc++; /* r */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_shuffle_epi8(c0, shuffle00);
-         line1 = _mm_shuffle_epi8(c0, shuffle01);
-         line2 = _mm_shuffle_epi8(c0, shuffle02);
-         line3 = _mm_shuffle_epi8(c0, shuffle03);
-
-         c0 = *psrc++; /* g */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle10));
-         line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle11));
-         line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle12));
-         line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle13));
-
-         c1 = *psrc++; /* b */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c0, shuffle20));
-         line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c0, shuffle21));
-         line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c0, shuffle22));
-         line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c0, shuffle23));
-
-         if (psrc != end)
-                 c0 = *psrc++; /* a */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle30));
-         line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle31));
-         line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle32));
-         line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle33));
-
-         *(__m128i*) (linep) = line0;
-         *(__m128i*) (((char*)linep) + dst_stride) = line1;
-         *(__m128i*) (((char*)linep) + 2 * dst_stride) = line2;
-         *(__m128i*) (((char*)linep) + 3 * dst_stride) = line3;
+   const __m128i *src128 = (const __m128i *) src;
+   
+   dst += y0 * dst_stride;
+   dst += x0 * sizeof(uint32_t);
+   
+   for (y = 0; y < TILE_SIZE; y += 4) {
+      const uint8_t *dst_row = dst;
+
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         unswz4( &src128[2],     /* b */
+                 &src128[1],     /* g */
+                 &src128[0],     /* r */
+                 &src128[3],     /* a */
+                 (__m128i *) (dst_row + 0 * dst_stride),
+                 (__m128i *) (dst_row + 1 * dst_stride),
+                 (__m128i *) (dst_row + 2 * dst_stride),
+                 (__m128i *) (dst_row + 3 * dst_stride));
+
+         src128 += 4;
+         dst_row += sizeof(__m128i);;
       }
+
+      dst += 4 * dst_stride;
    }
 }
 
-#endif /* PIPE_ARCH_SSSE3 */
+#endif /* PIPE_ARCH_SSE */
 '''
 
 
@@ -479,7 +448,7 @@ def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix):
             func_name = 'lp_tile_%s_swizzle_%s' % (format.short_name(), dst_suffix)
             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
                 print '#ifdef PIPE_ARCH_SSE'
-                print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
+                print '      func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name)
                 print '#else'
                 print '      func = %s;' % (func_name,)
                 print '#endif'
@@ -517,7 +486,7 @@ def generate_unswizzle(formats, src_channel, src_native_type, src_suffix):
             func_name = 'lp_tile_%s_unswizzle_%s' % (format.short_name(), src_suffix)
             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
                 print '#ifdef PIPE_ARCH_SSE'
-                print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
+                print '      func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name)
                 print '#else'
                 print '      func = %s;' % (func_name,)
                 print '#endif'
@@ -577,7 +546,7 @@ def main():
     print '};'
     print
 
-    generate_ssse3()
+    generate_sse2()
 
     channel = Channel(UNSIGNED, True, 8)
     native_type = 'uint8_t'
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index ebb21a6e5a3..a9426df686f 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -236,7 +236,7 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 	int ret;
 
 	ret = nouveau_channel_alloc(dev, 0xbeef0201, 0xbeef0202,
-				    &screen->channel);
+				    512*1024, &screen->channel);
 	if (ret)
 		return ret;
 	screen->device = dev;
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index ac69c7848e9..bf6a577188b 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -26,6 +26,9 @@
 #define NOUVEAU_MSG(fmt, args...) \
 	fprintf(stderr, "nouveau: "fmt, ##args);
 
+#define nouveau_bo_tile_layout(nvbo) \
+	((nvbo)->tile_flags & NOUVEAU_BO_TILE_LAYOUT_MASK)
+
 /* Constant buffer assignment */
 #define NV50_CB_PMISC		0
 #define NV50_CB_PVP		1
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index 3f3166261b1..f70c138fe1a 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -92,7 +92,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst)
  		return 1;
 	}
 
- 	if (!bo->tile_flags) {
+	if (!nouveau_bo_tile_layout(bo)) {
  		BEGIN_RING(chan, eng2d, mthd, 2);
  		OUT_RING  (chan, format);
  		OUT_RING  (chan, 1);
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index f973cf24b98..0cc2f4a837f 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -45,7 +45,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 
 	WAIT_RING (chan, 14);
 
-	if (!src_bo->tile_flags) {
+	if (!nouveau_bo_tile_layout(src_bo)) {
 		BEGIN_RING(chan, m2mf,
 			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_IN, 1);
 		OUT_RING  (chan, 1);
@@ -64,7 +64,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 		OUT_RING  (chan, sz); /* copying only 1 zslice per call */
 	}
 
-	if (!dst_bo->tile_flags) {
+	if (!nouveau_bo_tile_layout(dst_bo)) {
 		BEGIN_RING(chan, m2mf,
 			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_OUT, 1);
 		OUT_RING  (chan, 1);
@@ -95,14 +95,14 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 			NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 2);
 		OUT_RELOCl(chan, src_bo, src_offset, src_reloc);
 		OUT_RELOCl(chan, dst_bo, dst_offset, dst_reloc);
-		if (src_bo->tile_flags) {
+		if (nouveau_bo_tile_layout(src_bo)) {
 			BEGIN_RING(chan, m2mf,
 				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN, 1);
 			OUT_RING  (chan, (sy << 16) | (sx * cpp));
 		} else {
 			src_offset += (line_count * src_pitch);
 		}
-		if (dst_bo->tile_flags) {
+		if (nouveau_bo_tile_layout(dst_bo)) {
 			BEGIN_RING(chan, m2mf,
 				NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT, 1);
 			OUT_RING  (chan, (dy << 16) | (dx * cpp));
@@ -280,7 +280,7 @@ nv50_upload_sifc(struct nv50_context *nv50,
 
 	MARK_RING (chan, 32, 2); /* flush on lack of space or relocs */
 
-	if (bo->tile_flags) {
+	if (nouveau_bo_tile_layout(bo)) {
 		BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 5);
 		OUT_RING  (chan, dst_format);
 		OUT_RING  (chan, 0);
diff --git a/src/gallium/drivers/r300/r300_debug.c b/src/gallium/drivers/r300/r300_debug.c
index 145a7985da3..f78fe34790c 100644
--- a/src/gallium/drivers/r300/r300_debug.c
+++ b/src/gallium/drivers/r300/r300_debug.c
@@ -29,6 +29,7 @@
 static const struct debug_named_value debug_options[] = {
     { "fp", DBG_FP, "Log fragment program compilation" },
     { "vp", DBG_VP, "Log vertex program compilation" },
+    { "pstat", DBG_P_STAT, "Log vertex/fragment program stats" },
     { "draw", DBG_DRAW, "Log draw calls" },
     { "swtcl", DBG_SWTCL, "Log SWTCL-specific info" },
     { "rsblock", DBG_RS_BLOCK, "Log rasterizer registers" },
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index d9d4a9304df..c91532eb7b3 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -378,7 +378,8 @@ static void r300_translate_fragment_shader(
     /* Setup the compiler. */
     memset(&compiler, 0, sizeof(compiler));
     rc_init(&compiler.Base);
-    compiler.Base.Debug = DBG_ON(r300, DBG_FP);
+    DBG_ON(r300, DBG_FP) ? compiler.Base.Debug |= RC_DBG_LOG : 0;
+    DBG_ON(r300, DBG_P_STAT) ? compiler.Base.Debug |= RC_DBG_STATS : 0;
 
     compiler.code = &shader->code;
     compiler.state = shader->compare_state;
@@ -395,7 +396,7 @@ static void r300_translate_fragment_shader(
 
     find_output_registers(&compiler, shader);
 
-    if (compiler.Base.Debug) {
+    if (compiler.Base.Debug & RC_DBG_LOG) {
         DBG(r300, DBG_FP, "r300: Initial fragment program\n");
         tgsi_dump(tokens, 0);
     }
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 7f41ff0e2ec..d445df408dd 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -124,6 +124,9 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_INDEP_BLEND_FUNC:
         case PIPE_CAP_DEPTH_CLAMP: /* XXX implemented, but breaks Regnum Online */
         case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
+        case PIPE_CAP_SHADER_STENCIL_EXPORT:
+        case PIPE_CAP_STREAM_OUTPUT:
+        case PIPE_CAP_PRIMITIVE_RESTART:
             return 0;
 
         /* Texturing. */
@@ -153,8 +156,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
             return 0;
         default:
-            fprintf(stderr, "r300: Implementation error: Bad param %d\n",
-                param);
+            debug_printf("r300: Warning: Unknown CAP %d in get_param.\n",
+                         param);
             return 0;
     }
 }
@@ -264,8 +267,8 @@ static float r300_get_paramf(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
             return 16.0f;
         default:
-            fprintf(stderr, "r300: Implementation error: Bad paramf %d\n",
-                param);
+            debug_printf("r300: Warning: Unknown CAP %d in get_paramf.\n",
+                         param);
             return 0.0f;
     }
 }
diff --git a/src/gallium/drivers/r300/r300_screen.h b/src/gallium/drivers/r300/r300_screen.h
index dc2bc7e8279..8b7f1fab61b 100644
--- a/src/gallium/drivers/r300/r300_screen.h
+++ b/src/gallium/drivers/r300/r300_screen.h
@@ -102,6 +102,7 @@ r300_winsys_screen(struct pipe_screen *screen) {
 #define DBG_NO_CBZB     (1 << 21)
 /* Statistics. */
 #define DBG_STATS       (1 << 24)
+#define DBG_P_STAT      (1 << 25)
 /*@}*/
 
 static INLINE boolean SCREEN_DBG_ON(struct r300_screen * screen, unsigned flags)
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 904736ef06d..730ef7a3ee2 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -620,14 +620,19 @@ static uint32_t r300_get_border_color(enum pipe_format format,
             }
             break;
 
-        default:
-            /* I think the fat formats (16, 32) are specified
-             * as the 8-bit ones. I am not sure how compressed formats
-             * work here. */
+        case 8:
             r = ((float_to_ubyte(border_swizzled[0]) & 0xff) << 0) |
                 ((float_to_ubyte(border_swizzled[1]) & 0xff) << 8) |
                 ((float_to_ubyte(border_swizzled[2]) & 0xff) << 16) |
                 ((float_to_ubyte(border_swizzled[3]) & 0xff) << 24);
+            break;
+
+        case 16:
+            r = ((float_to_ubyte(border_swizzled[2]) & 0xff) << 0) |
+                ((float_to_ubyte(border_swizzled[1]) & 0xff) << 8) |
+                ((float_to_ubyte(border_swizzled[0]) & 0xff) << 16) |
+                ((float_to_ubyte(border_swizzled[3]) & 0xff) << 24);
+            break;
     }
 
     return r;
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index e2b9af9d018..65696555ac3 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -202,7 +202,8 @@ void r300_translate_vertex_shader(struct r300_context *r300,
     memset(&compiler, 0, sizeof(compiler));
     rc_init(&compiler.Base);
 
-    compiler.Base.Debug = DBG_ON(r300, DBG_VP);
+    DBG_ON(r300, DBG_VP) ? compiler.Base.Debug |= RC_DBG_LOG : 0;
+    DBG_ON(r300, DBG_P_STAT) ? compiler.Base.Debug |= RC_DBG_STATS : 0;
     compiler.code = &vs->code;
     compiler.UserData = vs;
     compiler.Base.is_r500 = r300->screen->caps.is_r500;
@@ -214,7 +215,7 @@ void r300_translate_vertex_shader(struct r300_context *r300,
     compiler.Base.max_alu_insts = r300->screen->caps.is_r500 ? 1024 : 256;
     compiler.Base.remove_unused_constants = TRUE;
 
-    if (compiler.Base.Debug) {
+    if (compiler.Base.Debug & RC_DBG_LOG) {
         DBG(r300, DBG_VP, "r300: Initial vertex program\n");
         tgsi_dump(vs->state.tokens, 0);
     }
diff --git a/src/gallium/drivers/r600/Makefile b/src/gallium/drivers/r600/Makefile
index ede0bb2ec45..a484f38e9f1 100644
--- a/src/gallium/drivers/r600/Makefile
+++ b/src/gallium/drivers/r600/Makefile
@@ -19,6 +19,8 @@ C_SOURCES = \
 	r600_texture.c \
 	r700_asm.c \
 	evergreen_state.c \
-	eg_asm.c
+	eg_asm.c \
+	r600_translate.c \
+	r600_state_common.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/r600/SConscript b/src/gallium/drivers/r600/SConscript
index b6d9b7d8d22..bf0ad8571ba 100644
--- a/src/gallium/drivers/r600/SConscript
+++ b/src/gallium/drivers/r600/SConscript
@@ -18,6 +18,7 @@ r600 = env.ConvenienceLibrary(
     source = [
         'r600_asm.c',
         'r600_buffer.c',
+        'r600_blit.c',
         'r600_helper.c',
         'r600_pipe.c',
         'r600_query.c',
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index 52b7189e9e5..c30f09c394b 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -36,8 +36,13 @@ int eg_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
 	case (EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
 	case (EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
-		  S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache0_mode);
+			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache0_mode) |
+			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache0_bank) |
+			S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache1_bank);
 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
+			S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache1_mode) |
+			S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache0_addr) |
+			S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache1_addr) |
 					S_SQ_CF_ALU_WORD1_BARRIER(1) |
 					S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
 		break;
diff --git a/src/gallium/drivers/r600/eg_state_inlines.h b/src/gallium/drivers/r600/eg_state_inlines.h
index 81094904485..be81c28b43f 100644
--- a/src/gallium/drivers/r600/eg_state_inlines.h
+++ b/src/gallium/drivers/r600/eg_state_inlines.h
@@ -25,6 +25,7 @@
 
 #include "util/u_format.h"
 #include "evergreend.h"
+#include "r600_formats.h"
 
 static INLINE uint32_t r600_translate_blend_function(int blend_func)
 {
@@ -276,6 +277,14 @@ static inline uint32_t r600_translate_dbformat(enum pipe_format format)
 	}
 }
 
+static inline uint32_t r600_translate_stencilformat(enum pipe_format format)
+{
+	if (format == PIPE_FORMAT_Z24_UNORM_S8_USCALED)
+		return 1;
+	else
+		return 0;
+}
+
 static inline uint32_t r600_translate_colorswap(enum pipe_format format)
 {
 	switch (format) {
@@ -301,6 +310,12 @@ static inline uint32_t r600_translate_colorswap(enum pipe_format format)
 
 	case PIPE_FORMAT_Z16_UNORM:
 		return V_028C70_SWAP_STD;
+
+	case PIPE_FORMAT_R8G8_UNORM:
+		return V_028C70_SWAP_STD;
+
+	case PIPE_FORMAT_R16_UNORM:
+		return V_028C70_SWAP_STD;
 		/* 32-bit buffers. */
 
 	case PIPE_FORMAT_A8B8G8R8_SRGB:
@@ -338,6 +353,9 @@ static inline uint32_t r600_translate_colorswap(enum pipe_format format)
 	case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
 		return V_028C70_SWAP_STD_REV;
 
+	case PIPE_FORMAT_R16G16_UNORM:
+		return V_028C70_SWAP_STD;
+
 		/* 64-bit buffers. */
 	case PIPE_FORMAT_R16G16B16A16_UNORM:
 	case PIPE_FORMAT_R16G16B16A16_SNORM:
@@ -382,6 +400,12 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format)
 	case PIPE_FORMAT_Z16_UNORM:
 		return V_028C70_COLOR_16;
 
+	case PIPE_FORMAT_R8G8_UNORM:
+		return V_028C70_COLOR_8_8;
+
+	case PIPE_FORMAT_R16_UNORM:
+		return V_028C70_COLOR_16;
+
 		/* 32-bit buffers. */
 	case PIPE_FORMAT_A8B8G8R8_SRGB:
 	case PIPE_FORMAT_A8B8G8R8_UNORM:
@@ -419,6 +443,7 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format)
 		return V_028C70_COLOR_16_16_FLOAT;
 
 	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16_UNORM:
 		return V_028C70_COLOR_16_16;
 
 		/* 64-bit buffers. */
@@ -499,32 +524,32 @@ static INLINE uint32_t r600_translate_vertex_data_type(enum pipe_format format)
 		case 16:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_030008_FMT_16_FLOAT;
+				result = FMT_16_FLOAT;
 				break;
 			case 2:
-				result = V_030008_FMT_16_16_FLOAT;
+				result = FMT_16_16_FLOAT;
 				break;
 			case 3:
-				result = V_030008_FMT_16_16_16_FLOAT;
+				result = FMT_16_16_16_FLOAT;
 				break;
 			case 4:
-				result = V_030008_FMT_16_16_16_16_FLOAT;
+				result = FMT_16_16_16_16_FLOAT;
 				break;
 			}
 			break;
 		case 32:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_030008_FMT_32_FLOAT;
+				result = FMT_32_FLOAT;
 				break;
 			case 2:
-				result = V_030008_FMT_32_32_FLOAT;
+				result = FMT_32_32_FLOAT;
 				break;
 			case 3:
-				result = V_030008_FMT_32_32_32_FLOAT;
+				result = FMT_32_32_32_FLOAT;
 				break;
 			case 4:
-				result = V_030008_FMT_32_32_32_32_FLOAT;
+				result = FMT_32_32_32_32_FLOAT;
 				break;
 			}
 			break;
@@ -540,48 +565,48 @@ static INLINE uint32_t r600_translate_vertex_data_type(enum pipe_format format)
 		case 8:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_030008_FMT_8;
+				result = FMT_8;
 				break;
 			case 2:
-				result = V_030008_FMT_8_8;
+				result = FMT_8_8;
 				break;
 			case 3:
 //				result = V_038008_FMT_8_8_8; /* fails piglit draw-vertices test */
 //				break;
 			case 4:
-				result = V_030008_FMT_8_8_8_8;
+				result = FMT_8_8_8_8;
 				break;
 			}
 			break;
 		case 16:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_030008_FMT_16;
+				result = FMT_16;
 				break;
 			case 2:
-				result = V_030008_FMT_16_16;
+				result = FMT_16_16;
 				break;
 			case 3:
 //				result = V_038008_FMT_16_16_16; /* fails piglit draw-vertices test */
 //				break;
 			case 4:
-				result = V_030008_FMT_16_16_16_16;
+				result = FMT_16_16_16_16;
 				break;
 			}
 			break;
 		case 32:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_030008_FMT_32;
+				result = FMT_32;
 				break;
 			case 2:
-				result = V_030008_FMT_32_32;
+				result = FMT_32_32;
 				break;
 			case 3:
-				result = V_030008_FMT_32_32_32;
+				result = FMT_32_32_32;
 				break;
 			case 4:
-				result = V_030008_FMT_32_32_32_32;
+				result = FMT_32_32_32_32;
 				break;
 			}
 			break;
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 147d4f372e6..72223485067 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -39,6 +39,7 @@
 #include <util/u_pack_color.h>
 #include <util/u_memory.h>
 #include <util/u_inlines.h>
+#include <util/u_framebuffer.h>
 #include <pipebuffer/pb_buffer.h>
 #include "r600.h"
 #include "evergreend.h"
@@ -136,20 +137,6 @@ static void *evergreen_create_blend_state(struct pipe_context *ctx,
 	return rstate;
 }
 
-static void evergreen_bind_blend_state(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_blend *blend = (struct r600_pipe_blend *)state;
-	struct r600_pipe_state *rstate;
-
-	if (state == NULL)
-		return;
-	rstate = &blend->rstate;
-	rctx->states[rstate->id] = rstate;
-	rctx->cb_target_mask = blend->cb_target_mask;
-	r600_context_pipe_state_set(&rctx->ctx, rstate);
-}
-
 static void *evergreen_create_dsa_state(struct pipe_context *ctx,
 				   const struct pipe_depth_stencil_alpha_state *state)
 {
@@ -241,6 +228,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	struct r600_pipe_state *rstate;
 	unsigned tmp;
 	unsigned prov_vtx = 1, polygon_dual_mode;
+	unsigned clip_rule;
 
 	if (rs == NULL) {
 		return NULL;
@@ -250,6 +238,8 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
 
+	clip_rule = state->scissor ? 0xAAAA : 0xFFFF;
+
 	/* offset */
 	rs->offset_units = state->offset_units;
 	rs->offset_scale = state->offset_scale * 12.0f;
@@ -257,7 +247,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	rstate->id = R600_PIPE_STATE_RASTERIZER;
 	if (state->flatshade_first)
 		prov_vtx = 0;
-	tmp = 0x00000001;
+	tmp = S_0286D4_FLAT_SHADE_ENA(1);
 	if (state->sprite_coord_enable) {
 		tmp |= S_0286D4_PNT_SPRITE_ENA(1) |
 			S_0286D4_PNT_SPRITE_OVRD_X(2) |
@@ -299,39 +289,10 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate, R_028C18_PA_CL_GB_HORZ_DISC_ADJ, 0x3F800000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 0x0, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_028C08_PA_SU_VTX_CNTL, 0x00000005, 0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate, R_02820C_PA_SC_CLIPRECT_RULE, clip_rule, 0xFFFFFFFF, NULL);
 	return rstate;
 }
 
-static void evergreen_bind_rs_state(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state;
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	if (state == NULL)
-		return;
-
-	rctx->flatshade = rs->flatshade;
-	rctx->sprite_coord_enable = rs->sprite_coord_enable;
-	rctx->rasterizer = rs;
-
-	rctx->states[rs->rstate.id] = &rs->rstate;
-	r600_context_pipe_state_set(&rctx->ctx, &rs->rstate);
-}
-
-static void evergreen_delete_rs_state(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state;
-
-	if (rctx->rasterizer == rs) {
-		rctx->rasterizer = NULL;
-	}
-	if (rctx->states[rs->rstate.id] == &rs->rstate) {
-		rctx->states[rs->rstate.id] = NULL;
-	}
-	free(rs);
-}
-
 static void *evergreen_create_sampler_state(struct pipe_context *ctx,
 					const struct pipe_sampler_state *state)
 {
@@ -372,28 +333,6 @@ static void *evergreen_create_sampler_state(struct pipe_context *ctx,
 	return rstate;
 }
 
-static void *evergreen_create_vertex_elements(struct pipe_context *ctx,
-				unsigned count,
-				const struct pipe_vertex_element *elements)
-{
-	struct r600_vertex_element *v = CALLOC_STRUCT(r600_vertex_element);
-
-	assert(count < 32);
-	v->count = count;
-	v->refcount = 1;
-	memcpy(v->elements, elements, count * sizeof(struct pipe_vertex_element));
-	return v;
-}
-
-static void evergreen_sampler_view_destroy(struct pipe_context *ctx,
-				      struct pipe_sampler_view *state)
-{
-	struct r600_pipe_sampler_view *resource = (struct r600_pipe_sampler_view *)state;
-
-	pipe_resource_reference(&state->texture, NULL);
-	FREE(resource);
-}
-
 static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_context *ctx,
 							struct pipe_resource *texture,
 							const struct pipe_sampler_view *state)
@@ -424,15 +363,15 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte
 	swizzle[1] = state->swizzle_g;
 	swizzle[2] = state->swizzle_b;
 	swizzle[3] = state->swizzle_a;
-	format = r600_translate_texformat(texture->format,
+	format = r600_translate_texformat(state->format,
 					  swizzle,
 					  &word4, &yuv_format);
 	if (format == ~0) {
 		format = 0;
 	}
-	desc = util_format_description(texture->format);
+	desc = util_format_description(state->format);
 	if (desc == NULL) {
-		R600_ERR("unknow format %d\n", texture->format);
+		R600_ERR("unknow format %d\n", state->format);
 	}
 	tmp = (struct r600_resource_texture*)texture;
 	rbuffer = &tmp->resource;
@@ -446,7 +385,7 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte
 		bo[0] = rbuffer->bo;
 		bo[1] = rbuffer->bo;
 	}
-	pitch = align(tmp->pitch[0] / tmp->bpt, 8);
+	pitch = align(tmp->pitch_in_pixels[0], 8);
 
 	/* FIXME properly handle first level != 0 */
 	r600_pipe_state_add_reg(rstate, R_030000_RESOURCE0_WORD0,
@@ -464,7 +403,6 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte
 	r600_pipe_state_add_reg(rstate, R_030010_RESOURCE0_WORD4,
 				word4 | S_030010_NUM_FORMAT_ALL(V_030010_SQ_NUM_FORMAT_NORM) |
 				S_030010_SRF_MODE_ALL(V_030010_SFR_MODE_NO_ZERO) |
-				S_030010_REQUEST_SIZE(1) |
 				S_030010_BASE_LEVEL(state->first_level), 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_030014_RESOURCE0_WORD5,
 				S_030014_LAST_LEVEL(state->last_level) |
@@ -481,32 +419,42 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte
 static void evergreen_set_vs_sampler_view(struct pipe_context *ctx, unsigned count,
 					struct pipe_sampler_view **views)
 {
-	/* TODO */
-	assert(1);
-}
-
-static void evergreen_set_ps_sampler_view(struct pipe_context *ctx, unsigned count,
-					struct pipe_sampler_view **views)
-{
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_sampler_view **resource = (struct r600_pipe_sampler_view **)views;
 
 	for (int i = 0; i < count; i++) {
 		if (resource[i]) {
-			evergreen_context_pipe_state_set_ps_resource(&rctx->ctx, &resource[i]->state, i);
+			evergreen_context_pipe_state_set_vs_resource(&rctx->ctx, &resource[i]->state, i + PIPE_MAX_ATTRIBS);
 		}
 	}
 }
 
-static void evergreen_bind_state(struct pipe_context *ctx, void *state)
+static void evergreen_set_ps_sampler_view(struct pipe_context *ctx, unsigned count,
+					struct pipe_sampler_view **views)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_state *rstate = (struct r600_pipe_state *)state;
-
-	if (state == NULL)
-		return;
-	rctx->states[rstate->id] = rstate;
-	r600_context_pipe_state_set(&rctx->ctx, rstate);
+	struct r600_pipe_sampler_view **resource = (struct r600_pipe_sampler_view **)views;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		if (&rctx->ps_samplers.views[i]->base != views[i]) {
+			if (resource[i])
+				evergreen_context_pipe_state_set_ps_resource(&rctx->ctx, &resource[i]->state, i);
+			else
+				evergreen_context_pipe_state_set_ps_resource(&rctx->ctx, NULL, i);
+
+			pipe_sampler_view_reference(
+				(struct pipe_sampler_view **)&rctx->ps_samplers.views[i],
+				views[i]);
+		}
+	}
+	for (i = count; i < NUM_TEX_UNITS; i++) {
+		if (rctx->ps_samplers.views[i]) {
+			evergreen_context_pipe_state_set_ps_resource(&rctx->ctx, NULL, i);
+			pipe_sampler_view_reference((struct pipe_sampler_view **)&rctx->ps_samplers.views[i], NULL);
+		}
+	}
+	rctx->ps_samplers.n_views = count;
 }
 
 static void evergreen_bind_ps_sampler(struct pipe_context *ctx, unsigned count, void **states)
@@ -514,6 +462,10 @@ static void evergreen_bind_ps_sampler(struct pipe_context *ctx, unsigned count,
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state **rstates = (struct r600_pipe_state **)states;
 
+
+	memcpy(rctx->ps_samplers.samplers, states, sizeof(void*) * count);
+	rctx->ps_samplers.n_samplers = count;
+
 	for (int i = 0; i < count; i++) {
 		evergreen_context_pipe_state_set_ps_sampler(&rctx->ctx, rstates[i], i);
 	}
@@ -524,37 +476,11 @@ static void evergreen_bind_vs_sampler(struct pipe_context *ctx, unsigned count,
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state **rstates = (struct r600_pipe_state **)states;
 
-	/* TODO implement */
 	for (int i = 0; i < count; i++) {
 		evergreen_context_pipe_state_set_vs_sampler(&rctx->ctx, rstates[i], i);
 	}
 }
 
-static void evergreen_delete_state(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_state *rstate = (struct r600_pipe_state *)state;
-
-	if (rctx->states[rstate->id] == rstate) {
-		rctx->states[rstate->id] = NULL;
-	}
-	for (int i = 0; i < rstate->nregs; i++) {
-		r600_bo_reference(rctx->radeon, &rstate->regs[i].bo, NULL);
-	}
-	free(rstate);
-}
-
-static void evergreen_delete_vertex_element(struct pipe_context *ctx, void *state)
-{
-	struct r600_vertex_element *v = (struct r600_vertex_element*)state;
-
-	if (v == NULL)
-		return;
-	if (--v->refcount)
-		return;
-	free(v);
-}
-
 static void evergreen_set_clip_state(struct pipe_context *ctx,
 				const struct pipe_clip_state *state)
 {
@@ -590,19 +516,6 @@ static void evergreen_set_clip_state(struct pipe_context *ctx,
 	r600_context_pipe_state_set(&rctx->ctx, rstate);
 }
 
-static void evergreen_bind_vertex_elements(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_vertex_element *v = (struct r600_vertex_element*)state;
-
-	evergreen_delete_vertex_element(ctx, rctx->vertex_elements);
-	rctx->vertex_elements = v;
-	if (v) {
-		v->refcount++;
-//		rctx->vs_rebuild = TRUE;
-	}
-}
-
 static void evergreen_set_polygon_stipple(struct pipe_context *ctx,
 					 const struct pipe_poly_stipple *state)
 {
@@ -626,18 +539,6 @@ static void evergreen_set_scissor_state(struct pipe_context *ctx,
 	tl = S_028240_TL_X(state->minx) | S_028240_TL_Y(state->miny);
 	br = S_028244_BR_X(state->maxx) | S_028244_BR_Y(state->maxy);
 	r600_pipe_state_add_reg(rstate,
-				R_028030_PA_SC_SCREEN_SCISSOR_TL, tl,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_028034_PA_SC_SCREEN_SCISSOR_BR, br,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_028204_PA_SC_WINDOW_SCISSOR_TL, tl,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_028208_PA_SC_WINDOW_SCISSOR_BR, br,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
 				R_028210_PA_SC_CLIPRECT_0_TL, tl,
 				0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
@@ -661,15 +562,6 @@ static void evergreen_set_scissor_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate,
 				R_02822C_PA_SC_CLIPRECT_3_BR, br,
 				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_028200_PA_SC_WINDOW_OFFSET, 0x00000000,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_02820C_PA_SC_CLIPRECT_RULE, 0x0000FFFF,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_028230_PA_SC_EDGERULE, 0xAAAAAAAA,
-				0xFFFFFFFF, NULL);
 
 	free(rctx->states[R600_PIPE_STATE_SCISSOR]);
 	rctx->states[R600_PIPE_STATE_SCISSOR] = rstate;
@@ -733,6 +625,7 @@ static void evergreen_cb(struct r600_pipe_context *rctx, struct r600_pipe_state
 {
 	struct r600_resource_texture *rtex;
 	struct r600_resource *rbuffer;
+	struct r600_surface *surf;
 	unsigned level = state->cbufs[cb]->level;
 	unsigned pitch, slice;
 	unsigned color_info;
@@ -740,14 +633,15 @@ static void evergreen_cb(struct r600_pipe_context *rctx, struct r600_pipe_state
 	const struct util_format_description *desc;
 	struct r600_bo *bo[3];
 
+	surf = (struct r600_surface *)state->cbufs[cb];
 	rtex = (struct r600_resource_texture*)state->cbufs[cb]->texture;
 	rbuffer = &rtex->resource;
 	bo[0] = rbuffer->bo;
 	bo[1] = rbuffer->bo;
 	bo[2] = rbuffer->bo;
 
-	pitch = (rtex->pitch[level] / rtex->bpt) / 8 - 1;
-	slice = (rtex->pitch[level] / rtex->bpt) * state->cbufs[cb]->height / 64 - 1;
+	pitch = rtex->pitch_in_pixels[level] / 8 - 1;
+	slice = rtex->pitch_in_pixels[level] * surf->aligned_height / 64 - 1;
 	ntype = 0;
 	desc = util_format_description(rtex->resource.base.b.format);
 	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
@@ -794,32 +688,49 @@ static void evergreen_db(struct r600_pipe_context *rctx, struct r600_pipe_state
 {
 	struct r600_resource_texture *rtex;
 	struct r600_resource *rbuffer;
+	struct r600_surface *surf;
 	unsigned level;
-	unsigned pitch, slice, format;
+	unsigned pitch, slice, format, stencil_format;
 
 	if (state->zsbuf == NULL)
 		return;
 
+	level = state->zsbuf->level;
+
+	surf = (struct r600_surface *)state->zsbuf;
 	rtex = (struct r600_resource_texture*)state->zsbuf->texture;
 	rtex->tiled = 1;
-	rtex->array_mode = 2;
+	rtex->array_mode[level] = 2;
 	rtex->tile_type = 1;
 	rtex->depth = 1;
 	rbuffer = &rtex->resource;
 
-	level = state->zsbuf->level;
-	pitch = (rtex->pitch[level] / rtex->bpt) / 8 - 1;
-	slice = (rtex->pitch[level] / rtex->bpt) * state->zsbuf->height / 64 - 1;
+	pitch = rtex->pitch_in_pixels[level] / 8 - 1;
+	slice = rtex->pitch_in_pixels[level] * surf->aligned_height / 64 - 1;
 	format = r600_translate_dbformat(state->zsbuf->texture->format);
+	stencil_format = r600_translate_stencilformat(state->zsbuf->texture->format);
 
 	r600_pipe_state_add_reg(rstate, R_028048_DB_Z_READ_BASE,
 				(state->zsbuf->offset + r600_bo_offset(rbuffer->bo)) >> 8, 0xFFFFFFFF, rbuffer->bo);
 	r600_pipe_state_add_reg(rstate, R_028050_DB_Z_WRITE_BASE,
 				(state->zsbuf->offset  + r600_bo_offset(rbuffer->bo)) >> 8, 0xFFFFFFFF, rbuffer->bo);
-//	r600_pipe_state_add_reg(rstate, R_028014_DB_HTILE_DATA_BASE, state->zsbuf->offset >> 8, 0xFFFFFFFF, rbuffer->bo);
+
+	if (stencil_format) {
+		uint32_t stencil_offset;
+
+		stencil_offset = ((surf->aligned_height * rtex->pitch_in_bytes[level]) + 255) & ~255;
+		r600_pipe_state_add_reg(rstate, R_02804C_DB_STENCIL_READ_BASE,
+					(state->zsbuf->offset + stencil_offset + r600_bo_offset(rbuffer->bo)) >> 8, 0xFFFFFFFF, rbuffer->bo);
+		r600_pipe_state_add_reg(rstate, R_028054_DB_STENCIL_WRITE_BASE,
+					(state->zsbuf->offset + stencil_offset + r600_bo_offset(rbuffer->bo)) >> 8, 0xFFFFFFFF, rbuffer->bo);
+	}
+
 	r600_pipe_state_add_reg(rstate, R_028008_DB_DEPTH_VIEW, 0x00000000, 0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate, R_028044_DB_STENCIL_INFO,
+				S_028044_FORMAT(stencil_format), 0xFFFFFFFF, rbuffer->bo);
+
 	r600_pipe_state_add_reg(rstate, R_028040_DB_Z_INFO,
-				S_028040_ARRAY_MODE(rtex->array_mode) | S_028040_FORMAT(format),
+				S_028040_ARRAY_MODE(rtex->array_mode[level]) | S_028040_FORMAT(format),
 				0xFFFFFFFF, rbuffer->bo);
 	r600_pipe_state_add_reg(rstate, R_028058_DB_DEPTH_SIZE,
 				S_028058_PITCH_TILE_MAX(pitch),
@@ -841,14 +752,9 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 
 	/* unreference old buffer and reference new one */
 	rstate->id = R600_PIPE_STATE_FRAMEBUFFER;
-	for (int i = 0; i < rctx->framebuffer.nr_cbufs; i++) {
-		pipe_surface_reference(&rctx->framebuffer.cbufs[i], NULL);
-	}
-	for (int i = 0; i < state->nr_cbufs; i++) {
-		pipe_surface_reference(&rctx->framebuffer.cbufs[i], state->cbufs[i]);
-	}
-	pipe_surface_reference(&rctx->framebuffer.zsbuf, state->zsbuf);
-	rctx->framebuffer = *state;
+
+	util_copy_framebuffer_state(&rctx->framebuffer, state);
+
 	rctx->pframebuffer = &rctx->framebuffer;
 
 	/* build states */
@@ -881,6 +787,24 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate,
 				R_028254_PA_SC_VPORT_SCISSOR_0_BR, br,
 				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028030_PA_SC_SCREEN_SCISSOR_TL, tl,
+				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028034_PA_SC_SCREEN_SCISSOR_BR, br,
+				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028204_PA_SC_WINDOW_SCISSOR_TL, tl,
+				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028208_PA_SC_WINDOW_SCISSOR_BR, br,
+				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028200_PA_SC_WINDOW_OFFSET, 0x00000000,
+				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028230_PA_SC_EDGERULE, 0xAAAAAAAA,
+				0xFFFFFFFF, NULL);
 
 	r600_pipe_state_add_reg(rstate, R_028238_CB_TARGET_MASK,
 				0x00000000, target_mask, NULL);
@@ -896,40 +820,6 @@ static void evergreen_set_framebuffer_state(struct pipe_context *ctx,
 	r600_context_pipe_state_set(&rctx->ctx, rstate);
 }
 
-static void evergreen_set_index_buffer(struct pipe_context *ctx,
-				  const struct pipe_index_buffer *ib)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	if (ib) {
-		pipe_resource_reference(&rctx->index_buffer.buffer, ib->buffer);
-		memcpy(&rctx->index_buffer, ib, sizeof(rctx->index_buffer));
-	} else {
-		pipe_resource_reference(&rctx->index_buffer.buffer, NULL);
-		memset(&rctx->index_buffer, 0, sizeof(rctx->index_buffer));
-	}
-
-	/* TODO make this more like a state */
-}
-
-static void evergreen_set_vertex_buffers(struct pipe_context *ctx, unsigned count,
-					const struct pipe_vertex_buffer *buffers)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	for (int i = 0; i < rctx->nvertex_buffer; i++) {
-		pipe_resource_reference(&rctx->vertex_buffer[i].buffer, NULL);
-	}
-	memcpy(rctx->vertex_buffer, buffers, sizeof(struct pipe_vertex_buffer) * count);
-	for (int i = 0; i < count; i++) {
-		rctx->vertex_buffer[i].buffer = NULL;
-		if (r600_buffer_is_user_buffer(buffers[i].buffer))
-			rctx->any_user_vbs = TRUE;
-		pipe_resource_reference(&rctx->vertex_buffer[i].buffer, buffers[i].buffer);
-	}
-	rctx->nvertex_buffer = count;
-}
-
 static void evergreen_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
 					struct pipe_resource *buffer)
 {
@@ -965,84 +855,31 @@ static void evergreen_set_constant_buffer(struct pipe_context *ctx, uint shader,
 	}
 }
 
-static void *evergreen_create_shader_state(struct pipe_context *ctx,
-					const struct pipe_shader_state *state)
-{
-	struct r600_pipe_shader *shader =  CALLOC_STRUCT(r600_pipe_shader);
-	int r;
-
-	r =  r600_pipe_shader_create(ctx, shader, state->tokens);
-	if (r) {
-		return NULL;
-	}
-	return shader;
-}
-
-static void evergreen_bind_ps_shader(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	/* TODO delete old shader */
-	rctx->ps_shader = (struct r600_pipe_shader *)state;
-}
-
-static void evergreen_bind_vs_shader(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	/* TODO delete old shader */
-	rctx->vs_shader = (struct r600_pipe_shader *)state;
-}
-
-static void evergreen_delete_ps_shader(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_shader *shader = (struct r600_pipe_shader *)state;
-
-	if (rctx->ps_shader == shader) {
-		rctx->ps_shader = NULL;
-	}
-	/* TODO proper delete */
-	free(shader);
-}
-
-static void evergreen_delete_vs_shader(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_shader *shader = (struct r600_pipe_shader *)state;
-
-	if (rctx->vs_shader == shader) {
-		rctx->vs_shader = NULL;
-	}
-	/* TODO proper delete */
-	free(shader);
-}
-
 void evergreen_init_state_functions(struct r600_pipe_context *rctx)
 {
 	rctx->context.create_blend_state = evergreen_create_blend_state;
 	rctx->context.create_depth_stencil_alpha_state = evergreen_create_dsa_state;
-	rctx->context.create_fs_state = evergreen_create_shader_state;
+	rctx->context.create_fs_state = r600_create_shader_state;
 	rctx->context.create_rasterizer_state = evergreen_create_rs_state;
 	rctx->context.create_sampler_state = evergreen_create_sampler_state;
 	rctx->context.create_sampler_view = evergreen_create_sampler_view;
-	rctx->context.create_vertex_elements_state = evergreen_create_vertex_elements;
-	rctx->context.create_vs_state = evergreen_create_shader_state;
-	rctx->context.bind_blend_state = evergreen_bind_blend_state;
-	rctx->context.bind_depth_stencil_alpha_state = evergreen_bind_state;
+	rctx->context.create_vertex_elements_state = r600_create_vertex_elements;
+	rctx->context.create_vs_state = r600_create_shader_state;
+	rctx->context.bind_blend_state = r600_bind_blend_state;
+	rctx->context.bind_depth_stencil_alpha_state = r600_bind_state;
 	rctx->context.bind_fragment_sampler_states = evergreen_bind_ps_sampler;
-	rctx->context.bind_fs_state = evergreen_bind_ps_shader;
-	rctx->context.bind_rasterizer_state = evergreen_bind_rs_state;
-	rctx->context.bind_vertex_elements_state = evergreen_bind_vertex_elements;
+	rctx->context.bind_fs_state = r600_bind_ps_shader;
+	rctx->context.bind_rasterizer_state = r600_bind_rs_state;
+	rctx->context.bind_vertex_elements_state = r600_bind_vertex_elements;
 	rctx->context.bind_vertex_sampler_states = evergreen_bind_vs_sampler;
-	rctx->context.bind_vs_state = evergreen_bind_vs_shader;
-	rctx->context.delete_blend_state = evergreen_delete_state;
-	rctx->context.delete_depth_stencil_alpha_state = evergreen_delete_state;
-	rctx->context.delete_fs_state = evergreen_delete_ps_shader;
-	rctx->context.delete_rasterizer_state = evergreen_delete_rs_state;
-	rctx->context.delete_sampler_state = evergreen_delete_state;
-	rctx->context.delete_vertex_elements_state = evergreen_delete_vertex_element;
-	rctx->context.delete_vs_state = evergreen_delete_vs_shader;
+	rctx->context.bind_vs_state = r600_bind_vs_shader;
+	rctx->context.delete_blend_state = r600_delete_state;
+	rctx->context.delete_depth_stencil_alpha_state = r600_delete_state;
+	rctx->context.delete_fs_state = r600_delete_ps_shader;
+	rctx->context.delete_rasterizer_state = r600_delete_rs_state;
+	rctx->context.delete_sampler_state = r600_delete_state;
+	rctx->context.delete_vertex_elements_state = r600_delete_vertex_element;
+	rctx->context.delete_vs_state = r600_delete_vs_shader;
 	rctx->context.set_blend_color = evergreen_set_blend_color;
 	rctx->context.set_clip_state = evergreen_set_clip_state;
 	rctx->context.set_constant_buffer = evergreen_set_constant_buffer;
@@ -1052,11 +889,11 @@ void evergreen_init_state_functions(struct r600_pipe_context *rctx)
 	rctx->context.set_sample_mask = evergreen_set_sample_mask;
 	rctx->context.set_scissor_state = evergreen_set_scissor_state;
 	rctx->context.set_stencil_ref = evergreen_set_stencil_ref;
-	rctx->context.set_vertex_buffers = evergreen_set_vertex_buffers;
-	rctx->context.set_index_buffer = evergreen_set_index_buffer;
+	rctx->context.set_vertex_buffers = r600_set_vertex_buffers;
+	rctx->context.set_index_buffer = r600_set_index_buffer;
 	rctx->context.set_vertex_sampler_views = evergreen_set_vs_sampler_view;
 	rctx->context.set_viewport_state = evergreen_set_viewport_state;
-	rctx->context.sampler_view_destroy = evergreen_sampler_view_destroy;
+	rctx->context.sampler_view_destroy = r600_sampler_view_destroy;
 }
 
 void evergreen_init_config(struct r600_pipe_context *rctx)
@@ -1339,6 +1176,12 @@ void evergreen_draw(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	struct r600_draw rdraw;
 	struct r600_pipe_state vgt;
 	struct r600_drawl draw;
+	boolean translate = FALSE;
+
+	if (rctx->vertex_elements->incompatible_layout) {
+		r600_begin_vertex_translate(rctx);
+		translate = TRUE;
+	}
 
 	if (rctx->any_user_vbs) {
 		r600_upload_user_buffers(rctx);
@@ -1415,11 +1258,11 @@ void evergreen_draw(struct pipe_context *ctx, const struct pipe_draw_info *info)
 			vertex_buffer->buffer_offset +
 			r600_bo_offset(rbuffer->bo);
 
-		format = r600_translate_vertex_data_type(rctx->vertex_elements->elements[i].src_format);
+		format = r600_translate_vertex_data_type(rctx->vertex_elements->hw_format[i]);
 
 		word2 = format | S_030008_STRIDE(vertex_buffer->stride);
 
-		word3 = r600_translate_vertex_data_swizzle(rctx->vertex_elements->elements[i].src_format);
+		word3 = r600_translate_vertex_data_swizzle(rctx->vertex_elements->hw_format[i]);
 
 		r600_pipe_state_add_reg(rstate, R_030000_RESOURCE0_WORD0, offset, 0xFFFFFFFF, rbuffer->bo);
 		r600_pipe_state_add_reg(rstate, R_030004_RESOURCE0_WORD1, rbuffer->size - offset - 1, 0xFFFFFFFF, NULL);
@@ -1500,6 +1343,9 @@ void evergreen_draw(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	}
 	evergreen_context_draw(&rctx->ctx, &rdraw);
 
+	if (translate)
+		r600_end_vertex_translate(rctx);
+
 	pipe_resource_reference(&draw.index_buffer, NULL);
 }
 
@@ -1508,23 +1354,39 @@ void evergreen_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state *rstate = &shader->rstate;
 	struct r600_shader *rshader = &shader->shader;
-	unsigned i, tmp, exports_ps, num_cout, spi_ps_in_control_0, spi_input_z;
-	boolean have_pos = FALSE, have_face = FALSE;
+	unsigned i, tmp, exports_ps, num_cout, spi_ps_in_control_0, spi_input_z, spi_ps_in_control_1;
+	int pos_index = -1, face_index = -1;
+	int ninterp = 0;
+	boolean have_linear = FALSE, have_centroid = FALSE, have_perspective = FALSE;
+	unsigned spi_baryc_cntl;
 
 	/* clear previous register */
 	rstate->nregs = 0;
 
 	for (i = 0; i < rshader->ninput; i++) {
 		tmp = S_028644_SEMANTIC(r600_find_vs_semantic_index(&rctx->vs_shader->shader, rshader, i));
+		/* evergreen NUM_INTERP only contains values interpolated into the LDS,
+		   POSITION goes via GPRs from the SC so isn't counted */
 		if (rshader->input[i].name == TGSI_SEMANTIC_POSITION)
-			have_pos = TRUE;
+			pos_index = i;
+		else if (rshader->input[i].name == TGSI_SEMANTIC_FACE)
+			face_index = i;
+		else {
+			if (rshader->input[i].interpolate == TGSI_INTERPOLATE_LINEAR ||
+			    rshader->input[i].interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
+				ninterp++;
+			if (rshader->input[i].interpolate == TGSI_INTERPOLATE_LINEAR)
+				have_linear = TRUE;
+			if (rshader->input[i].interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
+				have_perspective = TRUE;
+			if (rshader->input[i].centroid)
+				have_centroid = TRUE;
+		}
 		if (rshader->input[i].name == TGSI_SEMANTIC_COLOR ||
 		    rshader->input[i].name == TGSI_SEMANTIC_BCOLOR ||
 		    rshader->input[i].name == TGSI_SEMANTIC_POSITION) {
 			tmp |= S_028644_FLAT_SHADE(rshader->flat_shade);
 		}
-		if (rshader->input[i].name == TGSI_SEMANTIC_FACE)
-			have_face = TRUE;
 		if (rshader->input[i].name == TGSI_SEMANTIC_GENERIC &&
 			rctx->sprite_coord_enable & (1 << rshader->input[i].sid)) {
 			tmp |= S_028644_PT_SPRITE_TEX(1);
@@ -1532,17 +1394,23 @@ void evergreen_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader
 		r600_pipe_state_add_reg(rstate, R_028644_SPI_PS_INPUT_CNTL_0 + i * 4, tmp, 0xFFFFFFFF, NULL);
 	}
 	for (i = 0; i < rshader->noutput; i++) {
-		if (rshader->input[i].name == TGSI_SEMANTIC_POSITION)
+		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION)
 			r600_pipe_state_add_reg(rstate,
 						R_02880C_DB_SHADER_CONTROL,
 						S_02880C_Z_EXPORT_ENABLE(1),
 						S_02880C_Z_EXPORT_ENABLE(1), NULL);
+		if (rshader->output[i].name == TGSI_SEMANTIC_STENCIL)
+			r600_pipe_state_add_reg(rstate,
+						R_02880C_DB_SHADER_CONTROL,
+						S_02880C_STENCIL_EXPORT_ENABLE(1),
+						S_02880C_STENCIL_EXPORT_ENABLE(1), NULL);
 	}
 
 	exports_ps = 0;
 	num_cout = 0;
 	for (i = 0; i < rshader->noutput; i++) {
-		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION)
+		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION ||
+		    rshader->output[i].name == TGSI_SEMANTIC_STENCIL)
 			exports_ps |= 1;
 		else if (rshader->output[i].name == TGSI_SEMANTIC_COLOR) {
 			num_cout++;
@@ -1554,19 +1422,49 @@ void evergreen_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader
 		exports_ps = 2;
 	}
 
-	spi_ps_in_control_0 = S_0286CC_NUM_INTERP(rshader->ninput) |
-				S_0286CC_PERSP_GRADIENT_ENA(1);
+	if (ninterp == 0) {
+		ninterp = 1;
+		have_perspective = TRUE;
+	}
+
+	spi_ps_in_control_0 = S_0286CC_NUM_INTERP(ninterp) |
+		              S_0286CC_PERSP_GRADIENT_ENA(have_perspective) |
+		              S_0286CC_LINEAR_GRADIENT_ENA(have_linear);
 	spi_input_z = 0;
-	if (have_pos) {
-		spi_ps_in_control_0 |=  S_0286CC_POSITION_ENA(1);
+	if (pos_index != -1) {
+		spi_ps_in_control_0 |=  S_0286CC_POSITION_ENA(1) |
+			S_0286CC_POSITION_CENTROID(rshader->input[pos_index].centroid) |
+			S_0286CC_POSITION_ADDR(rshader->input[pos_index].gpr);
 		spi_input_z |= 1;
 	}
+
+	spi_ps_in_control_1 = 0;
+	if (face_index != -1) {
+		spi_ps_in_control_1 |= S_0286D0_FRONT_FACE_ENA(1) |
+			S_0286D0_FRONT_FACE_ADDR(rshader->input[face_index].gpr);
+	}
+
+	spi_baryc_cntl = 0;
+	if (have_perspective)
+		spi_baryc_cntl |= S_0286E0_PERSP_CENTER_ENA(1) |
+				  S_0286E0_PERSP_CENTROID_ENA(have_centroid);
+	if (have_linear)
+		spi_baryc_cntl |= S_0286E0_LINEAR_CENTER_ENA(1) |
+			          S_0286E0_LINEAR_CENTROID_ENA(have_centroid);
+				
 	r600_pipe_state_add_reg(rstate, R_0286CC_SPI_PS_IN_CONTROL_0,
 				spi_ps_in_control_0, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_0286D0_SPI_PS_IN_CONTROL_1,
-				S_0286D0_FRONT_FACE_ENA(have_face), 0xFFFFFFFF, NULL);
+				spi_ps_in_control_1, 0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate, R_0286E4_SPI_PS_IN_CONTROL_2,
+				0, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_0286D8_SPI_INPUT_Z, spi_input_z, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
+				R_0286E0_SPI_BARYC_CNTL,
+				spi_baryc_cntl,
+				0xFFFFFFFF, NULL);
+
+	r600_pipe_state_add_reg(rstate,
 				R_028840_SQ_PGM_START_PS,
 				(r600_bo_offset(shader->bo)) >> 8, 0xFFFFFFFF, shader->bo);
 	r600_pipe_state_add_reg(rstate,
@@ -1581,11 +1479,6 @@ void evergreen_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shader
 	r600_pipe_state_add_reg(rstate,
 				R_02884C_SQ_PGM_EXPORTS_PS,
 				exports_ps, 0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_0286E0_SPI_BARYC_CNTL,
-				S_0286E0_PERSP_CENTROID_ENA(1) |
-				S_0286E0_LINEAR_CENTROID_ENA(1),
-				0xFFFFFFFF, NULL);
 
 	if (rshader->uses_kill) {
 		/* only set some bits here, the other bits are set in the dsa state */
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index 9971dded782..8e96f9355e6 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -686,6 +686,9 @@
 #define   S_02880C_Z_EXPORT_ENABLE(x)                  (((x) & 0x1) << 0)
 #define   G_02880C_Z_EXPORT_ENABLE(x)                  (((x) >> 0) & 0x1)
 #define   C_02880C_Z_EXPORT_ENABLE                     0xFFFFFFFE
+#define   S_02880C_STENCIL_EXPORT_ENABLE(x)            (((x) & 0x1) << 1)
+#define   G_02880C_STENCIL_EXPORT_ENABLE(x)            (((x) >> 1) & 0x1)
+#define   C_02880C_STENCIL_EXPORT_ENABLE               0xFFFFFFFD
 #define   S_02880C_Z_ORDER(x)                          (((x) & 0x3) << 4)
 #define   G_02880C_Z_ORDER(x)                          (((x) >> 4) & 0x3)
 #define   C_02880C_Z_ORDER                             0xFFFFFCFF
@@ -984,9 +987,6 @@
 #define   S_030010_ENDIAN_SWAP(x)                      (((x) & 0x3) << 12)
 #define   G_030010_ENDIAN_SWAP(x)                      (((x) >> 12) & 0x3)
 #define   C_030010_ENDIAN_SWAP                         0xFFFFCFFF
-#define   S_030010_REQUEST_SIZE(x)                     (((x) & 0x3) << 14)
-#define   G_030010_REQUEST_SIZE(x)                     (((x) >> 14) & 0x3)
-#define   C_030010_REQUEST_SIZE                        0xFFFF3FFF
 #define   S_030010_DST_SEL_X(x)                        (((x) & 0x7) << 16)
 #define   G_030010_DST_SEL_X(x)                        (((x) >> 16) & 0x7)
 #define   C_030010_DST_SEL_X                           0xFFF8FFFF
@@ -1050,45 +1050,6 @@
 #define   S_030008_DATA_FORMAT(x)                      (((x) & 0x3F) << 20)
 #define   G_030008_DATA_FORMAT(x)                      (((x) >> 20) & 0x3F)
 #define   C_030008_DATA_FORMAT                         0xFC0FFFFF
-#define     V_030008_FMT_INVALID                     0x00000000
-#define     V_030008_FMT_8                           0x00000001
-#define     V_030008_FMT_4_4                         0x00000002
-#define     V_030008_FMT_3_3_2                       0x00000003
-#define     V_030008_FMT_16                          0x00000005
-#define     V_030008_FMT_16_FLOAT                    0x00000006
-#define     V_030008_FMT_8_8                         0x00000007
-#define     V_030008_FMT_5_6_5                       0x00000008
-#define     V_030008_FMT_6_5_5                       0x00000009
-#define     V_030008_FMT_1_5_5_5                     0x0000000A
-#define     V_030008_FMT_4_4_4_4                     0x0000000B
-#define     V_030008_FMT_5_5_5_1                     0x0000000C
-#define     V_030008_FMT_32                          0x0000000D
-#define     V_030008_FMT_32_FLOAT                    0x0000000E
-#define     V_030008_FMT_16_16                       0x0000000F
-#define     V_030008_FMT_16_16_FLOAT                 0x00000010
-#define     V_030008_FMT_8_24                        0x00000011
-#define     V_030008_FMT_8_24_FLOAT                  0x00000012
-#define     V_030008_FMT_24_8                        0x00000013
-#define     V_030008_FMT_24_8_FLOAT                  0x00000014
-#define     V_030008_FMT_10_11_11                    0x00000015
-#define     V_030008_FMT_10_11_11_FLOAT              0x00000016
-#define     V_030008_FMT_11_11_10                    0x00000017
-#define     V_030008_FMT_11_11_10_FLOAT              0x00000018
-#define     V_030008_FMT_2_10_10_10                  0x00000019
-#define     V_030008_FMT_8_8_8_8                     0x0000001A
-#define     V_030008_FMT_10_10_10_2                  0x0000001B
-#define     V_030008_FMT_X24_8_32_FLOAT              0x0000001C
-#define     V_030008_FMT_32_32                       0x0000001D
-#define     V_030008_FMT_32_32_FLOAT                 0x0000001E
-#define     V_030008_FMT_16_16_16_16                 0x0000001F
-#define     V_030008_FMT_16_16_16_16_FLOAT           0x00000020
-#define     V_030008_FMT_32_32_32_32                 0x00000022
-#define     V_030008_FMT_32_32_32_32_FLOAT           0x00000023
-#define     V_030008_FMT_8_8_8                       0x0000002c
-#define     V_030008_FMT_16_16_16                    0x0000002d
-#define     V_030008_FMT_16_16_16_FLOAT              0x0000002e
-#define     V_030008_FMT_32_32_32                    0x0000002f
-#define     V_030008_FMT_32_32_32_FLOAT              0x00000030
 #define   S_030008_NUM_FORMAT_ALL(x)                   (((x) & 0x3) << 26)
 #define   G_030008_NUM_FORMAT_ALL(x)                   (((x) >> 26) & 0x3)
 #define   C_030008_NUM_FORMAT_ALL                      0xF3FFFFFF
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index 630177d6add..62d983269f5 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -99,15 +99,22 @@ enum chip_class {
 	EVERGREEN,
 };
 
+struct r600_tiling_info {
+	unsigned num_channels;
+	unsigned num_banks;
+	unsigned group_bytes;
+};
+
 enum radeon_family r600_get_family(struct radeon *rw);
 enum chip_class r600_get_family_class(struct radeon *radeon);
+struct r600_tiling_info *r600_get_tiling_info(struct radeon *radeon);
 
 /* r600_bo.c */
 struct r600_bo;
 struct r600_bo *r600_bo(struct radeon *radeon,
 				  unsigned size, unsigned alignment, unsigned usage);
 struct r600_bo *r600_bo_handle(struct radeon *radeon,
-					 unsigned handle);
+			       unsigned handle, unsigned *array_mode);
 void *r600_bo_map(struct radeon *radeon, struct r600_bo *bo, unsigned usage, void *ctx);
 void r600_bo_unmap(struct radeon *radeon, struct r600_bo *bo);
 void r600_bo_reference(struct radeon *radeon, struct r600_bo **dst,
@@ -233,6 +240,10 @@ struct r600_context {
 	u32			*pm4;
 	struct list_head	query_list;
 	unsigned		num_query_running;
+	unsigned		fence;
+	struct list_head	fenced_bo;
+	unsigned		*cfence;
+	struct r600_bo		*fence_bo;
 };
 
 struct r600_draw {
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index d13da0ef638..c22628423bd 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -701,9 +701,14 @@ static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
 	case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
 	case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
-			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache0_mode);
+			S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache0_mode) |
+			S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache0_bank) |
+			S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache1_bank);
 
 		bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
+			S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache1_mode) |
+			S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache0_addr) |
+			S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache1_addr) |
 					S_SQ_CF_ALU_WORD1_BARRIER(1) |
 					S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chiprev == 0 ? cf->r6xx_uses_waterfall : 0) |
 					S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
@@ -871,3 +876,39 @@ int r600_bc_build(struct r600_bc *bc)
 	}
 	return 0;
 }
+
+void r600_bc_clear(struct r600_bc *bc)
+{
+	struct r600_bc_cf *cf, *next_cf;
+
+	free(bc->bytecode);
+	bc->bytecode = NULL;
+
+	LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
+		struct r600_bc_alu *alu, *next_alu;
+		struct r600_bc_tex *tex, *next_tex;
+		struct r600_bc_tex *vtx, *next_vtx;
+
+		LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
+			free(alu);
+		}
+
+		LIST_INITHEAD(&cf->alu);
+
+		LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
+			free(tex);
+		}
+
+		LIST_INITHEAD(&cf->tex);
+
+		LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
+			free(vtx);
+		}
+
+		LIST_INITHEAD(&cf->vtx);
+
+		free(cf);
+	}
+
+	LIST_INITHEAD(&cf->list);
+}
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index bebc7c15b00..25cda16837d 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -132,6 +132,11 @@ struct r600_bc_cf {
 	unsigned			pop_count;
 	unsigned			cf_addr; /* control flow addr */
 	unsigned			kcache0_mode;
+	unsigned			kcache1_mode;
+	unsigned			kcache0_addr;
+	unsigned			kcache1_addr;
+	unsigned			kcache0_bank;
+	unsigned			kcache1_bank;
 	unsigned			r6xx_uses_waterfall;
 	struct list_head		alu;
 	struct list_head		tex;
@@ -185,6 +190,7 @@ int eg_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf);
 
 /* r600_asm.c */
 int r600_bc_init(struct r600_bc *bc, enum radeon_family family);
+void r600_bc_clear(struct r600_bc *bc);
 int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu);
 int r600_bc_add_literal(struct r600_bc *bc, const u32 *value);
 int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx);
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 4bf44a171af..50d47060c1a 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -22,12 +22,22 @@
  */
 #include <util/u_surface.h>
 #include <util/u_blitter.h>
+#include <util/u_format.h>
 #include "r600_pipe.h"
 
-static void r600_blitter_save_states(struct pipe_context *ctx)
+enum r600_blitter_op /* bitmask */
+{
+    R600_CLEAR         = 1,
+    R600_CLEAR_SURFACE = 2,
+    R600_COPY          = 4
+};
+
+static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 
+	r600_context_queries_suspend(&rctx->ctx);
+
 	util_blitter_save_blend(rctx->blitter, rctx->states[R600_PIPE_STATE_BLEND]);
 	util_blitter_save_depth_stencil_alpha(rctx->blitter, rctx->states[R600_PIPE_STATE_DSA]);
 	if (rctx->states[R600_PIPE_STATE_STENCIL_REF]) {
@@ -47,25 +57,34 @@ static void r600_blitter_save_states(struct pipe_context *ctx)
 
 	rctx->vertex_elements = NULL;
 
-	/* TODO queries */
+	if (op & (R600_CLEAR_SURFACE | R600_COPY))
+		util_blitter_save_framebuffer(rctx->blitter, &rctx->framebuffer);
+
+	if (op & R600_COPY) {
+		util_blitter_save_fragment_sampler_states(
+			rctx->blitter, rctx->ps_samplers.n_samplers,
+			(void**)rctx->ps_samplers.samplers);
+
+		util_blitter_save_fragment_sampler_views(
+			rctx->blitter, rctx->ps_samplers.n_views,
+			(struct pipe_sampler_view**)rctx->ps_samplers.views);
+	}
+
+}
+
+static void r600_blitter_end(struct pipe_context *ctx)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	r600_context_queries_resume(&rctx->ctx);
 }
 
 int r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *texture)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct pipe_framebuffer_state fb = *rctx->pframebuffer;
 	struct pipe_surface *zsurf, *cbsurf;
 	int level = 0;
 	float depth = 1.0f;
 
-	r600_context_queries_suspend(&rctx->ctx);
-	for (int i = 0; i < fb.nr_cbufs; i++) {
-		fb.cbufs[i] = NULL;
-		pipe_surface_reference(&fb.cbufs[i], rctx->pframebuffer->cbufs[i]);
-	}
-	fb.zsbuf = NULL;
-	pipe_surface_reference(&fb.zsbuf, rctx->pframebuffer->zsbuf);
-
 	zsurf = ctx->screen->get_tex_surface(ctx->screen, &texture->resource.base.b, 0, level, 0,
 					     PIPE_BIND_DEPTH_STENCIL);
 
@@ -73,22 +92,17 @@ int r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_te
 			(struct pipe_resource*)texture->flushed_depth_texture,
 			0, level, 0, PIPE_BIND_RENDER_TARGET);
 
-	r600_blitter_save_states(ctx);
-	util_blitter_save_framebuffer(rctx->blitter, &fb);
-
 	if (rctx->family == CHIP_RV610 || rctx->family == CHIP_RV630 ||
-		rctx->family == CHIP_RV620 || rctx->family == CHIP_RV635)
+	    rctx->family == CHIP_RV620 || rctx->family == CHIP_RV635)
 		depth = 0.0f;
 
+	r600_blitter_begin(ctx, R600_CLEAR_SURFACE);
 	util_blitter_custom_depth_stencil(rctx->blitter, zsurf, cbsurf, rctx->custom_dsa_flush, depth);
+	r600_blitter_end(ctx);
 
 	pipe_surface_reference(&zsurf, NULL);
 	pipe_surface_reference(&cbsurf, NULL);
-	for (int i = 0; i < fb.nr_cbufs; i++) {
-		pipe_surface_reference(&fb.cbufs[i], NULL);
-	}
-	pipe_surface_reference(&fb.zsbuf, NULL);
-	r600_context_queries_resume(&rctx->ctx);
+
 
 	return 0;
 }
@@ -99,12 +113,11 @@ static void r600_clear(struct pipe_context *ctx, unsigned buffers,
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct pipe_framebuffer_state *fb = &rctx->framebuffer;
 
-	r600_context_queries_suspend(&rctx->ctx);
-	r600_blitter_save_states(ctx);
+	r600_blitter_begin(ctx, R600_CLEAR);
 	util_blitter_clear(rctx->blitter, fb->width, fb->height,
 				fb->nr_cbufs, buffers, rgba, depth,
 				stencil);
-	r600_context_queries_resume(&rctx->ctx);
+	r600_blitter_end(ctx);
 }
 
 static void r600_clear_render_target(struct pipe_context *ctx,
@@ -114,13 +127,11 @@ static void r600_clear_render_target(struct pipe_context *ctx,
 				     unsigned width, unsigned height)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct pipe_framebuffer_state *fb = &rctx->framebuffer;
 
-	r600_context_queries_suspend(&rctx->ctx);
-	util_blitter_save_framebuffer(rctx->blitter, fb);
+	r600_blitter_begin(ctx, R600_CLEAR_SURFACE);
 	util_blitter_clear_render_target(rctx->blitter, dst, rgba,
 					 dstx, dsty, width, height);
-	r600_context_queries_resume(&rctx->ctx);
+	r600_blitter_end(ctx);
 }
 
 static void r600_clear_depth_stencil(struct pipe_context *ctx,
@@ -132,16 +143,34 @@ static void r600_clear_depth_stencil(struct pipe_context *ctx,
 				     unsigned width, unsigned height)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct pipe_framebuffer_state *fb = &rctx->framebuffer;
 
-	r600_context_queries_suspend(&rctx->ctx);
-	util_blitter_save_framebuffer(rctx->blitter, fb);
+	r600_blitter_begin(ctx, R600_CLEAR_SURFACE);
 	util_blitter_clear_depth_stencil(rctx->blitter, dst, clear_flags, depth, stencil,
 					 dstx, dsty, width, height);
-	r600_context_queries_resume(&rctx->ctx);
+	r600_blitter_end(ctx);
 }
 
 
+
+/* Copy a block of pixels from one surface to another using HW. */
+static void r600_hw_copy_region(struct pipe_context *ctx,
+                                struct pipe_resource *dst,
+                                struct pipe_subresource subdst,
+                                unsigned dstx, unsigned dsty, unsigned dstz,
+                                struct pipe_resource *src,
+                                struct pipe_subresource subsrc,
+                                unsigned srcx, unsigned srcy, unsigned srcz,
+                                unsigned width, unsigned height)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+
+	r600_blitter_begin(ctx, R600_COPY);
+	util_blitter_copy_region(rctx->blitter, dst, subdst, dstx, dsty, dstz,
+				 src, subsrc, srcx, srcy, srcz, width, height,
+				 TRUE);
+	r600_blitter_end(ctx);
+}
+
 static void r600_resource_copy_region(struct pipe_context *ctx,
 				      struct pipe_resource *dst,
 				      struct pipe_subresource subdst,
@@ -151,8 +180,16 @@ static void r600_resource_copy_region(struct pipe_context *ctx,
 				      unsigned srcx, unsigned srcy, unsigned srcz,
 				      unsigned width, unsigned height)
 {
-	util_resource_copy_region(ctx, dst, subdst, dstx, dsty, dstz,
-				  src, subsrc, srcx, srcy, srcz, width, height);
+	boolean is_depth;
+	/* there is something wrong with depth resource copies at the moment so avoid them for now */
+	is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
+	if (is_depth)
+		util_resource_copy_region(ctx, dst, subdst, dstx, dsty, dstz,
+					  src, subsrc, srcx, srcy, srcz, width, height);
+	else
+		r600_hw_copy_region(ctx, dst, subdst, dstx, dsty, dstz,
+				    src, subsrc, srcx, srcy, srcz, width, height);
+
 }
 
 void r600_init_blit_functions(struct r600_pipe_context *rctx)
diff --git a/src/gallium/drivers/r600/r600_buffer.c b/src/gallium/drivers/r600/r600_buffer.c
index 2bfa4e22fec..455aa2e81f6 100644
--- a/src/gallium/drivers/r600/r600_buffer.c
+++ b/src/gallium/drivers/r600/r600_buffer.c
@@ -227,7 +227,7 @@ struct pipe_resource *r600_buffer_from_handle(struct pipe_screen *screen,
 	struct r600_resource *rbuffer;
 	struct r600_bo *bo = NULL;
 
-	bo = r600_bo_handle(rw, whandle->handle);
+	bo = r600_bo_handle(rw, whandle->handle, NULL);
 	if (bo == NULL) {
 		return NULL;
 	}
diff --git a/src/gallium/drivers/r600/r600_formats.h b/src/gallium/drivers/r600/r600_formats.h
new file mode 100644
index 00000000000..0c91a212384
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_formats.h
@@ -0,0 +1,56 @@
+#ifndef R600_FORMATS_H
+#define R600_FORMATS_H
+
+/* list of formats from R700 ISA document - apply across GPUs in different registers */
+#define     FMT_INVALID                     0x00000000
+#define     FMT_8                           0x00000001
+#define     FMT_4_4                         0x00000002
+#define     FMT_3_3_2                       0x00000003
+#define     FMT_16                          0x00000005
+#define     FMT_16_FLOAT                    0x00000006
+#define     FMT_8_8                         0x00000007
+#define     FMT_5_6_5                       0x00000008
+#define     FMT_6_5_5                       0x00000009
+#define     FMT_1_5_5_5                     0x0000000A
+#define     FMT_4_4_4_4                     0x0000000B
+#define     FMT_5_5_5_1                     0x0000000C
+#define     FMT_32                          0x0000000D
+#define     FMT_32_FLOAT                    0x0000000E
+#define     FMT_16_16                       0x0000000F
+#define     FMT_16_16_FLOAT                 0x00000010
+#define     FMT_8_24                        0x00000011
+#define     FMT_8_24_FLOAT                  0x00000012
+#define     FMT_24_8                        0x00000013
+#define     FMT_24_8_FLOAT                  0x00000014
+#define     FMT_10_11_11                    0x00000015
+#define     FMT_10_11_11_FLOAT              0x00000016
+#define     FMT_11_11_10                    0x00000017
+#define     FMT_11_11_10_FLOAT              0x00000018
+#define     FMT_2_10_10_10                  0x00000019
+#define     FMT_8_8_8_8                     0x0000001A
+#define     FMT_10_10_10_2                  0x0000001B
+#define     FMT_X24_8_32_FLOAT              0x0000001C
+#define     FMT_32_32                       0x0000001D
+#define     FMT_32_32_FLOAT                 0x0000001E
+#define     FMT_16_16_16_16                 0x0000001F
+#define     FMT_16_16_16_16_FLOAT           0x00000020
+#define     FMT_32_32_32_32                 0x00000022
+#define     FMT_32_32_32_32_FLOAT           0x00000023
+#define     FMT_1                           0x00000025
+#define     FMT_GB_GR                       0x00000027
+#define     FMT_BG_RG                       0x00000028
+#define     FMT_32_AS_8                     0x00000029
+#define     FMT_32_AS_8_8                   0x0000002a
+#define     FMT_5_9_9_9_SHAREDEXP           0x0000002b
+#define     FMT_8_8_8                       0x0000002c
+#define     FMT_16_16_16                    0x0000002d
+#define     FMT_16_16_16_FLOAT              0x0000002e
+#define     FMT_32_32_32                    0x0000002f
+#define     FMT_32_32_32_FLOAT              0x00000030
+#define     FMT_BC1                         0x00000031
+#define     FMT_BC2                         0x00000032
+#define     FMT_BC3                         0x00000033
+#define     FMT_BC4                         0x00000034
+#define     FMT_BC5                         0x00000035
+
+#endif
diff --git a/src/gallium/drivers/r600/r600_opcodes.h b/src/gallium/drivers/r600/r600_opcodes.h
index 0cf9c1c401c..4f9b39a7fdc 100644
--- a/src/gallium/drivers/r600/r600_opcodes.h
+++ b/src/gallium/drivers/r600/r600_opcodes.h
@@ -233,12 +233,6 @@
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL                      0x00000012
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RNDNE                     0x00000013
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR                     0x00000014
-/* same up to here */
-/*
-#define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA                      0x00000015
-#define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR                0x00000016
-#define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT                  0x00000018
-*/
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT                  0x00000015
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT                  0x00000016
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT                  0x00000017
@@ -336,9 +330,11 @@
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED_64      0x00000098
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_64                   0x00000099
 /* TODO Fill in more ALU */
+#define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR          0x000000B1
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4                      0x000000BE
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE                 0x000000BF
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE                      0x000000C0
+#define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT                  0x000000CC
 
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY                      0x000000D6
 #define     EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW                      0x000000D7
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 0589652f705..bea7ef5df84 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -85,6 +85,11 @@ static void r600_destroy_context(struct pipe_context *context)
 	u_upload_destroy(rctx->upload_vb);
 	u_upload_destroy(rctx->upload_ib);
 
+	if (rctx->tran.translate_cache)
+		translate_cache_destroy(rctx->tran.translate_cache);
+
+	FREE(rctx->ps_resource);
+	FREE(rctx->vs_resource);
 	FREE(rctx);
 }
 
@@ -171,6 +176,24 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, void
 		return NULL;
 	}
 
+	rctx->tran.translate_cache = translate_cache_create();
+	if (rctx->tran.translate_cache == NULL) {
+		FREE(rctx);
+		return NULL;
+	}
+	
+	rctx->vs_resource = CALLOC(R600_RESOURCE_ARRAY_SIZE, sizeof(struct r600_pipe_state));
+	if (!rctx->vs_resource) {
+		FREE(rctx);
+		return NULL;
+	}
+
+	rctx->ps_resource = CALLOC(R600_RESOURCE_ARRAY_SIZE, sizeof(struct r600_pipe_state));
+	if (!rctx->ps_resource) {
+		FREE(rctx);
+		return NULL;
+	}
+
 	class = r600_get_family_class(rctx->radeon);
 	if (class == R600 || class == R700)
 		rctx->custom_dsa_flush = r600_create_db_flush_dsa(rctx);
@@ -242,6 +265,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_INDEP_BLEND_ENABLE:
 	case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
 	case PIPE_CAP_DEPTH_CLAMP:
+	case PIPE_CAP_SHADER_STENCIL_EXPORT:
 		return 1;
 
 	/* Unsupported features (boolean caps). */
@@ -257,7 +281,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return 14;
 	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
 		/* FIXME allow this once infrastructure is there */
-		return 0;
+		return 16;
 	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
 	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
 		return 16;
@@ -428,5 +452,7 @@ struct pipe_screen *r600_screen_create(struct radeon *radeon)
 	r600_init_screen_texture_functions(&rscreen->screen);
 	r600_init_screen_resource_functions(&rscreen->screen);
 
+	rscreen->tiling_info = r600_get_tiling_info(radeon);
+
 	return &rscreen->screen;
 }
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index c46029a5617..1c691f6b764 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -30,6 +30,7 @@
 #include <pipe/p_screen.h>
 #include <pipe/p_context.h>
 #include <util/u_math.h>
+#include "translate/translate_cache.h"
 #include "r600.h"
 #include "r600_public.h"
 #include "r600_shader.h"
@@ -58,6 +59,7 @@ enum r600_pipe_state_id {
 struct r600_screen {
 	struct pipe_screen		screen;
 	struct radeon			*radeon;
+	struct r600_tiling_info		*tiling_info;
 };
 
 struct r600_pipe_sampler_view {
@@ -81,8 +83,10 @@ struct r600_pipe_blend {
 struct r600_vertex_element
 {
 	unsigned			count;
-	unsigned			refcount;
-	struct pipe_vertex_element	elements[32];
+	struct pipe_vertex_element	elements[PIPE_MAX_ATTRIBS];
+	enum pipe_format		hw_format[PIPE_MAX_ATTRIBS];
+	unsigned			hw_format_size[PIPE_MAX_ATTRIBS];
+	boolean incompatible_layout;
 };
 
 struct r600_pipe_shader {
@@ -92,6 +96,29 @@ struct r600_pipe_shader {
 	struct r600_vertex_element	vertex_elements;
 };
 
+/* needed for blitter save */
+#define NUM_TEX_UNITS 16
+
+struct r600_textures_info {
+	struct r600_pipe_sampler_view   *views[NUM_TEX_UNITS];
+	unsigned                        n_views;
+	void				*samplers[NUM_TEX_UNITS];
+	unsigned                        n_samplers;
+};
+
+struct r600_translate_context {
+	/* Translate cache for incompatible vertex offset/stride/format fallback. */
+	struct translate_cache *translate_cache;
+
+	/* The vertex buffer slot containing the translated buffer. */
+	unsigned vb_slot;
+	/* Saved and new vertex element state. */
+	void *saved_velems, *new_velems;
+};
+
+#define R600_CONSTANT_ARRAY_SIZE 256
+#define R600_RESOURCE_ARRAY_SIZE 160
+
 struct r600_pipe_context {
 	struct pipe_context		context;
 	struct blitter_context		*blitter;
@@ -112,12 +139,8 @@ struct r600_pipe_context {
 	struct pipe_stencil_ref		stencil_ref;
 	struct pipe_viewport_state	viewport;
 	struct pipe_clip_state		clip;
-	unsigned			vs_nconst;
-	unsigned			ps_nconst;
-	struct r600_pipe_state		vs_const[256];
-	struct r600_pipe_state		ps_const[256];
-	struct r600_pipe_state		vs_resource[160];
-	struct r600_pipe_state		ps_resource[160];
+	struct r600_pipe_state		*vs_resource;
+	struct r600_pipe_state		*ps_resource;
 	struct r600_pipe_state		config;
 	struct r600_pipe_shader 	*ps_shader;
 	struct r600_pipe_shader 	*vs_shader;
@@ -130,6 +153,11 @@ struct r600_pipe_context {
 	struct u_upload_mgr		*upload_vb;
 	struct u_upload_mgr		*upload_ib;
 	unsigned			any_user_vbs;
+	struct r600_textures_info       ps_samplers;
+
+	unsigned vb_max_index;
+	struct r600_translate_context tran;
+
 };
 
 struct r600_drawl {
@@ -180,6 +208,7 @@ void r600_init_context_resource_functions(struct r600_pipe_context *r600);
 /* r600_shader.c */
 int r600_pipe_shader_update(struct pipe_context *ctx, struct r600_pipe_shader *shader);
 int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *shader, const struct tgsi_token *tokens);
+void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader);
 int r600_find_vs_semantic_index(struct r600_shader *vs,
 				struct r600_shader *ps, int id);
 
@@ -187,10 +216,6 @@ int r600_find_vs_semantic_index(struct r600_shader *vs,
 void r600_init_state_functions(struct r600_pipe_context *rctx);
 void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info);
 void r600_init_config(struct r600_pipe_context *rctx);
-void r600_translate_index_buffer(struct r600_pipe_context *r600,
-					struct pipe_resource **index_buffer,
-					unsigned *index_size,
-					unsigned *start, unsigned count);
 void *r600_create_db_flush_dsa(struct r600_pipe_context *rctx);
 /* r600_helper.h */
 int r600_conv_pipe_prim(unsigned pprim, unsigned *prim);
@@ -201,6 +226,38 @@ uint32_t r600_translate_texformat(enum pipe_format format,
 				  const unsigned char *swizzle_view, 
 				  uint32_t *word4_p, uint32_t *yuv_format_p);
 
+/* r600_translate.c */
+void r600_begin_vertex_translate(struct r600_pipe_context *rctx);
+void r600_end_vertex_translate(struct r600_pipe_context *rctx);
+void r600_translate_index_buffer(struct r600_pipe_context *r600,
+				 struct pipe_resource **index_buffer,
+				 unsigned *index_size,
+				 unsigned *start, unsigned count);
+
+/* r600_state_common.c */
+void r600_set_index_buffer(struct pipe_context *ctx,
+			   const struct pipe_index_buffer *ib);
+void r600_set_vertex_buffers(struct pipe_context *ctx, unsigned count,
+			     const struct pipe_vertex_buffer *buffers);
+void *r600_create_vertex_elements(struct pipe_context *ctx,
+				  unsigned count,
+				  const struct pipe_vertex_element *elements);
+void r600_delete_vertex_element(struct pipe_context *ctx, void *state);
+void r600_bind_blend_state(struct pipe_context *ctx, void *state);
+void r600_bind_rs_state(struct pipe_context *ctx, void *state);
+void r600_delete_rs_state(struct pipe_context *ctx, void *state);
+void r600_sampler_view_destroy(struct pipe_context *ctx,
+			       struct pipe_sampler_view *state);
+void r600_bind_state(struct pipe_context *ctx, void *state);
+void r600_delete_state(struct pipe_context *ctx, void *state);
+void r600_bind_vertex_elements(struct pipe_context *ctx, void *state);
+
+void *r600_create_shader_state(struct pipe_context *ctx,
+			       const struct pipe_shader_state *state);
+void r600_bind_ps_shader(struct pipe_context *ctx, void *state);
+void r600_bind_vs_shader(struct pipe_context *ctx, void *state);
+void r600_delete_ps_shader(struct pipe_context *ctx, void *state);
+void r600_delete_vs_shader(struct pipe_context *ctx, void *state);
 /*
  * common helpers
  */
diff --git a/src/gallium/drivers/r600/r600_resource.h b/src/gallium/drivers/r600/r600_resource.h
index f6377ea802b..d152285815c 100644
--- a/src/gallium/drivers/r600/r600_resource.h
+++ b/src/gallium/drivers/r600/r600_resource.h
@@ -25,6 +25,9 @@
 
 #include "util/u_transfer.h"
 
+/* flag to indicate a resource is to be used as a transfer so should not be tiled */
+#define R600_RESOURCE_FLAG_TRANSFER     PIPE_RESOURCE_FLAG_DRV_PRIV
+
 /* Texture transfer. */
 struct r600_transfer {
 	/* Base class. */
@@ -49,16 +52,14 @@ struct r600_resource {
 
 struct r600_resource_texture {
 	struct r600_resource		resource;
-	unsigned long			offset[PIPE_MAX_TEXTURE_LEVELS];
-	unsigned long			pitch[PIPE_MAX_TEXTURE_LEVELS];
-	unsigned long			width[PIPE_MAX_TEXTURE_LEVELS];
-	unsigned long			height[PIPE_MAX_TEXTURE_LEVELS];
-	unsigned long			layer_size[PIPE_MAX_TEXTURE_LEVELS];
-	unsigned long			pitch_override;
-	unsigned long			bpt;
-	unsigned long			size;
+	unsigned			offset[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned			pitch_in_bytes[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned			pitch_in_pixels[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned			layer_size[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned			array_mode[PIPE_MAX_TEXTURE_LEVELS];
+	unsigned			pitch_override;
+	unsigned			size;
 	unsigned			tiled;
-	unsigned			array_mode;
 	unsigned			tile_type;
 	unsigned			depth;
 	unsigned			dirty;
@@ -126,4 +127,9 @@ void* r600_texture_transfer_map(struct pipe_context *ctx,
 void r600_texture_transfer_unmap(struct pipe_context *ctx,
 				 struct pipe_transfer* transfer);
 
+struct r600_surface {
+	struct pipe_surface base;
+	unsigned aligned_height;
+};
+
 #endif
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 016e75bd52b..4106587398b 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -107,24 +107,28 @@ static void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shade
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state *rstate = &shader->rstate;
 	struct r600_shader *rshader = &shader->shader;
-	unsigned i, tmp, exports_ps, num_cout, spi_ps_in_control_0, spi_input_z;
-	boolean have_pos = FALSE, have_face = FALSE;
+	unsigned i, tmp, exports_ps, num_cout, spi_ps_in_control_0, spi_input_z, spi_ps_in_control_1;
+	int pos_index = -1, face_index = -1;
 
 	/* clear previous register */
 	rstate->nregs = 0;
 
 	for (i = 0; i < rshader->ninput; i++) {
 		tmp = S_028644_SEMANTIC(r600_find_vs_semantic_index(&rctx->vs_shader->shader, rshader, i));
-		tmp |= S_028644_SEL_CENTROID(1);
+		if (rshader->input[i].centroid)
+			tmp |= S_028644_SEL_CENTROID(1);
+		if (rshader->input[i].interpolate == TGSI_INTERPOLATE_LINEAR)
+			tmp |= S_028644_SEL_LINEAR(1);
+
 		if (rshader->input[i].name == TGSI_SEMANTIC_POSITION)
-			have_pos = TRUE;
+			pos_index = i;
 		if (rshader->input[i].name == TGSI_SEMANTIC_COLOR ||
 		    rshader->input[i].name == TGSI_SEMANTIC_BCOLOR ||
 		    rshader->input[i].name == TGSI_SEMANTIC_POSITION) {
 			tmp |= S_028644_FLAT_SHADE(rshader->flat_shade);
 		}
 		if (rshader->input[i].name == TGSI_SEMANTIC_FACE)
-			have_face = TRUE;
+			face_index = i;
 		if (rshader->input[i].name == TGSI_SEMANTIC_GENERIC &&
 			rctx->sprite_coord_enable & (1 << rshader->input[i].sid)) {
 			tmp |= S_028644_PT_SPRITE_TEX(1);
@@ -132,17 +136,22 @@ static void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shade
 		r600_pipe_state_add_reg(rstate, R_028644_SPI_PS_INPUT_CNTL_0 + i * 4, tmp, 0xFFFFFFFF, NULL);
 	}
 	for (i = 0; i < rshader->noutput; i++) {
-		if (rshader->input[i].name == TGSI_SEMANTIC_POSITION)
+		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION)
 			r600_pipe_state_add_reg(rstate,
 						R_02880C_DB_SHADER_CONTROL,
 						S_02880C_Z_EXPORT_ENABLE(1),
 						S_02880C_Z_EXPORT_ENABLE(1), NULL);
+		if (rshader->output[i].name == TGSI_SEMANTIC_STENCIL)
+			r600_pipe_state_add_reg(rstate,
+						R_02880C_DB_SHADER_CONTROL,
+						S_02880C_STENCIL_REF_EXPORT_ENABLE(1),
+						S_02880C_STENCIL_REF_EXPORT_ENABLE(1), NULL);
 	}
 
 	exports_ps = 0;
 	num_cout = 0;
 	for (i = 0; i < rshader->noutput; i++) {
-		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION)
+		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION || rshader->output[i].name == TGSI_SEMANTIC_STENCIL)
 			exports_ps |= 1;
 		else if (rshader->output[i].name == TGSI_SEMANTIC_COLOR) {
 			num_cout++;
@@ -157,13 +166,22 @@ static void r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_pipe_shade
 	spi_ps_in_control_0 = S_0286CC_NUM_INTERP(rshader->ninput) |
 				S_0286CC_PERSP_GRADIENT_ENA(1);
 	spi_input_z = 0;
-	if (have_pos) {
-		spi_ps_in_control_0 |=  S_0286CC_POSITION_ENA(1) |
-					S_0286CC_BARYC_SAMPLE_CNTL(1);
+	if (pos_index != -1) {
+		spi_ps_in_control_0 |= (S_0286CC_POSITION_ENA(1) |
+					S_0286CC_POSITION_CENTROID(rshader->input[pos_index].centroid) |
+					S_0286CC_POSITION_ADDR(rshader->input[pos_index].gpr) |
+					S_0286CC_BARYC_SAMPLE_CNTL(1));
 		spi_input_z |= 1;
 	}
+
+	spi_ps_in_control_1 = 0;
+	if (face_index != -1) {
+		spi_ps_in_control_1 |= S_0286D0_FRONT_FACE_ENA(1) |
+			S_0286D0_FRONT_FACE_ADDR(rshader->input[face_index].gpr);
+	}
+
 	r600_pipe_state_add_reg(rstate, R_0286CC_SPI_PS_IN_CONTROL_0, spi_ps_in_control_0, 0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate, R_0286D0_SPI_PS_IN_CONTROL_1, S_0286D0_FRONT_FACE_ENA(have_face), 0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate, R_0286D0_SPI_PS_IN_CONTROL_1, spi_ps_in_control_1, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_0286D8_SPI_INPUT_Z, spi_input_z, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
 				R_028840_SQ_PGM_START_PS,
@@ -252,7 +270,7 @@ static int r600_shader_update(struct pipe_context *ctx, struct r600_pipe_shader
 	}
 	rshader->vertex_elements = *rctx->vertex_elements;
 	for (i = 0; i < rctx->vertex_elements->count; i++) {
-		resource_format[nresources++] = rctx->vertex_elements->elements[i].src_format;
+		resource_format[nresources++] = rctx->vertex_elements->hw_format[i];
 	}
 	r600_bo_reference(rctx->radeon, &rshader->bo, NULL);
 	LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
@@ -320,6 +338,18 @@ int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *s
 	return 0;
 }
 
+void
+r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+
+	r600_bo_reference(rctx->radeon, &shader->bo, NULL);
+
+	r600_bc_clear(&shader->shader.bc);
+
+	/* FIXME: is there more stuff to free? */
+}
+
 /*
  * tgsi -> r600 shader
  */
@@ -339,6 +369,11 @@ struct r600_shader_ctx {
 	u32					*literals;
 	u32					nliterals;
 	u32					max_driver_temp_used;
+	/* needed for evergreen interpolation */
+	boolean                                 input_centroid;
+	boolean                                 input_linear;
+	boolean                                 input_perspective;
+	int					num_interp_gpr;
 };
 
 struct r600_shader_tgsi_instruction {
@@ -371,11 +406,9 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx)
 	}
 #endif
 	for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
-		if (i->Src[j].Register.Dimension ||
-			i->Src[j].Register.Absolute) {
-			R600_ERR("unsupported src %d (dimension %d|absolute %d)\n", j,
-				 i->Src[j].Register.Dimension,
-				 i->Src[j].Register.Absolute);
+		if (i->Src[j].Register.Dimension) {
+			R600_ERR("unsupported src %d (dimension %d)\n", j,
+				 i->Src[j].Register.Dimension);
 			return -EINVAL;
 		}
 	}
@@ -388,10 +421,33 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx)
 	return 0;
 }
 
-static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int gpr)
+static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
 {
 	int i, r;
 	struct r600_bc_alu alu;
+	int gpr = 0, base_chan = 0;
+	int ij_index = 0;
+
+	if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
+		ij_index = 0;
+		if (ctx->shader->input[input].centroid)
+			ij_index++;
+	} else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
+		ij_index = 0;
+		/* if we have perspective add one */
+		if (ctx->input_perspective)  {
+			ij_index++;
+			/* if we have perspective centroid */
+			if (ctx->input_centroid)
+				ij_index++;
+		}
+		if (ctx->shader->input[input].centroid)
+			ij_index++;
+	}
+		
+	/* work out gpr and base_chan from index */
+	gpr = ij_index / 2;
+	base_chan = (2 * (ij_index % 2)) + 1;
 
 	for (i = 0; i < 8; i++) {
 		memset(&alu, 0, sizeof(struct r600_bc_alu));
@@ -402,13 +458,16 @@ static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int gpr)
 			alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY;
 
 		if ((i > 1) && (i < 6)) {
-			alu.dst.sel = ctx->shader->input[gpr].gpr;
+			alu.dst.sel = ctx->shader->input[input].gpr;
 			alu.dst.write = 1;
 		}
 
 		alu.dst.chan = i % 4;
-		alu.src[0].chan = (1 - (i % 2));
-		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + gpr;
+
+		alu.src[0].sel = gpr;
+		alu.src[0].chan = (base_chan - (i % 2));
+
+		alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
 
 		alu.bank_swizzle_force = SQ_ALU_VEC_210;
 		if ((i % 4) == 3)
@@ -434,6 +493,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		ctx->shader->input[i].name = d->Semantic.Name;
 		ctx->shader->input[i].sid = d->Semantic.Index;
 		ctx->shader->input[i].interpolate = d->Declaration.Interpolate;
+		ctx->shader->input[i].centroid = d->Declaration.Centroid;
 		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + i;
 		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
 			/* turn input into fetch */
@@ -457,7 +517,12 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		}
 		if (ctx->type == TGSI_PROCESSOR_FRAGMENT && ctx->bc->chiprev == 2) {
 			/* turn input into interpolate on EG */
-			evergreen_interp_alu(ctx, i);
+			if (ctx->shader->input[i].name != TGSI_SEMANTIC_POSITION) {
+				if (ctx->shader->input[i].interpolate > 0) {
+					ctx->shader->input[i].lds_pos = ctx->shader->nlds++;
+					evergreen_interp_alu(ctx, i);
+				}
+			}
 		}
 		break;
 	case TGSI_FILE_OUTPUT:
@@ -484,6 +549,53 @@ static int r600_get_temp(struct r600_shader_ctx *ctx)
 	return ctx->temp_reg + ctx->max_driver_temp_used++;
 }
 
+/* 
+ * for evergreen we need to scan the shader to find the number of GPRs we need to
+ * reserve for interpolation.
+ *
+ * we need to know if we are going to emit
+ * any centroid inputs
+ * if perspective and linear are required
+*/
+static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
+{
+	int i;
+	int num_baryc;
+
+	ctx->input_linear = FALSE;
+	ctx->input_perspective = FALSE;
+	ctx->input_centroid = FALSE;
+	ctx->num_interp_gpr = 1;
+
+	/* any centroid inputs */
+	for (i = 0; i < ctx->info.num_inputs; i++) {
+		/* skip position/face */
+		if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
+		    ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
+			continue;
+		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
+			ctx->input_linear = TRUE;
+		if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
+			ctx->input_perspective = TRUE;
+		if (ctx->info.input_centroid[i])
+			ctx->input_centroid = TRUE;
+	}
+
+	num_baryc = 0;
+	/* ignoring sample for now */
+	if (ctx->input_perspective)
+		num_baryc++;
+	if (ctx->input_linear)
+		num_baryc++;
+	if (ctx->input_centroid)
+		num_baryc *= 2;
+
+	ctx->num_interp_gpr += (num_baryc + 1) >> 1;
+
+	/* TODO PULL MODEL and LINE STIPPLE, FIXED PT POS */
+	return ctx->num_interp_gpr;
+}
+
 int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *shader)
 {
 	struct tgsi_full_immediate *immediate;
@@ -529,6 +641,9 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
 	}
+	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chiprev == 2) {
+		ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
+	}
 	ctx.file_offset[TGSI_FILE_OUTPUT] = ctx.file_offset[TGSI_FILE_INPUT] +
 						ctx.info.file_count[TGSI_FILE_INPUT];
 	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
@@ -625,7 +740,14 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
 				output[i].array_base = 61;
 				output[i].swizzle_x = 2;
-				output[i].swizzle_y = output[i].swizzle_z = output[i].swizzle_w = 7;
+				output[i].swizzle_y = 7;
+				output[i].swizzle_z = output[i].swizzle_w = 7;
+				output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+			} else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
+				output[i].array_base = 61;
+				output[i].swizzle_x = 7;
+				output[i].swizzle_y = 1;
+				output[i].swizzle_z = output[i].swizzle_w = 7;
 				output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 			} else {
 				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
@@ -731,6 +853,7 @@ static int tgsi_src(struct r600_shader_ctx *ctx,
 	if (tgsi_src->Register.Indirect)
 		r600_src->rel = V_SQ_REL_RELATIVE;
 	r600_src->neg = tgsi_src->Register.Negate;
+	r600_src->abs = tgsi_src->Register.Absolute;
 	r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
 	return 0;
 }
@@ -793,6 +916,7 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx, struct r600_bc_alu_s
 				alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
 				alu.src[0].sel = r600_src[i].sel;
 				alu.src[0].chan = k;
+				alu.src[0].rel = r600_src[i].rel;
 				alu.dst.sel = treg;
 				alu.dst.chan = k;
 				alu.dst.write = 1;
@@ -803,6 +927,7 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx, struct r600_bc_alu_s
 					return r;
 			}
 			r600_src[i].sel = treg;
+			r600_src[i].rel =0;
 			j--;
 		}
 	}
@@ -1843,7 +1968,7 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 			memset(&alu, 0, sizeof(struct r600_bc_alu));
 			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV);
 			alu.src[0].sel = src_gpr;
-			alu.src[0].chan = i;
+			alu.src[0].chan = tgsi_chan(&inst->Src[0], i);
 			alu.dst.sel = ctx->temp_reg;
 			alu.dst.chan = i;
 			if (i == 3)
@@ -1863,8 +1988,10 @@ static int tgsi_tex(struct r600_shader_ctx *ctx)
 
 	memset(&tex, 0, sizeof(struct r600_bc_tex));
 	tex.inst = opcode;
-	tex.resource_id = ctx->file_offset[inst->Src[1].Register.File] + inst->Src[1].Register.Index;
-	tex.sampler_id = tex.resource_id;
+	tex.sampler_id = ctx->file_offset[inst->Src[1].Register.File] + inst->Src[1].Register.Index;
+	tex.resource_id = tex.sampler_id;
+	if (ctx->shader->processor_type == TGSI_PROCESSOR_VERTEX)
+		tex.resource_id += PIPE_MAX_ATTRIBS;
 	tex.src_gpr = src_gpr;
 	tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
 	tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
@@ -2172,7 +2299,7 @@ static int tgsi_xpd(struct r600_shader_ctx *ctx)
 static int tgsi_exp(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
-	struct r600_bc_alu_src r600_src[3];
+	struct r600_bc_alu_src r600_src[3] = { { 0 } };
 	struct r600_bc_alu alu;
 	int r;
 
@@ -2495,7 +2622,40 @@ static int tgsi_log(struct r600_shader_ctx *ctx)
 }
 
 /* r6/7 only for now */
-static int tgsi_arl(struct r600_shader_ctx *ctx)
+static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
+{
+	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	struct r600_bc_alu alu;
+	int r;
+	
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+
+	alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR;
+	r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+	if (r)
+		return r;
+	alu.src[0].chan = tgsi_chan(&inst->Src[0], 0);
+	alu.last = 1;
+	alu.dst.chan = 0;
+	alu.dst.sel = ctx->temp_reg;
+	alu.dst.write = 1;
+	r = r600_bc_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
+	if (r)
+		return r;
+	memset(&alu, 0, sizeof(struct r600_bc_alu));
+	alu.inst = EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT;
+	r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]);
+	if (r)
+		return r;
+	alu.src[0].sel = ctx->temp_reg;
+	alu.src[0].chan = 0;
+	alu.last = 1;
+	r = r600_bc_add_alu_type(ctx->bc, &alu, CTX_INST(V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
+	if (r)
+		return r;
+	return 0;
+}
+static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
 {
 	/* TODO from r600c, ar values don't persist between clauses */
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
@@ -2840,7 +3000,7 @@ static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
 }
 
 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
-	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_arl},
+	{TGSI_OPCODE_ARL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_r600_arl},
 	{TGSI_OPCODE_MOV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
 	{TGSI_OPCODE_LIT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
 
@@ -2921,7 +3081,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_NRM,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DIV,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DP2,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
-	{TGSI_OPCODE_TXL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
 	{TGSI_OPCODE_BRK,	0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
 	{TGSI_OPCODE_IF,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
 	/* gap */
@@ -3004,7 +3164,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 };
 
 static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
-	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_ARL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_eg_arl},
 	{TGSI_OPCODE_MOV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2},
 	{TGSI_OPCODE_LIT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit},
 	{TGSI_OPCODE_RCP,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate},
@@ -3079,7 +3239,7 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_NRM,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DIV,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DP2,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp},
-	{TGSI_OPCODE_TXL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_TXL,	0, SQ_TEX_INST_SAMPLE_L, tgsi_tex},
 	{TGSI_OPCODE_BRK,	0, EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont},
 	{TGSI_OPCODE_IF,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if},
 	/* gap */
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 6e2620f2012..f8bc5951395 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -31,6 +31,8 @@ struct r600_shader_io {
 	unsigned		done;
 	int			sid;
 	unsigned		interpolate;
+	boolean                 centroid;
+	unsigned		lds_pos; /* for evergreen */
 };
 
 struct r600_shader {
@@ -39,6 +41,7 @@ struct r600_shader {
 	boolean			flat_shade;
 	unsigned		ninput;
 	unsigned		noutput;
+	unsigned		nlds;
 	struct r600_shader_io	input[32];
 	struct r600_shader_io	output[32];
 	enum radeon_family	family;
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index b55c3450059..b3e0d493217 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -37,7 +37,7 @@
 #include <util/u_memory.h>
 #include <util/u_inlines.h>
 #include <util/u_upload_mgr.h>
-#include <util/u_index_modify.h>
+#include <util/u_framebuffer.h>
 #include <pipebuffer/pb_buffer.h>
 #include "r600.h"
 #include "r600d.h"
@@ -98,7 +98,7 @@ static void r600_draw_common(struct r600_drawl *draw)
 			vertex_buffer->buffer_offset +
 			r600_bo_offset(rbuffer->bo);
 
-		format = r600_translate_vertex_data_type(rctx->vertex_elements->elements[i].src_format);
+		format = r600_translate_vertex_data_type(rctx->vertex_elements->hw_format[i]);
 
 		word2 = format | S_038008_STRIDE(vertex_buffer->stride);
 
@@ -181,40 +181,21 @@ static void r600_draw_common(struct r600_drawl *draw)
 	r600_context_draw(&rctx->ctx, &rdraw);
 }
 
-void r600_translate_index_buffer(struct r600_pipe_context *r600,
-					struct pipe_resource **index_buffer,
-					unsigned *index_size,
-					unsigned *start, unsigned count)
-{
-	switch (*index_size) {
-	case 1:
-		util_shorten_ubyte_elts(&r600->context, index_buffer, 0, *start, count);
-		*index_size = 2;
-		*start = 0;
-		break;
-
-	case 2:
-		if (*start % 2 != 0) {
-			util_rebuild_ushort_elts(&r600->context, index_buffer, 0, *start, count);
-			*start = 0;
-		}
-		break;
-
-	case 4:
-		break;
-	}
-}
-
 void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_drawl draw;
+	boolean translate = FALSE;
+
+	if (rctx->vertex_elements->incompatible_layout) {
+		r600_begin_vertex_translate(rctx);
+		translate = TRUE;
+	}
 
 	if (rctx->any_user_vbs) {
 		r600_upload_user_buffers(rctx);
 		rctx->any_user_vbs = FALSE;
 	}
-
 	memset(&draw, 0, sizeof(struct r600_drawl));
 	draw.ctx = ctx;
 	draw.mode = info->mode;
@@ -245,6 +226,9 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	}
 	r600_draw_common(&draw);
 
+	if (translate)
+		r600_end_vertex_translate(rctx);
+
 	pipe_resource_reference(&draw.index_buffer, NULL);
 }
 
@@ -340,20 +324,6 @@ static void *r600_create_blend_state(struct pipe_context *ctx,
 	return rstate;
 }
 
-static void r600_bind_blend_state(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_blend *blend = (struct r600_pipe_blend *)state;
-	struct r600_pipe_state *rstate;
-
-	if (state == NULL)
-		return;
-	rstate = &blend->rstate;
-	rctx->states[rstate->id] = rstate;
-	rctx->cb_target_mask = blend->cb_target_mask;
-	r600_context_pipe_state_set(&rctx->ctx, rstate);
-}
-
 static void *r600_create_dsa_state(struct pipe_context *ctx,
 				   const struct pipe_depth_stencil_alpha_state *state)
 {
@@ -446,6 +416,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	struct r600_pipe_state *rstate;
 	unsigned tmp;
 	unsigned prov_vtx = 1, polygon_dual_mode;
+	unsigned clip_rule;
 
 	if (rs == NULL) {
 		return NULL;
@@ -455,6 +426,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
 
+	clip_rule = state->scissor ? 0xAAAA : 0xFFFF;
 	/* offset */
 	rs->offset_units = state->offset_units;
 	rs->offset_scale = state->offset_scale * 12.0f;
@@ -462,7 +434,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	rstate->id = R600_PIPE_STATE_RASTERIZER;
 	if (state->flatshade_first)
 		prov_vtx = 0;
-	tmp = 0x00000001;
+	tmp = S_0286D4_FLAT_SHADE_ENA(1);
 	if (state->sprite_coord_enable) {
 		tmp |= S_0286D4_PNT_SPRITE_ENA(1) |
 			S_0286D4_PNT_SPRITE_OVRD_X(2) |
@@ -496,7 +468,10 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	tmp = (unsigned)(state->point_size * 8.0);
 	r600_pipe_state_add_reg(rstate, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp), 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_028A04_PA_SU_POINT_MINMAX, 0x80000000, 0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate, R_028A08_PA_SU_LINE_CNTL, 0x00000008, 0xFFFFFFFF, NULL);
+
+	tmp = (unsigned)(state->line_width * 8.0);
+	r600_pipe_state_add_reg(rstate, R_028A08_PA_SU_LINE_CNTL, S_028A08_WIDTH(tmp), 0xFFFFFFFF, NULL);
+
 	r600_pipe_state_add_reg(rstate, R_028A0C_PA_SC_LINE_STIPPLE, 0x00000005, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_028A48_PA_SC_MPASS_PS_CNTL, 0x00000000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_028C00_PA_SC_LINE_CNTL, 0x00000400, 0xFFFFFFFF, NULL);
@@ -505,37 +480,9 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate, R_028C14_PA_CL_GB_HORZ_CLIP_ADJ, 0x3F800000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_028C18_PA_CL_GB_HORZ_DISC_ADJ, 0x3F800000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_028DFC_PA_SU_POLY_OFFSET_CLAMP, 0x00000000, 0xFFFFFFFF, NULL);
-	return rstate;
-}
-
-static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state;
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	if (state == NULL)
-		return;
-
-	rctx->flatshade = rs->flatshade;
-	rctx->sprite_coord_enable = rs->sprite_coord_enable;
-	rctx->rasterizer = rs;
+	r600_pipe_state_add_reg(rstate, R_02820C_PA_SC_CLIPRECT_RULE, clip_rule, 0xFFFFFFFF, NULL);
 
-	rctx->states[rs->rstate.id] = &rs->rstate;
-	r600_context_pipe_state_set(&rctx->ctx, &rs->rstate);
-}
-
-static void r600_delete_rs_state(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state;
-
-	if (rctx->rasterizer == rs) {
-		rctx->rasterizer = NULL;
-	}
-	if (rctx->states[rs->rstate.id] == &rs->rstate) {
-		rctx->states[rs->rstate.id] = NULL;
-	}
-	free(rs);
+	return rstate;
 }
 
 static void *r600_create_sampler_state(struct pipe_context *ctx,
@@ -574,28 +521,6 @@ static void *r600_create_sampler_state(struct pipe_context *ctx,
 	return rstate;
 }
 
-static void *r600_create_vertex_elements(struct pipe_context *ctx,
-				unsigned count,
-				const struct pipe_vertex_element *elements)
-{
-	struct r600_vertex_element *v = CALLOC_STRUCT(r600_vertex_element);
-
-	assert(count < 32);
-	v->count = count;
-	v->refcount = 1;
-	memcpy(v->elements, elements, count * sizeof(struct pipe_vertex_element));
-	return v;
-}
-
-static void r600_sampler_view_destroy(struct pipe_context *ctx,
-				      struct pipe_sampler_view *state)
-{
-	struct r600_pipe_sampler_view *resource = (struct r600_pipe_sampler_view *)state;
-
-	pipe_resource_reference(&state->texture, NULL);
-	FREE(resource);
-}
-
 static struct pipe_sampler_view *r600_create_sampler_view(struct pipe_context *ctx,
 							struct pipe_resource *texture,
 							const struct pipe_sampler_view *state)
@@ -626,15 +551,15 @@ static struct pipe_sampler_view *r600_create_sampler_view(struct pipe_context *c
 	swizzle[1] = state->swizzle_g;
 	swizzle[2] = state->swizzle_b;
 	swizzle[3] = state->swizzle_a;
-	format = r600_translate_texformat(texture->format,
+	format = r600_translate_texformat(state->format,
 					  swizzle,
 					  &word4, &yuv_format);
 	if (format == ~0) {
 		format = 0;
 	}
-	desc = util_format_description(texture->format);
+	desc = util_format_description(state->format);
 	if (desc == NULL) {
-		R600_ERR("unknow format %d\n", texture->format);
+		R600_ERR("unknow format %d\n", state->format);
 	}
 	tmp = (struct r600_resource_texture*)texture;
 	rbuffer = &tmp->resource;
@@ -648,7 +573,11 @@ static struct pipe_sampler_view *r600_create_sampler_view(struct pipe_context *c
 		bo[0] = rbuffer->bo;
 		bo[1] = rbuffer->bo;
 	}
-	pitch = align(tmp->pitch[0] / tmp->bpt, 8);
+	pitch = align(tmp->pitch_in_pixels[0], 8);
+	if (tmp->tiled) {
+		array_mode = tmp->array_mode[0];
+		tile_type = tmp->tile_type;
+	}
 
 	/* FIXME properly handle first level != 0 */
 	r600_pipe_state_add_reg(rstate, R_038000_RESOURCE0_WORD0,
@@ -683,32 +612,43 @@ static struct pipe_sampler_view *r600_create_sampler_view(struct pipe_context *c
 static void r600_set_vs_sampler_view(struct pipe_context *ctx, unsigned count,
 					struct pipe_sampler_view **views)
 {
-	/* TODO */
-	assert(1);
-}
-
-static void r600_set_ps_sampler_view(struct pipe_context *ctx, unsigned count,
-					struct pipe_sampler_view **views)
-{
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_sampler_view **resource = (struct r600_pipe_sampler_view **)views;
 
 	for (int i = 0; i < count; i++) {
 		if (resource[i]) {
-			r600_context_pipe_state_set_ps_resource(&rctx->ctx, &resource[i]->state, i);
+			r600_context_pipe_state_set_vs_resource(&rctx->ctx, &resource[i]->state, i + PIPE_MAX_ATTRIBS);
 		}
 	}
 }
 
-static void r600_bind_state(struct pipe_context *ctx, void *state)
+static void r600_set_ps_sampler_view(struct pipe_context *ctx, unsigned count,
+					struct pipe_sampler_view **views)
 {
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_state *rstate = (struct r600_pipe_state *)state;
+	struct r600_pipe_sampler_view **resource = (struct r600_pipe_sampler_view **)views;
+	int i;
 
-	if (state == NULL)
-		return;
-	rctx->states[rstate->id] = rstate;
-	r600_context_pipe_state_set(&rctx->ctx, rstate);
+	for (i = 0; i < count; i++) {
+		if (&rctx->ps_samplers.views[i]->base != views[i]) {
+			if (resource[i])
+				r600_context_pipe_state_set_ps_resource(&rctx->ctx, &resource[i]->state, i);
+			else
+				r600_context_pipe_state_set_ps_resource(&rctx->ctx, NULL, i);
+
+			pipe_sampler_view_reference(
+				(struct pipe_sampler_view **)&rctx->ps_samplers.views[i],
+				views[i]);
+
+		}
+	}
+	for (i = count; i < NUM_TEX_UNITS; i++) {
+		if (rctx->ps_samplers.views[i]) {
+			r600_context_pipe_state_set_ps_resource(&rctx->ctx, NULL, i);
+			pipe_sampler_view_reference((struct pipe_sampler_view **)&rctx->ps_samplers.views[i], NULL);
+		}
+	}
+	rctx->ps_samplers.n_views = count;
 }
 
 static void r600_bind_ps_sampler(struct pipe_context *ctx, unsigned count, void **states)
@@ -716,6 +656,9 @@ static void r600_bind_ps_sampler(struct pipe_context *ctx, unsigned count, void
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state **rstates = (struct r600_pipe_state **)states;
 
+	memcpy(rctx->ps_samplers.samplers, states, sizeof(void*) * count);
+	rctx->ps_samplers.n_samplers = count;
+
 	for (int i = 0; i < count; i++) {
 		r600_context_pipe_state_set_ps_sampler(&rctx->ctx, rstates[i], i);
 	}
@@ -726,37 +669,11 @@ static void r600_bind_vs_sampler(struct pipe_context *ctx, unsigned count, void
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	struct r600_pipe_state **rstates = (struct r600_pipe_state **)states;
 
-	/* TODO implement */
 	for (int i = 0; i < count; i++) {
 		r600_context_pipe_state_set_vs_sampler(&rctx->ctx, rstates[i], i);
 	}
 }
 
-static void r600_delete_state(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_state *rstate = (struct r600_pipe_state *)state;
-
-	if (rctx->states[rstate->id] == rstate) {
-		rctx->states[rstate->id] = NULL;
-	}
-	for (int i = 0; i < rstate->nregs; i++) {
-		r600_bo_reference(rctx->radeon, &rstate->regs[i].bo, NULL);
-	}
-	free(rstate);
-}
-
-static void r600_delete_vertex_element(struct pipe_context *ctx, void *state)
-{
-	struct r600_vertex_element *v = (struct r600_vertex_element*)state;
-
-	if (v == NULL)
-		return;
-	if (--v->refcount)
-		return;
-	free(v);
-}
-
 static void r600_set_clip_state(struct pipe_context *ctx,
 				const struct pipe_clip_state *state)
 {
@@ -792,19 +709,6 @@ static void r600_set_clip_state(struct pipe_context *ctx,
 	r600_context_pipe_state_set(&rctx->ctx, rstate);
 }
 
-static void r600_bind_vertex_elements(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_vertex_element *v = (struct r600_vertex_element*)state;
-
-	r600_delete_vertex_element(ctx, rctx->vertex_elements);
-	rctx->vertex_elements = v;
-	if (v) {
-		v->refcount++;
-//		rctx->vs_rebuild = TRUE;
-	}
-}
-
 static void r600_set_polygon_stipple(struct pipe_context *ctx,
 					 const struct pipe_poly_stipple *state)
 {
@@ -828,18 +732,6 @@ static void r600_set_scissor_state(struct pipe_context *ctx,
 	tl = S_028240_TL_X(state->minx) | S_028240_TL_Y(state->miny) | S_028240_WINDOW_OFFSET_DISABLE(1);
 	br = S_028244_BR_X(state->maxx) | S_028244_BR_Y(state->maxy);
 	r600_pipe_state_add_reg(rstate,
-				R_028030_PA_SC_SCREEN_SCISSOR_TL, tl,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_028034_PA_SC_SCREEN_SCISSOR_BR, br,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_028204_PA_SC_WINDOW_SCISSOR_TL, tl,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_028208_PA_SC_WINDOW_SCISSOR_BR, br,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
 				R_028210_PA_SC_CLIPRECT_0_TL, tl,
 				0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
@@ -863,17 +755,6 @@ static void r600_set_scissor_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate,
 				R_02822C_PA_SC_CLIPRECT_3_BR, br,
 				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_028200_PA_SC_WINDOW_OFFSET, 0x00000000,
-				0xFFFFFFFF, NULL);
-	r600_pipe_state_add_reg(rstate,
-				R_02820C_PA_SC_CLIPRECT_RULE, 0x0000FFFF,
-				0xFFFFFFFF, NULL);
-	if (rctx->family >= CHIP_RV770) {
-		r600_pipe_state_add_reg(rstate,
-					R_028230_PA_SC_EDGERULE, 0xAAAAAAAA,
-					0xFFFFFFFF, NULL);
-	}
 
 	free(rctx->states[R600_PIPE_STATE_SCISSOR]);
 	rctx->states[R600_PIPE_STATE_SCISSOR] = rstate;
@@ -937,6 +818,7 @@ static void r600_cb(struct r600_pipe_context *rctx, struct r600_pipe_state *rsta
 {
 	struct r600_resource_texture *rtex;
 	struct r600_resource *rbuffer;
+	struct r600_surface *surf;
 	unsigned level = state->cbufs[cb]->level;
 	unsigned pitch, slice;
 	unsigned color_info;
@@ -944,14 +826,15 @@ static void r600_cb(struct r600_pipe_context *rctx, struct r600_pipe_state *rsta
 	const struct util_format_description *desc;
 	struct r600_bo *bo[3];
 
+	surf = (struct r600_surface *)state->cbufs[cb];
 	rtex = (struct r600_resource_texture*)state->cbufs[cb]->texture;
 	rbuffer = &rtex->resource;
 	bo[0] = rbuffer->bo;
 	bo[1] = rbuffer->bo;
 	bo[2] = rbuffer->bo;
 
-	pitch = (rtex->pitch[level] / rtex->bpt) / 8 - 1;
-	slice = (rtex->pitch[level] / rtex->bpt) * state->cbufs[cb]->height / 64 - 1;
+	pitch = rtex->pitch_in_pixels[level] / 8 - 1;
+	slice = rtex->pitch_in_pixels[level] * surf->aligned_height / 64 - 1;
 	ntype = 0;
 	desc = util_format_description(rtex->resource.base.b.format);
 	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
@@ -961,6 +844,7 @@ static void r600_cb(struct r600_pipe_context *rctx, struct r600_pipe_state *rsta
 	swap = r600_translate_colorswap(rtex->resource.base.b.format);
 	color_info = S_0280A0_FORMAT(format) |
 		S_0280A0_COMP_SWAP(swap) |
+		S_0280A0_ARRAY_MODE(rtex->array_mode[level]) |
 		S_0280A0_BLEND_CLAMP(1) |
 		S_0280A0_NUMBER_TYPE(ntype);
 	if (desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) 
@@ -996,22 +880,25 @@ static void r600_db(struct r600_pipe_context *rctx, struct r600_pipe_state *rsta
 {
 	struct r600_resource_texture *rtex;
 	struct r600_resource *rbuffer;
+	struct r600_surface *surf;
 	unsigned level;
 	unsigned pitch, slice, format;
 
 	if (state->zsbuf == NULL)
 		return;
 
+	level = state->zsbuf->level;
+
+	surf = (struct r600_surface *)state->zsbuf;
 	rtex = (struct r600_resource_texture*)state->zsbuf->texture;
 	rtex->tiled = 1;
-	rtex->array_mode = 2;
+	rtex->array_mode[level] = 2;
 	rtex->tile_type = 1;
 	rtex->depth = 1;
 	rbuffer = &rtex->resource;
 
-	level = state->zsbuf->level;
-	pitch = (rtex->pitch[level] / rtex->bpt) / 8 - 1;
-	slice = (rtex->pitch[level] / rtex->bpt) * state->zsbuf->height / 64 - 1;
+	pitch = rtex->pitch_in_pixels[level] / 8 - 1;
+	slice = rtex->pitch_in_pixels[level] * surf->aligned_height / 64 - 1;
 	format = r600_translate_dbformat(state->zsbuf->texture->format);
 
 	r600_pipe_state_add_reg(rstate, R_02800C_DB_DEPTH_BASE,
@@ -1021,10 +908,10 @@ static void r600_db(struct r600_pipe_context *rctx, struct r600_pipe_state *rsta
 				0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_028004_DB_DEPTH_VIEW, 0x00000000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_028010_DB_DEPTH_INFO,
-				S_028010_ARRAY_MODE(rtex->array_mode) | S_028010_FORMAT(format),
+				S_028010_ARRAY_MODE(rtex->array_mode[level]) | S_028010_FORMAT(format),
 				0xFFFFFFFF, rbuffer->bo);
 	r600_pipe_state_add_reg(rstate, R_028D34_DB_PREFETCH_LIMIT,
-				(state->zsbuf->height / 8) - 1, 0xFFFFFFFF, NULL);
+				(surf->aligned_height / 8) - 1, 0xFFFFFFFF, NULL);
 }
 
 static void r600_set_framebuffer_state(struct pipe_context *ctx,
@@ -1039,14 +926,9 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 
 	/* unreference old buffer and reference new one */
 	rstate->id = R600_PIPE_STATE_FRAMEBUFFER;
-	for (int i = 0; i < rctx->framebuffer.nr_cbufs; i++) {
-		pipe_surface_reference(&rctx->framebuffer.cbufs[i], NULL);
-	}
-	for (int i = 0; i < state->nr_cbufs; i++) {
-		pipe_surface_reference(&rctx->framebuffer.cbufs[i], state->cbufs[i]);
-	}
-	pipe_surface_reference(&rctx->framebuffer.zsbuf, state->zsbuf);
-	rctx->framebuffer = *state;
+
+	util_copy_framebuffer_state(&rctx->framebuffer, state);
+	
 	rctx->pframebuffer = &rctx->framebuffer;
 
 	/* build states */
@@ -1070,6 +952,18 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 	br = S_028244_BR_X(state->width) | S_028244_BR_Y(state->height);
 
 	r600_pipe_state_add_reg(rstate,
+				R_028030_PA_SC_SCREEN_SCISSOR_TL, tl,
+				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028034_PA_SC_SCREEN_SCISSOR_BR, br,
+				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028204_PA_SC_WINDOW_SCISSOR_TL, tl,
+				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028208_PA_SC_WINDOW_SCISSOR_BR, br,
+				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
 				R_028240_PA_SC_GENERIC_SCISSOR_TL, tl,
 				0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate,
@@ -1081,6 +975,14 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 	r600_pipe_state_add_reg(rstate,
 				R_028254_PA_SC_VPORT_SCISSOR_0_BR, br,
 				0xFFFFFFFF, NULL);
+	r600_pipe_state_add_reg(rstate,
+				R_028200_PA_SC_WINDOW_OFFSET, 0x00000000,
+				0xFFFFFFFF, NULL);
+	if (rctx->family >= CHIP_RV770) {
+		r600_pipe_state_add_reg(rstate,
+					R_028230_PA_SC_EDGERULE, 0xAAAAAAAA,
+					0xFFFFFFFF, NULL);
+	}
 
 	r600_pipe_state_add_reg(rstate, R_0287A0_CB_SHADER_CONTROL,
 				shader_control, 0xFFFFFFFF, NULL);
@@ -1110,40 +1012,6 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx,
 	r600_context_pipe_state_set(&rctx->ctx, rstate);
 }
 
-static void r600_set_index_buffer(struct pipe_context *ctx,
-				  const struct pipe_index_buffer *ib)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	if (ib) {
-		pipe_resource_reference(&rctx->index_buffer.buffer, ib->buffer);
-		memcpy(&rctx->index_buffer, ib, sizeof(rctx->index_buffer));
-	} else {
-		pipe_resource_reference(&rctx->index_buffer.buffer, NULL);
-		memset(&rctx->index_buffer, 0, sizeof(rctx->index_buffer));
-	}
-
-	/* TODO make this more like a state */
-}
-
-static void r600_set_vertex_buffers(struct pipe_context *ctx, unsigned count,
-					const struct pipe_vertex_buffer *buffers)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	for (int i = 0; i < rctx->nvertex_buffer; i++) {
-		pipe_resource_reference(&rctx->vertex_buffer[i].buffer, NULL);
-	}
-	memcpy(rctx->vertex_buffer, buffers, sizeof(struct pipe_vertex_buffer) * count);
-	for (int i = 0; i < count; i++) {
-		rctx->vertex_buffer[i].buffer = NULL;
-		if (r600_buffer_is_user_buffer(buffers[i].buffer))
-			rctx->any_user_vbs = TRUE;
-		pipe_resource_reference(&rctx->vertex_buffer[i].buffer, buffers[i].buffer);
-	}
-	rctx->nvertex_buffer = count;
-}
-
 static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
 					struct pipe_resource *buffer)
 {
@@ -1179,59 +1047,6 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint
 	}
 }
 
-static void *r600_create_shader_state(struct pipe_context *ctx,
-					const struct pipe_shader_state *state)
-{
-	struct r600_pipe_shader *shader =  CALLOC_STRUCT(r600_pipe_shader);
-	int r;
-
-	r =  r600_pipe_shader_create(ctx, shader, state->tokens);
-	if (r) {
-		return NULL;
-	}
-	return shader;
-}
-
-static void r600_bind_ps_shader(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	/* TODO delete old shader */
-	rctx->ps_shader = (struct r600_pipe_shader *)state;
-}
-
-static void r600_bind_vs_shader(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-
-	/* TODO delete old shader */
-	rctx->vs_shader = (struct r600_pipe_shader *)state;
-}
-
-static void r600_delete_ps_shader(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_shader *shader = (struct r600_pipe_shader *)state;
-
-	if (rctx->ps_shader == shader) {
-		rctx->ps_shader = NULL;
-	}
-	/* TODO proper delete */
-	free(shader);
-}
-
-static void r600_delete_vs_shader(struct pipe_context *ctx, void *state)
-{
-	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
-	struct r600_pipe_shader *shader = (struct r600_pipe_shader *)state;
-
-	if (rctx->vs_shader == shader) {
-		rctx->vs_shader = NULL;
-	}
-	/* TODO proper delete */
-	free(shader);
-}
-
 void r600_init_state_functions(struct r600_pipe_context *rctx)
 {
 	rctx->context.create_blend_state = r600_create_blend_state;
@@ -1480,14 +1295,14 @@ void r600_init_config(struct r600_pipe_context *rctx)
 		r600_pipe_state_add_reg(rstate, R_009830_DB_DEBUG, 0x00000000, 0xFFFFFFFF, NULL);
 		r600_pipe_state_add_reg(rstate, R_009838_DB_WATERMARKS, 0x00420204, 0xFFFFFFFF, NULL);
 		r600_pipe_state_add_reg(rstate, R_0286C8_SPI_THREAD_GROUPING, 0x00000000, 0xFFFFFFFF, NULL);
-		r600_pipe_state_add_reg(rstate, R_028A4C_PA_SC_MODE_CNTL, 0x00514000, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_028A4C_PA_SC_MODE_CNTL, 0x00514002, 0xFFFFFFFF, NULL);
 	} else {
 		r600_pipe_state_add_reg(rstate, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, 0x00000000, 0xFFFFFFFF, NULL);
 		r600_pipe_state_add_reg(rstate, R_009508_TA_CNTL_AUX, 0x07000003, 0xFFFFFFFF, NULL);
 		r600_pipe_state_add_reg(rstate, R_009830_DB_DEBUG, 0x82000000, 0xFFFFFFFF, NULL);
 		r600_pipe_state_add_reg(rstate, R_009838_DB_WATERMARKS, 0x01020204, 0xFFFFFFFF, NULL);
 		r600_pipe_state_add_reg(rstate, R_0286C8_SPI_THREAD_GROUPING, 0x00000001, 0xFFFFFFFF, NULL);
-		r600_pipe_state_add_reg(rstate, R_028A4C_PA_SC_MODE_CNTL, 0x00004010, 0xFFFFFFFF, NULL);
+		r600_pipe_state_add_reg(rstate, R_028A4C_PA_SC_MODE_CNTL, 0x00004012, 0xFFFFFFFF, NULL);
 	}
 	r600_pipe_state_add_reg(rstate, R_0288A8_SQ_ESGS_RING_ITEMSIZE, 0x00000000, 0xFFFFFFFF, NULL);
 	r600_pipe_state_add_reg(rstate, R_0288AC_SQ_GSVS_RING_ITEMSIZE, 0x00000000, 0xFFFFFFFF, NULL);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
new file mode 100644
index 00000000000..210420e823b
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2010 Red Hat Inc.
+ *           2010 Jerome Glisse
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Dave Airlie <[email protected]>
+ *          Jerome Glisse <[email protected]>
+ */
+#include <util/u_memory.h>
+#include <util/u_format.h>
+#include <pipebuffer/pb_buffer.h>
+#include "r600_pipe.h"
+
+/* common state between evergreen and r600 */
+void r600_bind_blend_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct r600_pipe_blend *blend = (struct r600_pipe_blend *)state;
+	struct r600_pipe_state *rstate;
+
+	if (state == NULL)
+		return;
+	rstate = &blend->rstate;
+	rctx->states[rstate->id] = rstate;
+	rctx->cb_target_mask = blend->cb_target_mask;
+	r600_context_pipe_state_set(&rctx->ctx, rstate);
+}
+
+void r600_bind_rs_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state;
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+
+	if (state == NULL)
+		return;
+
+	rctx->flatshade = rs->flatshade;
+	rctx->sprite_coord_enable = rs->sprite_coord_enable;
+	rctx->rasterizer = rs;
+
+	rctx->states[rs->rstate.id] = &rs->rstate;
+	r600_context_pipe_state_set(&rctx->ctx, &rs->rstate);
+}
+
+void r600_delete_rs_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state;
+
+	if (rctx->rasterizer == rs) {
+		rctx->rasterizer = NULL;
+	}
+	if (rctx->states[rs->rstate.id] == &rs->rstate) {
+		rctx->states[rs->rstate.id] = NULL;
+	}
+	free(rs);
+}
+
+void r600_sampler_view_destroy(struct pipe_context *ctx,
+			       struct pipe_sampler_view *state)
+{
+	struct r600_pipe_sampler_view *resource = (struct r600_pipe_sampler_view *)state;
+
+	pipe_resource_reference(&state->texture, NULL);
+	FREE(resource);
+}
+
+void r600_bind_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct r600_pipe_state *rstate = (struct r600_pipe_state *)state;
+
+	if (state == NULL)
+		return;
+	rctx->states[rstate->id] = rstate;
+	r600_context_pipe_state_set(&rctx->ctx, rstate);
+}
+
+void r600_delete_state(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct r600_pipe_state *rstate = (struct r600_pipe_state *)state;
+
+	if (rctx->states[rstate->id] == rstate) {
+		rctx->states[rstate->id] = NULL;
+	}
+	for (int i = 0; i < rstate->nregs; i++) {
+		r600_bo_reference(rctx->radeon, &rstate->regs[i].bo, NULL);
+	}
+	free(rstate);
+}
+
+void r600_bind_vertex_elements(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct r600_vertex_element *v = (struct r600_vertex_element*)state;
+
+	rctx->vertex_elements = v;
+	if (v) {
+//		rctx->vs_rebuild = TRUE;
+	}
+}
+
+void r600_delete_vertex_element(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+
+	FREE(state);
+
+	if (rctx->vertex_elements == state)
+		rctx->vertex_elements = NULL;
+}
+
+
+void r600_set_index_buffer(struct pipe_context *ctx,
+			   const struct pipe_index_buffer *ib)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+
+	if (ib) {
+		pipe_resource_reference(&rctx->index_buffer.buffer, ib->buffer);
+		memcpy(&rctx->index_buffer, ib, sizeof(rctx->index_buffer));
+	} else {
+		pipe_resource_reference(&rctx->index_buffer.buffer, NULL);
+		memset(&rctx->index_buffer, 0, sizeof(rctx->index_buffer));
+	}
+
+	/* TODO make this more like a state */
+}
+
+void r600_set_vertex_buffers(struct pipe_context *ctx, unsigned count,
+			     const struct pipe_vertex_buffer *buffers)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct pipe_vertex_buffer *vbo;
+	unsigned max_index = (unsigned)-1;
+
+	for (int i = 0; i < rctx->nvertex_buffer; i++) {
+		pipe_resource_reference(&rctx->vertex_buffer[i].buffer, NULL);
+	}
+	memcpy(rctx->vertex_buffer, buffers, sizeof(struct pipe_vertex_buffer) * count);
+
+	for (int i = 0; i < count; i++) {
+		vbo = (struct pipe_vertex_buffer*)&buffers[i];
+
+		rctx->vertex_buffer[i].buffer = NULL;
+		if (r600_buffer_is_user_buffer(buffers[i].buffer))
+			rctx->any_user_vbs = TRUE;
+		pipe_resource_reference(&rctx->vertex_buffer[i].buffer, buffers[i].buffer);
+
+		if (vbo->max_index == ~0) {
+			if (!vbo->stride)
+				vbo->max_index = 1;
+			else
+				vbo->max_index = (vbo->buffer->width0 - vbo->buffer_offset) / vbo->stride;
+		}
+		max_index = MIN2(vbo->max_index, max_index);
+	}
+	rctx->nvertex_buffer = count;
+	rctx->vb_max_index = max_index;
+}
+
+
+#define FORMAT_REPLACE(what, withwhat) \
+	case PIPE_FORMAT_##what: *format = PIPE_FORMAT_##withwhat; break
+
+void *r600_create_vertex_elements(struct pipe_context *ctx,
+				  unsigned count,
+				  const struct pipe_vertex_element *elements)
+{
+	struct r600_vertex_element *v = CALLOC_STRUCT(r600_vertex_element);
+	int i;
+	enum pipe_format *format;
+
+	assert(count < 32);
+	if (!v)
+		return NULL;
+
+	v->count = count;
+	memcpy(v->elements, elements, count * sizeof(struct pipe_vertex_element));
+
+	for (i = 0; i < count; i++) {
+		v->hw_format[i] = v->elements[i].src_format;
+		format = &v->hw_format[i];
+
+		switch (*format) {
+                    FORMAT_REPLACE(R64_FLOAT,           R32_FLOAT);
+                    FORMAT_REPLACE(R64G64_FLOAT,        R32G32_FLOAT);
+                    FORMAT_REPLACE(R64G64B64_FLOAT,     R32G32B32_FLOAT);
+                    FORMAT_REPLACE(R64G64B64A64_FLOAT,  R32G32B32A32_FLOAT);
+		default:;
+		}
+		v->incompatible_layout =
+			v->incompatible_layout ||
+			v->elements[i].src_format != v->hw_format[i] ||
+			v->elements[i].src_offset % 4 != 0;
+
+                v->hw_format_size[i] =
+			align(util_format_get_blocksize(v->hw_format[i]), 4);
+	}
+
+	return v;
+}
+
+void *r600_create_shader_state(struct pipe_context *ctx,
+			       const struct pipe_shader_state *state)
+{
+	struct r600_pipe_shader *shader =  CALLOC_STRUCT(r600_pipe_shader);
+	int r;
+
+	r =  r600_pipe_shader_create(ctx, shader, state->tokens);
+	if (r) {
+		return NULL;
+	}
+	return shader;
+}
+
+void r600_bind_ps_shader(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+
+	/* TODO delete old shader */
+	rctx->ps_shader = (struct r600_pipe_shader *)state;
+}
+
+void r600_bind_vs_shader(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+
+	/* TODO delete old shader */
+	rctx->vs_shader = (struct r600_pipe_shader *)state;
+}
+
+void r600_delete_ps_shader(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct r600_pipe_shader *shader = (struct r600_pipe_shader *)state;
+
+	if (rctx->ps_shader == shader) {
+		rctx->ps_shader = NULL;
+	}
+
+	r600_pipe_shader_destroy(ctx, shader);
+	free(shader);
+}
+
+void r600_delete_vs_shader(struct pipe_context *ctx, void *state)
+{
+	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+	struct r600_pipe_shader *shader = (struct r600_pipe_shader *)state;
+
+	if (rctx->vs_shader == shader) {
+		rctx->vs_shader = NULL;
+	}
+
+	r600_pipe_shader_destroy(ctx, shader);
+	free(shader);
+}
diff --git a/src/gallium/drivers/r600/r600_state_inlines.h b/src/gallium/drivers/r600/r600_state_inlines.h
index 81f285027e6..1c1978f8abb 100644
--- a/src/gallium/drivers/r600/r600_state_inlines.h
+++ b/src/gallium/drivers/r600/r600_state_inlines.h
@@ -25,6 +25,7 @@
 
 #include "util/u_format.h"
 #include "r600d.h"
+#include "r600_formats.h"
 
 static INLINE uint32_t r600_translate_blend_function(int blend_func)
 {
@@ -303,6 +304,10 @@ static inline uint32_t r600_translate_colorswap(enum pipe_format format)
 		return V_0280A0_SWAP_STD;
 
 	case PIPE_FORMAT_L8A8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+		return V_0280A0_SWAP_STD;
+
+	case PIPE_FORMAT_R16_UNORM:
 		return V_0280A0_SWAP_STD;
 
 		/* 32-bit buffers. */
@@ -342,16 +347,19 @@ static inline uint32_t r600_translate_colorswap(enum pipe_format format)
 	case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
 		return V_0280A0_SWAP_STD_REV;
 
+	case PIPE_FORMAT_R16G16_UNORM:
+		return V_0280A0_SWAP_STD;
+
 		/* 64-bit buffers. */
 	case PIPE_FORMAT_R16G16B16A16_UNORM:
 	case PIPE_FORMAT_R16G16B16A16_SNORM:
-		//		return V_0280A0_COLOR_16_16_16_16;
+		//		return FMT_16_16_16_16;
 	case PIPE_FORMAT_R16G16B16A16_FLOAT:
-		//		return V_0280A0_COLOR_16_16_16_16_FLOAT;
+		//		return FMT_16_16_16_16_FLOAT;
 
 		/* 128-bit buffers. */
 	case PIPE_FORMAT_R32G32B32A32_FLOAT:
-		//		return V_0280A0_COLOR_32_32_32_32_FLOAT;
+		//		return FMT_32_32_32_32_FLOAT;
 		return 0;
 	default:
 		R600_ERR("unsupported colorswap format %d\n", format);
@@ -387,8 +395,12 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format)
 		return V_0280A0_COLOR_16;
 
 	case PIPE_FORMAT_L8A8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
 		return V_0280A0_COLOR_8_8;
 
+	case PIPE_FORMAT_R16_UNORM:
+		return V_0280A0_COLOR_16;
+
 		/* 32-bit buffers. */
 	case PIPE_FORMAT_A8B8G8R8_SRGB:
 	case PIPE_FORMAT_A8B8G8R8_UNORM:
@@ -426,6 +438,7 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format)
 		return V_0280A0_COLOR_16_16_FLOAT;
 
 	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16_UNORM:
 		return V_0280A0_COLOR_16_16;
 
 
@@ -510,32 +523,32 @@ static INLINE uint32_t r600_translate_vertex_data_type(enum pipe_format format)
                 case 16:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_038008_FMT_16_FLOAT;
+				result = FMT_16_FLOAT;
 				break;
 			case 2:
-				result = V_038008_FMT_16_16_FLOAT;
+				result = FMT_16_16_FLOAT;
 				break;
 			case 3:
-				result = V_038008_FMT_16_16_16_FLOAT;
+				result = FMT_16_16_16_FLOAT;
 				break;
 			case 4:
-				result = V_038008_FMT_16_16_16_16_FLOAT;
+				result = FMT_16_16_16_16_FLOAT;
 				break;
 			}
 			break;
                 case 32:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_038008_FMT_32_FLOAT;
+				result = FMT_32_FLOAT;
 				break;
 			case 2:
-				result = V_038008_FMT_32_32_FLOAT;
+				result = FMT_32_32_FLOAT;
 				break;
 			case 3:
-				result = V_038008_FMT_32_32_32_FLOAT;
+				result = FMT_32_32_32_FLOAT;
 				break;
 			case 4:
-				result = V_038008_FMT_32_32_32_32_FLOAT;
+				result = FMT_32_32_32_32_FLOAT;
 				break;
 			}
 			break;
@@ -551,48 +564,48 @@ static INLINE uint32_t r600_translate_vertex_data_type(enum pipe_format format)
                 case 8:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_038008_FMT_8;
+				result = FMT_8;
 				break;
 			case 2:
-				result = V_038008_FMT_8_8;
+				result = FMT_8_8;
 				break;
 			case 3:
-			//	result = V_038008_FMT_8_8_8; /* fails piglit draw-vertices test */
+			//	result = FMT_8_8_8; /* fails piglit draw-vertices test */
 			//	break;
 			case 4:
-				result = V_038008_FMT_8_8_8_8;
+				result = FMT_8_8_8_8;
 				break;
 			}
 			break;
                 case 16:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_038008_FMT_16;
+				result = FMT_16;
 				break;
 			case 2:
-				result = V_038008_FMT_16_16;
+				result = FMT_16_16;
 				break;
 			case 3:
-			//	result = V_038008_FMT_16_16_16; /* fails piglit draw-vertices test */
+			//	result = FMT_16_16_16; /* fails piglit draw-vertices test */
 			//	break;
 			case 4:
-				result = V_038008_FMT_16_16_16_16;
+				result = FMT_16_16_16_16;
 				break;
 			}
 			break;
                 case 32:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_038008_FMT_32;
+				result = FMT_32;
 				break;
 			case 2:
-				result = V_038008_FMT_32_32;
+				result = FMT_32_32;
 				break;
 			case 3:
-				result = V_038008_FMT_32_32_32;
+				result = FMT_32_32_32;
 				break;
 			case 4:
-				result = V_038008_FMT_32_32_32_32;
+				result = FMT_32_32_32_32;
 				break;
 			}
 			break;
diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
index c46bfa66ba1..4ebd5b754b3 100644
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -35,6 +35,7 @@
 #include "r600_resource.h"
 #include "r600_state_inlines.h"
 #include "r600d.h"
+#include "r600_formats.h"
 
 extern struct u_resource_vtbl r600_texture_vtbl;
 
@@ -53,11 +54,30 @@ static void r600_copy_from_tiled_texture(struct pipe_context *ctx, struct r600_t
 				transfer->box.width, transfer->box.height);
 }
 
-static unsigned long r600_texture_get_offset(struct r600_resource_texture *rtex,
+
+/* Copy from a detiled texture to a tiled one. */
+static void r600_copy_into_tiled_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
+{
+	struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
+	struct pipe_resource *texture = transfer->resource;
+	struct pipe_subresource subsrc;
+
+	subsrc.face = 0;
+	subsrc.level = 0;
+	ctx->resource_copy_region(ctx, texture, transfer->sr,
+				  transfer->box.x, transfer->box.y, transfer->box.z,
+				  rtransfer->linear_texture, subsrc,
+				  0, 0, 0,
+				  transfer->box.width, transfer->box.height);
+
+	ctx->flush(ctx, 0, NULL);
+}
+
+static unsigned r600_texture_get_offset(struct r600_resource_texture *rtex,
 					unsigned level, unsigned zslice,
 					unsigned face)
 {
-	unsigned long offset = rtex->offset[level];
+	unsigned offset = rtex->offset[level];
 
 	switch (rtex->resource.base.b.target) {
 	case PIPE_TEXTURE_3D:
@@ -72,22 +92,158 @@ static unsigned long r600_texture_get_offset(struct r600_resource_texture *rtex,
 	}
 }
 
-static void r600_setup_miptree(struct r600_resource_texture *rtex, enum chip_class chipc)
+static unsigned r600_get_pixel_alignment(struct pipe_screen *screen,
+					 enum pipe_format format,
+					 unsigned array_mode)
+{
+	struct r600_screen* rscreen = (struct r600_screen *)screen;
+	unsigned pixsize = util_format_get_blocksize(format);
+	int p_align;
+
+	switch(array_mode) {
+	case V_038000_ARRAY_1D_TILED_THIN1:
+		p_align = MAX2(8,
+			       ((rscreen->tiling_info->group_bytes / 8 / pixsize)));
+		break;
+	case V_038000_ARRAY_2D_TILED_THIN1:
+		p_align = MAX2(rscreen->tiling_info->num_banks,
+			       (((rscreen->tiling_info->group_bytes / 8 / pixsize)) *
+				rscreen->tiling_info->num_banks));
+		break;
+	case 0:
+	default:
+		p_align = 64;
+		break;
+	}
+	return p_align;
+}
+
+static unsigned r600_get_height_alignment(struct pipe_screen *screen,
+					  unsigned array_mode)
+{
+	struct r600_screen* rscreen = (struct r600_screen *)screen;
+	int h_align;
+
+	switch (array_mode) {
+	case V_038000_ARRAY_2D_TILED_THIN1:
+		h_align = rscreen->tiling_info->num_channels * 8;
+		break;
+	case V_038000_ARRAY_1D_TILED_THIN1:
+		h_align = 8;
+		break;
+	default:
+		h_align = 1;
+		break;
+	}
+	return h_align;
+}
+
+static unsigned mip_minify(unsigned size, unsigned level)
+{
+	unsigned val;
+	val = u_minify(size, level);
+	if (level > 0)
+		val = util_next_power_of_two(val);
+	return val;
+}
+
+static unsigned r600_texture_get_stride(struct pipe_screen *screen,
+					struct r600_resource_texture *rtex,
+					unsigned level)
+{
+	struct pipe_resource *ptex = &rtex->resource.base.b;
+	struct radeon *radeon = (struct radeon *)screen->winsys;
+	enum chip_class chipc = r600_get_family_class(radeon);
+	unsigned width, stride, tile_width;
+	
+	if (rtex->pitch_override)
+		return rtex->pitch_override;
+
+	width = mip_minify(ptex->width0, level);
+	if (util_format_is_plain(ptex->format)) {
+		tile_width = r600_get_pixel_alignment(screen, ptex->format,
+						      rtex->array_mode[level]);
+		width = align(width, tile_width);
+	}
+	stride = util_format_get_stride(ptex->format, width);
+	if (chipc == EVERGREEN)
+		stride = align(stride, 512);
+	return stride;
+}
+
+static unsigned r600_texture_get_nblocksy(struct pipe_screen *screen,
+					  struct r600_resource_texture *rtex,
+					  unsigned level)
 {
 	struct pipe_resource *ptex = &rtex->resource.base.b;
-	unsigned long w, h, pitch, size, layer_size, i, offset;
+	unsigned height, tile_height;
 
-	rtex->bpt = util_format_get_blocksize(ptex->format);
-	for (i = 0, offset = 0; i <= ptex->last_level; i++) {
-		w = u_minify(ptex->width0, i);
-		h = u_minify(ptex->height0, i);
-		h = util_next_power_of_two(h);
-		pitch = util_format_get_stride(ptex->format, align(w, 64));
-		if (chipc == EVERGREEN)
-			pitch = align(pitch, 512);
+	height = mip_minify(ptex->height0, level);
+	if (util_format_is_plain(ptex->format)) {
+		tile_height = r600_get_height_alignment(screen,
+							rtex->array_mode[level]);
+		height = align(height, tile_height);
+	}
+	return util_format_get_nblocksy(ptex->format, height);
+}
+
+/* Get a width in pixels from a stride in bytes. */
+static unsigned pitch_to_width(enum pipe_format format,
+                                unsigned pitch_in_bytes)
+{
+    return (pitch_in_bytes / util_format_get_blocksize(format)) *
+            util_format_get_blockwidth(format);
+}
+
+static void r600_texture_set_array_mode(struct pipe_screen *screen,
+					struct r600_resource_texture *rtex,
+					unsigned level, unsigned array_mode)
+{
+	struct pipe_resource *ptex = &rtex->resource.base.b;
+
+	switch (array_mode) {
+	case V_0280A0_ARRAY_LINEAR_GENERAL:
+	case V_0280A0_ARRAY_LINEAR_ALIGNED:
+	case V_0280A0_ARRAY_1D_TILED_THIN1:
+	default:
+		rtex->array_mode[level] = array_mode;
+		break;
+	case V_0280A0_ARRAY_2D_TILED_THIN1:
+	{
+		unsigned w, h, tile_height, tile_width;
+
+		tile_height = r600_get_height_alignment(screen, array_mode);
+		tile_width = r600_get_pixel_alignment(screen, ptex->format, array_mode);
+
+		w = mip_minify(ptex->width0, level);
+		h = mip_minify(ptex->height0, level);
+		if (w < tile_width || h < tile_height)
+			rtex->array_mode[level] = V_0280A0_ARRAY_1D_TILED_THIN1;
 		else
-			pitch = align(pitch, 256);
-		layer_size = pitch * h;
+			rtex->array_mode[level] = array_mode;
+	}
+	break;
+	}
+}
+
+static void r600_setup_miptree(struct pipe_screen *screen,
+			       struct r600_resource_texture *rtex,
+			       unsigned array_mode)
+{
+	struct pipe_resource *ptex = &rtex->resource.base.b;
+	struct radeon *radeon = (struct radeon *)screen->winsys;
+	enum chip_class chipc = r600_get_family_class(radeon);
+	unsigned pitch, size, layer_size, i, offset;
+	unsigned nblocksy;
+
+	for (i = 0, offset = 0; i <= ptex->last_level; i++) {
+		r600_texture_set_array_mode(screen, rtex, i, array_mode);
+
+		pitch = r600_texture_get_stride(screen, rtex, i);
+		nblocksy = r600_texture_get_nblocksy(screen, rtex, i);
+
+		layer_size = pitch * nblocksy;
+
 		if (ptex->target == PIPE_TEXTURE_CUBE) {
 			if (chipc >= R700)
 				size = layer_size * 8;
@@ -98,41 +254,69 @@ static void r600_setup_miptree(struct r600_resource_texture *rtex, enum chip_cla
 			size = layer_size * u_minify(ptex->depth0, i);
 		rtex->offset[i] = offset;
 		rtex->layer_size[i] = layer_size;
-		rtex->pitch[i] = pitch;
-		rtex->width[i] = w;
-		rtex->height[i] = h;
+		rtex->pitch_in_bytes[i] = pitch;
+		rtex->pitch_in_pixels[i] = pitch_to_width(ptex->format, pitch);
 		offset += size;
 	}
 	rtex->size = offset;
 }
 
-struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
-						const struct pipe_resource *templ)
+static struct r600_resource_texture *
+r600_texture_create_object(struct pipe_screen *screen,
+			   const struct pipe_resource *base,
+			   unsigned array_mode,
+			   unsigned pitch_in_bytes_override,
+			   unsigned max_buffer_size,
+			   struct r600_bo *bo)
 {
 	struct r600_resource_texture *rtex;
 	struct r600_resource *resource;
 	struct radeon *radeon = (struct radeon *)screen->winsys;
 
 	rtex = CALLOC_STRUCT(r600_resource_texture);
-	if (!rtex) {
+	if (rtex == NULL)
 		return NULL;
-	}
+
 	resource = &rtex->resource;
-	resource->base.b = *templ;
+	resource->base.b = *base;
 	resource->base.vtbl = &r600_texture_vtbl;
 	pipe_reference_init(&resource->base.b.reference, 1);
 	resource->base.b.screen = screen;
-	r600_setup_miptree(rtex, r600_get_family_class(radeon));
-
-	/* FIXME alignment 4096 enought ? too much ? */
+	resource->bo = bo;
 	resource->domain = r600_domain_from_usage(resource->base.b.bind);
+	rtex->pitch_override = pitch_in_bytes_override;
+
+	if (array_mode)
+		rtex->tiled = 1;
+	r600_setup_miptree(screen, rtex, array_mode);
+
 	resource->size = rtex->size;
-	resource->bo = r600_bo(radeon, rtex->size, 4096, 0);
-	if (resource->bo == NULL) {
-		FREE(rtex);
-		return NULL;
+
+	if (!resource->bo) {
+		resource->bo = r600_bo(radeon, rtex->size, 4096, 0);
+		if (!resource->bo) {
+			FREE(rtex);
+			return NULL;
+		}
+	}
+	return rtex;
+}
+
+struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
+						const struct pipe_resource *templ)
+{
+	unsigned array_mode = 0;
+
+	if (debug_get_bool_option("R600_FORCE_TILING", FALSE)) {
+		if (!(templ->flags & R600_RESOURCE_FLAG_TRANSFER) &&
+		    !(templ->bind & PIPE_BIND_SCANOUT)) {
+			array_mode = V_038000_ARRAY_2D_TILED_THIN1;
+		}
 	}
-	return &resource->base.b;
+
+	return (struct pipe_resource *)r600_texture_create_object(screen, templ, array_mode,
+								  0, 0, NULL);
+
 }
 
 static void r600_texture_destroy(struct pipe_screen *screen,
@@ -157,24 +341,27 @@ static struct pipe_surface *r600_get_tex_surface(struct pipe_screen *screen,
 						unsigned zslice, unsigned flags)
 {
 	struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture;
-	struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
-	unsigned long offset;
+	struct r600_surface *surface = CALLOC_STRUCT(r600_surface);
+	unsigned offset, tile_height;
 
 	if (surface == NULL)
 		return NULL;
 	offset = r600_texture_get_offset(rtex, level, zslice, face);
-	pipe_reference_init(&surface->reference, 1);
-	pipe_resource_reference(&surface->texture, texture);
-	surface->format = texture->format;
-	surface->width = u_minify(texture->width0, level);
-	surface->height = u_minify(texture->height0, level);
-	surface->offset = offset;
-	surface->usage = flags;
-	surface->zslice = zslice;
-	surface->texture = texture;
-	surface->face = face;
-	surface->level = level;
-	return surface;
+	pipe_reference_init(&surface->base.reference, 1);
+	pipe_resource_reference(&surface->base.texture, texture);
+	surface->base.format = texture->format;
+	surface->base.width = mip_minify(texture->width0, level);
+	surface->base.height = mip_minify(texture->height0, level);
+	surface->base.offset = offset;
+	surface->base.usage = flags;
+	surface->base.zslice = zslice;
+	surface->base.texture = texture;
+	surface->base.face = face;
+	surface->base.level = level;
+
+	tile_height = r600_get_height_alignment(screen, rtex->array_mode[level]);
+	surface->aligned_height = align(surface->base.height, tile_height);
+	return &surface->base;
 }
 
 static void r600_tex_surface_destroy(struct pipe_surface *surface)
@@ -183,46 +370,29 @@ static void r600_tex_surface_destroy(struct pipe_surface *surface)
 	FREE(surface);
 }
 
+
 struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
 					       const struct pipe_resource *templ,
 					       struct winsys_handle *whandle)
 {
 	struct radeon *rw = (struct radeon*)screen->winsys;
-	struct r600_resource_texture *rtex;
-	struct r600_resource *resource;
 	struct r600_bo *bo = NULL;
+	unsigned array_mode = 0;
 
 	/* Support only 2D textures without mipmaps */
 	if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
 	      templ->depth0 != 1 || templ->last_level != 0)
 		return NULL;
 
-	rtex = CALLOC_STRUCT(r600_resource_texture);
-	if (rtex == NULL)
-		return NULL;
-
-	bo = r600_bo_handle(rw, whandle->handle);
+	bo = r600_bo_handle(rw, whandle->handle, &array_mode);
 	if (bo == NULL) {
-		FREE(rtex);
 		return NULL;
 	}
 
-	resource = &rtex->resource;
-	resource->base.b = *templ;
-	resource->base.vtbl = &r600_texture_vtbl;
-	pipe_reference_init(&resource->base.b.reference, 1);
-	resource->base.b.screen = screen;
-	resource->bo = bo;
-	rtex->depth = 0;
-	rtex->pitch_override = whandle->stride;
-	rtex->bpt = util_format_get_blocksize(templ->format);
-	rtex->pitch[0] = whandle->stride;
-	rtex->width[0] = templ->width0;
-	rtex->height[0] = templ->height0;
-	rtex->offset[0] = 0;
-	rtex->size = align(rtex->pitch[0] * templ->height0, 64);
-
-	return &resource->base.b;
+	return (struct pipe_resource *)r600_texture_create_object(screen, templ, array_mode,
+								  whandle->stride,
+								  0,
+								  bo);
 }
 
 static unsigned int r600_texture_is_referenced(struct pipe_context *context,
@@ -248,14 +418,14 @@ int r600_texture_depth_flush(struct pipe_context *ctx,
 	resource.format = texture->format;
 	resource.width0 = texture->width0;
 	resource.height0 = texture->height0;
-	resource.depth0 = 0;
+	resource.depth0 = 1;
 	resource.last_level = 0;
 	resource.nr_samples = 0;
 	resource.usage = PIPE_USAGE_DYNAMIC;
 	resource.bind = 0;
-	resource.flags = 0;
+	resource.flags = R600_RESOURCE_FLAG_TRANSFER;
 
-	resource.bind |= PIPE_BIND_RENDER_TARGET;
+	resource.bind |= PIPE_BIND_DEPTH_STENCIL;
 
 	rtex->flushed_depth_texture = (struct r600_resource_texture *)ctx->screen->resource_create(ctx->screen, &resource);
 	if (rtex->flushed_depth_texture == NULL) {
@@ -286,8 +456,6 @@ struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
 	trans->transfer.sr = sr;
 	trans->transfer.usage = usage;
 	trans->transfer.box = *box;
-	trans->transfer.stride = rtex->pitch[sr.level];
-	trans->offset = r600_texture_get_offset(rtex, sr.level, box->z, sr.face);
 	if (rtex->depth) {
 		r = r600_texture_depth_flush(ctx, texture);
 		if (r < 0) {
@@ -301,12 +469,12 @@ struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
 		resource.format = texture->format;
 		resource.width0 = box->width;
 		resource.height0 = box->height;
-		resource.depth0 = 0;
+		resource.depth0 = 1;
 		resource.last_level = 0;
 		resource.nr_samples = 0;
 		resource.usage = PIPE_USAGE_DYNAMIC;
 		resource.bind = 0;
-		resource.flags = 0;
+		resource.flags = R600_RESOURCE_FLAG_TRANSFER;
 		/* For texture reading, the temporary (detiled) texture is used as
 		 * a render target when blitting from a tiled texture. */
 		if (usage & PIPE_TRANSFER_READ) {
@@ -325,6 +493,9 @@ struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
 			FREE(trans);
 			return NULL;
 		}
+
+		trans->transfer.stride =
+		  ((struct r600_resource_texture *)trans->linear_texture)->pitch_in_bytes[0];
 		if (usage & PIPE_TRANSFER_READ) {
 			/* We cannot map a tiled texture directly because the data is
 			 * in a different order, therefore we do detiling using a blit. */
@@ -332,7 +503,10 @@ struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx,
 			/* Always referenced in the blit. */
 			ctx->flush(ctx, 0, NULL);
 		}
+		return &trans->transfer;
 	}
+	trans->transfer.stride = rtex->pitch_in_bytes[sr.level];
+	trans->offset = r600_texture_get_offset(rtex, sr.level, box->z, sr.face);
 	return &trans->transfer;
 }
 
@@ -343,12 +517,12 @@ void r600_texture_transfer_destroy(struct pipe_context *ctx,
 	struct r600_resource_texture *rtex = (struct r600_resource_texture*)transfer->resource;
 
 	if (rtransfer->linear_texture) {
+		if (transfer->usage & PIPE_TRANSFER_WRITE) {
+			r600_copy_into_tiled_texture(ctx, rtransfer);
+		}
 		pipe_resource_reference(&rtransfer->linear_texture, NULL);
 	}
 	if (rtex->flushed_depth_texture) {
-		if (transfer->usage & PIPE_TRANSFER_WRITE) {
-			// TODO
-		}
 		pipe_resource_reference((struct pipe_resource **)&rtex->flushed_depth_texture, NULL);
 	}
 	pipe_resource_reference(&transfer->resource, NULL);
@@ -362,7 +536,7 @@ void* r600_texture_transfer_map(struct pipe_context *ctx,
 	struct r600_bo *bo;
 	enum pipe_format format = transfer->resource->format;
 	struct radeon *radeon = (struct radeon *)ctx->screen->winsys;
-	unsigned long offset = 0;
+	unsigned offset = 0;
 	char *map;
 
 	if (rtransfer->linear_texture) {
@@ -500,15 +674,23 @@ uint32_t r600_translate_texformat(enum pipe_format format,
 	case UTIL_FORMAT_COLORSPACE_ZS:
 		switch (format) {
 		case PIPE_FORMAT_Z16_UNORM:
-			result = V_0280A0_COLOR_16;
+			result = FMT_16;
 			goto out_word4;
+		case PIPE_FORMAT_X24S8_USCALED:
+			word4 |= S_038010_NUM_FORMAT_ALL(V_038010_SQ_NUM_FORMAT_INT);
 		case PIPE_FORMAT_Z24X8_UNORM:
 		case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-			result = V_0280A0_COLOR_8_24;
+			result = FMT_8_24;
 			goto out_word4;
+		case PIPE_FORMAT_S8X24_USCALED:
+			word4 |= S_038010_NUM_FORMAT_ALL(V_038010_SQ_NUM_FORMAT_INT);
 		case PIPE_FORMAT_X8Z24_UNORM:
 		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
-			result = V_0280A0_COLOR_24_8;
+			result = FMT_24_8;
+			goto out_word4;
+		case PIPE_FORMAT_S8_USCALED:
+			result = V_0280A0_COLOR_8;
+			word4 |= S_038010_NUM_FORMAT_ALL(V_038010_SQ_NUM_FORMAT_INT);
 			goto out_word4;
 		default:
 			goto out_unknown;
@@ -562,7 +744,7 @@ uint32_t r600_translate_texformat(enum pipe_format format,
 			if (desc->channel[0].size == 5 &&
 			    desc->channel[1].size == 6 &&
 			    desc->channel[2].size == 5) {
-				result = V_0280A0_COLOR_5_6_5;
+				result = FMT_5_6_5;
 				goto out_word4;
 			}
 			goto out_unknown;
@@ -571,14 +753,14 @@ uint32_t r600_translate_texformat(enum pipe_format format,
 			    desc->channel[1].size == 5 &&
 			    desc->channel[2].size == 5 &&
 			    desc->channel[3].size == 1) {
-				result = V_0280A0_COLOR_1_5_5_5;
+				result = FMT_1_5_5_5;
 				goto out_word4;
 			}
 			if (desc->channel[0].size == 10 &&
 			    desc->channel[1].size == 10 &&
 			    desc->channel[2].size == 10 &&
 			    desc->channel[3].size == 2) {
-				result = V_0280A0_COLOR_10_10_10_2;
+				result = FMT_10_10_10_2;
 				goto out_word4;
 			}
 			goto out_unknown;
@@ -609,36 +791,36 @@ uint32_t r600_translate_texformat(enum pipe_format format,
 		case 4:
 			switch (desc->nr_channels) {
 			case 2:
-				result = V_0280A0_COLOR_4_4;
+				result = FMT_4_4;
 				goto out_word4;
 			case 4:
-				result = V_0280A0_COLOR_4_4_4_4;
+				result = FMT_4_4_4_4;
 				goto out_word4;
 			}
 			goto out_unknown;
 		case 8:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_0280A0_COLOR_8;
+				result = FMT_8;
 				goto out_word4;
 			case 2:
-				result = V_0280A0_COLOR_8_8;
+				result = FMT_8_8;
 				goto out_word4;
 			case 4:
-				result = V_0280A0_COLOR_8_8_8_8;
+				result = FMT_8_8_8_8;
 				goto out_word4;
 			}
 			goto out_unknown;
 		case 16:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_0280A0_COLOR_16;
+				result = FMT_16;
 				goto out_word4;
 			case 2:
-				result = V_0280A0_COLOR_16_16;
+				result = FMT_16_16;
 				goto out_word4;
 			case 4:
-				result = V_0280A0_COLOR_16_16_16_16;
+				result = FMT_16_16_16_16;
 				goto out_word4;
 			}
 		}
@@ -649,26 +831,26 @@ uint32_t r600_translate_texformat(enum pipe_format format,
 		case 16:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_0280A0_COLOR_16_FLOAT;
+				result = FMT_16_FLOAT;
 				goto out_word4;
 			case 2:
-				result = V_0280A0_COLOR_16_16_FLOAT;
+				result = FMT_16_16_FLOAT;
 				goto out_word4;
 			case 4:
-				result = V_0280A0_COLOR_16_16_16_16_FLOAT;
+				result = FMT_16_16_16_16_FLOAT;
 				goto out_word4;
 			}
 			goto out_unknown;
 		case 32:
 			switch (desc->nr_channels) {
 			case 1:
-				result = V_0280A0_COLOR_32_FLOAT;
+				result = FMT_32_FLOAT;
 				goto out_word4;
 			case 2:
-				result = V_0280A0_COLOR_32_32_FLOAT;
+				result = FMT_32_32_FLOAT;
 				goto out_word4;
 			case 4:
-				result = V_0280A0_COLOR_32_32_32_32_FLOAT;
+				result = FMT_32_32_32_32_FLOAT;
 				goto out_word4;
 			}
 		}
diff --git a/src/gallium/drivers/r600/r600_translate.c b/src/gallium/drivers/r600/r600_translate.c
new file mode 100644
index 00000000000..9a07cf2073f
--- /dev/null
+++ b/src/gallium/drivers/r600/r600_translate.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2010 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Dave Airlie <[email protected]>
+ */
+#include "translate/translate_cache.h"
+#include "translate/translate.h"
+#include <pipebuffer/pb_buffer.h>
+#include <util/u_index_modify.h>
+#include "r600_pipe.h"
+
+void r600_begin_vertex_translate(struct r600_pipe_context *rctx)
+{
+	struct pipe_context *pipe = &rctx->context;
+	struct translate_key key = {0};
+	struct translate_element *te;
+	unsigned tr_elem_index[PIPE_MAX_ATTRIBS] = {0};
+	struct translate *tr;
+	struct r600_vertex_element *ve = rctx->vertex_elements;
+	boolean vb_translated[PIPE_MAX_ATTRIBS] = {0};
+	void *vb_map[PIPE_MAX_ATTRIBS] = {0}, *out_map;
+	struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0}, *out_transfer;
+	struct pipe_resource *out_buffer;
+	unsigned i, num_verts;
+
+	/* Initialize the translate key, i.e. the recipe how vertices should be
+	 * translated. */
+	for (i = 0; i < ve->count; i++) {
+		struct pipe_vertex_buffer *vb =
+			&rctx->vertex_buffer[ve->elements[i].vertex_buffer_index];
+		enum pipe_format output_format = ve->hw_format[i];
+		unsigned output_format_size = ve->hw_format_size[i];
+
+		/* Check for support. */
+		if (ve->elements[i].src_format == ve->hw_format[i] &&
+		    (vb->buffer_offset + ve->elements[i].src_offset) % 4 == 0 &&
+		    vb->stride % 4 == 0) {
+			continue;
+		}
+
+		/* Workaround for translate: output floats instead of halfs. */
+		switch (output_format) {
+		case PIPE_FORMAT_R16_FLOAT:
+			output_format = PIPE_FORMAT_R32_FLOAT;
+			output_format_size = 4;
+			break;
+		case PIPE_FORMAT_R16G16_FLOAT:
+			output_format = PIPE_FORMAT_R32G32_FLOAT;
+			output_format_size = 8;
+			break;
+		case PIPE_FORMAT_R16G16B16_FLOAT:
+			output_format = PIPE_FORMAT_R32G32B32_FLOAT;
+			output_format_size = 12;
+			break;
+		case PIPE_FORMAT_R16G16B16A16_FLOAT:
+			output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+			output_format_size = 16;
+			break;
+		default:;
+		}
+
+		/* Add this vertex element. */
+		te = &key.element[key.nr_elements];
+		/*te->type;
+		  te->instance_divisor;*/
+		te->input_buffer = ve->elements[i].vertex_buffer_index;
+		te->input_format = ve->elements[i].src_format;
+		te->input_offset = vb->buffer_offset + ve->elements[i].src_offset;
+		te->output_format = output_format;
+		te->output_offset = key.output_stride;
+
+		key.output_stride += output_format_size;
+		vb_translated[ve->elements[i].vertex_buffer_index] = TRUE;
+		tr_elem_index[i] = key.nr_elements;
+		key.nr_elements++;
+	}
+
+	/* Get a translate object. */
+	tr = translate_cache_find(rctx->tran.translate_cache, &key);
+
+	/* Map buffers we want to translate. */
+	for (i = 0; i < rctx->nvertex_buffer; i++) {
+		if (vb_translated[i]) {
+			struct pipe_vertex_buffer *vb = &rctx->vertex_buffer[i];
+
+			vb_map[i] = pipe_buffer_map(pipe, vb->buffer,
+						    PIPE_TRANSFER_READ, &vb_transfer[i]);
+
+			tr->set_buffer(tr, i, vb_map[i], vb->stride, vb->max_index);
+		}
+	}
+
+	/* Create and map the output buffer. */
+	num_verts = rctx->vb_max_index + 1;
+
+	out_buffer = pipe_buffer_create(&rctx->screen->screen,
+					PIPE_BIND_VERTEX_BUFFER,
+					key.output_stride * num_verts);
+
+	out_map = pipe_buffer_map(pipe, out_buffer, PIPE_TRANSFER_WRITE,
+				  &out_transfer);
+
+	/* Translate. */
+	tr->run(tr, 0, num_verts, 0, out_map);
+
+	/* Unmap all buffers. */
+	for (i = 0; i < rctx->nvertex_buffer; i++) {
+		if (vb_translated[i]) {
+			pipe_buffer_unmap(pipe, rctx->vertex_buffer[i].buffer,
+					  vb_transfer[i]);
+		}
+	}
+
+	pipe_buffer_unmap(pipe, out_buffer, out_transfer);
+
+	/* Setup the new vertex buffer in the first free slot. */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		struct pipe_vertex_buffer *vb = &rctx->vertex_buffer[i];
+
+		if (!vb->buffer) {
+			pipe_resource_reference(&vb->buffer, out_buffer);
+			vb->buffer_offset = 0;
+			vb->max_index = num_verts - 1;
+			vb->stride = key.output_stride;
+			rctx->tran.vb_slot = i;
+			break;
+		}
+	}
+
+	/* Save and replace vertex elements. */
+	{
+		struct pipe_vertex_element new_velems[PIPE_MAX_ATTRIBS];
+
+		rctx->tran.saved_velems = rctx->vertex_elements;
+
+		for (i = 0; i < ve->count; i++) {
+			if (vb_translated[ve->elements[i].vertex_buffer_index]) {
+				te = &key.element[tr_elem_index[i]];
+				new_velems[i].instance_divisor = ve->elements[i].instance_divisor;
+				new_velems[i].src_format = te->output_format;
+				new_velems[i].src_offset = te->output_offset;
+				new_velems[i].vertex_buffer_index = rctx->tran.vb_slot;
+			} else {
+				memcpy(&new_velems[i], &ve->elements[i],
+				       sizeof(struct pipe_vertex_element));
+			}
+		}
+
+		rctx->tran.new_velems =
+			pipe->create_vertex_elements_state(pipe, ve->count, new_velems);
+		pipe->bind_vertex_elements_state(pipe, rctx->tran.new_velems);
+	}
+
+	pipe_resource_reference(&out_buffer, NULL);
+}
+
+void r600_end_vertex_translate(struct r600_pipe_context *rctx)
+{
+	struct pipe_context *pipe = &rctx->context;
+
+	/* Restore vertex elements. */
+	pipe->bind_vertex_elements_state(pipe, rctx->tran.saved_velems);
+	pipe->delete_vertex_elements_state(pipe, rctx->tran.new_velems);
+
+	/* Delete the now-unused VBO. */
+	pipe_resource_reference(&rctx->vertex_buffer[rctx->tran.vb_slot].buffer,
+				NULL);
+}
+
+void r600_translate_index_buffer(struct r600_pipe_context *r600,
+					struct pipe_resource **index_buffer,
+					unsigned *index_size,
+					unsigned *start, unsigned count)
+{
+	switch (*index_size) {
+	case 1:
+		util_shorten_ubyte_elts(&r600->context, index_buffer, 0, *start, count);
+		*index_size = 2;
+		*start = 0;
+		break;
+
+	case 2:
+		if (*start % 2 != 0) {
+			util_rebuild_ushort_elts(&r600->context, index_buffer, 0, *start, count);
+			*start = 0;
+		}
+		break;
+
+	case 4:
+		break;
+	}
+}
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index a96d2ce26c1..a3cb5b86004 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -667,6 +667,9 @@
 #define   S_02880C_Z_EXPORT_ENABLE(x)                  (((x) & 0x1) << 0)
 #define   G_02880C_Z_EXPORT_ENABLE(x)                  (((x) >> 0) & 0x1)
 #define   C_02880C_Z_EXPORT_ENABLE                     0xFFFFFFFE
+#define   S_02880C_STENCIL_REF_EXPORT_ENABLE(x)        (((x) & 0x1) << 1)
+#define   G_02880C_STENCIL_REF_EXPORT_ENABLE(x)        (((x) >> 1) & 0x1)
+#define   C_02880C_STENCIL_REF_EXPORT_ENABLE           0xFFFFFFFD
 #define   S_02880C_Z_ORDER(x)                          (((x) & 0x3) << 4)
 #define   G_02880C_Z_ORDER(x)                          (((x) >> 4) & 0x3)
 #define   C_02880C_Z_ORDER                             0xFFFFFCFF
@@ -901,6 +904,10 @@
 #define   S_038000_TILE_MODE(x)                        (((x) & 0xF) << 3)
 #define   G_038000_TILE_MODE(x)                        (((x) >> 3) & 0xF)
 #define   C_038000_TILE_MODE                           0xFFFFFF87
+#define     V_038000_ARRAY_LINEAR_GENERAL              0x00000000
+#define     V_038000_ARRAY_LINEAR_ALIGNED              0x00000001
+#define     V_038000_ARRAY_1D_TILED_THIN1              0x00000002
+#define     V_038000_ARRAY_2D_TILED_THIN1              0x00000004
 #define   S_038000_TILE_TYPE(x)                        (((x) & 0x1) << 7)
 #define   G_038000_TILE_TYPE(x)                        (((x) >> 7) & 0x1)
 #define   C_038000_TILE_TYPE                           0xFFFFFF7F
@@ -1025,45 +1032,7 @@
 #define   S_038008_DATA_FORMAT(x)                      (((x) & 0x3F) << 20)
 #define   G_038008_DATA_FORMAT(x)                      (((x) >> 20) & 0x3F)
 #define   C_038008_DATA_FORMAT                         0xFC0FFFFF
-#define     V_038008_FMT_INVALID                     0x00000000
-#define     V_038008_FMT_8                           0x00000001
-#define     V_038008_FMT_4_4                         0x00000002
-#define     V_038008_FMT_3_3_2                       0x00000003
-#define     V_038008_FMT_16                          0x00000005
-#define     V_038008_FMT_16_FLOAT                    0x00000006
-#define     V_038008_FMT_8_8                         0x00000007
-#define     V_038008_FMT_5_6_5                       0x00000008
-#define     V_038008_FMT_6_5_5                       0x00000009
-#define     V_038008_FMT_1_5_5_5                     0x0000000A
-#define     V_038008_FMT_4_4_4_4                     0x0000000B
-#define     V_038008_FMT_5_5_5_1                     0x0000000C
-#define     V_038008_FMT_32                          0x0000000D
-#define     V_038008_FMT_32_FLOAT                    0x0000000E
-#define     V_038008_FMT_16_16                       0x0000000F
-#define     V_038008_FMT_16_16_FLOAT                 0x00000010
-#define     V_038008_FMT_8_24                        0x00000011
-#define     V_038008_FMT_8_24_FLOAT                  0x00000012
-#define     V_038008_FMT_24_8                        0x00000013
-#define     V_038008_FMT_24_8_FLOAT                  0x00000014
-#define     V_038008_FMT_10_11_11                    0x00000015
-#define     V_038008_FMT_10_11_11_FLOAT              0x00000016
-#define     V_038008_FMT_11_11_10                    0x00000017
-#define     V_038008_FMT_11_11_10_FLOAT              0x00000018
-#define     V_038008_FMT_2_10_10_10                  0x00000019
-#define     V_038008_FMT_8_8_8_8                     0x0000001A
-#define     V_038008_FMT_10_10_10_2                  0x0000001B
-#define     V_038008_FMT_X24_8_32_FLOAT              0x0000001C
-#define     V_038008_FMT_32_32                       0x0000001D
-#define     V_038008_FMT_32_32_FLOAT                 0x0000001E
-#define     V_038008_FMT_16_16_16_16                 0x0000001F
-#define     V_038008_FMT_16_16_16_16_FLOAT           0x00000020
-#define     V_038008_FMT_32_32_32_32                 0x00000022
-#define     V_038008_FMT_32_32_32_32_FLOAT           0x00000023
-#define     V_038008_FMT_8_8_8                       0x0000002c
-#define     V_038008_FMT_16_16_16                    0x0000002d
-#define     V_038008_FMT_16_16_16_FLOAT              0x0000002e
-#define     V_038008_FMT_32_32_32                    0x0000002f
-#define     V_038008_FMT_32_32_32_FLOAT              0x00000030
+
 #define   S_038008_NUM_FORMAT_ALL(x)                   (((x) & 0x3) << 26)
 #define   G_038008_NUM_FORMAT_ALL(x)                   (((x) >> 26) & 0x3)
 #define   C_038008_NUM_FORMAT_ALL                      0xF3FFFFFF
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 67e2c8f8bc4..346e1b402ba 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -158,9 +158,17 @@ exec_run( const struct sp_fragment_shader *base,
          case TGSI_SEMANTIC_POSITION:
             {
                uint j;
-               for (j = 0; j < 4; j++) {
+
+               for (j = 0; j < 4; j++)
                   quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
-               }
+            }
+            break;
+         case TGSI_SEMANTIC_STENCIL:
+            {
+               uint j;
+
+               for (j = 0; j < 4; j++)
+                  quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].f[j];
             }
             break;
          }
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index daa158df7c4..5b18cd035e3 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -169,9 +169,15 @@ fs_sse_run( const struct sp_fragment_shader *base,
          case TGSI_SEMANTIC_POSITION:
             {
                uint j;
-               for (j = 0; j < 4; j++) {
-                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
-               }
+               for (j = 0; j < 4; j++)
+                  quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+            }
+            break;
+         case TGSI_SEMANTIC_STENCIL:
+            {
+               uint j;
+               for (j = 0; j < 4; j++)
+                  quad->output.stencil[j] = machine->Outputs[i].xyzw[1].f[j];
             }
             break;
          }
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
index a3236bd1169..e745aa80619 100644
--- a/src/gallium/drivers/softpipe/sp_quad.h
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -85,6 +85,7 @@ struct quad_header_output
    /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
    float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
    float depth[QUAD_SIZE];
+   uint8_t stencil[QUAD_SIZE];
 };
 
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index e9b92626176..c8f5f89568a 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -47,6 +47,8 @@ struct depth_data {
    unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
    unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
    ubyte stencilVals[QUAD_SIZE];
+   boolean use_shader_stencil_refs;
+   ubyte shader_stencil_refs[QUAD_SIZE];
    struct softpipe_cached_tile *tile;
 };
 
@@ -186,6 +188,33 @@ convert_quad_depth( struct depth_data *data,
 }
 
 
+/**
+ * Compute the depth_data::shader_stencil_refs[] values from the float fragment stencil values.
+ */
+static void
+convert_quad_stencil( struct depth_data *data, 
+                      const struct quad_header *quad )
+{
+   unsigned j;
+
+   data->use_shader_stencil_refs = TRUE;
+   /* Copy quads stencil values
+    */
+   switch (data->format) {
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+      {
+         for (j = 0; j < QUAD_SIZE; j++) {
+            data->shader_stencil_refs[j] = ((unsigned)(quad->output.stencil[j]));
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
 
 /**
  * Write data->bzzzz[] values and data->stencilVals into the Z/stencil buffer.
@@ -272,8 +301,14 @@ do_stencil_test(struct depth_data *data,
 {
    unsigned passMask = 0x0;
    unsigned j;
+   ubyte refs[QUAD_SIZE];
 
-   ref &= valMask;
+   for (j = 0; j < QUAD_SIZE; j++) {
+      if (data->use_shader_stencil_refs)
+         refs[j] = data->shader_stencil_refs[j] & valMask;
+      else 
+         refs[j] = ref & valMask;
+   }
 
    switch (func) {
    case PIPE_FUNC_NEVER:
@@ -281,42 +316,42 @@ do_stencil_test(struct depth_data *data,
       break;
    case PIPE_FUNC_LESS:
       for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref < (data->stencilVals[j] & valMask)) {
+         if (refs[j] < (data->stencilVals[j] & valMask)) {
             passMask |= (1 << j);
          }
       }
       break;
    case PIPE_FUNC_EQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref == (data->stencilVals[j] & valMask)) {
+         if (refs[j] == (data->stencilVals[j] & valMask)) {
             passMask |= (1 << j);
          }
       }
       break;
    case PIPE_FUNC_LEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref <= (data->stencilVals[j] & valMask)) {
+         if (refs[j] <= (data->stencilVals[j] & valMask)) {
             passMask |= (1 << j);
          }
       }
       break;
    case PIPE_FUNC_GREATER:
       for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref > (data->stencilVals[j] & valMask)) {
+         if (refs[j] > (data->stencilVals[j] & valMask)) {
             passMask |= (1 << j);
          }
       }
       break;
    case PIPE_FUNC_NOTEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref != (data->stencilVals[j] & valMask)) {
+         if (refs[j] != (data->stencilVals[j] & valMask)) {
             passMask |= (1 << j);
          }
       }
       break;
    case PIPE_FUNC_GEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref >= (data->stencilVals[j] & valMask)) {
+         if (refs[j] >= (data->stencilVals[j] & valMask)) {
             passMask |= (1 << j);
          }
       }
@@ -348,9 +383,14 @@ apply_stencil_op(struct depth_data *data,
 {
    unsigned j;
    ubyte newstencil[QUAD_SIZE];
+   ubyte refs[QUAD_SIZE];
 
    for (j = 0; j < QUAD_SIZE; j++) {
       newstencil[j] = data->stencilVals[j];
+      if (data->use_shader_stencil_refs)
+         refs[j] = data->shader_stencil_refs[j];
+      else
+         refs[j] = ref;
    }
 
    switch (op) {
@@ -367,7 +407,7 @@ apply_stencil_op(struct depth_data *data,
    case PIPE_STENCIL_OP_REPLACE:
       for (j = 0; j < QUAD_SIZE; j++) {
          if (mask & (1 << j)) {
-            newstencil[j] = ref;
+            newstencil[j] = refs[j];
          }
       }
       break;
@@ -688,8 +728,10 @@ depth_test_quads_fallback(struct quad_stage *qs,
    unsigned i, pass = 0;
    const struct sp_fragment_shader *fs = qs->softpipe->fs;
    boolean interp_depth = !fs->info.writes_z;
+   boolean shader_stencil_ref = fs->info.writes_stencil;
    struct depth_data data;
 
+   data.use_shader_stencil_refs = FALSE;
 
    if (qs->softpipe->depth_stencil->alpha.enabled) {
       nr = alpha_test_quads(qs, quads, nr);
@@ -716,6 +758,9 @@ depth_test_quads_fallback(struct quad_stage *qs,
          }
 
          if (qs->softpipe->depth_stencil->stencil[0].enabled) {
+            if (shader_stencil_ref)
+               convert_quad_stencil(&data, quads[i]);
+            
             depth_stencil_test_quad(qs, &data, quads[i]);
             write_depth_stencil_values(&data, quads[i]);
          }
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
index 43b8e88e334..2cfd02a22c6 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.c
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
@@ -47,7 +47,8 @@ sp_build_quad_pipeline(struct softpipe_context *sp)
       sp->framebuffer.zsbuf &&
       !sp->depth_stencil->alpha.enabled &&
       !sp->fs->info.uses_kill &&
-      !sp->fs->info.writes_z;
+      !sp->fs->info.writes_z &&
+      !sp->fs->info.writes_stencil;
 
    sp->quad.first = sp->quad.blend;
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 25a0a622179..edc2a6dacf2 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -113,8 +113,12 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_STREAM_OUTPUT:
       return 1;
+   case PIPE_CAP_PRIMITIVE_RESTART:
+      return 1;
    case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
       return 0;
+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
+      return 1;
    default:
       return 0;
    }
@@ -206,13 +210,6 @@ softpipe_is_format_supported( struct pipe_screen *screen,
       if (format_desc->block.width != 1 ||
           format_desc->block.height != 1)
          return FALSE;
-
-      /*
-       * TODO: Unfortunately we cannot render into anything more than 32 bits
-       * because we encode color clear values into a 32bit word.
-       */
-      if (format_desc->block.bits > 32)
-         return FALSE;
    }
 
    if (bind & PIPE_BIND_DEPTH_STENCIL) {
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 088e48f81fe..2eac4c7a82b 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -44,6 +44,9 @@
 #include "sp_tex_tile_cache.h"
 
 
+/** Set to one to help debug texture sampling */
+#define DEBUG_TEX 0
+
 
 /*
  * Return fractional part of 'f'.  Used for computing interpolation weights.
@@ -774,6 +777,18 @@ pot_level_size(unsigned base_pot, unsigned level)
 }
 
 
+static void
+print_sample(const char *function, float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   debug_printf("%s %g %g %g %g, %g %g %g %g, %g %g %g %g, %g %g %g %g\n",
+                function,
+                rgba[0][0], rgba[1][0], rgba[2][0], rgba[3][0],
+                rgba[0][1], rgba[1][1], rgba[2][1], rgba[3][1],
+                rgba[0][2], rgba[1][2], rgba[2][2], rgba[3][2],
+                rgba[0][3], rgba[1][3], rgba[2][3], rgba[3][3]);
+}
+
+
 /* Some image-filter fastpaths:
  */
 static INLINE void
@@ -832,6 +847,10 @@ img_filter_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
                               tx[2][c], tx[3][c]);
       }
    }
+
+   if (DEBUG_TEX) {
+      print_sample(__FUNCTION__, rgba);
+   }
 }
 
 
@@ -872,6 +891,10 @@ img_filter_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
          rgba[c][j] = out[c];
       }
    }
+
+   if (DEBUG_TEX) {
+      print_sample(__FUNCTION__, rgba);
+   }
 }
 
 
@@ -921,6 +944,10 @@ img_filter_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
          rgba[c][j] = out[c];
       }
    }
+
+   if (DEBUG_TEX) {
+      print_sample(__FUNCTION__, rgba);
+   }
 }
 
 
@@ -957,6 +984,10 @@ img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
          rgba[c][j] = out[c];
       }
    }
+
+   if (DEBUG_TEX) {
+      print_sample(__FUNCTION__, rgba);
+   }
 }
 
 
@@ -997,6 +1028,10 @@ img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
          rgba[c][j] = out[c];
       }
    }
+
+   if (DEBUG_TEX) {
+      print_sample(__FUNCTION__, rgba);
+   }
 }
 
 
@@ -1045,6 +1080,10 @@ img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
          rgba[c][j] = out[c];
       }
    }
+
+   if (DEBUG_TEX) {
+      print_sample(__FUNCTION__, rgba);
+   }
 }
 
 
@@ -1357,6 +1396,10 @@ mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
          }
       }
    }
+
+   if (DEBUG_TEX) {
+      print_sample(__FUNCTION__, rgba);
+   }
 }
 
 
@@ -1402,13 +1445,9 @@ mip_filter_nearest(struct tgsi_sampler *tgsi_sampler,
       samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba);
    }
 
-#if 0
-   printf("RGBA %g %g %g %g, %g %g %g %g, %g %g %g %g, %g %g %g %g\n",
-          rgba[0][0], rgba[1][0], rgba[2][0], rgba[3][0],
-          rgba[0][1], rgba[1][1], rgba[2][1], rgba[3][1],
-          rgba[0][2], rgba[1][2], rgba[2][2], rgba[3][2],
-          rgba[0][3], rgba[1][3], rgba[2][3], rgba[3][3]);
-#endif
+   if (DEBUG_TEX) {
+      print_sample(__FUNCTION__, rgba);
+   }
 }
 
 
@@ -1510,6 +1549,10 @@ mip_filter_linear_2d_linear_repeat_POT(
          }
       }
    }
+
+   if (DEBUG_TEX) {
+      print_sample(__FUNCTION__, rgba);
+   }
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index 031c7c1ea5c..4151a47c323 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -86,7 +86,7 @@ struct softpipe_tile_cache
    struct softpipe_cached_tile *entries[NUM_ENTRIES];
    uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
    float clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
+   uint clear_val;        /**< for z+stencil */
    boolean depth_stencil; /**< Is the surface a depth/stencil format? */
 
    struct softpipe_cached_tile *tile;  /**< scratch tile for clears */
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 50205995911..3d6b5b5c81d 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -122,6 +122,27 @@ typedef unsigned char boolean;
 #  endif
 #endif
 
+/*
+ * Define the C99 restrict keyword.
+ *
+ * See also:
+ * - http://cellperformance.beyond3d.com/articles/2006/05/demystifying-the-restrict-keyword.html
+ */
+#ifndef restrict
+#  if (__STDC_VERSION__ >= 199901L)
+     /* C99 */
+#  elif defined(__SUNPRO_C) && defined(__C99FEATURES__)
+     /* C99 */
+#  elif defined(__GNUC__)
+#    define restrict __restrict__
+#  elif defined(_MSC_VER)
+#    define restrict __restrict
+#  else
+#    define restrict /* */
+#  endif
+#endif
+
+
 /* Function visibility */
 #ifndef PUBLIC
 #  if defined(__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index d2273b334a6..e30b9904fa5 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -452,6 +452,7 @@ enum pipe_cap {
    PIPE_CAP_BLEND_EQUATION_SEPARATE,
    PIPE_CAP_SM3,  /*< Shader Model, supported */
    PIPE_CAP_STREAM_OUTPUT,
+   PIPE_CAP_PRIMITIVE_RESTART,
    /** Maximum texture image units accessible from vertex and fragment shaders
     * combined */
    PIPE_CAP_MAX_COMBINED_SAMPLERS,
@@ -464,7 +465,8 @@ enum pipe_cap {
    PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT,
    PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER,
    PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER,
-   PIPE_CAP_DEPTH_CLAMP
+   PIPE_CAP_DEPTH_CLAMP,
+   PIPE_CAP_SHADER_STENCIL_EXPORT,
 };
 
 /* Shader caps not specific to any single stage */
diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h
index 240e349ba7e..9a59c9f9ea0 100644
--- a/src/gallium/include/pipe/p_format.h
+++ b/src/gallium/include/pipe/p_format.h
@@ -198,6 +198,10 @@ enum pipe_format {
    PIPE_FORMAT_IA44                  = 141,
    PIPE_FORMAT_AI44                  = 142,
 
+   /* some stencil samplers formats */
+   PIPE_FORMAT_X24S8_USCALED           = 136,
+   PIPE_FORMAT_S8X24_USCALED           = 137,
+   PIPE_FORMAT_X32_S8X24_USCALED       = 138,
    PIPE_FORMAT_COUNT
 };
 
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 74488de17eb..ba433b2bd2a 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -143,7 +143,8 @@ struct tgsi_declaration_dimension
 #define TGSI_SEMANTIC_EDGEFLAG   8
 #define TGSI_SEMANTIC_PRIMID     9
 #define TGSI_SEMANTIC_INSTANCEID 10
-#define TGSI_SEMANTIC_COUNT      11 /**< number of semantic values */
+#define TGSI_SEMANTIC_STENCIL    11
+#define TGSI_SEMANTIC_COUNT      12 /**< number of semantic values */
 
 struct tgsi_declaration_semantic
 {
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 9a2b31da50d..fc6dba346da 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -457,6 +457,12 @@ struct pipe_draw_info
    int index_bias; /**< a bias to be added to each index */
    unsigned min_index; /**< the min index */
    unsigned max_index; /**< the max index */
+
+   /**
+    * Primitive restart enable/index (only applies to indexed drawing)
+    */
+   boolean primitive_restart;
+   unsigned restart_index;
 };
 
 
diff --git a/src/gallium/state_trackers/dri/common/dri_context.c b/src/gallium/state_trackers/dri/common/dri_context.c
index 22e1b6dd701..770b37037f5 100644
--- a/src/gallium/state_trackers/dri/common/dri_context.c
+++ b/src/gallium/state_trackers/dri/common/dri_context.c
@@ -49,7 +49,7 @@ dri_init_extensions(struct dri_context *ctx)
 }
 
 GLboolean
-dri_create_context(gl_api api, const __GLcontextModes * visual,
+dri_create_context(gl_api api, const struct gl_config * visual,
 		   __DRIcontext * cPriv, void *sharedContextPrivate)
 {
    __DRIscreen *sPriv = cPriv->driScreenPriv;
diff --git a/src/gallium/state_trackers/dri/common/dri_context.h b/src/gallium/state_trackers/dri/common/dri_context.h
index beb59c6f684..35105e861f9 100644
--- a/src/gallium/state_trackers/dri/common/dri_context.h
+++ b/src/gallium/state_trackers/dri/common/dri_context.h
@@ -88,7 +88,7 @@ dri_get_current(__DRIscreen * driScreenPriv);
 
 boolean
 dri_create_context(gl_api api,
-		   const __GLcontextModes * visual,
+		   const struct gl_config * visual,
 		   __DRIcontext * driContextPriv,
 		   void *sharedContextPrivate);
 
diff --git a/src/gallium/state_trackers/dri/common/dri_drawable.c b/src/gallium/state_trackers/dri/common/dri_drawable.c
index 1bdfdccf439..5fd6e7863c0 100644
--- a/src/gallium/state_trackers/dri/common/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/common/dri_drawable.c
@@ -112,7 +112,7 @@ dri_st_framebuffer_flush_front(struct st_framebuffer_iface *stfbi,
 boolean
 dri_create_buffer(__DRIscreen * sPriv,
 		  __DRIdrawable * dPriv,
-		  const __GLcontextModes * visual, boolean isPixmap)
+		  const struct gl_config * visual, boolean isPixmap)
 {
    struct dri_screen *screen = sPriv->private;
    struct dri_drawable *drawable = NULL;
diff --git a/src/gallium/state_trackers/dri/common/dri_drawable.h b/src/gallium/state_trackers/dri/common/dri_drawable.h
index 74e662d36c4..837d3983748 100644
--- a/src/gallium/state_trackers/dri/common/dri_drawable.h
+++ b/src/gallium/state_trackers/dri/common/dri_drawable.h
@@ -79,7 +79,7 @@ dri_drawable(__DRIdrawable * driDrawPriv)
 boolean
 dri_create_buffer(__DRIscreen * sPriv,
 		  __DRIdrawable * dPriv,
-		  const __GLcontextModes * visual, boolean isPixmap);
+		  const struct gl_config * visual, boolean isPixmap);
 
 void dri_destroy_buffer(__DRIdrawable * dPriv);
 
diff --git a/src/gallium/state_trackers/dri/common/dri_screen.c b/src/gallium/state_trackers/dri/common/dri_screen.c
index b3b09b605fa..252ad1768d8 100644
--- a/src/gallium/state_trackers/dri/common/dri_screen.c
+++ b/src/gallium/state_trackers/dri/common/dri_screen.c
@@ -227,7 +227,7 @@ dri_fill_in_modes(struct dri_screen *screen,
  */
 void
 dri_fill_st_visual(struct st_visual *stvis, struct dri_screen *screen,
-                   const __GLcontextModes *mode)
+                   const struct gl_config *mode)
 {
    memset(stvis, 0, sizeof(*stvis));
 
diff --git a/src/gallium/state_trackers/dri/common/dri_screen.h b/src/gallium/state_trackers/dri/common/dri_screen.h
index d4eb8f454f0..0da9b5510fc 100644
--- a/src/gallium/state_trackers/dri/common/dri_screen.h
+++ b/src/gallium/state_trackers/dri/common/dri_screen.h
@@ -114,7 +114,7 @@ dri_with_format(__DRIscreen * sPriv)
 
 void
 dri_fill_st_visual(struct st_visual *stvis, struct dri_screen *screen,
-                   const __GLcontextModes *mode);
+                   const struct gl_config *mode);
 
 const __DRIconfig **
 dri_init_screen_helper(struct dri_screen *screen,
diff --git a/src/gallium/state_trackers/dri/drm/dri2.c b/src/gallium/state_trackers/dri/drm/dri2.c
index 116afccb194..3c5b0756174 100644
--- a/src/gallium/state_trackers/dri/drm/dri2.c
+++ b/src/gallium/state_trackers/dri/drm/dri2.c
@@ -502,7 +502,7 @@ static const __DRIextension *dri_screen_extensions[] = {
 /**
  * This is the driver specific part of the createNewScreen entry point.
  *
- * Returns the __GLcontextModes supported by this driver.
+ * Returns the struct gl_config supported by this driver.
  */
 static const __DRIconfig **
 dri2_init_screen(__DRIscreen * sPriv)
@@ -548,7 +548,7 @@ fail:
 }
 
 static boolean
-dri2_create_context(gl_api api, const __GLcontextModes * visual,
+dri2_create_context(gl_api api, const struct gl_config * visual,
                     __DRIcontext * cPriv, void *sharedContextPrivate)
 {
    struct dri_context *ctx = NULL;
@@ -564,7 +564,7 @@ dri2_create_context(gl_api api, const __GLcontextModes * visual,
 static boolean
 dri2_create_buffer(__DRIscreen * sPriv,
                    __DRIdrawable * dPriv,
-                   const __GLcontextModes * visual, boolean isPixmap)
+                   const struct gl_config * visual, boolean isPixmap)
 {
    struct dri_drawable *drawable = NULL;
 
diff --git a/src/gallium/state_trackers/dri/sw/drisw.c b/src/gallium/state_trackers/dri/sw/drisw.c
index 04bba631aeb..c48cc440367 100644
--- a/src/gallium/state_trackers/dri/sw/drisw.c
+++ b/src/gallium/state_trackers/dri/sw/drisw.c
@@ -298,7 +298,7 @@ fail:
 static boolean
 drisw_create_buffer(__DRIscreen * sPriv,
                     __DRIdrawable * dPriv,
-                    const __GLcontextModes * visual, boolean isPixmap)
+                    const struct gl_config * visual, boolean isPixmap)
 {
    struct dri_drawable *drawable = NULL;
 
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.c b/src/gallium/state_trackers/egl/common/egl_g3d.c
index bfbb431058b..30ddcd5bc14 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d.c
@@ -126,24 +126,24 @@ egl_g3d_add_screens(_EGLDriver *drv, _EGLDisplay *dpy)
          continue;
       }
 
-      _eglInitScreen(&gscr->base);
-
-      for (j = 0; j < num_modes; j++) {
+      _eglInitScreen(&gscr->base, dpy, num_modes);
+      for (j = 0; j < gscr->base.NumModes; j++) {
          const struct native_mode *nmode = native_modes[j];
-         _EGLMode *mode;
-
-         mode = _eglAddNewMode(&gscr->base, nmode->width, nmode->height,
-               nmode->refresh_rate, nmode->desc);
-         if (!mode)
-            break;
-         /* gscr->native_modes and gscr->base.Modes should be consistent */
-         assert(mode == &gscr->base.Modes[j]);
+         _EGLMode *mode = &gscr->base.Modes[j];
+
+         mode->Width = nmode->width;
+         mode->Height = nmode->height;
+         mode->RefreshRate = nmode->refresh_rate;
+         mode->Optimal = EGL_FALSE;
+         mode->Interlaced = EGL_FALSE;
+         /* no need to strdup() */
+         mode->Name = nmode->desc;
       }
 
       gscr->native = nconn;
       gscr->native_modes = native_modes;
 
-      _eglAddScreen(dpy, &gscr->base);
+      _eglLinkScreen(&gscr->base);
    }
 
    FREE(native_connectors);
@@ -194,53 +194,48 @@ init_config_attributes(_EGLConfig *conf, const struct native_config *nconf,
    if (nconf->buffer_mask & (1 << NATIVE_ATTACHMENT_BACK_LEFT))
       surface_type |= EGL_PBUFFER_BIT;
 
-   SET_CONFIG_ATTRIB(conf, EGL_CONFORMANT, api_mask);
-   SET_CONFIG_ATTRIB(conf, EGL_RENDERABLE_TYPE, api_mask);
+   conf->Conformant = api_mask;
+   conf->RenderableType = api_mask;
 
-   SET_CONFIG_ATTRIB(conf, EGL_RED_SIZE, rgba[0]);
-   SET_CONFIG_ATTRIB(conf, EGL_GREEN_SIZE, rgba[1]);
-   SET_CONFIG_ATTRIB(conf, EGL_BLUE_SIZE, rgba[2]);
-   SET_CONFIG_ATTRIB(conf, EGL_ALPHA_SIZE, rgba[3]);
-   SET_CONFIG_ATTRIB(conf, EGL_BUFFER_SIZE, buffer_size);
+   conf->RedSize = rgba[0];
+   conf->GreenSize = rgba[1];
+   conf->BlueSize = rgba[2];
+   conf->AlphaSize = rgba[3];
+   conf->BufferSize = buffer_size;
 
-   SET_CONFIG_ATTRIB(conf, EGL_DEPTH_SIZE, depth_stencil[0]);
-   SET_CONFIG_ATTRIB(conf, EGL_STENCIL_SIZE, depth_stencil[1]);
+   conf->DepthSize = depth_stencil[0];
+   conf->StencilSize = depth_stencil[1];
 
-   SET_CONFIG_ATTRIB(conf, EGL_SURFACE_TYPE, surface_type);
+   conf->SurfaceType = surface_type;
 
-   SET_CONFIG_ATTRIB(conf, EGL_NATIVE_RENDERABLE, EGL_TRUE);
+   conf->NativeRenderable = EGL_TRUE;
    if (surface_type & EGL_WINDOW_BIT) {
-      SET_CONFIG_ATTRIB(conf, EGL_NATIVE_VISUAL_ID, nconf->native_visual_id);
-      SET_CONFIG_ATTRIB(conf, EGL_NATIVE_VISUAL_TYPE,
-            nconf->native_visual_type);
+      conf->NativeVisualID = nconf->native_visual_id;
+      conf->NativeVisualType = nconf->native_visual_type;
    }
 
    if (surface_type & EGL_PBUFFER_BIT) {
-      SET_CONFIG_ATTRIB(conf, EGL_BIND_TO_TEXTURE_RGB, EGL_TRUE);
+      conf->BindToTextureRGB = EGL_TRUE;
       if (rgba[3])
-         SET_CONFIG_ATTRIB(conf, EGL_BIND_TO_TEXTURE_RGBA, EGL_TRUE);
+         conf->BindToTextureRGBA = EGL_TRUE;
 
-      SET_CONFIG_ATTRIB(conf, EGL_MAX_PBUFFER_WIDTH, 4096);
-      SET_CONFIG_ATTRIB(conf, EGL_MAX_PBUFFER_HEIGHT, 4096);
-      SET_CONFIG_ATTRIB(conf, EGL_MAX_PBUFFER_PIXELS, 4096 * 4096);
+      conf->MaxPbufferWidth = 4096;
+      conf->MaxPbufferHeight = 4096;
+      conf->MaxPbufferPixels = 4096 * 4096;
    }
 
-   SET_CONFIG_ATTRIB(conf, EGL_LEVEL, nconf->level);
-   SET_CONFIG_ATTRIB(conf, EGL_SAMPLES, nconf->samples);
-   SET_CONFIG_ATTRIB(conf, EGL_SAMPLE_BUFFERS, 1);
+   conf->Level = nconf->level;
+   conf->Samples = nconf->samples;
+   conf->SampleBuffers = 0;
 
    if (nconf->slow_config)
-      SET_CONFIG_ATTRIB(conf, EGL_CONFIG_CAVEAT, EGL_SLOW_CONFIG);
+      conf->ConfigCaveat = EGL_SLOW_CONFIG;
 
    if (nconf->transparent_rgb) {
-      rgba[0] = nconf->transparent_rgb_values[0];
-      rgba[1] = nconf->transparent_rgb_values[1];
-      rgba[2] = nconf->transparent_rgb_values[2];
-
-      SET_CONFIG_ATTRIB(conf, EGL_TRANSPARENT_TYPE, EGL_TRANSPARENT_RGB);
-      SET_CONFIG_ATTRIB(conf, EGL_TRANSPARENT_RED_VALUE, rgba[0]);
-      SET_CONFIG_ATTRIB(conf, EGL_TRANSPARENT_GREEN_VALUE, rgba[1]);
-      SET_CONFIG_ATTRIB(conf, EGL_TRANSPARENT_BLUE_VALUE, rgba[2]);
+      conf->TransparentType = EGL_TRANSPARENT_RGB;
+      conf->TransparentRedValue = nconf->transparent_rgb_values[0];
+      conf->TransparentGreenValue = nconf->transparent_rgb_values[1];
+      conf->TransparentBlueValue = nconf->transparent_rgb_values[2];
    }
 
    return _eglValidateConfig(conf, EGL_FALSE);
@@ -378,7 +373,7 @@ egl_g3d_add_configs(_EGLDriver *drv, _EGLDisplay *dpy, EGLint id)
                break;
             }
 
-            _eglAddConfig(dpy, &gconf->base);
+            _eglLinkConfig(&gconf->base);
             id++;
          }
       }
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.h b/src/gallium/state_trackers/egl/common/egl_g3d.h
index be450bbede3..72c14f0ac49 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d.h
+++ b/src/gallium/state_trackers/egl/common/egl_g3d.h
@@ -106,8 +106,6 @@ _EGL_DRIVER_TYPECAST(egl_g3d_image, _EGLImage, obj)
 struct egl_g3d_sync {
    _EGLSync base;
 
-   int refs;
-
    /* the mutex protects only the condvar, not the struct */
    pipe_mutex mutex;
    pipe_condvar condvar;
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_api.c b/src/gallium/state_trackers/egl/common/egl_g3d_api.c
index c0164daf9c1..c10245bb067 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d_api.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_api.c
@@ -160,7 +160,7 @@ destroy_context(_EGLDisplay *dpy, _EGLContext *ctx)
 static EGLBoolean
 egl_g3d_destroy_context(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx)
 {
-   if (!_eglIsContextBound(ctx))
+   if (_eglPutContext(ctx))
       destroy_context(dpy, ctx);
    return EGL_TRUE;
 }
@@ -433,7 +433,7 @@ destroy_surface(_EGLDisplay *dpy, _EGLSurface *surf)
 static EGLBoolean
 egl_g3d_destroy_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf)
 {
-   if (!_eglIsSurfaceBound(surf))
+   if (_eglPutSurface(surf))
       destroy_surface(dpy, surf);
    return EGL_TRUE;
 }
@@ -446,13 +446,15 @@ egl_g3d_make_current(_EGLDriver *drv, _EGLDisplay *dpy,
    struct egl_g3d_surface *gdraw = egl_g3d_surface(draw);
    struct egl_g3d_surface *gread = egl_g3d_surface(read);
    struct egl_g3d_context *old_gctx;
+   _EGLContext *old_ctx;
+   _EGLSurface *old_draw, *old_read;
    EGLBoolean ok = EGL_TRUE;
 
-   /* bind the new context and return the "orphaned" one */
-   if (!_eglBindContext(&ctx, &draw, &read))
+   /* make new bindings */
+   if (!_eglBindContext(ctx, draw, read, &old_ctx, &old_draw, &old_read))
       return EGL_FALSE;
-   old_gctx = egl_g3d_context(ctx);
 
+   old_gctx = egl_g3d_context(old_ctx);
    if (old_gctx) {
       /* flush old context */
       old_gctx->stctxi->flush(old_gctx->stctxi,
@@ -481,15 +483,33 @@ egl_g3d_make_current(_EGLDriver *drv, _EGLDisplay *dpy,
    }
    else if (old_gctx) {
       ok = old_gctx->stapi->make_current(old_gctx->stapi, NULL, NULL, NULL);
-      old_gctx->base.WindowRenderBuffer = EGL_NONE;
+      if (ok)
+         old_gctx->base.WindowRenderBuffer = EGL_NONE;
    }
 
-   if (ctx && !_eglIsContextLinked(ctx))
-      destroy_context(dpy, ctx);
-   if (draw && !_eglIsSurfaceLinked(draw))
-      destroy_surface(dpy, draw);
-   if (read && read != draw && !_eglIsSurfaceLinked(read))
-      destroy_surface(dpy, read);
+   if (ok) {
+      if (_eglPutContext(old_ctx))
+         destroy_context(dpy, old_ctx);
+      if (_eglPutSurface(old_draw))
+         destroy_surface(dpy, old_draw);
+      if (_eglPutSurface(old_read))
+         destroy_surface(dpy, old_read);
+   }
+   else {
+      /* undo the previous _eglBindContext */
+      _eglBindContext(old_ctx, old_draw, old_read, &ctx, &draw, &read);
+      assert(&gctx->base == ctx &&
+             &gdraw->base == draw &&
+             &gread->base == read);
+
+      _eglPutSurface(draw);
+      _eglPutSurface(read);
+      _eglPutContext(ctx);
+
+      _eglPutSurface(old_draw);
+      _eglPutSurface(old_read);
+      _eglPutContext(old_ctx);
+   }
 
    return ok;
 }
@@ -609,8 +629,10 @@ egl_g3d_wait_client(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx)
 
    gctx->stctxi->flush(gctx->stctxi,
          PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_FRAME, &fence);
-   screen->fence_finish(screen, fence, 0);
-   screen->fence_reference(screen, &fence, NULL);
+   if (fence) {
+      screen->fence_finish(screen, fence, 0);
+      screen->fence_reference(screen, &fence, NULL);
+   }
 
    return EGL_TRUE;
 }
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_image.c b/src/gallium/state_trackers/egl/common/egl_g3d_image.c
index 558638e72f0..be9c88e5e47 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d_image.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_image.c
@@ -74,62 +74,40 @@ egl_g3d_reference_native_pixmap(_EGLDisplay *dpy, EGLNativePixmapType pix)
 #ifdef EGL_MESA_drm_image
 
 static struct pipe_resource *
-egl_g3d_create_drm_buffer(_EGLDisplay *dpy, const EGLint *attribs)
+egl_g3d_create_drm_buffer(_EGLDisplay *dpy, _EGLImage *img,
+                          const EGLint *attribs)
 {
    struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
    struct pipe_screen *screen = gdpy->native->screen;
    struct pipe_resource templ;
-   EGLint width = 0, height = 0, format = 0, use = 0;
-   EGLint valid_use;
-   EGLint i, err = EGL_SUCCESS;
-
-   for (i = 0; attribs[i] != EGL_NONE; i++) {
-      EGLint attr = attribs[i++];
-      EGLint val = attribs[i];
-
-      switch (attr) {
-      case EGL_WIDTH:
-	 width = val;
-         break;
-      case EGL_HEIGHT:
-	 height = val;
-         break;
-      case EGL_DRM_BUFFER_FORMAT_MESA:
-	 format = val;
-         break;
-      case EGL_DRM_BUFFER_USE_MESA:
-	 use = val;
-         break;
-      default:
-         err = EGL_BAD_ATTRIBUTE;
-         break;
-      }
+   _EGLImageAttribs attrs;
+   EGLint format, valid_use;
 
-      if (err != EGL_SUCCESS) {
-         _eglLog(_EGL_DEBUG, "bad image attribute 0x%04x", attr);
-         return NULL;
-      }
-   }
+   if (_eglParseImageAttribList(&attrs, dpy, attribs) != EGL_SUCCESS)
+      return NULL;
 
-   if (width <= 0 || height <= 0) {
-      _eglLog(_EGL_DEBUG, "bad width or height (%dx%d)", width, height);
+   if (attrs.Width <= 0 || attrs.Height <= 0) {
+      _eglLog(_EGL_DEBUG, "bad width or height (%dx%d)",
+            attrs.Width, attrs.Height);
       return NULL;
    }
 
-   switch (format) {
+   switch (attrs.DRMBufferFormatMESA) {
    case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA:
       format = PIPE_FORMAT_B8G8R8A8_UNORM;
       break;
    default:
-      _eglLog(_EGL_DEBUG, "bad image format value 0x%04x", format);
+      _eglLog(_EGL_DEBUG, "bad image format value 0x%04x",
+            attrs.DRMBufferFormatMESA);
       return NULL;
       break;
    }
 
    valid_use = EGL_DRM_BUFFER_USE_SCANOUT_MESA |
                EGL_DRM_BUFFER_USE_SHARE_MESA;
-   if (use & ~valid_use) {
-      _eglLog(_EGL_DEBUG, "bad image use bit 0x%04x", use);
+   if (attrs.DRMBufferUseMESA & ~valid_use) {
+      _eglLog(_EGL_DEBUG, "bad image use bit 0x%04x",
+            attrs.DRMBufferUseMESA);
       return NULL;
    }
 
@@ -137,18 +115,18 @@ egl_g3d_create_drm_buffer(_EGLDisplay *dpy, const EGLint *attribs)
    templ.target = PIPE_TEXTURE_2D;
    templ.format = format;
    templ.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
-   templ.width0 = width;
-   templ.height0 = height;
+   templ.width0 = attrs.Width;
+   templ.height0 = attrs.Height;
    templ.depth0 = 1;
 
    /*
     * XXX fix apps (e.g. wayland) and pipe drivers (e.g. i915) and remove the
     * size check
     */
-   if ((use & EGL_DRM_BUFFER_USE_SCANOUT_MESA) &&
-       width >= 640 && height >= 480)
+   if ((attrs.DRMBufferUseMESA & EGL_DRM_BUFFER_USE_SCANOUT_MESA) &&
+       attrs.Width >= 640 && attrs.Height >= 480)
       templ.bind |= PIPE_BIND_SCANOUT;
-   if (use & EGL_DRM_BUFFER_USE_SHARE_MESA)
+   if (attrs.DRMBufferUseMESA & EGL_DRM_BUFFER_USE_SHARE_MESA)
       templ.bind |= PIPE_BIND_SHARED;
 
    return screen->resource_create(screen, &templ);
@@ -156,59 +134,36 @@ egl_g3d_create_drm_buffer(_EGLDisplay *dpy, const EGLint *attribs)
 
 static struct pipe_resource *
 egl_g3d_reference_drm_buffer(_EGLDisplay *dpy, EGLint name,
-                             const EGLint *attribs)
+                             _EGLImage *img, const EGLint *attribs)
 {
    struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
    struct pipe_screen *screen = gdpy->native->screen;
    struct pipe_resource templ;
    struct winsys_handle wsh;
-   EGLint width = 0, height = 0, format = 0, stride = 0;
-   EGLint i, err = EGL_SUCCESS;
+   _EGLImageAttribs attrs;
+   EGLint format;
 
    /* winsys_handle is in theory platform-specific */
    if (dpy->Platform != _EGL_PLATFORM_DRM)
       return NULL;
 
-   for (i = 0; attribs[i] != EGL_NONE; i++) {
-      EGLint attr = attribs[i++];
-      EGLint val = attribs[i];
-
-      switch (attr) {
-      case EGL_WIDTH:
-	 width = val;
-         break;
-      case EGL_HEIGHT:
-	 height = val;
-         break;
-      case EGL_DRM_BUFFER_FORMAT_MESA:
-	 format = val;
-         break;
-      case EGL_DRM_BUFFER_STRIDE_MESA:
-	 stride = val;
-         break;
-      default:
-         err = EGL_BAD_ATTRIBUTE;
-         break;
-      }
-
-      if (err != EGL_SUCCESS) {
-         _eglLog(_EGL_DEBUG, "bad image attribute 0x%04x", attr);
-         return NULL;
-      }
-   }
+   if (_eglParseImageAttribList(&attrs, dpy, attribs) != EGL_SUCCESS)
+      return NULL;
 
-   if (width <= 0 || height <= 0 || stride <= 0) {
+   if (attrs.Width <= 0 || attrs.Height <= 0 ||
+       attrs.DRMBufferStrideMESA <= 0) {
       _eglLog(_EGL_DEBUG, "bad width, height, or stride (%dx%dx%d)",
-            width, height, stride);
+            attrs.Width, attrs.Height, attrs.DRMBufferStrideMESA);
       return NULL;
    }
 
-   switch (format) {
+   switch (attrs.DRMBufferFormatMESA) {
    case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA:
       format = PIPE_FORMAT_B8G8R8A8_UNORM;
       break;
    default:
-      _eglLog(_EGL_DEBUG, "bad image format value 0x%04x", format);
+      _eglLog(_EGL_DEBUG, "bad image format value 0x%04x",
+            attrs.DRMBufferFormatMESA);
       return NULL;
       break;
    }
@@ -217,13 +172,13 @@ egl_g3d_reference_drm_buffer(_EGLDisplay *dpy, EGLint name,
    templ.target = PIPE_TEXTURE_2D;
    templ.format = format;
    templ.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
-   templ.width0 = width;
-   templ.height0 = height;
+   templ.width0 = attrs.Width;
+   templ.height0 = attrs.Height;
    templ.depth0 = 1;
 
    memset(&wsh, 0, sizeof(wsh));
    wsh.handle = (unsigned) name;
-   wsh.stride = stride;
+   wsh.stride = attrs.DRMBufferStrideMESA;
 
    return screen->resource_from_handle(screen, &templ, &wsh);
 }
@@ -245,7 +200,7 @@ egl_g3d_create_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx,
       return NULL;
    }
 
-   if (!_eglInitImage(&gimg->base, dpy, attribs)) {
+   if (!_eglInitImage(&gimg->base, dpy)) {
       FREE(gimg);
       return NULL;
    }
@@ -257,7 +212,8 @@ egl_g3d_create_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx,
       break;
 #ifdef EGL_MESA_drm_image
    case EGL_DRM_BUFFER_MESA:
-      ptex = egl_g3d_reference_drm_buffer(dpy, (EGLint) buffer, attribs);
+      ptex = egl_g3d_reference_drm_buffer(dpy,
+            (EGLint) buffer, &gimg->base, attribs);
       break;
 #endif
    default:
@@ -316,13 +272,13 @@ egl_g3d_create_drm_image(_EGLDriver *drv, _EGLDisplay *dpy,
       return NULL;
    }
 
-   if (!_eglInitImage(&gimg->base, dpy, attribs)) {
+   if (!_eglInitImage(&gimg->base, dpy)) {
       FREE(gimg);
       return NULL;
    }
 
 #ifdef EGL_MESA_drm_image
-   ptex = egl_g3d_create_drm_buffer(dpy, attribs);
+   ptex = egl_g3d_create_drm_buffer(dpy, &gimg->base, attribs);
 #else
    ptex = NULL;
 #endif
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_sync.c b/src/gallium/state_trackers/egl/common/egl_g3d_sync.c
index ec74e9eb94c..4e6d944c151 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d_sync.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_sync.c
@@ -128,13 +128,13 @@ egl_g3d_wait_fence_sync(struct egl_g3d_sync *gsync, EGLTimeKHR timeout)
 static INLINE void
 egl_g3d_ref_sync(struct egl_g3d_sync *gsync)
 {
-   p_atomic_inc(&gsync->refs);
+   _eglGetSync(&gsync->base);
 }
 
 static INLINE void
 egl_g3d_unref_sync(struct egl_g3d_sync *gsync)
 {
-   if (p_atomic_dec_zero(&gsync->refs)) {
+   if (_eglPutSync(&gsync->base)) {
       pipe_condvar_destroy(gsync->condvar);
       pipe_mutex_destroy(gsync->mutex);
 
@@ -194,7 +194,6 @@ egl_g3d_create_sync(_EGLDriver *drv, _EGLDisplay *dpy,
 
    pipe_mutex_init(gsync->mutex);
    pipe_condvar_init(gsync->condvar);
-   p_atomic_set(&gsync->refs, 1);
 
    return &gsync->base;
 }
diff --git a/src/gallium/state_trackers/egl/common/native_modeset.h b/src/gallium/state_trackers/egl/common/native_modeset.h
index dee757b3a88..2598082d687 100644
--- a/src/gallium/state_trackers/egl/common/native_modeset.h
+++ b/src/gallium/state_trackers/egl/common/native_modeset.h
@@ -39,7 +39,7 @@ struct native_connector {
 struct native_mode {
    const char *desc;
    int width, height;
-   int refresh_rate;
+   int refresh_rate; /* HZ * 1000 */
 };
 
 /**
diff --git a/src/gallium/state_trackers/egl/drm/modeset.c b/src/gallium/state_trackers/egl/drm/modeset.c
index 06a60770537..5ed22f7b9d4 100644
--- a/src/gallium/state_trackers/egl/drm/modeset.c
+++ b/src/gallium/state_trackers/egl/drm/modeset.c
@@ -469,8 +469,8 @@ drm_display_get_modes(struct native_display *ndpy,
       drmmode->base.height = drmmode->mode.vdisplay;
       drmmode->base.refresh_rate = drmmode->mode.vrefresh;
       /* not all kernels have vrefresh = refresh_rate * 1000 */
-      if (drmmode->base.refresh_rate > 1000)
-         drmmode->base.refresh_rate = (drmmode->base.refresh_rate + 500) / 1000;
+      if (drmmode->base.refresh_rate < 1000)
+         drmmode->base.refresh_rate *= 1000;
    }
 
    nmodes_return = MALLOC(count * sizeof(*nmodes_return));
diff --git a/src/gallium/state_trackers/egl/x11/glcore.h b/src/gallium/state_trackers/egl/x11/glcore.h
new file mode 100644
index 00000000000..547b1113707
--- /dev/null
+++ b/src/gallium/state_trackers/egl/x11/glcore.h
@@ -0,0 +1,181 @@
+#ifndef __gl_core_h_
+#define __gl_core_h_
+
+/*
+ * SGI FREE SOFTWARE LICENSE B (Version 2.0, Sept. 18, 2008)
+ * Copyright (C) 1991-2000 Silicon Graphics, Inc. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice including the dates of first publication and
+ * either this permission notice or a reference to
+ * http://oss.sgi.com/projects/FreeB/
+ * shall be included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * SILICON GRAPHICS, INC. BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Except as contained in this notice, the name of Silicon Graphics, Inc.
+ * shall not be used in advertising or otherwise to promote the sale, use or
+ * other dealings in this Software without prior written authorization from
+ * Silicon Graphics, Inc.
+ */
+
+#if !defined(_WIN32_WCE)
+#include <sys/types.h>
+#endif
+
+#define GL_CORE_SGI  1
+#define GL_CORE_MESA 2
+#define GL_CORE_APPLE 4
+#define GL_CORE_WINDOWS 8
+
+typedef struct __GLcontextRec __GLcontext;
+
+/*
+** This file defines the interface between the GL core and the surrounding
+** "operating system" that supports it (currently the GLX or WGL extensions).
+**
+** Members (data and function pointers) are documented as imported or
+** exported according to how they are used by the core rendering functions.
+** Imported members are initialized by the "operating system" and used by
+** the core functions.  Exported members are initialized by the core functions
+** and used by the "operating system".
+*/
+
+/**
+ * Mode and limit information for a context.  This information is
+ * kept around in the context so that values can be used during
+ * command execution, and for returning information about the
+ * context to the application.
+ * 
+ * Instances of this structure are shared by the driver and the loader.  To
+ * maintain binary compatability, new fields \b must be added only to the
+ * end of the structure.
+ * 
+ * \sa _gl_context_modes_create
+ */
+typedef struct __GLcontextModesRec {
+    struct __GLcontextModesRec * next;
+
+    GLboolean rgbMode;
+    GLboolean floatMode;
+    GLboolean colorIndexMode;
+    GLuint doubleBufferMode;
+    GLuint stereoMode;
+
+    GLboolean haveAccumBuffer;
+    GLboolean haveDepthBuffer;
+    GLboolean haveStencilBuffer;
+
+    GLint redBits, greenBits, blueBits, alphaBits;	/* bits per comp */
+    GLuint redMask, greenMask, blueMask, alphaMask;
+    GLint rgbBits;		/* total bits for rgb */
+    GLint indexBits;		/* total bits for colorindex */
+
+    GLint accumRedBits, accumGreenBits, accumBlueBits, accumAlphaBits;
+    GLint depthBits;
+    GLint stencilBits;
+
+    GLint numAuxBuffers;
+
+    GLint level;
+
+    GLint pixmapMode;
+
+    /* GLX */
+    GLint visualID;
+    GLint visualType;     /**< One of the GLX X visual types. (i.e., 
+			   * \c GLX_TRUE_COLOR, etc.)
+			   */
+
+    /* EXT_visual_rating / GLX 1.2 */
+    GLint visualRating;
+
+    /* EXT_visual_info / GLX 1.2 */
+    GLint transparentPixel;
+				/*    colors are floats scaled to ints */
+    GLint transparentRed, transparentGreen, transparentBlue, transparentAlpha;
+    GLint transparentIndex;
+
+    /* ARB_multisample / SGIS_multisample */
+    GLint sampleBuffers;
+    GLint samples;
+
+    /* SGIX_fbconfig / GLX 1.3 */
+    GLint drawableType;
+    GLint renderType;
+    GLint xRenderable;
+    GLint fbconfigID;
+
+    /* SGIX_pbuffer / GLX 1.3 */
+    GLint maxPbufferWidth;
+    GLint maxPbufferHeight;
+    GLint maxPbufferPixels;
+    GLint optimalPbufferWidth;   /* Only for SGIX_pbuffer. */
+    GLint optimalPbufferHeight;  /* Only for SGIX_pbuffer. */
+
+    /* SGIX_visual_select_group */
+    GLint visualSelectGroup;
+
+    /* OML_swap_method */
+    GLint swapMethod;
+
+    GLint screen;
+
+    /* EXT_texture_from_pixmap */
+    GLint bindToTextureRgb;
+    GLint bindToTextureRgba;
+    GLint bindToMipmapTexture;
+    GLint bindToTextureTargets;
+    GLint yInverted;
+} __GLcontextModes;
+
+/* Several fields of __GLcontextModes can take these as values.  Since
+ * GLX header files may not be available everywhere they need to be used,
+ * redefine them here.
+ */
+#define GLX_NONE                           0x8000
+#define GLX_SLOW_CONFIG                    0x8001
+#define GLX_TRUE_COLOR                     0x8002
+#define GLX_DIRECT_COLOR                   0x8003
+#define GLX_PSEUDO_COLOR                   0x8004
+#define GLX_STATIC_COLOR                   0x8005
+#define GLX_GRAY_SCALE                     0x8006
+#define GLX_STATIC_GRAY                    0x8007
+#define GLX_TRANSPARENT_RGB                0x8008
+#define GLX_TRANSPARENT_INDEX              0x8009
+#define GLX_NON_CONFORMANT_CONFIG          0x800D
+#define GLX_SWAP_EXCHANGE_OML              0x8061
+#define GLX_SWAP_COPY_OML                  0x8062
+#define GLX_SWAP_UNDEFINED_OML             0x8063
+
+#define GLX_DONT_CARE                      0xFFFFFFFF
+
+#define GLX_RGBA_BIT                       0x00000001
+#define GLX_COLOR_INDEX_BIT                0x00000002
+#define GLX_WINDOW_BIT                     0x00000001
+#define GLX_PIXMAP_BIT                     0x00000002
+#define GLX_PBUFFER_BIT                    0x00000004
+
+#define GLX_BIND_TO_TEXTURE_RGB_EXT        0x20D0
+#define GLX_BIND_TO_TEXTURE_RGBA_EXT       0x20D1
+#define GLX_BIND_TO_MIPMAP_TEXTURE_EXT     0x20D2
+#define GLX_BIND_TO_TEXTURE_TARGETS_EXT    0x20D3
+#define GLX_Y_INVERTED_EXT                 0x20D4
+
+#define GLX_TEXTURE_1D_BIT_EXT             0x00000001
+#define GLX_TEXTURE_2D_BIT_EXT             0x00000002
+#define GLX_TEXTURE_RECTANGLE_BIT_EXT      0x00000004
+
+#endif /* __gl_core_h_ */
diff --git a/src/gallium/state_trackers/egl/x11/glxinit.c b/src/gallium/state_trackers/egl/x11/glxinit.c
index 57c6aaff864..df8370f8d7d 100644
--- a/src/gallium/state_trackers/egl/x11/glxinit.c
+++ b/src/gallium/state_trackers/egl/x11/glxinit.c
@@ -18,7 +18,7 @@
 #include "GL/glxproto.h"
 #include "GL/glxtokens.h"
 #include "GL/gl.h" /* for GL types needed by __GLcontextModes */
-#include "GL/internal/glcore.h"  /* for __GLcontextModes */
+#include "glcore.h"  /* for __GLcontextModes */
 
 #include "glxinit.h"
 
diff --git a/src/gallium/state_trackers/egl/x11/x11_screen.h b/src/gallium/state_trackers/egl/x11/x11_screen.h
index bc0ef69ec66..2e313e0148e 100644
--- a/src/gallium/state_trackers/egl/x11/x11_screen.h
+++ b/src/gallium/state_trackers/egl/x11/x11_screen.h
@@ -30,7 +30,7 @@
 #include <X11/Xutil.h>
 #include <X11/extensions/dri2tokens.h>
 #include "GL/gl.h" /* for GL types needed by __GLcontextModes */
-#include "GL/internal/glcore.h"  /* for __GLcontextModes */
+#include "glcore.h"  /* for __GLcontextModes */
 #include "pipe/p_compiler.h"
 #include "common/native.h"
 
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.c b/src/gallium/state_trackers/glx/xlib/xm_api.c
index f950c8858bc..8332633f01b 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.c
@@ -423,7 +423,7 @@ static XMesaBuffer XMesaBufferList = NULL;
 
 /**
  * Allocate a new XMesaBuffer object which corresponds to the given drawable.
- * Note that XMesaBuffer is derived from GLframebuffer.
+ * Note that XMesaBuffer is derived from struct gl_framebuffer.
  * The new XMesaBuffer will not have any size (Width=Height=0).
  *
  * \param d  the corresponding X drawable (window or pixmap)
@@ -569,7 +569,7 @@ initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
       /* RGB WINDOW:
        * We support RGB rendering into almost any kind of visual.
        */
-      const int xclass = v->mesa_visual.visualType;
+      const int xclass = v->visualType;
       if (xclass != GLX_TRUE_COLOR && xclass == !GLX_DIRECT_COLOR) {
 	 _mesa_warning(NULL,
             "XMesa: RGB mode rendering not supported in given visual.\n");
@@ -716,13 +716,13 @@ XMesaVisual XMesaCreateVisual( Display *display,
    v->mesa_visual.redMask = visinfo->red_mask;
    v->mesa_visual.greenMask = visinfo->green_mask;
    v->mesa_visual.blueMask = visinfo->blue_mask;
-   v->mesa_visual.visualID = visinfo->visualid;
-   v->mesa_visual.screen = visinfo->screen;
+   v->visualID = visinfo->visualid;
+   v->screen = visinfo->screen;
 
 #if !(defined(__cplusplus) || defined(c_plusplus))
-   v->mesa_visual.visualType = xmesa_convert_from_x_visual_type(visinfo->class);
+   v->visualType = xmesa_convert_from_x_visual_type(visinfo->class);
 #else
-   v->mesa_visual.visualType = xmesa_convert_from_x_visual_type(visinfo->c_class);
+   v->visualType = xmesa_convert_from_x_visual_type(visinfo->c_class);
 #endif
 
    v->mesa_visual.visualRating = visualCaveat;
@@ -733,7 +733,7 @@ XMesaVisual XMesaCreateVisual( Display *display,
    (void) initialize_visual_and_buffer( v, NULL, rgb_flag, 0, 0 );
 
    {
-      const int xclass = v->mesa_visual.visualType;
+      const int xclass = v->visualType;
       if (xclass == GLX_TRUE_COLOR || xclass == GLX_DIRECT_COLOR) {
          red_bits   = _mesa_bitcount(GET_REDMASK(v));
          green_bits = _mesa_bitcount(GET_GREENMASK(v));
@@ -756,7 +756,7 @@ XMesaVisual XMesaCreateVisual( Display *display,
 
    /* initialize visual */
    {
-      __GLcontextModes *vis = &v->mesa_visual;
+      struct gl_config *vis = &v->mesa_visual;
 
       vis->rgbMode          = GL_TRUE;
       vis->doubleBufferMode = db_flag;
@@ -783,7 +783,6 @@ XMesaVisual XMesaCreateVisual( Display *display,
 
       vis->numAuxBuffers = 0;
       vis->level = 0;
-      vis->pixmapMode = 0;
       vis->sampleBuffers = 0;
       vis->samples = 0;
    }
@@ -855,7 +854,7 @@ XMesaContext XMesaCreateContext( XMesaVisual v, XMesaContext share_list )
    if (!xmdpy)
       return NULL;
 
-   /* Note: the XMesaContext contains a Mesa GLcontext struct (inheritance) */
+   /* Note: the XMesaContext contains a Mesa struct gl_context struct (inheritance) */
    c = (XMesaContext) CALLOC_STRUCT(xmesa_context);
    if (!c)
       return NULL;
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.h b/src/gallium/state_trackers/glx/xlib/xm_api.h
index fedf2b2d5a1..b8ac979edc1 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.h
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.h
@@ -280,7 +280,8 @@ XMesaCopyContext(XMesaContext src, XMesaContext dst, unsigned long mask);
  * Basically corresponds to an XVisualInfo.
  */
 struct xmesa_visual {
-   GLvisual mesa_visual;	/* Device independent visual parameters */
+   struct gl_config mesa_visual;/* Device independent visual parameters */
+   int screen, visualID, visualType;
    Display *display;	/* The X11 display */
    XVisualInfo * visinfo;	/* X's visual info (pointer to private copy) */
    XVisualInfo *vishandle;	/* Only used in fakeglx.c */
diff --git a/src/gallium/state_trackers/glx/xlib/xm_st.c b/src/gallium/state_trackers/glx/xlib/xm_st.c
index 4d0f5e66256..e7466bdbee5 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_st.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_st.c
@@ -196,7 +196,13 @@ xmesa_st_framebuffer_validate_textures(struct st_framebuffer_iface *stfbi,
 
 
 /**
+ * Check that a framebuffer's attachments match the window's size.
+ *
  * Called via st_framebuffer_iface::validate()
+ *
+ * \param statts  array of framebuffer attachments
+ * \param count  number of framebuffer attachments in statts[]
+ * \param out  returns resources for each of the attachments
  */
 static boolean 
 xmesa_st_framebuffer_validate(struct st_framebuffer_iface *stfbi,
@@ -209,9 +215,11 @@ xmesa_st_framebuffer_validate(struct st_framebuffer_iface *stfbi,
    boolean resized;
    boolean ret;
 
+   /* build mask of ST_ATTACHMENT bits */
    statt_mask = 0x0;
    for (i = 0; i < count; i++)
       statt_mask |= 1 << statts[i];
+
    /* record newly allocated textures */
    new_mask = statt_mask & ~xstfb->texture_mask;
 
diff --git a/src/gallium/state_trackers/vega/vg_manager.c b/src/gallium/state_trackers/vega/vg_manager.c
index e7996741d14..232deefa166 100644
--- a/src/gallium/state_trackers/vega/vg_manager.c
+++ b/src/gallium/state_trackers/vega/vg_manager.c
@@ -352,7 +352,7 @@ vg_api_create_context(struct st_api *stapi, struct st_manager *smapi,
       return NULL;
 
    /* only 1.0 is supported */
-   if (attribs->major != 1 || attribs->minor > 0)
+   if (attribs->major > 1 || (attribs->major == 1 && attribs->minor > 0))
       return NULL;
 
    pipe = smapi->screen->context_create(smapi->screen, NULL);
diff --git a/src/gallium/state_trackers/xorg/Makefile b/src/gallium/state_trackers/xorg/Makefile
index cb2c3aea410..7a44d28017b 100644
--- a/src/gallium/state_trackers/xorg/Makefile
+++ b/src/gallium/state_trackers/xorg/Makefile
@@ -10,7 +10,7 @@ LIBRARY_INCLUDES = \
 	$(shell pkg-config libkms --atleast-version=1.0 \
 				&& echo "-DHAVE_LIBKMS") \
 	$(shell pkg-config libkms --silence-errors --cflags-only-I) \
-	$(shell pkg-config --cflags-only-I pixman-1 xorg-server libdrm xproto) \
+	$(shell pkg-config --cflags-only-I pixman-1 xorg-server libdrm xproto dri2proto) \
 	-I$(TOP)/src/gallium/include \
 	-I$(TOP)/src/gallium/auxiliary \
 	-I$(TOP)/include \
diff --git a/src/gallium/state_trackers/xorg/xorg_crtc.c b/src/gallium/state_trackers/xorg/xorg_crtc.c
index 26a907f205e..80af82d97b2 100644
--- a/src/gallium/state_trackers/xorg/xorg_crtc.c
+++ b/src/gallium/state_trackers/xorg/xorg_crtc.c
@@ -234,6 +234,10 @@ crtc_load_cursor_argb_ga3d(xf86CrtcPtr crtc, CARD32 * image)
 		   64, 64, (void*)image, 64 * 4, 0, 0);
     ms->ctx->transfer_unmap(ms->ctx, transfer);
     ms->ctx->transfer_destroy(ms->ctx, transfer);
+
+    if (crtc->cursor_shown)
+	drmModeSetCursor(ms->fd, crtcp->drm_crtc->crtc_id,
+			 crtcp->cursor_handle, 64, 64);
 }
 
 #if HAVE_LIBKMS
@@ -271,6 +275,10 @@ crtc_load_cursor_argb_kms(xf86CrtcPtr crtc, CARD32 * image)
     memcpy(ptr, image, 64*64*4);
     kms_bo_unmap(crtcp->cursor_bo);
 
+    if (crtc->cursor_shown)
+	drmModeSetCursor(ms->fd, crtcp->drm_crtc->crtc_id,
+			 crtcp->cursor_handle, 64, 64);
+
     return;
 
 err_bo_destroy:
@@ -353,7 +361,7 @@ crtc_destroy(xf86CrtcPtr crtc)
 
     drmModeFreeCrtc(crtcp->drm_crtc);
 
-    xfree(crtcp);
+    free(crtcp);
     crtc->driver_private = NULL;
 }
 
@@ -401,7 +409,7 @@ xorg_crtc_init(ScrnInfoPtr pScrn)
 	if (crtc == NULL)
 	    goto out;
 
-	crtcp = xcalloc(1, sizeof(struct crtc_private));
+	crtcp = calloc(1, sizeof(struct crtc_private));
 	if (!crtcp) {
 	    xf86CrtcDestroy(crtc);
 	    goto out;
diff --git a/src/gallium/state_trackers/xorg/xorg_dri2.c b/src/gallium/state_trackers/xorg/xorg_dri2.c
index 704aed6a82c..b723a8e9cb0 100644
--- a/src/gallium/state_trackers/xorg/xorg_dri2.c
+++ b/src/gallium/state_trackers/xorg/xorg_dri2.c
@@ -201,11 +201,11 @@ dri2_create_buffer(DrawablePtr pDraw, unsigned int attachment, unsigned int form
     DRI2Buffer2Ptr buffer;
     BufferPrivatePtr private;
 
-    buffer = xcalloc(1, sizeof *buffer);
+    buffer = calloc(1, sizeof *buffer);
     if (!buffer)
 	return NULL;
 
-    private = xcalloc(1, sizeof *private);
+    private = calloc(1, sizeof *private);
     if (!private) {
 	goto fail;
     }
@@ -217,9 +217,9 @@ dri2_create_buffer(DrawablePtr pDraw, unsigned int attachment, unsigned int form
     if (dri2_do_create_buffer(pDraw, (DRI2BufferPtr)buffer, format))
 	return buffer;
 
-    xfree(private);
+    free(private);
 fail:
-    xfree(buffer);
+    free(buffer);
     return NULL;
 }
 
@@ -229,8 +229,8 @@ dri2_destroy_buffer(DrawablePtr pDraw, DRI2Buffer2Ptr buffer)
     /* So far it is safe to downcast a DRI2Buffer2Ptr to DRI2BufferPtr */
     dri2_do_destroy_buffer(pDraw, (DRI2BufferPtr)buffer);
 
-    xfree(buffer->driverPrivate);
-    xfree(buffer);
+    free(buffer->driverPrivate);
+    free(buffer);
 }
 
 #endif /* DRI2INFOREC_VERSION >= 2 */
@@ -244,11 +244,11 @@ dri2_create_buffers(DrawablePtr pDraw, unsigned int *attachments, int count)
     DRI2BufferPtr buffers;
     int i;
 
-    buffers = xcalloc(count, sizeof *buffers);
+    buffers = calloc(count, sizeof *buffers);
     if (!buffers)
 	goto fail_buffers;
 
-    privates = xcalloc(count, sizeof *privates);
+    privates = calloc(count, sizeof *privates);
     if (!privates)
 	goto fail_privates;
 
@@ -263,9 +263,9 @@ dri2_create_buffers(DrawablePtr pDraw, unsigned int *attachments, int count)
     return buffers;
 
 fail:
-    xfree(privates);
+    free(privates);
 fail_privates:
-    xfree(buffers);
+    free(buffers);
 fail_buffers:
     return NULL;
 }
@@ -280,8 +280,8 @@ dri2_destroy_buffers(DrawablePtr pDraw, DRI2BufferPtr buffers, int count)
     }
 
     if (buffers) {
-	xfree(buffers[0].driverPrivate);
-	xfree(buffers);
+	free(buffers[0].driverPrivate);
+	free(buffers);
     }
 }
 
diff --git a/src/gallium/state_trackers/xorg/xorg_driver.c b/src/gallium/state_trackers/xorg/xorg_driver.c
index e10ff2f9508..1ec772df172 100644
--- a/src/gallium/state_trackers/xorg/xorg_driver.c
+++ b/src/gallium/state_trackers/xorg/xorg_driver.c
@@ -45,6 +45,7 @@
 #include "miscstruct.h"
 #include "dixstruct.h"
 #include "xf86xv.h"
+#include "xorgVersion.h"
 #ifndef XSERVER_LIBPCIACCESS
 #error "libpciaccess needed"
 #endif
@@ -122,7 +123,7 @@ xorg_tracker_set_functions(ScrnInfoPtr scrn)
 Bool
 xorg_tracker_have_modesetting(ScrnInfoPtr pScrn, struct pci_device *device)
 {
-    char *BusID = xalloc(64);
+    char *BusID = malloc(64);
     sprintf(BusID, "pci:%04x:%02x:%02x.%d",
 	    device->domain, device->bus,
 	    device->dev, device->func);
@@ -130,14 +131,14 @@ xorg_tracker_have_modesetting(ScrnInfoPtr pScrn, struct pci_device *device)
     if (drmCheckModesettingSupported(BusID)) {
 	xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, 0,
 		       "Drm modesetting not supported %s\n", BusID);
-	xfree(BusID);
+	free(BusID);
 	return FALSE;
     }
 
     xf86DrvMsgVerb(pScrn->scrnIndex, X_INFO, 0,
 		   "Drm modesetting supported on %s\n", BusID);
 
-    xfree(BusID);
+    free(BusID);
     return TRUE;
 }
 
@@ -174,7 +175,7 @@ drv_free_rec(ScrnInfoPtr pScrn)
     if (!pScrn->driverPrivate)
 	return;
 
-    xfree(pScrn->driverPrivate);
+    free(pScrn->driverPrivate);
 
     pScrn->driverPrivate = NULL;
 }
@@ -274,7 +275,7 @@ drv_init_drm(ScrnInfoPtr pScrn)
     if (ms->fd < 0) {
 	char *BusID;
 
-	BusID = xalloc(64);
+	BusID = malloc(64);
 	sprintf(BusID, "PCI:%d:%d:%d",
 		((ms->PciInfo->domain << 8) | ms->PciInfo->bus),
 		ms->PciInfo->dev, ms->PciInfo->func
@@ -283,7 +284,7 @@ drv_init_drm(ScrnInfoPtr pScrn)
 
 	ms->fd = drmOpen(driver_descriptor.driver_name, BusID);
 	ms->isMaster = TRUE;
-	xfree(BusID);
+	free(BusID);
 
 	if (ms->fd >= 0)
 	    return TRUE;
@@ -369,6 +370,7 @@ drv_pre_init(ScrnInfoPtr pScrn, int flags)
     ms = modesettingPTR(pScrn);
     ms->pEnt = pEnt;
     ms->cust = cust;
+    ms->fb_id = -1;
 
     pScrn->displayWidth = 640;	       /* default it */
 
@@ -402,19 +404,6 @@ drv_pre_init(ScrnInfoPtr pScrn, int flags)
     if (!drv_init_drm(pScrn))
 	return FALSE;
 
-    use3D = cust ? !cust->no_3d : TRUE;
-    ms->from_3D = xf86GetOptValBool(ms->Options, OPTION_3D_ACCEL,
-				    &use3D) ?
-	X_CONFIG : X_PROBED;
-
-    ms->no3D = !use3D;
-
-    if (!drv_init_resource_management(pScrn)) {
-	xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Could not init "
-					       "Gallium3D or libKMS.");
-	return FALSE;
-    }
-
     pScrn->monitor = pScrn->confScreen->monitor;
     pScrn->progClock = TRUE;
     pScrn->rgbBits = 8;
@@ -444,11 +433,24 @@ drv_pre_init(ScrnInfoPtr pScrn, int flags)
 
     /* Process the options */
     xf86CollectOptions(pScrn, NULL);
-    if (!(ms->Options = xalloc(sizeof(drv_options))))
+    if (!(ms->Options = malloc(sizeof(drv_options))))
 	return FALSE;
     memcpy(ms->Options, drv_options, sizeof(drv_options));
     xf86ProcessOptions(pScrn->scrnIndex, pScrn->options, ms->Options);
 
+    use3D = cust ? !cust->no_3d : TRUE;
+    ms->from_3D = xf86GetOptValBool(ms->Options, OPTION_3D_ACCEL,
+				    &use3D) ?
+	X_CONFIG : X_PROBED;
+
+    ms->no3D = !use3D;
+
+    if (!drv_init_resource_management(pScrn)) {
+	xf86DrvMsg(pScrn->scrnIndex, X_ERROR, "Could not init "
+					       "Gallium3D or libKMS.");
+	return FALSE;
+    }
+
     /* Allocate an xf86CrtcConfig */
     xf86CrtcConfigInit(pScrn, &crtc_config_funcs);
     xf86_config = XF86_CRTC_CONFIG_PTR(pScrn);
@@ -791,7 +793,9 @@ drv_screen_init(int scrnIndex, ScreenPtr pScreen, int argc, char **argv)
     if (!ms->SWCursor)
 	xf86_cursors_init(pScreen, 64, 64,
 			  HARDWARE_CURSOR_SOURCE_MASK_INTERLEAVE_64 |
-			  HARDWARE_CURSOR_ARGB);
+			  HARDWARE_CURSOR_ARGB |
+			  ((cust && cust->unhidden_hw_cursor_update) ?
+			   HARDWARE_CURSOR_UPDATE_UNHIDDEN : 0));
 
     /* Must force it before EnterVT, so we are in control of VT and
      * later memory should be bound when allocating, e.g rotate_mem */
@@ -862,8 +866,10 @@ drv_leave_vt(int scrnIndex, int flags)
 	}
     }
 
-    drmModeRmFB(ms->fd, ms->fb_id);
-    ms->fb_id = -1;
+    if (ms->fb_id != -1) {
+	drmModeRmFB(ms->fd, ms->fb_id);
+	ms->fb_id = -1;
+    }
 
     /* idle hardware */
     if (!ms->kms)
@@ -944,7 +950,6 @@ drv_close_screen(int scrnIndex, ScreenPtr pScreen)
     }
 #endif
 
-    drmModeRmFB(ms->fd, ms->fb_id);
     ms->destroy_front_buffer(pScrn);
 
     if (ms->exa)
@@ -1178,6 +1183,8 @@ drv_bind_front_buffer_kms(ScrnInfoPtr pScrn)
 				stride,
 				ptr);
 
+#if (XORG_VERSION_CURRENT < XORG_VERSION_NUMERIC(1, 9, 99, 1, 0))
+
     /* This a hack to work around EnableDisableFBAccess setting the pointer
      * the real fix would be to replace pScrn->EnableDisableFBAccess hook
      * and set the rootPixmap->devPrivate.ptr to something valid before that.
@@ -1187,6 +1194,8 @@ drv_bind_front_buffer_kms(ScrnInfoPtr pScrn)
      */
     pScrn->pixmapPrivate.ptr = ptr;
 
+#endif
+
     return TRUE;
 
 err_destroy:
diff --git a/src/gallium/state_trackers/xorg/xorg_exa.c b/src/gallium/state_trackers/xorg/xorg_exa.c
index 6b2c80fbca6..4b1c02bad42 100644
--- a/src/gallium/state_trackers/xorg/xorg_exa.c
+++ b/src/gallium/state_trackers/xorg/xorg_exa.c
@@ -720,7 +720,7 @@ ExaCreatePixmap(ScreenPtr pScreen, int size, int align)
 {
     struct exa_pixmap_priv *priv;
 
-    priv = xcalloc(1, sizeof(struct exa_pixmap_priv));
+    priv = calloc(1, sizeof(struct exa_pixmap_priv));
     if (!priv)
 	return NULL;
 
@@ -737,7 +737,7 @@ ExaDestroyPixmap(ScreenPtr pScreen, void *dPriv)
 
     pipe_resource_reference(&priv->tex, NULL);
 
-    xfree(priv);
+    free(priv);
 }
 
 static Bool
@@ -975,7 +975,7 @@ xorg_exa_close(ScrnInfoPtr pScrn)
    ms->ctx = NULL;
 
    exaDriverFini(pScrn->pScreen);
-   xfree(exa);
+   free(exa);
    ms->exa = NULL;
 }
 
@@ -987,7 +987,7 @@ xorg_exa_init(ScrnInfoPtr pScrn, Bool accel)
    ExaDriverPtr pExa;
    CustomizerPtr cust = ms->cust;
 
-   exa = xcalloc(1, sizeof(struct exa_context));
+   exa = calloc(1, sizeof(struct exa_context));
    if (!exa)
       return NULL;
 
@@ -1057,6 +1057,7 @@ xorg_exa_init(ScrnInfoPtr pScrn, Bool accel)
 
 out_err:
    xorg_exa_close(pScrn);
+   free(exa);
 
    return NULL;
 }
diff --git a/src/gallium/state_trackers/xorg/xorg_output.c b/src/gallium/state_trackers/xorg/xorg_output.c
index 61206ed751c..5555b51131c 100644
--- a/src/gallium/state_trackers/xorg/xorg_output.c
+++ b/src/gallium/state_trackers/xorg/xorg_output.c
@@ -128,7 +128,7 @@ output_get_modes(xf86OutputPtr output)
     for (i = 0; i < drm_connector->count_modes; i++) {
 	drm_mode = &drm_connector->modes[i];
 	if (drm_mode) {
-	    mode = xcalloc(1, sizeof(DisplayModeRec));
+	    mode = calloc(1, sizeof(DisplayModeRec));
 	    if (!mode)
 		continue;
 	    mode->Clock = drm_mode->clock;
@@ -195,7 +195,7 @@ output_destroy(xf86OutputPtr output)
 {
     struct output_private *priv = output->driver_private;
     drmModeFreeConnector(priv->drm_connector);
-    xfree(priv);
+    free(priv);
     output->driver_private = NULL;
 }
 
@@ -262,14 +262,14 @@ xorg_output_init(ScrnInfoPtr pScrn)
 		 drm_connector->connector_type_id);
 
 
-	priv = xcalloc(sizeof(*priv), 1);
+	priv = calloc(sizeof(*priv), 1);
 	if (!priv) {
 	    continue;
 	}
 
 	output = xf86OutputCreate(pScrn, &output_funcs, name);
 	if (!output) {
-	    xfree(priv);
+	    free(priv);
 	    continue;
 	}
 
diff --git a/src/gallium/state_trackers/xorg/xorg_tracker.h b/src/gallium/state_trackers/xorg/xorg_tracker.h
index be1a9fda48d..a3fb5e5dad0 100644
--- a/src/gallium/state_trackers/xorg/xorg_tracker.h
+++ b/src/gallium/state_trackers/xorg/xorg_tracker.h
@@ -76,6 +76,7 @@ typedef struct _CustomizerRec
     Bool dirty_throttling;
     Bool swap_throttling;
     Bool no_3d;
+    Bool unhidden_hw_cursor_update;
     Bool (*winsys_pre_init) (struct _CustomizerRec *cust, int fd);
     Bool (*winsys_screen_init)(struct _CustomizerRec *cust);
     Bool (*winsys_screen_close)(struct _CustomizerRec *cust);
diff --git a/src/gallium/state_trackers/xorg/xorg_xv.c b/src/gallium/state_trackers/xorg/xorg_xv.c
index f98bd939010..f64959f00e9 100644
--- a/src/gallium/state_trackers/xorg/xorg_xv.c
+++ b/src/gallium/state_trackers/xorg/xorg_xv.c
@@ -536,8 +536,10 @@ display_video(ScrnInfoPtr pScrn, struct xorg_xv_port_priv *pPriv, int id,
    dst_surf = xorg_gpu_surface(pPriv->r->pipe->screen, dst);
    hdtv = ((src_w >= RES_720P_X) && (src_h >= RES_720P_Y));
 
+#ifdef COMPOSITE
    REGION_TRANSLATE(pScrn->pScreen, dstRegion, -pPixmap->screen_x,
                     -pPixmap->screen_y);
+#endif
 
    dxo = dstRegion->extents.x1;
    dyo = dstRegion->extents.y1;
@@ -562,11 +564,16 @@ display_video(ScrnInfoPtr pScrn, struct xorg_xv_port_priv *pPriv, int id,
       int box_y2 = pbox->y2;
       float diff_x = (float)src_w / (float)dst_w;
       float diff_y = (float)src_h / (float)dst_h;
-      float offset_x = box_x1 - dstX + pPixmap->screen_x;
-      float offset_y = box_y1 - dstY + pPixmap->screen_y;
+      float offset_x = box_x1 - dstX;
+      float offset_y = box_y1 - dstY;
       float offset_w;
       float offset_h;
 
+#ifdef COMPOSITE
+      offset_x += pPixmap->screen_x;
+      offset_y += pPixmap->screen_y;
+#endif
+
       x = box_x1;
       y = box_y1;
       w = box_x2 - box_x1;
diff --git a/src/gallium/targets/Makefile.xorg b/src/gallium/targets/Makefile.xorg
index 762c905985e..87eedd7136c 100644
--- a/src/gallium/targets/Makefile.xorg
+++ b/src/gallium/targets/Makefile.xorg
@@ -29,7 +29,7 @@ INCLUDES = \
 LIBNAME_STAGING = $(TOP)/$(LIB_DIR)/gallium/$(TARGET)
 
 ifeq ($(MESA_LLVM),1)
-LD = g++
+LD = $(CXX)
 LDFLAGS += $(LLVM_LDFLAGS)
 USE_CXX=1
 DRIVER_PIPES += $(TOP)/src/gallium/drivers/llvmpipe/libllvmpipe.a
@@ -42,7 +42,7 @@ endif
 default: depend $(TOP)/$(LIB_DIR)/gallium $(LIBNAME) $(LIBNAME_STAGING)
 
 $(LIBNAME): $(OBJECTS) Makefile ../Makefile.xorg $(LIBS) $(DRIVER_PIPES)
-	$(MKLIB) -noprefix -o $@ $(LDFLAGS) $(OBJECTS) $(DRIVER_PIPES) $(GALLIUM_AUXILIARIES) $(DRIVER_LINKS)
+	$(MKLIB) -linker $(CC) -noprefix -o $@ $(LDFLAGS) $(OBJECTS) $(DRIVER_PIPES) $(GALLIUM_AUXILIARIES) $(DRIVER_LINKS)
 
 depend: $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) $(SYMLINKS) $(GENERATED_SOURCES)
 	rm -f depend
diff --git a/src/gallium/targets/dri-i915/target.c b/src/gallium/targets/dri-i915/target.c
index 5ae6ca367d8..a27b7bd6d81 100644
--- a/src/gallium/targets/dri-i915/target.c
+++ b/src/gallium/targets/dri-i915/target.c
@@ -19,8 +19,7 @@ create_screen(int fd)
    if (!screen)
       return NULL;
 
-   if (debug_get_bool_option("I915_SOFTWARE", FALSE))
-      screen = sw_screen_wrap(screen);
+   screen = sw_screen_wrap(screen);
 
    screen = debug_screen_wrap(screen);
 
diff --git a/src/gallium/targets/dri-i965/target.c b/src/gallium/targets/dri-i965/target.c
index ce97f820278..0632b97beaa 100644
--- a/src/gallium/targets/dri-i965/target.c
+++ b/src/gallium/targets/dri-i965/target.c
@@ -19,8 +19,7 @@ create_screen(int fd)
    if (!screen)
       return NULL;
 
-   if (debug_get_bool_option("BRW_SOFTPIPE", FALSE))
-      screen = sw_screen_wrap(screen);
+   screen = sw_screen_wrap(screen);
 
    screen = debug_screen_wrap(screen);
 
diff --git a/src/gallium/targets/egl/Makefile b/src/gallium/targets/egl/Makefile
index 47c24cefe5c..57979c4e9d4 100644
--- a/src/gallium/targets/egl/Makefile
+++ b/src/gallium/targets/egl/Makefile
@@ -24,7 +24,9 @@ common_CPPFLAGS := \
 	-I$(TOP)/src/gallium/auxiliary \
 	-I$(TOP)/src/gallium/drivers \
 	-I$(TOP)/src/gallium/include \
-	-I$(TOP)/src/gallium/winsys
+	-I$(TOP)/src/gallium/winsys \
+	$(LIBDRM_CFLAGS)
+
 common_SYS :=
 common_LIBS := \
 	$(TOP)/src/gallium/drivers/identity/libidentity.a \
@@ -37,15 +39,15 @@ egl_CPPFLAGS := \
 	-I$(TOP)/src/gallium/state_trackers/egl \
 	-I$(TOP)/src/egl/main \
 	-DPIPE_PREFIX=\"$(PIPE_PREFIX)\" -DST_PREFIX=\"$(ST_PREFIX)\"
-egl_SYS := -lm $(DLOPEN_LIBS) -L$(TOP)/$(LIB_DIR) -lEGL
+egl_SYS := -lm $(DLOPEN_LIBS) -lEGL
 egl_LIBS := $(TOP)/src/gallium/state_trackers/egl/libegl.a
 
 ifneq ($(findstring x11, $(EGL_PLATFORMS)),)
-egl_SYS += -lX11 -lXext -lXfixes
+egl_SYS += -lX11 -lXext -lXfixes $(LIBDRM_LIB)
 egl_LIBS += $(TOP)/src/gallium/winsys/sw/xlib/libws_xlib.a
 endif
-ifneq ($(findstring kms, $(EGL_PLATFORMS)),)
-egl_SYS += -ldrm
+ifneq ($(findstring drm, $(EGL_PLATFORMS)),)
+egl_SYS += $(LIBDRM_LIB)
 endif
 ifneq ($(findstring fbdev, $(EGL_PLATFORMS)),)
 egl_LIBS += $(TOP)/src/gallium/winsys/sw/fbdev/libfbdev.a
@@ -132,17 +134,17 @@ GL_LIBS := $(TOP)/src/mesa/libmesagallium.a
 
 # OpenGL ES 1.x state tracker
 GLESv1_CM_CPPFLAGS := -I$(TOP)/src/mesa
-GLESv1_CM_SYS := $(DRI_LIB_DEPS) -L$(TOP)/$(LIB_DIR) -l$(GLESv1_CM_LIB)
+GLESv1_CM_SYS := $(DRI_LIB_DEPS) -l$(GLESv1_CM_LIB)
 GLESv1_CM_LIBS := $(TOP)/src/mesa/libes1gallium.a
 
 # OpenGL ES 2.x state tracker
 GLESv2_CPPFLAGS := -I$(TOP)/src/mesa
-GLESv2_SYS := $(DRI_LIB_DEPS) -L$(TOP)/$(LIB_DIR) -l$(GLESv2_LIB)
+GLESv2_SYS := $(DRI_LIB_DEPS) -l$(GLESv2_LIB)
 GLESv2_LIBS := $(TOP)/src/mesa/libes2gallium.a
 
 # OpenVG state tracker
 OpenVG_CPPFLAGS := -I$(TOP)/src/gallium/state_trackers/vega
-OpenVG_SYS := -lm -L$(TOP)/$(LIB_DIR) -l$(VG_LIB)
+OpenVG_SYS := -lm -l$(VG_LIB)
 OpenVG_LIBS := $(TOP)/src/gallium/state_trackers/vega/libvega.a
 
 
@@ -179,14 +181,16 @@ OUTPUTS := $(addprefix $(OUTPUT_PATH)/, $(OUTPUTS))
 default: $(OUTPUTS)
 
 define mklib
-$(MKLIB) -o $(notdir $@) -noprefix -linker '$(CC)' -ldflags '$(LDFLAGS)' \
+$(MKLIB) -o $(notdir $@) -noprefix -linker '$(CC)' \
+	-L$(TOP)/$(LIB_DIR) -ldflags '$(LDFLAGS)' \
 	-install $(OUTPUT_PATH) $(MKLIB_OPTIONS) $< \
 	-Wl,--start-group $(common_LIBS) $($(1)_LIBS) -Wl,--end-group \
 	$(common_SYS) $($(1)_SYS)
 endef
 
 define mklib-cxx
-$(MKLIB) -o $(notdir $@) -noprefix -linker '$(CXX)' -ldflags '$(LDFLAGS)' \
+$(MKLIB) -o $(notdir $@) -noprefix -linker '$(CXX)' \
+	-L$(TOP)/$(LIB_DIR) -ldflags '$(LDFLAGS)' \
 	-cplusplus -install $(OUTPUT_PATH) $(MKLIB_OPTIONS) $< \
 	-Wl,--start-group $(common_LIBS) $($(1)_LIBS) -Wl,--end-group \
 	$(common_SYS) $($(1)_SYS)
diff --git a/src/gallium/targets/egl/pipe_i915.c b/src/gallium/targets/egl/pipe_i915.c
index 758a921b481..cd74044d8c1 100644
--- a/src/gallium/targets/egl/pipe_i915.c
+++ b/src/gallium/targets/egl/pipe_i915.c
@@ -1,5 +1,4 @@
 
-#include "target-helpers/inline_wrapper_sw_helper.h"
 #include "target-helpers/inline_debug_helper.h"
 #include "state_tracker/drm_driver.h"
 #include "i915/drm/i915_drm_public.h"
diff --git a/src/gallium/targets/egl/pipe_i965.c b/src/gallium/targets/egl/pipe_i965.c
index 43bf646e825..f810ecffb0a 100644
--- a/src/gallium/targets/egl/pipe_i965.c
+++ b/src/gallium/targets/egl/pipe_i965.c
@@ -1,6 +1,6 @@
 
-#include "target-helpers/inline_wrapper_sw_helper.h"
 #include "target-helpers/inline_debug_helper.h"
+#include "target-helpers/inline_wrapper_sw_helper.h"
 #include "state_tracker/drm_driver.h"
 #include "i965/drm/i965_drm_public.h"
 #include "i965/brw_public.h"
@@ -19,8 +19,7 @@ create_screen(int fd)
    if (!screen)
       return NULL;
 
-   if (debug_get_bool_option("BRW_SOFTPIPE", FALSE))
-      screen = sw_screen_wrap(screen);
+   screen = sw_screen_wrap(screen);
 
    screen = debug_screen_wrap(screen);
 
diff --git a/src/gallium/targets/libgl-xlib/Makefile b/src/gallium/targets/libgl-xlib/Makefile
index 79e516a2a7a..076a040a5ab 100644
--- a/src/gallium/targets/libgl-xlib/Makefile
+++ b/src/gallium/targets/libgl-xlib/Makefile
@@ -10,7 +10,7 @@ include $(TOP)/configs/current
 
 GL_MAJOR = 1
 GL_MINOR = 5
-GL_TINY = 0$(MESA_MAJOR)0$(MESA_MINOR)0$(MESA_TINY)
+GL_TINY = 0$(MESA_MAJOR)$(MESA_MINOR)0$(MESA_TINY)
 
 
 INCLUDE_DIRS = \
diff --git a/src/gallium/targets/xorg-i965/intel_target.c b/src/gallium/targets/xorg-i965/intel_target.c
index ce97f820278..0632b97beaa 100644
--- a/src/gallium/targets/xorg-i965/intel_target.c
+++ b/src/gallium/targets/xorg-i965/intel_target.c
@@ -19,8 +19,7 @@ create_screen(int fd)
    if (!screen)
       return NULL;
 
-   if (debug_get_bool_option("BRW_SOFTPIPE", FALSE))
-      screen = sw_screen_wrap(screen);
+   screen = sw_screen_wrap(screen);
 
    screen = debug_screen_wrap(screen);
 
diff --git a/src/gallium/targets/xorg-vmwgfx/vmw_ctrl.c b/src/gallium/targets/xorg-vmwgfx/vmw_ctrl.c
index 237b308ae35..9b422e661bf 100644
--- a/src/gallium/targets/xorg-vmwgfx/vmw_ctrl.c
+++ b/src/gallium/targets/xorg-vmwgfx/vmw_ctrl.c
@@ -32,6 +32,7 @@
  *      allows X clients to communicate with the driver.
  */
 
+#include <xorg-server.h>
 #include "dixstruct.h"
 #include "extnsionst.h"
 #include <X11/X.h>
@@ -211,7 +212,7 @@ VMwareCtrlDoSetTopology(ScrnInfoPtr pScrn,
    struct vmw_customizer *vmw = vmw_customizer(xorg_customizer(pScrn));
    int i;
 
-   rects = xcalloc(number, sizeof(*rects));
+   rects = calloc(number, sizeof(*rects));
    if (!rects)
       return FALSE;
 
@@ -224,7 +225,7 @@ VMwareCtrlDoSetTopology(ScrnInfoPtr pScrn,
 
    vmw_ioctl_update_layout(vmw, number, rects);
 
-   xfree(rects);
+   free(rects);
    return TRUE;
 }
 
diff --git a/src/gallium/targets/xorg-vmwgfx/vmw_ioctl.c b/src/gallium/targets/xorg-vmwgfx/vmw_ioctl.c
index 7c799b58275..7625d2fb8f9 100644
--- a/src/gallium/targets/xorg-vmwgfx/vmw_ioctl.c
+++ b/src/gallium/targets/xorg-vmwgfx/vmw_ioctl.c
@@ -165,7 +165,7 @@ vmw_ioctl_buffer_create(struct vmw_customizer *vmw, uint32_t size, unsigned *han
     struct drm_vmw_dmabuf_rep *rep = &arg.rep;
     int ret;
 
-    buf = xcalloc(1, sizeof(*buf));
+    buf = calloc(1, sizeof(*buf));
     if (!buf)
 	goto err;
 
@@ -192,7 +192,7 @@ vmw_ioctl_buffer_create(struct vmw_customizer *vmw, uint32_t size, unsigned *han
     return buf;
 
 err_free:
-    xfree(buf);
+    free(buf);
 err:
     return NULL;
 }
@@ -211,7 +211,7 @@ vmw_ioctl_buffer_destroy(struct vmw_customizer *vmw, struct vmw_dma_buffer *buf)
     arg.handle = buf->handle; 
     drmCommandWrite(vmw->fd, DRM_VMW_UNREF_DMABUF, &arg, sizeof(arg)); 
 
-    xfree(buf); 
+    free(buf);
 } 
 
 void *
diff --git a/src/gallium/targets/xorg-vmwgfx/vmw_screen.c b/src/gallium/targets/xorg-vmwgfx/vmw_screen.c
index 8173908f551..76622031650 100644
--- a/src/gallium/targets/xorg-vmwgfx/vmw_screen.c
+++ b/src/gallium/targets/xorg-vmwgfx/vmw_screen.c
@@ -245,6 +245,7 @@ vmw_screen_pre_init(ScrnInfoPtr pScrn, int flags)
     cust->winsys_enter_vt = vmw_screen_enter_vt;
     cust->winsys_leave_vt = vmw_screen_leave_vt;
     cust->no_3d = TRUE;
+    cust->unhidden_hw_cursor_update = TRUE;
     vmw->pScrn = pScrn;
 
     pScrn->driverPrivate = cust;
diff --git a/src/gallium/targets/xorg-vmwgfx/vmw_video.c b/src/gallium/targets/xorg-vmwgfx/vmw_video.c
index eced60d0ec1..94465e52043 100644
--- a/src/gallium/targets/xorg-vmwgfx/vmw_video.c
+++ b/src/gallium/targets/xorg-vmwgfx/vmw_video.c
@@ -300,7 +300,7 @@ vmw_video_init(struct vmw_customizer *vmw)
         numAdaptors = 1;
         overlayAdaptors = &newAdaptor;
     } else {
-         newAdaptors = xalloc((numAdaptors + 1) *
+         newAdaptors = malloc((numAdaptors + 1) *
                               sizeof(XF86VideoAdaptorPtr*));
          if (!newAdaptors) {
             xf86XVFreeVideoAdaptorRec(newAdaptor);
@@ -320,7 +320,7 @@ vmw_video_init(struct vmw_customizer *vmw)
     }
 
     if (newAdaptors) {
-        xfree(newAdaptors);
+        free(newAdaptors);
     }
 
     debug_printf("Initialized VMware Xv extension successfully\n");
@@ -438,7 +438,7 @@ vmw_video_init_adaptor(ScrnInfoPtr pScrn, struct vmw_customizer *vmw)
         return NULL;
     }
 
-    video = xcalloc(1, sizeof(*video));
+    video = calloc(1, sizeof(*video));
     if (!video) {
         debug_printf("Not enough memory.\n");
         xf86XVFreeVideoAdaptorRec(adaptor);
@@ -742,7 +742,7 @@ vmw_video_buffer_alloc(struct vmw_customizer *vmw, int size,
     }
 
     out->size = size;
-    out->extra_data = xcalloc(1, size);
+    out->extra_data = calloc(1, size);
 
     debug_printf("\t\t%s: allocated buffer %p of size %i\n", __func__, out, size);
 
@@ -773,7 +773,7 @@ vmw_video_buffer_free(struct vmw_customizer *vmw,
     if (out->size == 0)
 	return Success;
 
-    xfree(out->extra_data);
+    free(out->extra_data);
     vmw_ioctl_buffer_unmap(vmw, out->buf);
     vmw_ioctl_buffer_destroy(vmw, out->buf);
 
diff --git a/src/gallium/tests/python/retrace/interpreter.py b/src/gallium/tests/python/retrace/interpreter.py
index 16132abb06d..954a701a53f 100755
--- a/src/gallium/tests/python/retrace/interpreter.py
+++ b/src/gallium/tests/python/retrace/interpreter.py
@@ -610,6 +610,15 @@ class Context(Object):
             _rgba[i] = rgba[i]
         self.real.clear(buffers, _rgba, depth, stencil)
         
+    def clear_render_target(self, dst, rgba, dstx, dsty, width, height):
+        _rgba = gallium.FloatArray(4)
+        for i in range(4):
+            _rgba[i] = rgba[i]
+        self.real.clear_render_target(dst, _rgba, dstx, dsty, width, height)
+
+    def clear_depth_stencil(self, dst, clear_flags, depth, stencil, dstx, dsty, width, height):
+        self.real.clear_depth_stencil(dst, clear_flags, depth, stencil, dstx, dsty, width, height)
+
     def _present(self):
         self.real.flush()
     
diff --git a/src/gallium/tests/python/tests/regress/fragment-shader/frag-face.sh b/src/gallium/tests/python/tests/regress/fragment-shader/frag-face.sh
new file mode 100644
index 00000000000..5745b6a5aba
--- /dev/null
+++ b/src/gallium/tests/python/tests/regress/fragment-shader/frag-face.sh
@@ -0,0 +1,14 @@
+FRAG
+
+DCL IN[0], COLOR, LINEAR
+DCL IN[1], FACE, CONSTANT
+DCL OUT[0], COLOR
+DCL TEMP[0]
+IMM FLT32 { 0.5, 1.0, 0.0, 0.0 }
+
+MUL TEMP[0], IN[1].xxxx, IMM[0].xxxx
+ADD TEMP[0], TEMP[0], IMM[0].yyyy
+
+MOV OUT[0], TEMP[0]
+
+END
diff --git a/src/gallium/winsys/r600/drm/evergreen_hw_context.c b/src/gallium/winsys/r600/drm/evergreen_hw_context.c
index 1355b079450..7f21b53ace0 100644
--- a/src/gallium/winsys/r600/drm/evergreen_hw_context.c
+++ b/src/gallium/winsys/r600/drm/evergreen_hw_context.c
@@ -613,6 +613,13 @@ int evergreen_context_init(struct r600_context *ctx, struct radeon *radeon)
 		r = -ENOMEM;
 		goto out_err;
 	}
+	/* save 16dwords space for fence mecanism */
+	ctx->pm4_ndwords -= 16;
+
+	r = r600_context_init_fence(ctx);
+	if (r) {
+		goto out_err;
+	}
 
 	/* init dirty list */
 	LIST_INITHEAD(&ctx->dirty);
@@ -633,7 +640,7 @@ static inline void evergreen_context_pipe_state_set_resource(struct r600_context
 		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY);
 		r600_bo_reference(ctx->radeon, &block->reloc[1].bo, NULL);
 		r600_bo_reference(ctx->radeon , &block->reloc[2].bo, NULL);
-		LIST_DEL(&block->list);
+		LIST_DELINIT(&block->list);
 		return;
 	}
 	block->reg[0] = state->regs[0].value;
@@ -688,7 +695,7 @@ static inline void evergreen_context_pipe_state_set_sampler(struct r600_context
 	block = range->blocks[CTX_BLOCK_ID(ctx, offset)];
 	if (state == NULL) {
 		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY);
-		LIST_DEL(&block->list);
+		LIST_DELINIT(&block->list);
 		return;
 	}
 	block->reg[0] = state->regs[0].value;
@@ -712,7 +719,7 @@ static inline void evergreen_context_pipe_state_set_sampler_border(struct r600_c
 	block = range->blocks[CTX_BLOCK_ID(ctx, fake_offset)];
 	if (state == NULL) {
 		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY);
-		LIST_DEL(&block->list);
+		LIST_DELINIT(&block->list);
 		return;
 	}
 	if (state->nregs <= 3) {
@@ -755,7 +762,8 @@ void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *dr
 	struct r600_bo *cb[12];
 	struct r600_bo *db;
 	unsigned ndwords = 9, flush;
-	struct r600_block *dirty_block;
+	struct r600_block *dirty_block = NULL;
+	struct r600_block *next_block;
 
 	if (draw->indices) {
 		ndwords = 13;
@@ -810,10 +818,9 @@ void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *dr
 	}
 
 	/* enough room to copy packet */
-	LIST_FOR_EACH_ENTRY(dirty_block,&ctx->dirty,list) {
+	LIST_FOR_EACH_ENTRY_SAFE(dirty_block, next_block, &ctx->dirty,list) {
 		r600_context_block_emit_dirty(ctx, dirty_block);
 	}
-	LIST_INITHEAD(&ctx->dirty);
 
 	/* draw packet */
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_INDEX_TYPE, 0);
diff --git a/src/gallium/winsys/r600/drm/r600.c b/src/gallium/winsys/r600/drm/r600.c
index 496547ca994..0a4d2e791db 100644
--- a/src/gallium/winsys/r600/drm/r600.c
+++ b/src/gallium/winsys/r600/drm/r600.c
@@ -40,6 +40,11 @@ enum chip_class r600_get_family_class(struct radeon *radeon)
 	return radeon->chip_class;
 }
 
+struct r600_tiling_info *r600_get_tiling_info(struct radeon *radeon)
+{
+	return &radeon->tiling_info;
+}
+
 static int r600_get_device(struct radeon *r600)
 {
 	struct drm_radeon_info info;
diff --git a/src/gallium/winsys/r600/drm/r600_bo.c b/src/gallium/winsys/r600/drm/r600_bo.c
index 9498f3a82ea..7d54ff18fc2 100644
--- a/src/gallium/winsys/r600/drm/r600_bo.c
+++ b/src/gallium/winsys/r600/drm/r600_bo.c
@@ -26,7 +26,9 @@
 #include <pipe/p_compiler.h>
 #include <pipe/p_screen.h>
 #include <pipebuffer/pb_bufmgr.h>
+#include "radeon_drm.h"
 #include "r600_priv.h"
+#include "r600d.h"
 
 struct r600_bo *r600_bo(struct radeon *radeon,
 				  unsigned size, unsigned alignment, unsigned usage)
@@ -55,7 +57,7 @@ struct r600_bo *r600_bo(struct radeon *radeon,
 }
 
 struct r600_bo *r600_bo_handle(struct radeon *radeon,
-					 unsigned handle)
+			       unsigned handle, unsigned *array_mode)
 {
 	struct r600_bo *ws_bo = calloc(1, sizeof(struct r600_bo));
 	struct radeon_bo *bo;
@@ -68,6 +70,20 @@ struct r600_bo *r600_bo_handle(struct radeon *radeon,
 	bo = radeon_bo_pb_get_bo(ws_bo->pb);
 	ws_bo->size = bo->size;
 	pipe_reference_init(&ws_bo->reference, 1);
+
+	radeon_bo_get_tiling_flags(radeon, bo, &ws_bo->tiling_flags,
+				   &ws_bo->kernel_pitch);
+	if (array_mode) {
+		if (ws_bo->tiling_flags) {
+			if (ws_bo->tiling_flags & RADEON_TILING_MICRO)
+				*array_mode = V_0280A0_ARRAY_1D_TILED_THIN1;
+			if ((ws_bo->tiling_flags & (RADEON_TILING_MICRO | RADEON_TILING_MACRO)) ==
+			    (RADEON_TILING_MICRO | RADEON_TILING_MACRO))
+				*array_mode = V_0280A0_ARRAY_2D_TILED_THIN1;
+		} else {
+			*array_mode = 0;
+		}
+	}
 	return ws_bo;
 }
 
diff --git a/src/gallium/winsys/r600/drm/r600_drm.c b/src/gallium/winsys/r600/drm/r600_drm.c
index 5f175a4df98..60c2f51fac0 100644
--- a/src/gallium/winsys/r600/drm/r600_drm.c
+++ b/src/gallium/winsys/r600/drm/r600_drm.c
@@ -37,6 +37,9 @@
 #include "xf86drm.h"
 #include "radeon_drm.h"
 
+#ifndef RADEON_INFO_TILING_CONFIG
+#define RADEON_INFO_TILING_CONFIG 0x6
+#endif
 static int radeon_get_device(struct radeon *radeon)
 {
 	struct drm_radeon_info info;
@@ -50,6 +53,61 @@ static int radeon_get_device(struct radeon *radeon)
 	return r;
 }
 
+static int radeon_drm_get_tiling(struct radeon *radeon)
+{
+	struct drm_radeon_info info;
+	int r;
+	uint32_t tiling_config;
+
+	info.request = RADEON_INFO_TILING_CONFIG;
+	info.value = (uintptr_t)&tiling_config;
+	r = drmCommandWriteRead(radeon->fd, DRM_RADEON_INFO, &info,
+				sizeof(struct drm_radeon_info));
+
+	if (r)
+		return 0;
+
+	switch ((tiling_config & 0xe) >> 1) {
+	case 0:
+		radeon->tiling_info.num_channels = 1;
+		break;
+	case 1:
+		radeon->tiling_info.num_channels = 2;
+		break;
+	case 2:
+		radeon->tiling_info.num_channels = 4;
+		break;
+	case 3:
+		radeon->tiling_info.num_channels = 8;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch ((tiling_config & 0x30) >> 4) {
+	case 0:
+		radeon->tiling_info.num_banks = 4;
+		break;
+	case 1:
+		radeon->tiling_info.num_banks = 8;
+		break;
+	default:
+		return -EINVAL;
+
+	}
+	switch ((tiling_config & 0xc0) >> 6) {
+	case 0:
+		radeon->tiling_info.group_bytes = 256;
+		break;
+	case 1:
+		radeon->tiling_info.group_bytes = 512;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
 struct radeon *radeon_new(int fd, unsigned device)
 {
 	struct radeon *radeon;
@@ -157,6 +215,10 @@ struct radeon *radeon_new(int fd, unsigned device)
 		break;
 	}
 
+	if (radeon->chip_class == R600 || radeon->chip_class == R700) {
+		if (radeon_drm_get_tiling(radeon))
+			return NULL;
+	}
 	radeon->kman = radeon_bo_pbmgr_create(radeon);
 	if (!radeon->kman)
 		return NULL;
@@ -179,9 +241,15 @@ struct radeon *radeon_decref(struct radeon *radeon)
 		return NULL;
 	}
 
-	radeon->cman->destroy(radeon->cman);
-	radeon->kman->destroy(radeon->kman);
-	drmClose(radeon->fd);
+        if (radeon->cman)
+           radeon->cman->destroy(radeon->cman);
+
+        if (radeon->kman)
+           radeon->kman->destroy(radeon->kman);
+
+        if (radeon->fd >= 0)
+           drmClose(radeon->fd);
+
 	free(radeon);
 	return NULL;
 }
diff --git a/src/gallium/winsys/r600/drm/r600_hw_context.c b/src/gallium/winsys/r600/drm/r600_hw_context.c
index b379499f060..2521ff96473 100644
--- a/src/gallium/winsys/r600/drm/r600_hw_context.c
+++ b/src/gallium/winsys/r600/drm/r600_hw_context.c
@@ -41,6 +41,45 @@
 
 #define GROUP_FORCE_NEW_BLOCK	0
 
+int r600_context_init_fence(struct r600_context *ctx)
+{
+	ctx->fence = 1;
+	ctx->fence_bo = r600_bo(ctx->radeon, 4096, 0, 0);
+	if (ctx->fence_bo == NULL) {
+		return -ENOMEM;
+	}
+	ctx->cfence = r600_bo_map(ctx->radeon, ctx->fence_bo, PB_USAGE_UNSYNCHRONIZED, NULL);
+	*ctx->cfence = 0;
+	LIST_INITHEAD(&ctx->fenced_bo);
+	return 0;
+}
+
+static void INLINE r600_context_update_fenced_list(struct r600_context *ctx)
+{
+	for (int i = 0; i < ctx->creloc; i++) {
+		if (!LIST_IS_EMPTY(&ctx->bo[i]->fencedlist))
+			LIST_DELINIT(&ctx->bo[i]->fencedlist);
+		LIST_ADDTAIL(&ctx->bo[i]->fencedlist, &ctx->fenced_bo);
+		ctx->bo[i]->fence = ctx->fence;
+		ctx->bo[i]->ctx = ctx;
+	}
+}
+
+static void INLINE r600_context_fence_wraparound(struct r600_context *ctx, unsigned fence)
+{
+	struct radeon_bo *bo = NULL;
+	struct radeon_bo *tmp;
+
+	LIST_FOR_EACH_ENTRY_SAFE(bo, tmp, &ctx->fenced_bo, fencedlist) {
+		if (bo->fence <= *ctx->cfence) {
+			LIST_DELINIT(&bo->fencedlist);
+			bo->fence = 0;
+		} else {
+			bo->fence = fence;
+		}
+	}
+}
+
 int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg)
 {
 	struct r600_block *block;
@@ -87,6 +126,8 @@ int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg,
 		block->reg = &block->pm4[block->pm4_ndwords];
 		block->pm4_ndwords += n;
 		block->nreg = n;
+		LIST_INITHEAD(&block->list);
+
 		for (j = 0; j < n; j++) {
 			if (reg[i+j].need_bo) {
 				block->nbo++;
@@ -572,6 +613,9 @@ void r600_context_fini(struct r600_context *ctx)
 	}
 	free(ctx->reloc);
 	free(ctx->pm4);
+	if (ctx->fence_bo) {
+		r600_bo_reference(ctx->radeon, &ctx->fence_bo, NULL);
+	}
 	memset(ctx, 0, sizeof(struct r600_context));
 }
 
@@ -691,6 +735,13 @@ int r600_context_init(struct r600_context *ctx, struct radeon *radeon)
 		r = -ENOMEM;
 		goto out_err;
 	}
+	/* save 16dwords space for fence mecanism */
+	ctx->pm4_ndwords -= 16;
+
+	r = r600_context_init_fence(ctx);
+	if (r) {
+		goto out_err;
+	}
 
 	/* init dirty list */
 	LIST_INITHEAD(&ctx->dirty);
@@ -781,7 +832,7 @@ static inline void r600_context_pipe_state_set_resource(struct r600_context *ctx
 		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY);
 		r600_bo_reference(ctx->radeon, &block->reloc[1].bo, NULL);
 		r600_bo_reference(ctx->radeon , &block->reloc[2].bo, NULL);
-		LIST_DEL(&block->list);
+		LIST_DELINIT(&block->list);
 		return;
 	}
 	block->reg[0] = state->regs[0].value;
@@ -835,7 +886,7 @@ static inline void r600_context_pipe_state_set_sampler(struct r600_context *ctx,
 	block = range->blocks[CTX_BLOCK_ID(ctx, offset)];
 	if (state == NULL) {
 		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY);
-		LIST_DEL(&block->list);
+		LIST_DELINIT(&block->list);
 		return;
 	}
 	block->reg[0] = state->regs[0].value;
@@ -858,7 +909,7 @@ static inline void r600_context_pipe_state_set_sampler_border(struct r600_contex
 	block = range->blocks[CTX_BLOCK_ID(ctx, offset)];
 	if (state == NULL) {
 		block->status &= ~(R600_BLOCK_STATUS_ENABLED | R600_BLOCK_STATUS_DIRTY);
-		LIST_DEL(&block->list);
+		LIST_DELINIT(&block->list);
 		return;
 	}
 	if (state->nregs <= 3) {
@@ -917,7 +968,8 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw)
 	struct r600_bo *cb[8];
 	struct r600_bo *db;
 	unsigned ndwords = 9;
-	struct r600_block *dirty_block;
+	struct r600_block *dirty_block = NULL;
+	struct r600_block *next_block;
 
 	if (draw->indices) {
 		ndwords = 13;
@@ -970,10 +1022,9 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw)
 	}
 
 	/* enough room to copy packet */
-	LIST_FOR_EACH_ENTRY(dirty_block,&ctx->dirty,list) {
+	LIST_FOR_EACH_ENTRY_SAFE(dirty_block, next_block, &ctx->dirty,list) {
 		r600_context_block_emit_dirty(ctx, dirty_block);
 	}
-	LIST_INITHEAD(&ctx->dirty);
 
 	/* draw packet */
 	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_INDEX_TYPE, 0);
@@ -1019,6 +1070,7 @@ void r600_context_flush(struct r600_context *ctx)
 	struct drm_radeon_cs drmib;
 	struct drm_radeon_cs_chunk chunks[2];
 	uint64_t chunk_array[2];
+	unsigned fence;
 	int r;
 
 	if (!ctx->pm4_cdwords)
@@ -1028,6 +1080,18 @@ void r600_context_flush(struct r600_context *ctx)
 	r600_context_queries_suspend(ctx);
 
 	radeon_bo_pbmgr_flush_maps(ctx->radeon->kman);
+
+	/* emit fence */
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4);
+	ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT | (5 << 8);
+	ctx->pm4[ctx->pm4_cdwords++] = 0;
+	ctx->pm4[ctx->pm4_cdwords++] = (1 << 29) | (0 << 24);
+	ctx->pm4[ctx->pm4_cdwords++] = ctx->fence;
+	ctx->pm4[ctx->pm4_cdwords++] = 0;
+	ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0);
+	ctx->pm4[ctx->pm4_cdwords++] = 0;
+	r600_context_bo_reloc(ctx, &ctx->pm4[ctx->pm4_cdwords - 1], ctx->fence_bo);
+
 #if 1
 	/* emit cs */
 	drmib.num_chunks = 2;
@@ -1043,8 +1107,18 @@ void r600_context_flush(struct r600_context *ctx)
 	r = drmCommandWriteRead(ctx->radeon->fd, DRM_RADEON_CS, &drmib,
 				sizeof(struct drm_radeon_cs));
 #endif
+
+	r600_context_update_fenced_list(ctx);
+
+	fence = ctx->fence + 1;
+	if (fence < ctx->fence) {
+		/* wrap around */
+		fence = 1;
+		r600_context_fence_wraparound(ctx, fence);
+	}
+	ctx->fence = fence;
+
 	/* restart */
-	radeon_bo_fencelist(ctx->radeon, ctx->bo, ctx->creloc);
 	for (int i = 0; i < ctx->creloc; i++) {
 		ctx->bo[i]->reloc = NULL;
 		ctx->bo[i]->last_flush = 0;
@@ -1062,8 +1136,9 @@ void r600_context_flush(struct r600_context *ctx)
 	 */
 	for (int i = 0; i < ctx->nblocks; i++) {
 		if (ctx->blocks[i]->status & R600_BLOCK_STATUS_ENABLED) {
-			if(!(ctx->blocks[i]->status & R600_BLOCK_STATUS_DIRTY))
+			if(!(ctx->blocks[i]->status & R600_BLOCK_STATUS_DIRTY)) {
 				LIST_ADDTAIL(&ctx->blocks[i]->list,&ctx->dirty);
+			}
 			ctx->pm4_dirty_cdwords += ctx->blocks[i]->pm4_ndwords + ctx->blocks[i]->pm4_flush_ndwords;
 			ctx->blocks[i]->status |= R600_BLOCK_STATUS_DIRTY;
 		}
@@ -1243,7 +1318,7 @@ struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned
 void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query)
 {
 	r600_bo_reference(ctx->radeon, &query->buffer, NULL);
-	LIST_DEL(&query->list);
+	LIST_DELINIT(&query->list);
 	free(query);
 }
 
diff --git a/src/gallium/winsys/r600/drm/r600_priv.h b/src/gallium/winsys/r600/drm/r600_priv.h
index ea2cf347785..b5bd7bd92c1 100644
--- a/src/gallium/winsys/r600/drm/r600_priv.h
+++ b/src/gallium/winsys/r600/drm/r600_priv.h
@@ -42,6 +42,7 @@ struct radeon {
 	enum chip_class			chip_class;
 	struct pb_manager *kman; /* kernel bo manager */
 	struct pb_manager *cman; /* cached bo manager */
+	struct r600_tiling_info tiling_info;
 };
 
 struct radeon *r600_new(int fd, unsigned device);
@@ -64,9 +65,9 @@ struct radeon_bo {
 	unsigned			map_count;
 	void				*data;
 	struct list_head		fencedlist;
+	unsigned			fence;
+	struct r600_context		*ctx;
 	boolean				shared;
-	int64_t				last_busy;
-	boolean				set_busy;
 	struct r600_reloc		*reloc;
 	unsigned			reloc_id;
 	unsigned			last_flush;
@@ -76,6 +77,8 @@ struct r600_bo {
 	struct pipe_reference		reference;
 	struct pb_buffer		*pb;
 	unsigned			size;
+	unsigned			tiling_flags;
+	unsigned                        kernel_pitch;
 };
 
 
@@ -94,7 +97,10 @@ int radeon_bo_wait(struct radeon *radeon, struct radeon_bo *bo);
 int radeon_bo_busy(struct radeon *radeon, struct radeon_bo *bo, uint32_t *domain);
 void radeon_bo_pbmgr_flush_maps(struct pb_manager *_mgr);
 int radeon_bo_fencelist(struct radeon *radeon, struct radeon_bo **bolist, uint32_t num_bo);
-
+int radeon_bo_get_tiling_flags(struct radeon *radeon,
+			       struct radeon_bo *bo,
+			       uint32_t *tiling_flags,
+			       uint32_t *pitch);
 
 /* radeon_bo_pb.c */
 struct radeon_bo *radeon_bo_pb_get_bo(struct pb_buffer *_buf);
@@ -103,6 +109,7 @@ struct pb_buffer *radeon_bo_pb_create_buffer_from_handle(struct pb_manager *_mgr
 							 uint32_t handle);
 
 /* r600_hw_context.c */
+int r600_context_init_fence(struct r600_context *ctx);
 void r600_context_bo_reloc(struct r600_context *ctx, u32 *pm4, struct r600_bo *rbo);
 void r600_context_bo_flush(struct r600_context *ctx, unsigned flush_flags,
 				unsigned flush_mask, struct r600_bo *rbo);
@@ -161,6 +168,7 @@ static inline void r600_context_block_emit_dirty(struct r600_context *ctx, struc
 	memcpy(&ctx->pm4[ctx->pm4_cdwords], block->pm4, block->pm4_ndwords * 4);
 	ctx->pm4_cdwords += block->pm4_ndwords;
 	block->status ^= R600_BLOCK_STATUS_DIRTY;
+	LIST_DELINIT(&block->list);
 }
 
 static inline int radeon_bo_map(struct radeon *radeon, struct radeon_bo *bo)
diff --git a/src/gallium/winsys/r600/drm/r600d.h b/src/gallium/winsys/r600/drm/r600d.h
index ccc9ffaf8e3..d91f7737af3 100644
--- a/src/gallium/winsys/r600/drm/r600d.h
+++ b/src/gallium/winsys/r600/drm/r600d.h
@@ -91,6 +91,7 @@
 #define PKT3_SET_CTL_CONST                     0x6F
 #define PKT3_SURFACE_BASE_UPDATE               0x73
 
+#define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
 #define EVENT_TYPE_ZPASS_DONE                  0x15
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
 
diff --git a/src/gallium/winsys/r600/drm/radeon_bo.c b/src/gallium/winsys/r600/drm/radeon_bo.c
index 42a23f0ab8f..9d664b7e537 100644
--- a/src/gallium/winsys/r600/drm/radeon_bo.c
+++ b/src/gallium/winsys/r600/drm/radeon_bo.c
@@ -32,7 +32,6 @@
 #include "r600_priv.h"
 #include "xf86drm.h"
 #include "radeon_drm.h"
-#include "util/u_time.h"
 
 static int radeon_bo_fixed_map(struct radeon *radeon, struct radeon_bo *bo)
 {
@@ -128,7 +127,6 @@ struct radeon_bo *radeon_bo(struct radeon *radeon, unsigned handle,
 	return bo;
 }
 
-
 static void radeon_bo_destroy(struct radeon *radeon, struct radeon_bo *bo)
 {
 	struct drm_gem_close args;
@@ -158,8 +156,14 @@ int radeon_bo_wait(struct radeon *radeon, struct radeon_bo *bo)
 	struct drm_radeon_gem_wait_idle args;
 	int ret;
 
-	if (LIST_IS_EMPTY(&bo->fencedlist) && !bo->shared)
+	if (!bo->fence && !bo->shared)
+		return 0;
+
+	if (bo->fence <= *bo->ctx->cfence) {
+		LIST_DELINIT(&bo->fencedlist);
+		bo->fence = 0;
 		return 0;
+	}
 
 	/* Zero out args to make valgrind happy */
 	memset(&args, 0, sizeof(args));
@@ -171,22 +175,20 @@ int radeon_bo_wait(struct radeon *radeon, struct radeon_bo *bo)
 	return ret;
 }
 
-#define BO_BUSY_BACKOFF 10000
-
 int radeon_bo_busy(struct radeon *radeon, struct radeon_bo *bo, uint32_t *domain)
 {
 	struct drm_radeon_gem_busy args;
 	int ret;
-	int64_t now;
-
-	now = os_time_get();
-	if (LIST_IS_EMPTY(&bo->fencedlist) && !bo->shared)
-		return 0;
-
-	if (bo->set_busy && (now - bo->last_busy < BO_BUSY_BACKOFF))
-		return -EBUSY;
 
-	bo->set_busy = FALSE;
+	if (!bo->shared) {
+		if (!bo->fence)
+			return 0;
+		if (bo->fence <= *bo->ctx->cfence) {
+			LIST_DELINIT(&bo->fencedlist);
+			bo->fence = 0;
+			return 0;
+		}
+	}
 
 	memset(&args, 0, sizeof(args));
 	args.handle = bo->handle;
@@ -195,37 +197,25 @@ int radeon_bo_busy(struct radeon *radeon, struct radeon_bo *bo, uint32_t *domain
 	ret = drmCommandWriteRead(radeon->fd, DRM_RADEON_GEM_BUSY,
 			&args, sizeof(args));
 
-	if (ret == 0) {
-		struct radeon_bo *entry, *tent;
-		LIST_FOR_EACH_ENTRY_SAFE(entry, tent, &bo->fencedlist, fencedlist) {
-			LIST_DELINIT(&entry->fencedlist);
-		}
-	} else {
-		bo->set_busy = TRUE;
-		bo->last_busy = now;
-	}
 	*domain = args.domain;
 	return ret;
 }
 
-int radeon_bo_fencelist(struct radeon *radeon, struct radeon_bo **bolist,
-			uint32_t num_bo)
+int radeon_bo_get_tiling_flags(struct radeon *radeon,
+			       struct radeon_bo *bo,
+			       uint32_t *tiling_flags,
+			       uint32_t *pitch)
 {
-	struct radeon_bo *first;
-	int i;
-
-	first = bolist[0];
-
-	if (!LIST_IS_EMPTY(&first->fencedlist))
-		LIST_DELINIT(&first->fencedlist);
-
-	LIST_INITHEAD(&first->fencedlist);
-
-	for (i = 1; i < num_bo; i++) {
-		if (!LIST_IS_EMPTY(&bolist[i]->fencedlist))
-			LIST_DELINIT(&bolist[i]->fencedlist);
-
-		LIST_ADDTAIL(&bolist[i]->fencedlist, &first->fencedlist);
-	}
-	return 0;
+	struct drm_radeon_gem_get_tiling args;
+	int ret;
+	
+	args.handle = bo->handle;
+	ret = drmCommandWriteRead(radeon->fd, DRM_RADEON_GEM_GET_TILING,
+				  &args, sizeof(args));
+	if (ret)
+		return ret;
+	
+	*tiling_flags = args.tiling_flags;
+	*pitch = args.pitch;
+	return ret;
 }
diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
index 3a76098b655..bc2623e7b77 100644
--- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
+++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
@@ -272,7 +272,7 @@ wsw_destroy(struct sw_winsys *ws)
 }
 
 struct sw_winsys *
-wrapper_sw_winsys_warp_pipe_screen(struct pipe_screen *screen)
+wrapper_sw_winsys_wrap_pipe_screen(struct pipe_screen *screen)
 {
    struct wrapper_sw_winsys *wsw = CALLOC_STRUCT(wrapper_sw_winsys);
 
@@ -304,3 +304,16 @@ err_free:
 err:
    return NULL;
 }
+
+struct pipe_screen *
+wrapper_sw_winsys_dewrap_pipe_screen(struct sw_winsys *ws)
+{
+   struct wrapper_sw_winsys *wsw = wrapper_sw_winsys(ws);
+   struct pipe_screen *screen = wsw->screen;
+
+   wsw->pipe->destroy(wsw->pipe);
+   /* don't destroy the screen its needed later on */
+
+   FREE(wsw);
+   return screen;
+}
diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.h b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.h
index b5c25a3c50f..ae0196c432c 100644
--- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.h
+++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.h
@@ -30,6 +30,15 @@
 struct sw_winsys;
 struct pipe_screen;
 
-struct sw_winsys *wrapper_sw_winsys_warp_pipe_screen(struct pipe_screen *screen);
+/*
+ * Wrap a pipe screen.
+ */
+struct sw_winsys *wrapper_sw_winsys_wrap_pipe_screen(struct pipe_screen *screen);
+
+/*
+ * Destroy the sw_winsys and return the wrapped pipe_screen.
+ * Not destroying it as sw_winsys::destroy does.
+ */
+struct pipe_screen *wrapper_sw_winsys_dewrap_pipe_screen(struct sw_winsys *sw_winsys);
 
 #endif
diff --git a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
index b78f537c125..3aef8daa423 100644
--- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
+++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
@@ -54,7 +54,7 @@ DEBUG_GET_ONCE_BOOL_OPTION(xlib_no_shm, "XLIB_NO_SHM", FALSE)
  * Display target for Xlib winsys.
  * Low-level OS/window system memory buffer
  */
-struct xm_displaytarget
+struct xlib_displaytarget
 {
    enum pipe_format format;
    unsigned width;
@@ -75,7 +75,7 @@ struct xm_displaytarget
    Drawable drawable;
 
    XShmSegmentInfo shminfo;
-   int shm;
+   Bool shm;  /** Using shared memory images? */
 };
 
 
@@ -85,19 +85,16 @@ struct xm_displaytarget
 struct xlib_sw_winsys
 {
    struct sw_winsys base;
-
-
-
    Display *display;
 };
 
 
 
 /** Cast wrapper */
-static INLINE struct xm_displaytarget *
-xm_displaytarget( struct sw_displaytarget *dt )
+static INLINE struct xlib_displaytarget *
+xlib_displaytarget(struct sw_displaytarget *dt)
 {
-   return (struct xm_displaytarget *)dt;
+   return (struct xlib_displaytarget *) dt;
 }
 
 
@@ -105,22 +102,23 @@ xm_displaytarget( struct sw_displaytarget *dt )
  * X Shared Memory Image extension code
  */
 
-static volatile int mesaXErrorFlag = 0;
+static volatile int XErrorFlag = 0;
 
 /**
  * Catches potential Xlib errors.
  */
 static int
-mesaHandleXError(Display *dpy, XErrorEvent *event)
+handle_xerror(Display *dpy, XErrorEvent *event)
 {
    (void) dpy;
    (void) event;
-   mesaXErrorFlag = 1;
+   XErrorFlag = 1;
    return 0;
 }
 
 
-static char *alloc_shm(struct xm_displaytarget *buf, unsigned size)
+static char *
+alloc_shm(struct xlib_displaytarget *buf, unsigned size)
 {
    XShmSegmentInfo *const shminfo = & buf->shminfo;
 
@@ -144,10 +142,10 @@ static char *alloc_shm(struct xm_displaytarget *buf, unsigned size)
 
 
 /**
- * Allocate a shared memory XImage back buffer for the given XMesaBuffer.
+ * Allocate a shared memory XImage back buffer for the given display target.
  */
 static void
-alloc_shm_ximage(struct xm_displaytarget *xm_dt,
+alloc_shm_ximage(struct xlib_displaytarget *xlib_dt,
                  struct xlib_drawable *xmb,
                  unsigned width, unsigned height)
 {
@@ -159,51 +157,54 @@ alloc_shm_ximage(struct xm_displaytarget *xm_dt,
     */
    int (*old_handler)(Display *, XErrorEvent *);
 
-   xm_dt->tempImage = XShmCreateImage(xm_dt->display,
+   xlib_dt->tempImage = XShmCreateImage(xlib_dt->display,
                                       xmb->visual,
                                       xmb->depth,
                                       ZPixmap,
                                       NULL,
-                                      &xm_dt->shminfo,
+                                      &xlib_dt->shminfo,
                                       width, height);
-   if (xm_dt->tempImage == NULL) {
-      xm_dt->shm = 0;
+   if (xlib_dt->tempImage == NULL) {
+      xlib_dt->shm = False;
       return;
    }
 
 
-   mesaXErrorFlag = 0;
-   old_handler = XSetErrorHandler(mesaHandleXError);
+   XErrorFlag = 0;
+   old_handler = XSetErrorHandler(handle_xerror);
    /* This may trigger the X protocol error we're ready to catch: */
-   XShmAttach(xm_dt->display, &xm_dt->shminfo);
-   XSync(xm_dt->display, False);
+   XShmAttach(xlib_dt->display, &xlib_dt->shminfo);
+   XSync(xlib_dt->display, False);
 
-   if (mesaXErrorFlag) {
+   if (XErrorFlag) {
       /* we are on a remote display, this error is normal, don't print it */
-      XFlush(xm_dt->display);
-      mesaXErrorFlag = 0;
-      XDestroyImage(xm_dt->tempImage);
-      xm_dt->tempImage = NULL;
-      xm_dt->shm = 0;
+      XFlush(xlib_dt->display);
+      XErrorFlag = 0;
+      XDestroyImage(xlib_dt->tempImage);
+      xlib_dt->tempImage = NULL;
+      xlib_dt->shm = False;
       (void) XSetErrorHandler(old_handler);
       return;
    }
 
-   xm_dt->shm = 1;
+   xlib_dt->shm = True;
 }
 
 
 static void
-alloc_ximage(struct xm_displaytarget *xm_dt,
+alloc_ximage(struct xlib_displaytarget *xlib_dt,
              struct xlib_drawable *xmb,
              unsigned width, unsigned height)
 {
-   if (xm_dt->shm) {
-      alloc_shm_ximage(xm_dt, xmb, width, height);
-      return;
+   /* try allocating a shared memory image first */
+   if (xlib_dt->shm) {
+      alloc_shm_ximage(xlib_dt, xmb, width, height);
+      if (xlib_dt->tempImage)
+         return; /* success */
    }
 
-   xm_dt->tempImage = XCreateImage(xm_dt->display,
+   /* try regular (non-shared memory) image */
+   xlib_dt->tempImage = XCreateImage(xlib_dt->display,
                                    xmb->visual,
                                    xmb->depth,
                                    ZPixmap, 0,
@@ -212,9 +213,9 @@ alloc_ximage(struct xm_displaytarget *xm_dt,
 }
 
 static boolean
-xm_is_displaytarget_format_supported( struct sw_winsys *ws,
-                                      unsigned tex_usage,
-                                      enum pipe_format format )
+xlib_is_displaytarget_format_supported(struct sw_winsys *ws,
+                                       unsigned tex_usage,
+                                       enum pipe_format format)
 {
    /* TODO: check visuals or other sensible thing here */
    return TRUE;
@@ -222,61 +223,67 @@ xm_is_displaytarget_format_supported( struct sw_winsys *ws,
 
 
 static void *
-xm_displaytarget_map(struct sw_winsys *ws,
-                     struct sw_displaytarget *dt,
-                     unsigned flags)
+xlib_displaytarget_map(struct sw_winsys *ws,
+                       struct sw_displaytarget *dt,
+                       unsigned flags)
 {
-   struct xm_displaytarget *xm_dt = xm_displaytarget(dt);
-   xm_dt->mapped = xm_dt->data;
-   return xm_dt->mapped;
+   struct xlib_displaytarget *xlib_dt = xlib_displaytarget(dt);
+   xlib_dt->mapped = xlib_dt->data;
+   return xlib_dt->mapped;
 }
 
+
 static void
-xm_displaytarget_unmap(struct sw_winsys *ws,
-                       struct sw_displaytarget *dt)
+xlib_displaytarget_unmap(struct sw_winsys *ws,
+                         struct sw_displaytarget *dt)
 {
-   struct xm_displaytarget *xm_dt = xm_displaytarget(dt);
-   xm_dt->mapped = NULL;
+   struct xlib_displaytarget *xlib_dt = xlib_displaytarget(dt);
+   xlib_dt->mapped = NULL;
 }
 
+
 static void
-xm_displaytarget_destroy(struct sw_winsys *ws,
-                         struct sw_displaytarget *dt)
+xlib_displaytarget_destroy(struct sw_winsys *ws,
+                           struct sw_displaytarget *dt)
 {
-   struct xm_displaytarget *xm_dt = xm_displaytarget(dt);
+   struct xlib_displaytarget *xlib_dt = xlib_displaytarget(dt);
 
-   if (xm_dt->data) {
-      if (xm_dt->shminfo.shmid >= 0) {
-         shmdt(xm_dt->shminfo.shmaddr);
-         shmctl(xm_dt->shminfo.shmid, IPC_RMID, 0);
+   if (xlib_dt->data) {
+      if (xlib_dt->shminfo.shmid >= 0) {
+         shmdt(xlib_dt->shminfo.shmaddr);
+         shmctl(xlib_dt->shminfo.shmid, IPC_RMID, 0);
          
-         xm_dt->shminfo.shmid = -1;
-         xm_dt->shminfo.shmaddr = (char *) -1;
+         xlib_dt->shminfo.shmid = -1;
+         xlib_dt->shminfo.shmaddr = (char *) -1;
+
+         xlib_dt->data = NULL;
+         if (xlib_dt->tempImage)
+            xlib_dt->tempImage->data = NULL;
       }
       else {
-         FREE(xm_dt->data);
-         if (xm_dt->tempImage && xm_dt->tempImage->data == xm_dt->data) {
-            xm_dt->tempImage->data = NULL;
+         FREE(xlib_dt->data);
+         if (xlib_dt->tempImage && xlib_dt->tempImage->data == xlib_dt->data) {
+            xlib_dt->tempImage->data = NULL;
          }
-         xm_dt->data = NULL;
+         xlib_dt->data = NULL;
       }
    }
 
-   if (xm_dt->tempImage) {
-      XDestroyImage(xm_dt->tempImage);
-      xm_dt->tempImage = NULL;
+   if (xlib_dt->tempImage) {
+      XDestroyImage(xlib_dt->tempImage);
+      xlib_dt->tempImage = NULL;
    }
 
-   if (xm_dt->gc)
-      XFreeGC(xm_dt->display, xm_dt->gc);
+   if (xlib_dt->gc)
+      XFreeGC(xlib_dt->display, xlib_dt->gc);
 
-   FREE(xm_dt);
+   FREE(xlib_dt);
 }
 
 
 /**
  * Display/copy the image in the surface into the X window specified
- * by the XMesaBuffer.
+ * by the display target.
  */
 static void
 xlib_sw_display(struct xlib_drawable *xlib_drawable,
@@ -284,8 +291,8 @@ xlib_sw_display(struct xlib_drawable *xlib_drawable,
 {
    static boolean no_swap = 0;
    static boolean firsttime = 1;
-   struct xm_displaytarget *xm_dt = xm_displaytarget(dt);
-   Display *display = xm_dt->display;
+   struct xlib_displaytarget *xlib_dt = xlib_displaytarget(dt);
+   Display *display = xlib_dt->display;
    XImage *ximage;
 
    if (firsttime) {
@@ -296,74 +303,74 @@ xlib_sw_display(struct xlib_drawable *xlib_drawable,
    if (no_swap)
       return;
 
-   if (xm_dt->drawable != xlib_drawable->drawable) {
-      if (xm_dt->gc) {
-         XFreeGC( display, xm_dt->gc );
-         xm_dt->gc = NULL;
+   if (xlib_dt->drawable != xlib_drawable->drawable) {
+      if (xlib_dt->gc) {
+         XFreeGC(display, xlib_dt->gc);
+         xlib_dt->gc = NULL;
       }
 
-      if (xm_dt->tempImage) {
-         XDestroyImage( xm_dt->tempImage );
-         xm_dt->tempImage = NULL;
+      if (xlib_dt->tempImage) {
+         XDestroyImage(xlib_dt->tempImage);
+         xlib_dt->tempImage = NULL;
       }
 
-      xm_dt->drawable = xlib_drawable->drawable;
+      xlib_dt->drawable = xlib_drawable->drawable;
    }
 
-   if (xm_dt->tempImage == NULL) {
-      assert(util_format_get_blockwidth(xm_dt->format) == 1);
-      assert(util_format_get_blockheight(xm_dt->format) == 1);
-      alloc_ximage(xm_dt, xlib_drawable,
-                   xm_dt->stride / util_format_get_blocksize(xm_dt->format),
-                   xm_dt->height);
-      if (!xm_dt->tempImage)
+   if (xlib_dt->tempImage == NULL) {
+      assert(util_format_get_blockwidth(xlib_dt->format) == 1);
+      assert(util_format_get_blockheight(xlib_dt->format) == 1);
+      alloc_ximage(xlib_dt, xlib_drawable,
+                   xlib_dt->stride / util_format_get_blocksize(xlib_dt->format),
+                   xlib_dt->height);
+      if (!xlib_dt->tempImage)
          return;
    }
 
-   if (xm_dt->gc == NULL) {
-      xm_dt->gc = XCreateGC( display, xlib_drawable->drawable, 0, NULL );
-      XSetFunction( display, xm_dt->gc, GXcopy );
+   if (xlib_dt->gc == NULL) {
+      xlib_dt->gc = XCreateGC(display, xlib_drawable->drawable, 0, NULL);
+      XSetFunction(display, xlib_dt->gc, GXcopy);
    }
 
-   if (xm_dt->shm)
-   {
-      ximage = xm_dt->tempImage;
-      ximage->data = xm_dt->data;
+   if (xlib_dt->shm) {
+      ximage = xlib_dt->tempImage;
+      ximage->data = xlib_dt->data;
 
       /* _debug_printf("XSHM\n"); */
-      XShmPutImage(xm_dt->display, xlib_drawable->drawable, xm_dt->gc,
-                   ximage, 0, 0, 0, 0, xm_dt->width, xm_dt->height, False);
+      XShmPutImage(xlib_dt->display, xlib_drawable->drawable, xlib_dt->gc,
+                   ximage, 0, 0, 0, 0, xlib_dt->width, xlib_dt->height, False);
    }
    else {
       /* display image in Window */
-      ximage = xm_dt->tempImage;
-      ximage->data = xm_dt->data;
+      ximage = xlib_dt->tempImage;
+      ximage->data = xlib_dt->data;
 
       /* check that the XImage has been previously initialized */
       assert(ximage->format);
       assert(ximage->bitmap_unit);
 
       /* update XImage's fields */
-      ximage->width = xm_dt->width;
-      ximage->height = xm_dt->height;
-      ximage->bytes_per_line = xm_dt->stride;
+      ximage->width = xlib_dt->width;
+      ximage->height = xlib_dt->height;
+      ximage->bytes_per_line = xlib_dt->stride;
 
       /* _debug_printf("XPUT\n"); */
-      XPutImage(xm_dt->display, xlib_drawable->drawable, xm_dt->gc,
-                ximage, 0, 0, 0, 0, xm_dt->width, xm_dt->height);
+      XPutImage(xlib_dt->display, xlib_drawable->drawable, xlib_dt->gc,
+                ximage, 0, 0, 0, 0, xlib_dt->width, xlib_dt->height);
    }
 
-   XFlush(xm_dt->display);
+   XFlush(xlib_dt->display);
 }
 
+
 /**
  * Display/copy the image in the surface into the X window specified
- * by the XMesaBuffer.
+ * by the display target.
  */
 static void
-xm_displaytarget_display(struct sw_winsys *ws,
-                         struct sw_displaytarget *dt,
-                         void *context_private)
+xlib_displaytarget_display(struct sw_winsys *ws,
+                           struct sw_displaytarget *dt,
+                           void *context_private)
 {
    struct xlib_drawable *xlib_drawable = (struct xlib_drawable *)context_private;
    xlib_sw_display(xlib_drawable, dt);
@@ -371,57 +378,57 @@ xm_displaytarget_display(struct sw_winsys *ws,
 
 
 static struct sw_displaytarget *
-xm_displaytarget_create(struct sw_winsys *winsys,
-                        unsigned tex_usage,
-                        enum pipe_format format,
-                        unsigned width, unsigned height,
-                        unsigned alignment,
-                        unsigned *stride)
+xlib_displaytarget_create(struct sw_winsys *winsys,
+                          unsigned tex_usage,
+                          enum pipe_format format,
+                          unsigned width, unsigned height,
+                          unsigned alignment,
+                          unsigned *stride)
 {
-   struct xm_displaytarget *xm_dt;
+   struct xlib_displaytarget *xlib_dt;
    unsigned nblocksy, size;
 
-   xm_dt = CALLOC_STRUCT(xm_displaytarget);
-   if(!xm_dt)
-      goto no_xm_dt;
+   xlib_dt = CALLOC_STRUCT(xlib_displaytarget);
+   if (!xlib_dt)
+      goto no_xlib_dt;
 
-   xm_dt->display = ((struct xlib_sw_winsys *)winsys)->display;
-   xm_dt->format = format;
-   xm_dt->width = width;
-   xm_dt->height = height;
+   xlib_dt->display = ((struct xlib_sw_winsys *)winsys)->display;
+   xlib_dt->format = format;
+   xlib_dt->width = width;
+   xlib_dt->height = height;
 
    nblocksy = util_format_get_nblocksy(format, height);
-   xm_dt->stride = align(util_format_get_stride(format, width), alignment);
-   size = xm_dt->stride * nblocksy;
+   xlib_dt->stride = align(util_format_get_stride(format, width), alignment);
+   size = xlib_dt->stride * nblocksy;
 
    if (!debug_get_option_xlib_no_shm()) {
-      xm_dt->data = alloc_shm(xm_dt, size);
-      if (xm_dt->data) {
-         xm_dt->shm = TRUE;
+      xlib_dt->data = alloc_shm(xlib_dt, size);
+      if (xlib_dt->data) {
+         xlib_dt->shm = True;
       }
    }
 
-   if(!xm_dt->data) {
-      xm_dt->data = align_malloc(size, alignment);
-      if(!xm_dt->data)
+   if (!xlib_dt->data) {
+      xlib_dt->data = align_malloc(size, alignment);
+      if (!xlib_dt->data)
          goto no_data;
    }
 
-   *stride = xm_dt->stride;
-   return (struct sw_displaytarget *)xm_dt;
+   *stride = xlib_dt->stride;
+   return (struct sw_displaytarget *)xlib_dt;
 
 no_data:
-   FREE(xm_dt);
-no_xm_dt:
+   FREE(xlib_dt);
+no_xlib_dt:
    return NULL;
 }
 
 
 static struct sw_displaytarget *
-xm_displaytarget_from_handle(struct sw_winsys *winsys,
-                             const struct pipe_resource *templet,
-                             struct winsys_handle *whandle,
-                             unsigned *stride)
+xlib_displaytarget_from_handle(struct sw_winsys *winsys,
+                               const struct pipe_resource *templet,
+                               struct winsys_handle *whandle,
+                               unsigned *stride)
 {
    assert(0);
    return NULL;
@@ -429,9 +436,9 @@ xm_displaytarget_from_handle(struct sw_winsys *winsys,
 
 
 static boolean
-xm_displaytarget_get_handle(struct sw_winsys *winsys,
-                            struct sw_displaytarget *dt,
-                            struct winsys_handle *whandle)
+xlib_displaytarget_get_handle(struct sw_winsys *winsys,
+                              struct sw_displaytarget *dt,
+                              struct winsys_handle *whandle)
 {
    assert(0);
    return FALSE;
@@ -439,14 +446,14 @@ xm_displaytarget_get_handle(struct sw_winsys *winsys,
 
 
 static void
-xm_destroy( struct sw_winsys *ws )
+xlib_destroy(struct sw_winsys *ws)
 {
    FREE(ws);
 }
 
 
 struct sw_winsys *
-xlib_create_sw_winsys( Display *display )
+xlib_create_sw_winsys(Display *display)
 {
    struct xlib_sw_winsys *ws;
 
@@ -455,19 +462,18 @@ xlib_create_sw_winsys( Display *display )
       return NULL;
 
    ws->display = display;
-   ws->base.destroy = xm_destroy;
+   ws->base.destroy = xlib_destroy;
 
-   ws->base.is_displaytarget_format_supported = xm_is_displaytarget_format_supported;
+   ws->base.is_displaytarget_format_supported = xlib_is_displaytarget_format_supported;
 
-   ws->base.displaytarget_create = xm_displaytarget_create;
-   ws->base.displaytarget_from_handle = xm_displaytarget_from_handle;
-   ws->base.displaytarget_get_handle = xm_displaytarget_get_handle;
-   ws->base.displaytarget_map = xm_displaytarget_map;
-   ws->base.displaytarget_unmap = xm_displaytarget_unmap;
-   ws->base.displaytarget_destroy = xm_displaytarget_destroy;
+   ws->base.displaytarget_create = xlib_displaytarget_create;
+   ws->base.displaytarget_from_handle = xlib_displaytarget_from_handle;
+   ws->base.displaytarget_get_handle = xlib_displaytarget_get_handle;
+   ws->base.displaytarget_map = xlib_displaytarget_map;
+   ws->base.displaytarget_unmap = xlib_displaytarget_unmap;
+   ws->base.displaytarget_destroy = xlib_displaytarget_destroy;
 
-   ws->base.displaytarget_display = xm_displaytarget_display;
+   ws->base.displaytarget_display = xlib_displaytarget_display;
 
    return &ws->base;
 }
-
author	Thomas Balling Sørensen <tball@tball-laptop.(none)>	2010-10-26 12:49:41 +0200
committer	Thomas Balling Sørensen <tball@tball-laptop.(none)>	2010-10-26 12:49:41 +0200
commit	dbf3a15313eed930a3d8fdde12e457259c43651b (patch)
tree	78bc54fa866ff1e97e61802f52dabe3f4f3380b1 /src/gallium
parent	1dccc4cfaa423f15ab582d2a0253a84a0ae0b9fa (diff)
parent	547e7619aac74ae13bdaa7fdf403a4ceb5212467 (diff)