49 files changed, 2563 insertions, 1157 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 05096b12a86..49ff1653e0e 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -173,6 +173,7 @@ GALLIVM_SOURCES = \
         gallivm/lp_bld_struct.c \
         gallivm/lp_bld_swizzle.c \
         gallivm/lp_bld_tgsi_aos.c \
+        gallivm/lp_bld_tgsi_info.c \
         gallivm/lp_bld_tgsi_soa.c \
         gallivm/lp_bld_type.c \
         draw/draw_llvm.c \
@@ -207,16 +208,16 @@ include ../Makefile.template
 
 
 indices/u_indices_gen.c: indices/u_indices_gen.py
-	python $< > $@
+	$(PYTHON2) $< > $@
 
 indices/u_unfilled_gen.c: indices/u_unfilled_gen.py
-	python $< > $@
+	$(PYTHON2) $< > $@
 
 util/u_format_srgb.c: util/u_format_srgb.py
-	python $< > $@
+	$(PYTHON2) $< > $@
 
 util/u_format_table.c: util/u_format_table.py util/u_format_pack.py util/u_format_parse.py util/u_format.csv
-	python util/u_format_table.py util/u_format.csv > $@
+	$(PYTHON2) util/u_format_table.py util/u_format.csv > $@
 
 util/u_half.c: util/u_half.py
-	python util/u_half.py > $@
+	$(PYTHON2) util/u_half.py > $@
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index a18f7c0b2a3..f22c8b96123 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -225,6 +225,7 @@ if env['llvm']:
     'gallivm/lp_bld_struct.c',
     'gallivm/lp_bld_swizzle.c',
     'gallivm/lp_bld_tgsi_aos.c',
+    'gallivm/lp_bld_tgsi_info.c',
     'gallivm/lp_bld_tgsi_soa.c',
     'gallivm/lp_bld_type.c',
     'draw/draw_llvm.c',
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 032fcbbc70a..39d82f32892 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -335,6 +335,7 @@ draw_set_mapped_constant_buffer(struct draw_context *draw,
    case PIPE_SHADER_VERTEX:
       draw->pt.user.vs_constants[slot] = buffer;
       draw->pt.user.vs_constants_size[slot] = size;
+      draw->pt.user.planes = (float (*) [12][4]) &(draw->plane[0]);
       draw_vs_set_constants(draw, slot, buffer, size);
       break;
    case PIPE_SHADER_GEOMETRY:
@@ -721,9 +722,9 @@ draw_set_mapped_texture(struct draw_context *draw,
                         unsigned sampler_idx,
                         uint32_t width, uint32_t height, uint32_t depth,
                         uint32_t last_level,
-                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        const void *data[DRAW_MAX_TEXTURE_LEVELS])
+                        uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        const void *data[PIPE_MAX_TEXTURE_LEVELS])
 {
 #ifdef HAVE_LLVM
    if(draw->llvm)
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 1f27cbf488a..ff4f753604f 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -49,7 +49,6 @@ struct draw_geometry_shader;
 struct draw_fragment_shader;
 struct tgsi_sampler;
 
-#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_context *draw_create( struct pipe_context *pipe );
 
@@ -120,9 +119,9 @@ draw_set_mapped_texture(struct draw_context *draw,
                         unsigned sampler_idx,
                         uint32_t width, uint32_t height, uint32_t depth,
                         uint32_t last_level,
-                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                        const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+                        uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                        const void *data[PIPE_MAX_TEXTURE_LEVELS]);
 
 
 /*
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 7fb86d7cb27..140e596f994 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -31,6 +31,9 @@
 #include "draw_vs.h"
 
 #include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_swizzle.h"
 #include "gallivm/lp_bld_struct.h"
 #include "gallivm/lp_bld_type.h"
 #include "gallivm/lp_bld_flow.h"
@@ -43,7 +46,6 @@
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_dump.h"
 
-#include "util/u_cpu_detect.h"
 #include "util/u_math.h"
 #include "util/u_pointer.h"
 #include "util/u_string.h"
@@ -72,12 +74,12 @@ init_globals(struct draw_llvm *llvm)
       elem_types[DRAW_JIT_TEXTURE_DEPTH] = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_ROW_STRIDE] =
-         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+         LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_IMG_STRIDE] =
-         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+         LLVMArrayType(LLVMInt32Type(), PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_DATA] =
          LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
-                       DRAW_MAX_TEXTURE_LEVELS);
+                       PIPE_MAX_TEXTURE_LEVELS);
       elem_types[DRAW_JIT_TEXTURE_MIN_LOD] = LLVMFloatType();
       elem_types[DRAW_JIT_TEXTURE_MAX_LOD] = LLVMFloatType();
       elem_types[DRAW_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType();
@@ -128,12 +130,14 @@ init_globals(struct draw_llvm *llvm)
 
    /* struct draw_jit_context */
    {
-      LLVMTypeRef elem_types[3];
+      LLVMTypeRef elem_types[5];
       LLVMTypeRef context_type;
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
-      elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
-      elem_types[2] = LLVMArrayType(texture_type,
+      elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* gs_constants */
+      elem_types[2] = LLVMPointerType(LLVMArrayType(LLVMArrayType(LLVMFloatType(), 4), 12), 0); /* planes */
+      elem_types[3] = LLVMPointerType(LLVMFloatType(), 0); /* viewport */
+      elem_types[4] = LLVMArrayType(texture_type,
                                     PIPE_MAX_VERTEX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
@@ -142,6 +146,8 @@ init_globals(struct draw_llvm *llvm)
                              llvm->target, context_type, 0);
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, gs_constants,
                              llvm->target, context_type, 1);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, planes,
+                             llvm->target, context_type, 2);
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, textures,
                              llvm->target, context_type,
                              DRAW_JIT_CTX_TEXTURES);
@@ -267,13 +273,7 @@ draw_llvm_create(struct draw_context *draw)
          LLVMAddConstantPropagationPass(llvm->pass);
       }
 
-      if(util_cpu_caps.has_sse4_1) {
-         /* FIXME: There is a bug in this pass, whereby the combination of fptosi
-          * and sitofp (necessary for trunc/floor/ceil/round implementation)
-          * somehow becomes invalid code.
-          */
-         LLVMAddInstructionCombiningPass(llvm->pass);
-      }
+      LLVMAddInstructionCombiningPass(llvm->pass);
       LLVMAddGVNPass(llvm->pass);
    } else {
       /* We need at least this pass to prevent the backends to fail in
@@ -421,7 +421,7 @@ generate_fetch(LLVMBuilderRef builder,
                             "instance_divisor");
    }
 
-   /* limit index to min(inex, vb_max_index) */
+   /* limit index to min(index, vb_max_index) */
    cond = LLVMBuildICmp(builder, LLVMIntULE, index, vb_max_index, "");
    index = LLVMBuildSelect(builder, cond, index, vb_max_index, "");
 
@@ -550,19 +550,28 @@ static void
 store_aos(LLVMBuilderRef builder,
           LLVMValueRef io_ptr,
           LLVMValueRef index,
-          LLVMValueRef value)
+          LLVMValueRef value,
+          LLVMValueRef clipmask)
 {
    LLVMValueRef id_ptr = draw_jit_header_id(builder, io_ptr);
    LLVMValueRef data_ptr = draw_jit_header_data(builder, io_ptr);
    LLVMValueRef indices[3];
+   LLVMValueRef val, shift;
 
    indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
    indices[1] = index;
    indices[2] = LLVMConstInt(LLVMInt32Type(), 0, 0);
 
-   /* undefined vertex */
-   LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(),
-                                        0xffff, 0), id_ptr);
+   /* initialize vertex id:16 = 0xffff, pad:3 = 0, edgeflag:1 = 1 */
+   val = LLVMConstInt(LLVMInt32Type(), 0xffff1, 0); 
+   shift  = LLVMConstInt(LLVMInt32Type(), 12, 0);          
+   val = LLVMBuildShl(builder, val, shift, "");
+   /* add clipmask:12 */   
+   val = LLVMBuildOr(builder, val, clipmask, "");               
+
+   /* store vertex header */
+   LLVMBuildStore(builder, val, id_ptr);
+
 
 #if DEBUG_STORE
    lp_build_printf(builder, "    ---- %p storing attribute %d (io = %p)\n", data_ptr, index, io_ptr);
@@ -617,7 +626,8 @@ store_aos_array(LLVMBuilderRef builder,
                 LLVMValueRef io_ptr,
                 LLVMValueRef aos[NUM_CHANNELS],
                 int attrib,
-                int num_outputs)
+                int num_outputs,
+                LLVMValueRef clipmask)
 {
    LLVMValueRef attr_index = LLVMConstInt(LLVMInt32Type(), attrib, 0);
    LLVMValueRef ind0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
@@ -625,7 +635,8 @@ store_aos_array(LLVMBuilderRef builder,
    LLVMValueRef ind2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
    LLVMValueRef ind3 = LLVMConstInt(LLVMInt32Type(), 3, 0);
    LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
-
+   LLVMValueRef clipmask0, clipmask1, clipmask2, clipmask3;
+   
    debug_assert(NUM_CHANNELS == 4);
 
    io0_ptr = LLVMBuildGEP(builder, io_ptr,
@@ -637,21 +648,31 @@ store_aos_array(LLVMBuilderRef builder,
    io3_ptr = LLVMBuildGEP(builder, io_ptr,
                           &ind3, 1, "");
 
+   clipmask0 = LLVMBuildExtractElement(builder, clipmask,
+                                       ind0, "");
+   clipmask1 = LLVMBuildExtractElement(builder, clipmask,
+                                       ind1, "");
+   clipmask2 = LLVMBuildExtractElement(builder, clipmask,
+                                       ind2, "");
+   clipmask3 = LLVMBuildExtractElement(builder, clipmask,
+                                       ind3, "");
+
 #if DEBUG_STORE
-   lp_build_printf(builder, "   io = %p, indexes[%d, %d, %d, %d]\n",
-                   io_ptr, ind0, ind1, ind2, ind3);
+   lp_build_printf(builder, "io = %p, indexes[%d, %d, %d, %d]\n, clipmask0 = %x, clipmask1 = %x, clipmask2 = %x, clipmask3 = %x\n",
+                   io_ptr, ind0, ind1, ind2, ind3, clipmask0, clipmask1, clipmask2, clipmask3);
 #endif
-
-   store_aos(builder, io0_ptr, attr_index, aos[0]);
-   store_aos(builder, io1_ptr, attr_index, aos[1]);
-   store_aos(builder, io2_ptr, attr_index, aos[2]);
-   store_aos(builder, io3_ptr, attr_index, aos[3]);
+   /* store for each of the 4 vertices */
+   store_aos(builder, io0_ptr, attr_index, aos[0], clipmask0);
+   store_aos(builder, io1_ptr, attr_index, aos[1], clipmask1);
+   store_aos(builder, io2_ptr, attr_index, aos[2], clipmask2);
+   store_aos(builder, io3_ptr, attr_index, aos[3], clipmask3);
 }
 
 static void
 convert_to_aos(LLVMBuilderRef builder,
                LLVMValueRef io,
                LLVMValueRef (*outputs)[NUM_CHANNELS],
+               LLVMValueRef clipmask,
                int num_outputs,
                int max_vertices)
 {
@@ -680,13 +701,305 @@ convert_to_aos(LLVMBuilderRef builder,
                       io,
                       aos,
                       attrib,
-                      num_outputs);
+                      num_outputs,
+                      clipmask);
    }
 #if DEBUG_STORE
    lp_build_printf(builder, "   # storing end\n");
 #endif
 }
 
+/*
+ * Stores original vertex positions in clip coordinates
+ * There is probably a more efficient way to do this, 4 floats at once
+ * rather than extracting each element one by one.
+ */
+static void
+store_clip(LLVMBuilderRef builder,
+           LLVMValueRef io_ptr,           
+           LLVMValueRef (*outputs)[NUM_CHANNELS])
+{
+   LLVMValueRef out[4];
+   LLVMValueRef indices[2]; 
+   LLVMValueRef io0_ptr, io1_ptr, io2_ptr, io3_ptr;
+   LLVMValueRef clip_ptr0, clip_ptr1, clip_ptr2, clip_ptr3;
+   LLVMValueRef clip0_ptr, clip1_ptr, clip2_ptr, clip3_ptr;    
+   LLVMValueRef out0elem, out1elem, out2elem, out3elem;
+   int i;
+
+   LLVMValueRef ind0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   LLVMValueRef ind1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+   LLVMValueRef ind2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+   LLVMValueRef ind3 = LLVMConstInt(LLVMInt32Type(), 3, 0);
+   
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indices[1] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   
+   out[0] = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 x1 x2 x3*/
+   out[1] = LLVMBuildLoad(builder, outputs[0][1], ""); /*y0 y1 y2 y3*/
+   out[2] = LLVMBuildLoad(builder, outputs[0][2], ""); /*z0 z1 z2 z3*/
+   out[3] = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/  
+
+   io0_ptr = LLVMBuildGEP(builder, io_ptr, &ind0, 1, "");
+   io1_ptr = LLVMBuildGEP(builder, io_ptr, &ind1, 1, "");
+   io2_ptr = LLVMBuildGEP(builder, io_ptr, &ind2, 1, "");
+   io3_ptr = LLVMBuildGEP(builder, io_ptr, &ind3, 1, "");
+
+   clip_ptr0 = draw_jit_header_clip(builder, io0_ptr);
+   clip_ptr1 = draw_jit_header_clip(builder, io1_ptr);
+   clip_ptr2 = draw_jit_header_clip(builder, io2_ptr);
+   clip_ptr3 = draw_jit_header_clip(builder, io3_ptr);
+
+   for (i = 0; i<4; i++){
+      clip0_ptr = LLVMBuildGEP(builder, clip_ptr0,
+                               indices, 2, ""); //x0
+      clip1_ptr = LLVMBuildGEP(builder, clip_ptr1,
+                               indices, 2, ""); //x1
+      clip2_ptr = LLVMBuildGEP(builder, clip_ptr2,
+                               indices, 2, ""); //x2
+      clip3_ptr = LLVMBuildGEP(builder, clip_ptr3,
+                               indices, 2, ""); //x3
+
+      out0elem = LLVMBuildExtractElement(builder, out[i],
+                                         ind0, ""); //x0
+      out1elem = LLVMBuildExtractElement(builder, out[i],
+                                         ind1, ""); //x1
+      out2elem = LLVMBuildExtractElement(builder, out[i],
+                                         ind2, ""); //x2
+      out3elem = LLVMBuildExtractElement(builder, out[i],
+                                         ind3, ""); //x3
+  
+      LLVMBuildStore(builder, out0elem, clip0_ptr);
+      LLVMBuildStore(builder, out1elem, clip1_ptr);
+      LLVMBuildStore(builder, out2elem, clip2_ptr);
+      LLVMBuildStore(builder, out3elem, clip3_ptr);
+
+      indices[1]= LLVMBuildAdd(builder, indices[1], ind1, "");
+   }
+
+}
+
+/* Equivalent of _mm_set1_ps(a)
+ */
+static LLVMValueRef vec4f_from_scalar(LLVMBuilderRef bld,
+				      LLVMValueRef a,
+				      const char *name)
+{
+   LLVMValueRef res = LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4));
+   int i;
+
+   for(i = 0; i < 4; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildInsertElement(bld, res, a, index, i == 3 ? name : "");
+   }
+
+   return res;
+}
+
+/*
+ * Transforms the outputs for viewport mapping
+ */
+static void
+generate_viewport(struct draw_llvm *llvm,
+                  LLVMBuilderRef builder,
+                  LLVMValueRef (*outputs)[NUM_CHANNELS],
+                  LLVMValueRef context_ptr)
+{
+   int i;
+   struct lp_type f32_type = lp_type_float_vec(32);
+   LLVMValueRef out3 = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/   
+   LLVMValueRef const1 = lp_build_const_vec(f32_type, 1.0);       /*1.0 1.0 1.0 1.0*/ 
+   LLVMValueRef vp_ptr = draw_jit_context_viewport(builder, context_ptr);
+
+   /* for 1/w convention*/
+   out3 = LLVMBuildFDiv(builder, const1, out3, "");
+   LLVMBuildStore(builder, out3, outputs[0][3]);
+  
+   /* Viewport Mapping */
+   for (i=0; i<3; i++){
+      LLVMValueRef out = LLVMBuildLoad(builder, outputs[0][i], ""); /*x0 x1 x2 x3*/
+      LLVMValueRef scale;
+      LLVMValueRef trans;
+      LLVMValueRef scale_i;
+      LLVMValueRef trans_i;
+      LLVMValueRef index;
+      
+      index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      scale_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, "");
+
+      index = LLVMConstInt(LLVMInt32Type(), i+4, 0);
+      trans_i = LLVMBuildGEP(builder, vp_ptr, &index, 1, "");
+
+      scale = vec4f_from_scalar(builder, LLVMBuildLoad(builder, scale_i, ""), "scale");
+      trans = vec4f_from_scalar(builder, LLVMBuildLoad(builder, trans_i, ""), "trans");
+
+      /* divide by w */
+      out = LLVMBuildFMul(builder, out, out3, "");
+      /* mult by scale */
+      out = LLVMBuildFMul(builder, out, scale, "");
+      /* add translation */
+      out = LLVMBuildFAdd(builder, out, trans, "");
+
+      /* store transformed outputs */
+      LLVMBuildStore(builder, out, outputs[0][i]);
+   }
+   
+}
+
+
+/*
+ * Returns clipmask as 4xi32 bitmask for the 4 vertices
+ */
+static LLVMValueRef 
+generate_clipmask(LLVMBuilderRef builder,
+                  LLVMValueRef (*outputs)[NUM_CHANNELS],
+                  boolean clip_xy,
+                  boolean clip_z,
+                  boolean clip_user,
+                  boolean clip_halfz,
+                  unsigned nr,
+                  LLVMValueRef context_ptr)
+{
+   LLVMValueRef mask; /* stores the <4xi32> clipmasks */     
+   LLVMValueRef test, temp; 
+   LLVMValueRef zero, shift;
+   LLVMValueRef pos_x, pos_y, pos_z, pos_w;
+   LLVMValueRef plane1, planes, plane_ptr, sum;
+
+   unsigned i;
+
+   struct lp_type f32_type = lp_type_float_vec(32); 
+
+   mask = lp_build_const_int_vec(lp_type_int_vec(32), 0);
+   temp = lp_build_const_int_vec(lp_type_int_vec(32), 0);
+   zero = lp_build_const_vec(f32_type, 0);                    /* 0.0f 0.0f 0.0f 0.0f */
+   shift = lp_build_const_int_vec(lp_type_int_vec(32), 1);    /* 1 1 1 1 */
+
+   /* Assuming position stored at output[0] */
+   pos_x = LLVMBuildLoad(builder, outputs[0][0], ""); /*x0 x1 x2 x3*/
+   pos_y = LLVMBuildLoad(builder, outputs[0][1], ""); /*y0 y1 y2 y3*/
+   pos_z = LLVMBuildLoad(builder, outputs[0][2], ""); /*z0 z1 z2 z3*/
+   pos_w = LLVMBuildLoad(builder, outputs[0][3], ""); /*w0 w1 w2 w3*/   
+
+   /* Cliptest, for hardwired planes */
+   if (clip_xy){
+      /* plane 1 */
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_x , pos_w);
+      temp = shift;
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = test;
+   
+      /* plane 2 */
+      test = LLVMBuildFAdd(builder, pos_x, pos_w, "");
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test);
+      temp = LLVMBuildShl(builder, temp, shift, "");
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = LLVMBuildOr(builder, mask, test, "");
+   
+      /* plane 3 */
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_y, pos_w);
+      temp = LLVMBuildShl(builder, temp, shift, "");
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = LLVMBuildOr(builder, mask, test, "");
+
+      /* plane 4 */
+      test = LLVMBuildFAdd(builder, pos_y, pos_w, "");
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test);
+      temp = LLVMBuildShl(builder, temp, shift, "");
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = LLVMBuildOr(builder, mask, test, "");
+   }
+
+   if (clip_z){
+      temp = lp_build_const_int_vec(lp_type_int_vec(32), 16);
+      if (clip_halfz){
+         /* plane 5 */
+         test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, pos_z);
+         test = LLVMBuildAnd(builder, test, temp, ""); 
+         mask = LLVMBuildOr(builder, mask, test, "");
+      }  
+      else{
+         /* plane 5 */
+         test = LLVMBuildFAdd(builder, pos_z, pos_w, "");
+         test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, test);
+         test = LLVMBuildAnd(builder, test, temp, ""); 
+         mask = LLVMBuildOr(builder, mask, test, "");
+      }
+      /* plane 6 */
+      test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, pos_z, pos_w);
+      temp = LLVMBuildShl(builder, temp, shift, "");
+      test = LLVMBuildAnd(builder, test, temp, ""); 
+      mask = LLVMBuildOr(builder, mask, test, "");
+   }   
+
+   if (clip_user){
+      LLVMValueRef planes_ptr = draw_jit_context_planes(builder, context_ptr);
+      LLVMValueRef indices[3];
+      temp = lp_build_const_int_vec(lp_type_int_vec(32), 32);
+
+      /* userclip planes */
+      for (i = 6; i < nr; i++) {
+         indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+         indices[1] = LLVMConstInt(LLVMInt32Type(), i, 0);
+
+         indices[2] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+         plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
+         plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_x");
+         planes = vec4f_from_scalar(builder, plane1, "plane4_x");
+         sum = LLVMBuildFMul(builder, planes, pos_x, "");
+
+         indices[2] = LLVMConstInt(LLVMInt32Type(), 1, 0);
+         plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
+         plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_y"); 
+         planes = vec4f_from_scalar(builder, plane1, "plane4_y");
+         test = LLVMBuildFMul(builder, planes, pos_y, "");
+         sum = LLVMBuildFAdd(builder, sum, test, "");
+         
+         indices[2] = LLVMConstInt(LLVMInt32Type(), 2, 0);
+         plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
+         plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_z"); 
+         planes = vec4f_from_scalar(builder, plane1, "plane4_z");
+         test = LLVMBuildFMul(builder, planes, pos_z, "");
+         sum = LLVMBuildFAdd(builder, sum, test, "");
+
+         indices[2] = LLVMConstInt(LLVMInt32Type(), 3, 0);
+         plane_ptr = LLVMBuildGEP(builder, planes_ptr, indices, 3, "");
+         plane1 = LLVMBuildLoad(builder, plane_ptr, "plane_w"); 
+         planes = vec4f_from_scalar(builder, plane1, "plane4_w");
+         test = LLVMBuildFMul(builder, planes, pos_w, "");
+         sum = LLVMBuildFAdd(builder, sum, test, "");
+
+         test = lp_build_compare(builder, f32_type, PIPE_FUNC_GREATER, zero, sum);
+         temp = LLVMBuildShl(builder, temp, shift, "");
+         test = LLVMBuildAnd(builder, test, temp, ""); 
+         mask = LLVMBuildOr(builder, mask, test, "");
+      }
+   }
+   return mask;
+}
+
+/*
+ * Returns boolean if any clipping has occurred
+ * Used zero/non-zero i32 value to represent boolean 
+ */
+static void
+clipmask_bool(LLVMBuilderRef builder, 
+              LLVMValueRef clipmask,
+              LLVMValueRef ret_ptr)
+{
+   LLVMValueRef ret = LLVMBuildLoad(builder, ret_ptr, "");   
+   LLVMValueRef temp;
+   int i;
+
+   for (i=0; i<4; i++){   
+      temp = LLVMBuildExtractElement(builder, clipmask,
+                                     LLVMConstInt(LLVMInt32Type(), i, 0) , "");
+      ret = LLVMBuildOr(builder, ret, temp, "");
+   }
+   
+   LLVMBuildStore(builder, ret, ret_ptr);
+}
+
 static void
 draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 {
@@ -706,7 +1019,12 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    void *code;
    struct lp_build_sampler_soa *sampler = 0;
-
+   LLVMValueRef ret, ret_ptr;
+   boolean bypass_viewport = variant->key.bypass_viewport;
+   boolean enable_cliptest = variant->key.clip_xy || 
+                             variant->key.clip_z  ||
+                             variant->key.clip_user;
+   
    arg_types[0] = llvm->context_ptr_type;           /* context */
    arg_types[1] = llvm->vertex_header_ptr_type;     /* vertex_header */
    arg_types[2] = llvm->buffer_ptr_type;            /* vbuffers */
@@ -716,7 +1034,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    arg_types[6] = llvm->vb_ptr_type;                /* pipe_vertex_buffer's */
    arg_types[7] = LLVMInt32Type();                  /* instance_id */
 
-   func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
+   func_type = LLVMFunctionType(LLVMInt32Type(), arg_types, Elements(arg_types), 0);
 
    variant->function = LLVMAddFunction(llvm->module, "draw_llvm_shader", func_type);
    LLVMSetFunctionCallConv(variant->function, LLVMCCallConv);
@@ -756,6 +1074,10 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 
    step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
 
+   /* function will return non-zero i32 value if any clipped vertices */     
+   ret_ptr = lp_build_alloca(builder, LLVMInt32Type(), "");   
+   LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), ret_ptr);
+
    /* code generated texture sampling */
    sampler = draw_llvm_sampler_soa_create(
       draw_llvm_variant_key_samplers(&variant->key),
@@ -770,6 +1092,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
       LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
       LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS] = { { 0 } };
       LLVMValueRef io;
+      LLVMValueRef clipmask;   /* holds the clipmask value */
       const LLVMValueRef (*ptr_aos)[NUM_CHANNELS];
 
       io_itr = LLVMBuildSub(builder, lp_loop.counter, start, "");
@@ -806,10 +1129,37 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
                   context_ptr,
                   sampler);
 
-      convert_to_aos(builder, io, outputs,
+      /* store original positions in clip before further manipulation */
+      store_clip(builder, io, outputs);
+
+      /* do cliptest */
+      if (enable_cliptest){
+         /* allocate clipmask, assign it integer type */
+         clipmask = generate_clipmask(builder, outputs,
+                                      variant->key.clip_xy,
+                                      variant->key.clip_z, 
+                                      variant->key.clip_user,
+                                      variant->key.clip_halfz,
+                                      variant->key.nr_planes,
+                                      context_ptr);
+         /* return clipping boolean value for function */
+         clipmask_bool(builder, clipmask, ret_ptr);
+      }
+      else{
+         clipmask = lp_build_const_int_vec(lp_type_int_vec(32), 0);    
+      }
+      
+      /* do viewport mapping */
+      if (!bypass_viewport){
+         generate_viewport(llvm, builder, outputs, context_ptr);
+      }
+
+      /* store clipmask in vertex header and positions in data */
+      convert_to_aos(builder, io, outputs, clipmask,
                      draw->vs.vertex_shader->info.num_outputs,
                      max_vertices);
    }
+
    lp_build_loop_end_cond(builder, end, step, LLVMIntUGE, &lp_loop);
 
    sampler->destroy(sampler);
@@ -819,8 +1169,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0);
 #endif
 
-   LLVMBuildRetVoid(builder);
-
+   ret = LLVMBuildLoad(builder, ret_ptr,"");
+   LLVMBuildRet(builder, ret);
+      
    LLVMDisposeBuilder(builder);
 
    /*
@@ -870,7 +1221,12 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    LLVMValueRef fetch_max;
    void *code;
    struct lp_build_sampler_soa *sampler = 0;
-
+   LLVMValueRef ret, ret_ptr;
+   boolean bypass_viewport = variant->key.bypass_viewport;
+   boolean enable_cliptest = variant->key.clip_xy || 
+                             variant->key.clip_z  ||
+                             variant->key.clip_user;
+   
    arg_types[0] = llvm->context_ptr_type;               /* context */
    arg_types[1] = llvm->vertex_header_ptr_type;         /* vertex_header */
    arg_types[2] = llvm->buffer_ptr_type;                /* vbuffers */
@@ -880,10 +1236,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    arg_types[6] = llvm->vb_ptr_type;                    /* pipe_vertex_buffer's */
    arg_types[7] = LLVMInt32Type();                      /* instance_id */
 
-   func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
+   func_type = LLVMFunctionType(LLVMInt32Type(), arg_types, Elements(arg_types), 0);
 
-   variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts",
-                                            func_type);
+   variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", func_type);
    LLVMSetFunctionCallConv(variant->function_elts, LLVMCCallConv);
    for(i = 0; i < Elements(arg_types); ++i)
       if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
@@ -929,11 +1284,16 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
                             LLVMConstInt(LLVMInt32Type(), 1, 0),
                             "fetch_max");
 
+   /* function returns non-zero i32 value if any clipped vertices */
+   ret_ptr = lp_build_alloca(builder, LLVMInt32Type(), ""); 
+   LLVMBuildStore(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), ret_ptr);
+
    lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), &lp_loop);
    {
       LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
       LLVMValueRef aos_attribs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS] = { { 0 } };
       LLVMValueRef io;
+      LLVMValueRef clipmask;   /* holds the clipmask value */
       const LLVMValueRef (*ptr_aos)[NUM_CHANNELS];
 
       io_itr = lp_loop.counter;
@@ -980,10 +1340,40 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
                   context_ptr,
                   sampler);
 
-      convert_to_aos(builder, io, outputs,
+      /* store original positions in clip before further manipulation */
+      store_clip(builder, io, outputs);
+
+      /* do cliptest */
+      if (enable_cliptest){
+         /* allocate clipmask, assign it integer type */
+         clipmask = generate_clipmask(builder, outputs,
+                                      variant->key.clip_xy,
+                                      variant->key.clip_z, 
+                                      variant->key.clip_user,
+                                      variant->key.clip_halfz,
+                                      variant->key.nr_planes,
+                                      context_ptr);
+         /* return clipping boolean value for function */
+         clipmask_bool(builder, clipmask, ret_ptr);
+      }
+      else{
+         clipmask = lp_build_const_int_vec(lp_type_int_vec(32), 0);
+      }
+      
+      /* do viewport mapping */
+      if (!bypass_viewport){
+         generate_viewport(llvm, builder, outputs, context_ptr);
+      }
+
+      /* store clipmask in vertex header, 
+       * original positions in clip 
+       * and transformed positions in data 
+       */   
+      convert_to_aos(builder, io, outputs, clipmask,
                      draw->vs.vertex_shader->info.num_outputs,
                      max_vertices);
    }
+
    lp_build_loop_end_cond(builder, fetch_count, step, LLVMIntUGE, &lp_loop);
 
    sampler->destroy(sampler);
@@ -993,8 +1383,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0);
 #endif
 
-   LLVMBuildRetVoid(builder);
-
+   ret = LLVMBuildLoad(builder, ret_ptr,"");   
+   LLVMBuildRet(builder, ret);
+   
    LLVMDisposeBuilder(builder);
 
    /*
@@ -1038,6 +1429,16 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store)
     */
    key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements;
 
+   /* will have to rig this up properly later */
+   key->clip_xy = llvm->draw->clip_xy;
+   key->clip_z = llvm->draw->clip_z;
+   key->clip_user = llvm->draw->clip_user;
+   key->bypass_viewport = llvm->draw->identity_viewport;
+   key->clip_halfz = !llvm->draw->rasterizer->gl_rasterization_rules;
+   key->need_edgeflags = (llvm->draw->vs.edgeflag_output ? TRUE : FALSE);
+   key->nr_planes = llvm->draw->nr_planes;
+   key->pad = 0;
+
    /* All variants of this shader will have the same value for
     * nr_samplers.  Not yet trying to compact away holes in the
     * sampler array.
@@ -1066,9 +1467,9 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,
                              unsigned sampler_idx,
                              uint32_t width, uint32_t height, uint32_t depth,
                              uint32_t last_level,
-                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             const void *data[DRAW_MAX_TEXTURE_LEVELS])
+                             uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             const void *data[PIPE_MAX_TEXTURE_LEVELS])
 {
    unsigned j;
    struct draw_jit_texture *jit_tex;
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index d0a68ae412d..c3c30c07c64 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -41,7 +41,6 @@
 #include <llvm-c/Target.h>
 #include <llvm-c/ExecutionEngine.h>
 
-#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_llvm;
 struct llvm_vertex_shader;
@@ -52,9 +51,9 @@ struct draw_jit_texture
    uint32_t height;
    uint32_t depth;
    uint32_t last_level;
-   uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
-   uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
-   const void *data[DRAW_MAX_TEXTURE_LEVELS];
+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   const void *data[PIPE_MAX_TEXTURE_LEVELS];
    float min_lod;
    float max_lod;
    float lod_bias;
@@ -97,7 +96,8 @@ struct draw_jit_context
 {
    const float *vs_constants;
    const float *gs_constants;
-
+   float (*planes) [12][4];
+   float *viewport;
 
    struct draw_jit_texture textures[PIPE_MAX_VERTEX_SAMPLERS];
 };
@@ -109,18 +109,22 @@ struct draw_jit_context
 #define draw_jit_context_gs_constants(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 1, "gs_constants")
 
-#define DRAW_JIT_CTX_TEXTURES 2
+#define draw_jit_context_planes(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 2, "planes")
 
-#define draw_jit_context_textures(_builder, _ptr) \
-   lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CTX_TEXTURES, "textures")
+#define draw_jit_context_viewport(_builder, _ptr) \
+   lp_build_struct_get(_builder, _ptr, 3, "viewport")
 
+#define DRAW_JIT_CTX_TEXTURES 4
 
+#define draw_jit_context_textures(_builder, _ptr) \
+   lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CTX_TEXTURES, "textures")
 
 #define draw_jit_header_id(_builder, _ptr)              \
    lp_build_struct_get_ptr(_builder, _ptr, 0, "id")
 
 #define draw_jit_header_clip(_builder, _ptr) \
-   lp_build_struct_get(_builder, _ptr, 1, "clip")
+   lp_build_struct_get_ptr(_builder, _ptr, 1, "clip")
 
 #define draw_jit_header_data(_builder, _ptr)            \
    lp_build_struct_get_ptr(_builder, _ptr, 2, "data")
@@ -136,7 +140,7 @@ struct draw_jit_context
    lp_build_struct_get(_builder, _ptr, 2, "buffer_offset")
 
 
-typedef void
+typedef int
 (*draw_jit_vert_func)(struct draw_jit_context *context,
                       struct vertex_header *io,
                       const char *vbuffers[PIPE_MAX_ATTRIBS],
@@ -147,7 +151,7 @@ typedef void
                       unsigned instance_id);
 
 
-typedef void
+typedef int
 (*draw_jit_vert_func_elts)(struct draw_jit_context *context,
                            struct vertex_header *io,
                            const char *vbuffers[PIPE_MAX_ATTRIBS],
@@ -159,8 +163,16 @@ typedef void
 
 struct draw_llvm_variant_key
 {
-   unsigned nr_vertex_elements:16;
-   unsigned nr_samplers:16;
+   unsigned nr_vertex_elements:8;
+   unsigned nr_samplers:8;
+   unsigned clip_xy:1;
+   unsigned clip_z:1;
+   unsigned clip_user:1;
+   unsigned clip_halfz:1;
+   unsigned bypass_viewport:1;
+   unsigned need_edgeflags:1;
+   unsigned nr_planes:4;
+   unsigned pad:6;
 
    /* Variable number of vertex elements:
     */
@@ -290,8 +302,8 @@ draw_llvm_set_mapped_texture(struct draw_context *draw,
                              unsigned sampler_idx,
                              uint32_t width, uint32_t height, uint32_t depth,
                              uint32_t last_level,
-                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
-                             const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+                             uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS],
+                             const void *data[PIPE_MAX_TEXTURE_LEVELS]);
 
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index d417f825a0f..54163d7f9eb 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -169,6 +169,9 @@ struct draw_context
          unsigned vs_constants_size[PIPE_MAX_CONSTANT_BUFFERS];
          const void *gs_constants[PIPE_MAX_CONSTANT_BUFFERS];
          unsigned gs_constants_size[PIPE_MAX_CONSTANT_BUFFERS];
+         
+         /* pointer to planes */
+         float (*planes)[12][4]; 
       } user;
 
       boolean test_fse;         /* enable FSE even though its not correct (eg for softpipe) */
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index f44bf2507c6..4078b2a07d0 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -287,6 +287,84 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
 }
 
 
+/** Helper code for below */
+#define PRIM_RESTART_LOOP(elements) \
+   do { \
+      for (i = start; i < end; i++) { \
+         if (elements[i] == info->restart_index) { \
+            if (cur_count > 0) { \
+               /* draw elts up to prev pos */ \
+               draw_pt_arrays(draw, prim, cur_start, cur_count); \
+            } \
+            /* begin new prim at next elt */ \
+            cur_start = i + 1; \
+            cur_count = 0; \
+         } \
+         else { \
+            cur_count++; \
+         } \
+      } \
+      if (cur_count > 0) { \
+         draw_pt_arrays(draw, prim, cur_start, cur_count); \
+      } \
+   } while (0)
+
+
+/**
+ * For drawing prims with primitive restart enabled.
+ * Scan for restart indexes and draw the runs of elements/vertices between
+ * the restarts.
+ */
+static void
+draw_pt_arrays_restart(struct draw_context *draw,
+                       const struct pipe_draw_info *info)
+{
+   const unsigned prim = info->mode;
+   const unsigned start = info->start;
+   const unsigned count = info->count;
+   const unsigned end = start + count;
+   unsigned i, cur_start, cur_count;
+
+   assert(info->primitive_restart);
+
+   if (draw->pt.user.elts) {
+      /* indexed prims (draw_elements) */
+      cur_start = start;
+      cur_count = 0;
+
+      switch (draw->pt.user.eltSize) {
+      case 1:
+         {
+            const ubyte *elt_ub = (const ubyte *) draw->pt.user.elts;
+            PRIM_RESTART_LOOP(elt_ub);
+         }
+         break;
+      case 2:
+         {
+            const ushort *elt_us = (const ushort *) draw->pt.user.elts;
+            PRIM_RESTART_LOOP(elt_us);
+         }
+         break;
+      case 4:
+         {
+            const uint *elt_ui = (const uint *) draw->pt.user.elts;
+            PRIM_RESTART_LOOP(elt_ui);
+         }
+         break;
+      default:
+         assert(0 && "bad eltSize in draw_arrays()");
+      }
+   }
+   else {
+      /* Non-indexed prims (draw_arrays).
+       * Primitive restart should have been handled in the state tracker.
+       */
+      draw_pt_arrays(draw, prim, start, count);
+   }
+}
+
+
+
 /**
  * Non-instanced drawing.
  * \sa draw_arrays_instanced
@@ -395,6 +473,12 @@ draw_vbo(struct draw_context *draw,
 
    for (instance = 0; instance < info->instance_count; instance++) {
       draw->instance_id = instance + info->start_instance;
-      draw_pt_arrays(draw, info->mode, info->start, info->count);
+
+      if (info->primitive_restart) {
+         draw_pt_arrays_restart(draw, info);
+      }
+      else {
+         draw_pt_arrays(draw, info->mode, info->start, info->count);
+      }
    }
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 77291e304e1..a53a768d029 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -175,6 +175,11 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
       draw->pt.user.vs_constants[0];
    fpme->llvm->jit_context.gs_constants =
       draw->pt.user.gs_constants[0];
+   fpme->llvm->jit_context.planes =
+      (float (*) [12][4]) draw->pt.user.planes[0];
+   fpme->llvm->jit_context.viewport =
+      (float *)draw->viewport.scale;
+    
 }
 
 
@@ -217,6 +222,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
    struct draw_vertex_info gs_vert_info;
    struct draw_vertex_info *vert_info;
    unsigned opt = fpme->opt;
+   unsigned clipped = 0;
 
    llvm_vert_info.count = fetch_info->count;
    llvm_vert_info.vertex_size = fpme->vertex_size;
@@ -230,7 +236,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
    }
 
    if (fetch_info->linear)
-      fpme->current_variant->jit_func( &fpme->llvm->jit_context,
+      clipped = fpme->current_variant->jit_func( &fpme->llvm->jit_context,
                                        llvm_vert_info.verts,
                                        (const char **)draw->pt.user.vbuffer,
                                        fetch_info->start,
@@ -239,7 +245,7 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
                                        draw->pt.vertex_buffer,
                                        draw->instance_id);
    else
-      fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context,
+      clipped = fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context,
                                             llvm_vert_info.verts,
                                             (const char **)draw->pt.user.vbuffer,
                                             fetch_info->elts,
@@ -266,6 +272,9 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
       FREE(vert_info->verts);
       vert_info = &gs_vert_info;
       prim_info = &gs_prim_info;
+
+      clipped = draw_pt_post_vs_run( fpme->post_vs, vert_info );
+
    }
 
    /* stream output needs to be done before clipping */
@@ -273,11 +282,11 @@ llvm_pipeline_generic( struct draw_pt_middle_end *middle,
 		    vert_info,
                     prim_info );
 
-   if (draw_pt_post_vs_run( fpme->post_vs, vert_info )) {
+   if (clipped) {
       opt |= PT_PIPELINE;
    }
 
-   /* Do we need to run the pipeline?
+   /* Do we need to run the pipeline? Now will come here if clipped
     */
    if (opt & PT_PIPELINE) {
       pipeline( fpme,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 64c468c14d4..f9a12a41a1b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -983,6 +983,12 @@ enum lp_build_round_sse41_mode
 };
 
 
+/**
+ * Helper for SSE4.1's ROUNDxx instructions.
+ *
+ * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
+ * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
+ */
 static INLINE LLVMValueRef
 lp_build_round_sse41(struct lp_build_context *bld,
                      LLVMValueRef a,
@@ -1053,10 +1059,58 @@ lp_build_round_sse41(struct lp_build_context *bld,
 }
 
 
+static INLINE LLVMValueRef
+lp_build_iround_nearest_sse2(struct lp_build_context *bld,
+                             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMTypeRef ret_type = lp_build_int_vec_type(type);
+   const char *intrinsic;
+   LLVMValueRef res;
+
+   assert(type.floating);
+   /* using the double precision conversions is a bit more complicated */
+   assert(type.width == 32);
+
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse2);
+
+   /* This is relying on MXCSR rounding mode, which should always be nearest. */
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef arg;
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      intrinsic = "llvm.x86.sse.cvtss2si";
+
+      undef = LLVMGetUndef(vec_type);
+
+      arg = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, arg);
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      intrinsic = "llvm.x86.sse2.cvtps2dq";
+
+      res = lp_build_intrinsic_unary(bld->builder, intrinsic,
+                                     ret_type, a);
+   }
+
+   return res;
+}
+
+
 /**
- * Return the integer part of a float (vector) value.  The returned value is
- * a float (vector).
- * Ex: trunc(-1.5) = 1.0
+ * Return the integer part of a float (vector) value (== round toward zero).
+ * The returned value is a float (vector).
+ * Ex: trunc(-1.5) = -1.0
  */
 LLVMValueRef
 lp_build_trunc(struct lp_build_context *bld,
@@ -1181,9 +1235,9 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
- * Return the integer part of a float (vector) value.  The returned value is
- * an integer (vector).
- * Ex: itrunc(-1.5) = 1
+ * Return the integer part of a float (vector) value (== round toward zero).
+ * The returned value is an integer (vector).
+ * Ex: itrunc(-1.5) = -1
  */
 LLVMValueRef
 lp_build_itrunc(struct lp_build_context *bld,
@@ -1217,7 +1271,11 @@ lp_build_iround(struct lp_build_context *bld,
 
    assert(lp_check_value(type, a));
 
-   if (util_cpu_caps.has_sse4_1 &&
+   if (util_cpu_caps.has_sse2 &&
+       ((type.width == 32) && (type.length == 1 || type.length == 4))) {
+      return lp_build_iround_nearest_sse2(bld, a);
+   }
+   else if (util_cpu_caps.has_sse4_1 &&
        (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
@@ -1371,8 +1429,6 @@ lp_build_ifloor_fract(struct lp_build_context *bld,
                       LLVMValueRef *out_ipart,
                       LLVMValueRef *out_fpart)
 {
-
-
    const struct lp_type type = bld->type;
    LLVMValueRef ipart;
 
@@ -2210,6 +2266,71 @@ lp_build_exp2(struct lp_build_context *bld,
 
 
 /**
+ * Extract the exponent of a IEEE-754 floating point value.
+ *
+ * Optionally apply an integer bias.
+ *
+ * Result is an integer value with
+ *
+ *   ifloor(log2(x)) + bias
+ */
+LLVMValueRef
+lp_build_extract_exponent(struct lp_build_context *bld,
+                          LLVMValueRef x,
+                          int bias)
+{
+   const struct lp_type type = bld->type;
+   unsigned mantissa = lp_mantissa(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+
+   assert(lp_check_value(bld->type, x));
+
+   x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, "");
+
+   res = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
+   res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(type, 255), "");
+   res = LLVMBuildSub(bld->builder, res, lp_build_const_int_vec(type, 127 - bias), "");
+
+   return res;
+}
+
+
+/**
+ * Extract the mantissa of the a floating.
+ *
+ * Result is a floating point value with
+ *
+ *   x / floor(log2(x))
+ */
+LLVMValueRef
+lp_build_extract_mantissa(struct lp_build_context *bld,
+                          LLVMValueRef x)
+{
+   const struct lp_type type = bld->type;
+   unsigned mantissa = lp_mantissa(type);
+   LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, bld->int_vec_type);
+   LLVMValueRef res;
+
+   assert(lp_check_value(bld->type, x));
+
+   assert(type.floating);
+
+   x = LLVMBuildBitCast(bld->builder, x, bld->int_vec_type, "");
+
+   /* res = x / 2**ipart */
+   res = LLVMBuildAnd(bld->builder, x, mantmask, "");
+   res = LLVMBuildOr(bld->builder, res, one, "");
+   res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
+
+   return res;
+}
+
+
+
+/**
  * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
  * These coefficients can be generate with
  * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
@@ -2333,7 +2454,10 @@ lp_build_log2(struct lp_build_context *bld,
 /**
  * Faster (and less accurate) log2.
  *
- *    log2(x) = floor(log2(x)) + frac(x)
+ *    log2(x) = floor(log2(x)) - 1 + x / 2**floor(log2(x))
+ *
+ * Piece-wise linear approximation, with exact results when x is a
+ * power of two.
  *
  * See http://www.flipcode.com/archives/Fast_log_Function.shtml
  */
@@ -2341,35 +2465,21 @@ LLVMValueRef
 lp_build_fast_log2(struct lp_build_context *bld,
                    LLVMValueRef x)
 {
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = bld->vec_type;
-   LLVMTypeRef int_vec_type = bld->int_vec_type;
-
-   unsigned mantissa = lp_mantissa(type);
-   LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1);
-   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
-
    LLVMValueRef ipart;
    LLVMValueRef fpart;
 
    assert(lp_check_value(bld->type, x));
 
-   assert(type.floating);
-
-   x = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
+   assert(bld->type.floating);
 
    /* ipart = floor(log2(x)) - 1 */
-   ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
-   ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), "");
-   ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 128), "");
-   ipart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
+   ipart = lp_build_extract_exponent(bld, x, -1);
+   ipart = LLVMBuildSIToFP(bld->builder, ipart, bld->vec_type, "");
 
-   /* fpart = 1.0 + frac(x) */
-   fpart = LLVMBuildAnd(bld->builder, x, mantmask, "");
-   fpart = LLVMBuildOr(bld->builder, fpart, one, "");
-   fpart = LLVMBuildBitCast(bld->builder, fpart, vec_type, "");
+   /* fpart = x / 2**ipart */
+   fpart = lp_build_extract_mantissa(bld, x);
 
-   /* floor(log2(x)) + frac(x) */
+   /* ipart + fpart */
    return LLVMBuildFAdd(bld->builder, ipart, fpart, "");
 }
 
@@ -2383,27 +2493,18 @@ LLVMValueRef
 lp_build_ilog2(struct lp_build_context *bld,
                LLVMValueRef x)
 {
-   const struct lp_type type = bld->type;
-   LLVMTypeRef int_vec_type = bld->int_vec_type;
-
-   unsigned mantissa = lp_mantissa(type);
-   LLVMValueRef sqrt2 = lp_build_const_vec(type, 1.4142135623730951);
-
+   LLVMValueRef sqrt2 = lp_build_const_vec(bld->type, M_SQRT2);
    LLVMValueRef ipart;
 
-   assert(lp_check_value(bld->type, x));
+   assert(bld->type.floating);
 
-   assert(type.floating);
+   assert(lp_check_value(bld->type, x));
 
    /* x * 2^(0.5)   i.e., add 0.5 to the log2(x) */
    x = LLVMBuildFMul(bld->builder, x, sqrt2, "");
 
-   x = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
-
    /* ipart = floor(log2(x) + 0.5)  */
-   ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), "");
-   ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), "");
-   ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
+   ipart = lp_build_extract_exponent(bld, x, 0);
 
    return ipart;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
index 8424384f8f7..c78b61decf0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -215,6 +215,15 @@ lp_build_exp2(struct lp_build_context *bld,
               LLVMValueRef a);
 
 LLVMValueRef
+lp_build_extract_exponent(struct lp_build_context *bld,
+                          LLVMValueRef x,
+                          int bias);
+
+LLVMValueRef
+lp_build_extract_mantissa(struct lp_build_context *bld,
+                          LLVMValueRef x);
+
+LLVMValueRef
 lp_build_log2(struct lp_build_context *bld,
               LLVMValueRef a);
 
@@ -226,7 +235,6 @@ LLVMValueRef
 lp_build_ilog2(struct lp_build_context *bld,
                LLVMValueRef x);
 
-
 void
 lp_build_exp2_approx(struct lp_build_context *bld,
                      LLVMValueRef x,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 127b13bc286..6967dd26225 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -97,58 +97,104 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type);
    LLVMValueRef res;
    unsigned mantissa;
-   unsigned n;
-   unsigned long long ubound;
-   unsigned long long mask;
-   double scale;
-   double bias;
 
    assert(src_type.floating);
+   assert(dst_width <= src_type.width);
+   src_type.sign = FALSE;
 
    mantissa = lp_mantissa(src_type);
 
-   /* We cannot carry more bits than the mantissa */
-   n = MIN2(mantissa, dst_width);
+   if (dst_width <= mantissa) {
+      /*
+       * Apply magic coefficients that will make the desired result to appear
+       * in the lowest significant bits of the mantissa, with correct rounding.
+       *
+       * This only works if the destination width fits in the mantissa.
+       */
 
-   /* This magic coefficients will make the desired result to appear in the
-    * lowest significant bits of the mantissa.
-    */
-   ubound = ((unsigned long long)1 << n);
-   mask = ubound - 1;
-   scale = (double)mask/ubound;
-   bias = (double)((unsigned long long)1 << (mantissa - n));
+      unsigned long long ubound;
+      unsigned long long mask;
+      double scale;
+      double bias;
 
-   res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
-   res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), "");
-   res = LLVMBuildBitCast(builder, res, int_vec_type, "");
+      ubound = (1ULL << dst_width);
+      mask = ubound - 1;
+      scale = (double)mask/ubound;
+      bias = (double)(1ULL << (mantissa - dst_width));
 
-   if(dst_width > n) {
-      int shift = dst_width - n;
-      res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), "");
+      res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+      res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), "");
+      res = LLVMBuildBitCast(builder, res, int_vec_type, "");
+      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
+   }
+   else if (dst_width == (mantissa + 1)) {
+      /*
+       * The destination width matches exactly what can be represented in
+       * floating point (i.e., mantissa + 1 bits). So do a straight
+       * multiplication followed by casting. No further rounding is necessary.
+       */
+
+      double scale;
+
+      scale = (double)((1ULL << dst_width) - 1);
 
-      /* TODO: Fill in the empty lower bits for additional precision? */
-      /* YES: this fixes progs/trivial/tri-z-eq.c.
-       * Otherwise vertex Z=1.0 values get converted to something like
-       * 0xfffffb00 and the test for equality with 0xffffffff fails.
+      res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+      res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
+   }
+   else {
+      /*
+       * The destination exceeds what can be represented in the floating point.
+       * So multiply by the largest power two we get away with, and when
+       * subtract the most significant bit to rescale to normalized values.
+       *
+       * The largest power of two factor we can get away is
+       * (1 << (src_type.width - 1)), because we need to use signed . In theory it
+       * should be (1 << (src_type.width - 2)), but IEEE 754 rules states
+       * INT_MIN should be returned in FPToSI, which is the correct result for
+       * values near 1.0!
+       *
+       * This means we get (src_type.width - 1) correct bits for values near 0.0,
+       * and (mantissa + 1) correct bits for values near 1.0. Equally or more
+       * important, we also get exact results for 0.0 and 1.0.
        */
-#if 0
-      {
-         LLVMValueRef msb;
-         msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), "");
-         msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), "");
-         msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), "");
-         res = LLVMBuildOr(builder, res, msb, "");
-      }
-#elif 0
-      while(shift > 0) {
-         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), "");
-         shift -= n;
-         n *= 2;
+
+      unsigned n = MIN2(src_type.width - 1, dst_width);
+
+      double scale = (double)(1ULL << n);
+      unsigned lshift = dst_width - n;
+      unsigned rshift = n;
+      LLVMValueRef lshifted;
+      LLVMValueRef rshifted;
+
+      res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+      res = LLVMBuildFPToSI(builder, res, int_vec_type, "");
+
+      /*
+       * Align the most significant bit to its final place.
+       *
+       * This will cause 1.0 to overflow to 0, but the later adjustment will
+       * get it right.
+       */
+      if (lshift) {
+         lshifted = LLVMBuildShl(builder, res,
+                                 lp_build_const_int_vec(src_type, lshift), "");
+      } else {
+         lshifted = res;
       }
-#endif
+
+      /*
+       * Align the most significant bit to the right.
+       */
+      rshifted =  LLVMBuildAShr(builder, res,
+                                lp_build_const_int_vec(src_type, rshift), "");
+
+      /*
+       * Subtract the MSB to the LSB, therefore re-scaling from
+       * (1 << dst_width) to ((1 << dst_width) - 1).
+       */
+
+      res = LLVMBuildSub(builder, lshifted, rshifted, "");
    }
-   else
-      res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), "");
 
    return res;
 }
@@ -178,6 +224,16 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
    assert(dst_type.floating);
 
+   /* Special-case int8->float, though most cases could be handled
+    * this way:
+    */
+   if (src_width == 8) {
+      scale = 1.0/255.0;
+      res = LLVMBuildSIToFP(builder, src, vec_type, "");
+      res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), "");
+      return res;
+   }
+
    mantissa = lp_mantissa(dst_type);
 
    n = MIN2(mantissa, src_width);
@@ -257,7 +313,9 @@ lp_build_conv(LLVMBuilderRef builder,
        dst_type.sign     == 0 &&
        dst_type.norm     == 1 &&
        dst_type.width    == 8 &&
-       dst_type.length   == 16)
+       dst_type.length   == 16 &&
+
+       util_cpu_caps.has_sse2)
    {
       int i;
 
@@ -296,23 +354,7 @@ lp_build_conv(LLVMBuilderRef builder,
          c = LLVMBuildFMul(builder, src[2], const_255f, "");
          d = LLVMBuildFMul(builder, src[3], const_255f, "");
 
-         /* lp_build_round generates excessively general code without
-          * sse4, so do rounding manually.
-          */
-         if (!util_cpu_caps.has_sse4_1) {
-            LLVMValueRef const_half = lp_build_const_vec(src_type, 0.5f);
-
-            a = LLVMBuildFAdd(builder, a, const_half, "");
-            b = LLVMBuildFAdd(builder, b, const_half, "");
-            c = LLVMBuildFAdd(builder, c, const_half, "");
-            d = LLVMBuildFAdd(builder, d, const_half, "");
-            
-            src_int0 = LLVMBuildFPToSI(builder, a, int32_vec_type, "");
-            src_int1 = LLVMBuildFPToSI(builder, b, int32_vec_type, "");
-            src_int2 = LLVMBuildFPToSI(builder, c, int32_vec_type, "");
-            src_int3 = LLVMBuildFPToSI(builder, d, int32_vec_type, "");
-         }
-         else {
+         {
             struct lp_build_context bld;
 
             bld.builder = builder;
@@ -323,13 +365,13 @@ lp_build_conv(LLVMBuilderRef builder,
             bld.undef = lp_build_undef(src_type);
             bld.zero = lp_build_zero(src_type);
             bld.one = lp_build_one(src_type);
-            
+
             src_int0 = lp_build_iround(&bld, a);
             src_int1 = lp_build_iround(&bld, b);
             src_int2 = lp_build_iround(&bld, c);
             src_int3 = lp_build_iround(&bld, d);
          }
-
+         /* relying on clamping behavior of sse2 intrinsics here */
          lo = lp_build_pack2(builder, int32_type, int16_type, src_int0, src_int1);
          hi = lp_build_pack2(builder, int32_type, int16_type, src_int2, src_int3);
          dst[i] = lp_build_pack2(builder, int16_type, dst_type, lo, hi);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
index d3a5afff8c2..93e56553d7b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
@@ -57,6 +57,8 @@ lp_disassemble(const void* func)
 #ifdef HAVE_UDIS86
    ud_t ud_obj;
    uint64_t max_jmp_pc;
+   uint inst_no;
+   boolean emit_addrs = TRUE, emit_line_nos = FALSE;
 
    ud_init(&ud_obj);
 
@@ -76,13 +78,18 @@ lp_disassemble(const void* func)
 
    while (ud_disassemble(&ud_obj)) {
 
+      if (emit_addrs) {
 #ifdef PIPE_ARCH_X86
-      debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
+         debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
 #endif
 #ifdef PIPE_ARCH_X86_64
-      debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
+         debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
 #endif
-
+      }
+      else if (emit_line_nos) {
+         debug_printf("%6d:\t", inst_no);
+         inst_no++;
+      }
 #if 0
       debug_printf("%-16s ", ud_insn_hex(&ud_obj));
 #endif
@@ -115,8 +122,10 @@ lp_disassemble(const void* func)
          }
       }
 
-      if ((ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret) ||
-           ud_obj.mnemonic == UD_Iinvalid)
+      if (ud_obj.mnemonic == UD_Iinvalid ||
+          (ud_insn_off(&ud_obj) >= max_jmp_pc &&
+           (ud_obj.mnemonic == UD_Iret ||
+            ud_obj.mnemonic == UD_Ijmp)))
          break;
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 369c1bbf09a..eb11dcd4ef4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -36,11 +36,12 @@
 #include "util/u_string.h"
 
 
-#define GALLIVM_DEBUG_TGSI      0x1
-#define GALLIVM_DEBUG_IR        0x2
-#define GALLIVM_DEBUG_ASM       0x4
-#define GALLIVM_DEBUG_NO_OPT    0x8
-#define GALLIVM_DEBUG_PERF      0x10
+#define GALLIVM_DEBUG_TGSI          (1 << 0)
+#define GALLIVM_DEBUG_IR            (1 << 1)
+#define GALLIVM_DEBUG_ASM           (1 << 2)
+#define GALLIVM_DEBUG_NO_OPT        (1 << 3)
+#define GALLIVM_DEBUG_PERF          (1 << 4)
+#define GALLIVM_DEBUG_NO_BRILINEAR  (1 << 5)
 
 
 #ifdef DEBUG
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index 5bc9c741a88..a2cee199a01 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -38,273 +38,15 @@
 #include "lp_bld_flow.h"
 
 
-#define LP_BUILD_FLOW_MAX_VARIABLES 64
-#define LP_BUILD_FLOW_MAX_DEPTH 32
-
-/**
- * Enumeration of all possible flow constructs.
- */
-enum lp_build_flow_construct_kind {
-   LP_BUILD_FLOW_SCOPE,
-   LP_BUILD_FLOW_SKIP,
-   LP_BUILD_FLOW_IF
-};
-
-
-/**
- * Variable declaration scope.
- */
-struct lp_build_flow_scope
-{
-   /** Number of variables declared in this scope */
-   unsigned num_variables;
-};
-
-
-/**
- * Early exit. Useful to skip to the end of a function or block when
- * the execution mask becomes zero or when there is an error condition.
- */
-struct lp_build_flow_skip
-{
-   /** Block to skip to */
-   LLVMBasicBlockRef block;
-
-   /** Number of variables declared at the beginning */
-   unsigned num_variables;
-
-   LLVMValueRef *phi;  /**< array [num_variables] */
-};
-
-
-/**
- * if/else/endif.
- */
-struct lp_build_flow_if
-{
-   unsigned num_variables;
-
-   LLVMValueRef *phi;  /**< array [num_variables] */
-
-   LLVMValueRef condition;
-   LLVMBasicBlockRef entry_block, true_block, false_block, merge_block;
-};
-
-
-/**
- * Union of all possible flow constructs' data
- */
-union lp_build_flow_construct_data
-{
-   struct lp_build_flow_scope scope;
-   struct lp_build_flow_skip skip;
-   struct lp_build_flow_if ifthen;
-};
-
-
-/**
- * Element of the flow construct stack.
- */
-struct lp_build_flow_construct
-{
-   enum lp_build_flow_construct_kind kind;
-   union lp_build_flow_construct_data data;
-};
-
-
 /**
- * All necessary data to generate LLVM control flow constructs.
+ * Insert a new block, right where builder is pointing to.
  *
- * Besides keeping track of the control flow construct themselves we also
- * need to keep track of variables in order to generate SSA Phi values.
- */
-struct lp_build_flow_context
-{
-   LLVMBuilderRef builder;
-
-   /**
-    * Control flow stack.
-    */
-   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
-   unsigned num_constructs;
-
-   /**
-    * Variable stack
-    */
-   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
-   unsigned num_variables;
-};
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder)
-{
-   struct lp_build_flow_context *flow;
-
-   flow = CALLOC_STRUCT(lp_build_flow_context);
-   if(!flow)
-      return NULL;
-
-   flow->builder = builder;
-
-   return flow;
-}
-
-
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow)
-{
-   assert(flow->num_constructs == 0);
-   assert(flow->num_variables == 0);
-   FREE(flow);
-}
-
-
-/**
- * Begin/push a new flow control construct, such as a loop, skip block
- * or variable scope.
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_push(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
-   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
-      return NULL;
-
-   flow->constructs[flow->num_constructs].kind = kind;
-   return &flow->constructs[flow->num_constructs++].data;
-}
-
-
-/**
- * Return the current/top flow control construct on the stack.
- * \param kind  the expected type of the top-most construct
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_peek(struct lp_build_flow_context *flow,
-                   enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[flow->num_constructs - 1].data;
-}
-
-
-/**
- * End/pop the current/top flow control construct on the stack.
- * \param kind  the expected type of the top-most construct
- */
-static union lp_build_flow_construct_data *
-lp_build_flow_pop(struct lp_build_flow_context *flow,
-                  enum lp_build_flow_construct_kind kind)
-{
-   assert(flow->num_constructs);
-   if(!flow->num_constructs)
-      return NULL;
-
-   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
-   if(flow->constructs[flow->num_constructs - 1].kind != kind)
-      return NULL;
-
-   return &flow->constructs[--flow->num_constructs].data;
-}
-
-
-/**
- * Begin a variable scope.
+ * This is useful important not only for aesthetic reasons, but also for
+ * performance reasons, as frequently run blocks should be laid out next to
+ * each other and fall-throughs maximized.
  *
+ * See also llvm/lib/Transforms/Scalar/BasicBlockPlacement.cpp.
  *
- */
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_push(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   scope->num_variables = 0;
-}
-
-
-/**
- * Declare a variable.
- *
- * A variable is a named entity which can have different LLVMValueRef's at
- * different points of the program. This is relevant for control flow because
- * when there are multiple branches to a same location we need to replace
- * the variable's value with a Phi function as explained in
- * http://en.wikipedia.org/wiki/Static_single_assignment_form .
- *
- * We keep track of variables by keeping around a pointer to where they're
- * current.
- *
- * There are a few cautions to observe:
- *
- * - Variable's value must not be NULL. If there is no initial value then
- *   LLVMGetUndef() should be used.
- *
- * - Variable's value must be kept up-to-date. If the variable is going to be
- *   modified by a function then a pointer should be passed so that its value
- *   is accurate. Failure to do this will cause some of the variables'
- *   transient values to be lost, leading to wrong results.
- *
- * - A program should be written from top to bottom, by always appending
- *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
- *   modifying existing statements will most likely lead to wrong results.
- *
- */
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(*variable);
-   if(!*variable)
-      return;
-
-   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
-   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
-      return;
-
-   flow->variables[flow->num_variables++] = variable;
-   ++scope->num_variables;
-}
-
-
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow)
-{
-   struct lp_build_flow_scope *scope;
-
-   scope = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SCOPE)->scope;
-   if(!scope)
-      return;
-
-   assert(flow->num_variables >= scope->num_variables);
-   if(flow->num_variables < scope->num_variables) {
-      flow->num_variables = 0;
-      return;
-   }
-
-   flow->num_variables -= scope->num_variables;
-}
-
-
-/**
  * Note: this function has no dependencies on the flow code and could
  * be used elsewhere.
  */
@@ -334,52 +76,18 @@ lp_build_insert_new_block(LLVMBuilderRef builder, const char *name)
 }
 
 
-static LLVMBasicBlockRef
-lp_build_flow_insert_block(struct lp_build_flow_context *flow)
-{
-   return lp_build_insert_new_block(flow->builder, "");
-}
-
-
 /**
  * Begin a "skip" block.  Inside this block we can test a condition and
  * skip to the end of the block if the condition is false.
  */
 void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
+lp_build_flow_skip_begin(struct lp_build_skip_context *skip,
+                         LLVMBuilderRef builder)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBuilderRef builder;
-   unsigned i;
-
-   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
+   skip->builder = builder;
 
    /* create new basic block */
-   skip->block = lp_build_flow_insert_block(flow);
-
-   skip->num_variables = flow->num_variables;
-   if(!skip->num_variables) {
-      skip->phi = NULL;
-      return;
-   }
-
-   /* Allocate a Phi node for each variable in this skip scope */
-   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
-   if(!skip->phi) {
-      skip->num_variables = 0;
-      return;
-   }
-
-   builder = LLVMCreateBuilder();
-   LLVMPositionBuilderAtEnd(builder, skip->block);
-
-   /* create a Phi node for each variable */
-   for(i = 0; i < skip->num_variables; ++i)
-      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
-
-   LLVMDisposeBuilder(builder);
+   skip->block = lp_build_insert_new_block(skip->builder, "skip");
 }
 
 
@@ -388,83 +96,50 @@ lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
  * skip block if the condition is true.
  */
 void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+lp_build_flow_skip_cond_break(struct lp_build_skip_context *skip,
                               LLVMValueRef cond)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
    LLVMBasicBlockRef new_block;
-   unsigned i;
-
-   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
 
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   new_block = lp_build_flow_insert_block(flow);
-
-   /* for each variable, update the Phi node with a (variable, block) pair */
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-   }
+   new_block = lp_build_insert_new_block(skip->builder, "");
 
    /* if cond is true, goto skip->block, else goto new_block */
-   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
+   LLVMBuildCondBr(skip->builder, cond, skip->block, new_block);
 
-   LLVMPositionBuilderAtEnd(flow->builder, new_block);
+   LLVMPositionBuilderAtEnd(skip->builder, new_block);
 }
 
 
 void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow)
+lp_build_flow_skip_end(struct lp_build_skip_context *skip)
 {
-   struct lp_build_flow_skip *skip;
-   LLVMBasicBlockRef current_block;
-   unsigned i;
-
-   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
-   if(!skip)
-      return;
-
-   current_block = LLVMGetInsertBlock(flow->builder);
-
-   /* add (variable, block) tuples to the phi nodes */
-   for(i = 0; i < skip->num_variables; ++i) {
-      assert(*flow->variables[i]);
-      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
-      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
-      *flow->variables[i] = skip->phi[i];
-   }
-
    /* goto block */
-   LLVMBuildBr(flow->builder, skip->block);
-   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
-
-   FREE(skip->phi);
+   LLVMBuildBr(skip->builder, skip->block);
+   LLVMPositionBuilderAtEnd(skip->builder, skip->block);
 }
 
 
 /**
  * Check if the mask predicate is zero.  If so, jump to the end of the block.
  */
-static void
+void
 lp_build_mask_check(struct lp_build_mask_context *mask)
 {
-   LLVMBuilderRef builder = mask->flow->builder;
+   LLVMBuilderRef builder = mask->skip.builder;
+   LLVMValueRef value;
    LLVMValueRef cond;
 
+   value = lp_build_mask_value(mask);
+
    /* cond = (mask == 0) */
    cond = LLVMBuildICmp(builder,
                         LLVMIntEQ,
-                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
+                        LLVMBuildBitCast(builder, value, mask->reg_type, ""),
                         LLVMConstNull(mask->reg_type),
                         "");
 
    /* if cond, goto end of block */
-   lp_build_flow_skip_cond_break(mask->flow, cond);
+   lp_build_flow_skip_cond_break(&mask->skip, cond);
 }
 
 
@@ -477,21 +152,27 @@ lp_build_mask_check(struct lp_build_mask_context *mask)
  */
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
+                    LLVMBuilderRef builder,
                     struct lp_type type,
                     LLVMValueRef value)
 {
    memset(mask, 0, sizeof *mask);
 
-   mask->flow = flow;
    mask->reg_type = LLVMIntType(type.width * type.length);
-   mask->value = value;
+   mask->var = lp_build_alloca(builder,
+                               lp_build_int_vec_type(type),
+                               "execution_mask");
 
-   lp_build_flow_scope_begin(flow);
-   lp_build_flow_scope_declare(flow, &mask->value);
-   lp_build_flow_skip_begin(flow);
+   LLVMBuildStore(builder, value, mask->var);
 
-   lp_build_mask_check(mask);
+   lp_build_flow_skip_begin(&mask->skip, builder);
+}
+
+
+LLVMValueRef
+lp_build_mask_value(struct lp_build_mask_context *mask)
+{
+   return LLVMBuildLoad(mask->skip.builder, mask->var, "");
 }
 
 
@@ -504,9 +185,10 @@ void
 lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value)
 {
-   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
-
-   lp_build_mask_check(mask);
+   value = LLVMBuildAnd(mask->skip.builder,
+                        lp_build_mask_value(mask),
+                        value, "");
+   LLVMBuildStore(mask->skip.builder, value, mask->var);
 }
 
 
@@ -516,9 +198,8 @@ lp_build_mask_update(struct lp_build_mask_context *mask,
 LLVMValueRef
 lp_build_mask_end(struct lp_build_mask_context *mask)
 {
-   lp_build_flow_skip_end(mask->flow);
-   lp_build_flow_scope_end(mask->flow);
-   return mask->value;
+   lp_build_flow_skip_end(&mask->skip);
+   return lp_build_mask_value(mask);
 }
 
 
@@ -528,59 +209,27 @@ lp_build_loop_begin(LLVMBuilderRef builder,
                     LLVMValueRef start,
                     struct lp_build_loop_state *state)
 {
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
+   state->block = lp_build_insert_new_block(builder, "loop_begin");
 
-   state->block = LLVMAppendBasicBlock(function, "loop");
+   state->counter_var = lp_build_alloca(builder, LLVMTypeOf(start), "loop_counter");
+
+   LLVMBuildStore(builder, start, state->counter_var);
 
    LLVMBuildBr(builder, state->block);
 
    LLVMPositionBuilderAtEnd(builder, state->block);
 
-   state->counter = LLVMBuildPhi(builder, LLVMTypeOf(start), "");
-
-   LLVMAddIncoming(state->counter, &start, &block, 1);
-
+   state->counter = LLVMBuildLoad(builder, state->counter_var, "");
 }
 
 
 void
-lp_build_loop_end(LLVMBuilderRef builder,
-                  LLVMValueRef end,
-                  LLVMValueRef step,
-                  struct lp_build_loop_state *state)
-{
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
-   LLVMValueRef next;
-   LLVMValueRef cond;
-   LLVMBasicBlockRef after_block;
-
-   if (!step)
-      step = LLVMConstInt(LLVMTypeOf(end), 1, 0);
-
-   next = LLVMBuildAdd(builder, state->counter, step, "");
-
-   cond = LLVMBuildICmp(builder, LLVMIntNE, next, end, "");
-
-   after_block = LLVMAppendBasicBlock(function, "");
-
-   LLVMBuildCondBr(builder, cond, after_block, state->block);
-
-   LLVMAddIncoming(state->counter, &next, &block, 1);
-
-   LLVMPositionBuilderAtEnd(builder, after_block);
-}
-
-void
 lp_build_loop_end_cond(LLVMBuilderRef builder,
                        LLVMValueRef end,
                        LLVMValueRef step,
-                       int llvm_cond,
+                       LLVMIntPredicate llvm_cond,
                        struct lp_build_loop_state *state)
 {
-   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   LLVMValueRef function = LLVMGetBasicBlockParent(block);
    LLVMValueRef next;
    LLVMValueRef cond;
    LLVMBasicBlockRef after_block;
@@ -590,15 +239,27 @@ lp_build_loop_end_cond(LLVMBuilderRef builder,
 
    next = LLVMBuildAdd(builder, state->counter, step, "");
 
+   LLVMBuildStore(builder, next, state->counter_var);
+
    cond = LLVMBuildICmp(builder, llvm_cond, next, end, "");
 
-   after_block = LLVMAppendBasicBlock(function, "");
+   after_block = lp_build_insert_new_block(builder, "loop_end");
 
    LLVMBuildCondBr(builder, cond, after_block, state->block);
 
-   LLVMAddIncoming(state->counter, &next, &block, 1);
-
    LLVMPositionBuilderAtEnd(builder, after_block);
+
+   state->counter = LLVMBuildLoad(builder, state->counter_var, "");
+}
+
+
+void
+lp_build_loop_end(LLVMBuilderRef builder,
+                  LLVMValueRef end,
+                  LLVMValueRef step,
+                  struct lp_build_loop_state *state)
+{
+   lp_build_loop_end_cond(builder, end, step, LLVMIntNE, state);
 }
 
 
@@ -616,24 +277,16 @@ lp_build_loop_end_cond(LLVMBuilderRef builder,
 
   Is built with:
 
-     LLVMValueRef x = LLVMGetUndef();  // or something else
+     // x needs an alloca variable
+     x = lp_build_alloca(builder, type, "x");
 
-     flow = lp_build_flow_create(builder);
 
-        lp_build_flow_scope_begin(flow);
+     lp_build_if(ctx, builder, cond);
+        LLVMBuildStore(LLVMBuildAdd(1, 2), x);
+     lp_build_else(ctx);
+        LLVMBuildStore(LLVMBuildAdd(2, 3). x);
+     lp_build_endif(ctx);
 
-           // x needs a phi node
-           lp_build_flow_scope_declare(flow, &x);
-
-           lp_build_if(ctx, flow, builder, cond);
-              x = LLVMAdd(1, 2);
-           lp_build_else(ctx);
-              x = LLVMAdd(2, 3);
-           lp_build_endif(ctx);
-
-        lp_build_flow_scope_end(flow);
-
-     lp_build_flow_destroy(flow);
  */
 
 
@@ -642,47 +295,19 @@ lp_build_loop_end_cond(LLVMBuilderRef builder,
  * Begin an if/else/endif construct.
  */
 void
-lp_build_if(struct lp_build_if_state *ctx,
-            struct lp_build_flow_context *flow,
+lp_build_if(struct lp_build_if_state *ifthen,
             LLVMBuilderRef builder,
             LLVMValueRef condition)
 {
    LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
-   struct lp_build_flow_if *ifthen;
-   unsigned i;
-
-   memset(ctx, 0, sizeof(*ctx));
-   ctx->builder = builder;
-   ctx->flow = flow;
 
-   /* push/create new scope */
-   ifthen = &lp_build_flow_push(flow, LP_BUILD_FLOW_IF)->ifthen;
-   assert(ifthen);
-
-   ifthen->num_variables = flow->num_variables;
+   memset(ifthen, 0, sizeof *ifthen);
+   ifthen->builder = builder;
    ifthen->condition = condition;
    ifthen->entry_block = block;
 
-   /* create a Phi node for each variable in this flow scope */
-   ifthen->phi = MALLOC(ifthen->num_variables * sizeof(*ifthen->phi));
-   if (!ifthen->phi) {
-      ifthen->num_variables = 0;
-      return;
-   }
-
    /* create endif/merge basic block for the phi functions */
    ifthen->merge_block = lp_build_insert_new_block(builder, "endif-block");
-   LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
-
-   /* create a phi node for each variable */
-   for (i = 0; i < flow->num_variables; i++) {
-      ifthen->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
-
-      /* add add the initial value of the var from the entry block */
-      if (!LLVMIsUndef(*flow->variables[i]))
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i],
-                         &ifthen->entry_block, 1);
-   }
 
    /* create/insert true_block before merge_block */
    ifthen->true_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-true-block");
@@ -696,27 +321,16 @@ lp_build_if(struct lp_build_if_state *ctx,
  * Begin else-part of a conditional
  */
 void
-lp_build_else(struct lp_build_if_state *ctx)
+lp_build_else(struct lp_build_if_state *ifthen)
 {
-   struct lp_build_flow_context *flow = ctx->flow;
-   struct lp_build_flow_if *ifthen;
-   unsigned i;
-
-   ifthen = &lp_build_flow_peek(flow, LP_BUILD_FLOW_IF)->ifthen;
-   assert(ifthen);
-
-   /* for each variable, update the Phi node with a (variable, block) pair */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-   for (i = 0; i < flow->num_variables; i++) {
-      assert(*flow->variables[i]);
-      LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
-   }
+   /* Append an unconditional Br(anch) instruction on the true_block */
+   LLVMBuildBr(ifthen->builder, ifthen->merge_block);
 
    /* create/insert false_block before the merge block */
    ifthen->false_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-false-block");
 
    /* successive code goes into the else block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->false_block);
+   LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->false_block);
 }
 
 
@@ -724,75 +338,30 @@ lp_build_else(struct lp_build_if_state *ctx)
  * End a conditional.
  */
 void
-lp_build_endif(struct lp_build_if_state *ctx)
+lp_build_endif(struct lp_build_if_state *ifthen)
 {
-   struct lp_build_flow_context *flow = ctx->flow;
-   struct lp_build_flow_if *ifthen;
-   LLVMBasicBlockRef curBlock = LLVMGetInsertBlock(ctx->builder);
-   unsigned i;
-
-   ifthen = &lp_build_flow_pop(flow, LP_BUILD_FLOW_IF)->ifthen;
-   assert(ifthen);
-
    /* Insert branch to the merge block from current block */
-   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+   LLVMBuildBr(ifthen->builder, ifthen->merge_block);
 
-   if (ifthen->false_block) {
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-      /* for each variable, update the Phi node with a (variable, block) pair */
-      for (i = 0; i < flow->num_variables; i++) {
-         assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &curBlock, 1);
-         /* replace the variable ref with the phi function */
-         *flow->variables[i] = ifthen->phi[i];
-      }
-   }
-   else {
-      /* no else clause */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
-      for (i = 0; i < flow->num_variables; i++) {
-         assert(*flow->variables[i]);
-         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
-
-         /* replace the variable ref with the phi function */
-         *flow->variables[i] = ifthen->phi[i];
-      }
-   }
-
-   FREE(ifthen->phi);
-
-   /***
-    *** Now patch in the various branch instructions.
-    ***/
+   /*
+    * Now patch in the various branch instructions.
+    */
 
    /* Insert the conditional branch instruction at the end of entry_block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->entry_block);
+   LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->entry_block);
    if (ifthen->false_block) {
       /* we have an else clause */
-      LLVMBuildCondBr(ctx->builder, ifthen->condition,
+      LLVMBuildCondBr(ifthen->builder, ifthen->condition,
                       ifthen->true_block, ifthen->false_block);
    }
    else {
       /* no else clause */
-      LLVMBuildCondBr(ctx->builder, ifthen->condition,
+      LLVMBuildCondBr(ifthen->builder, ifthen->condition,
                       ifthen->true_block, ifthen->merge_block);
    }
 
-   /* Insert branch from end of true_block to merge_block */
-   if (ifthen->false_block) {
-      /* Append an unconditional Br(anch) instruction on the true_block */
-      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
-      LLVMBuildBr(ctx->builder, ifthen->merge_block);
-   }
-   else {
-      /* No else clause.
-       * Note that we've already inserted the branch at the end of
-       * true_block.  See the very first LLVMBuildBr() call in this function.
-       */
-   }
-
    /* Resume building code at end of the ifthen->merge_block */
-   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
+   LLVMPositionBuilderAtEnd(ifthen->builder, ifthen->merge_block);
 }
 
 
@@ -830,6 +399,7 @@ lp_build_alloca(LLVMBuilderRef builder,
    }
 
    res = LLVMBuildAlloca(first_builder, type, name);
+   LLVMBuildStore(builder, LLVMConstNull(type), res);
 
    LLVMDisposeBuilder(first_builder);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index fffb493a93b..e729ee6eaac 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -41,52 +41,49 @@
 struct lp_type;
 
 
-struct lp_build_flow_context;
-
-
-struct lp_build_flow_context *
-lp_build_flow_create(LLVMBuilderRef builder);
-
-void
-lp_build_flow_destroy(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
-
-void
-lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
-                            LLVMValueRef *variable);
+/**
+ * Early exit. Useful to skip to the end of a function or block when
+ * the execution mask becomes zero or when there is an error condition.
+ */
+struct lp_build_skip_context
+{
+   LLVMBuilderRef builder;
 
-void
-lp_build_flow_scope_end(struct lp_build_flow_context *flow);
+   /** Block to skip to */
+   LLVMBasicBlockRef block;
+};
 
 void
-lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
+lp_build_flow_skip_begin(struct lp_build_skip_context *ctx,
+                         LLVMBuilderRef builder);
 
 void
-lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+lp_build_flow_skip_cond_break(struct lp_build_skip_context *ctx,
                               LLVMValueRef cond);
 
 void
-lp_build_flow_skip_end(struct lp_build_flow_context *flow);
+lp_build_flow_skip_end(struct lp_build_skip_context *ctx);
 
 
 struct lp_build_mask_context
 {
-   struct lp_build_flow_context *flow;
+   struct lp_build_skip_context skip;
 
    LLVMTypeRef reg_type;
 
-   LLVMValueRef value;
+   LLVMValueRef var;
 };
 
 
 void
 lp_build_mask_begin(struct lp_build_mask_context *mask,
-                    struct lp_build_flow_context *flow,
+                    LLVMBuilderRef builder,
                     struct lp_type type,
                     LLVMValueRef value);
 
+LLVMValueRef
+lp_build_mask_value(struct lp_build_mask_context *mask);
+
 /**
  * Bitwise AND the mask with the given value, if a previous mask was set.
  */
@@ -94,6 +91,9 @@ void
 lp_build_mask_update(struct lp_build_mask_context *mask,
                      LLVMValueRef value);
 
+void
+lp_build_mask_check(struct lp_build_mask_context *mask);
+
 LLVMValueRef
 lp_build_mask_end(struct lp_build_mask_context *mask);
 
@@ -108,6 +108,7 @@ lp_build_mask_end(struct lp_build_mask_context *mask);
 struct lp_build_loop_state
 {
   LLVMBasicBlockRef block;
+  LLVMValueRef counter_var;
   LLVMValueRef counter;
 };
 
@@ -128,22 +129,28 @@ void
 lp_build_loop_end_cond(LLVMBuilderRef builder,
                        LLVMValueRef end,
                        LLVMValueRef step,
-                       int cond, /* LLVM condition */
+                       LLVMIntPredicate cond,
                        struct lp_build_loop_state *state);
 
 
 
 
+/**
+ * if/else/endif.
+ */
 struct lp_build_if_state
 {
    LLVMBuilderRef builder;
-   struct lp_build_flow_context *flow;
+   LLVMValueRef condition;
+   LLVMBasicBlockRef entry_block;
+   LLVMBasicBlockRef true_block;
+   LLVMBasicBlockRef false_block;
+   LLVMBasicBlockRef merge_block;
 };
 
 
 void
 lp_build_if(struct lp_build_if_state *ctx,
-            struct lp_build_flow_context *flow,
             LLVMBuilderRef builder,
             LLVMValueRef condition);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 761f33b578d..5598ca5c489 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -44,6 +44,7 @@ static const struct debug_named_value lp_bld_debug_flags[] = {
    { "asm",    GALLIVM_DEBUG_ASM, NULL },
    { "nopt",   GALLIVM_DEBUG_NO_OPT, NULL },
    { "perf",   GALLIVM_DEBUG_PERF, NULL },
+   { "no_brilinear", GALLIVM_DEBUG_NO_BRILINEAR, NULL },
    DEBUG_NAMED_VALUE_END
 };
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index f26fdac4663..0b4b1ca7d11 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -47,4 +47,10 @@ lp_build_init(void);
 extern void
 lp_func_delete_body(LLVMValueRef func);
 
+
+extern LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                       const char *Name);
+
+
 #endif /* !LP_BLD_INIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index ce5d0214b43..026b60ac36e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -462,10 +462,12 @@ lp_build_select(struct lp_build_context *bld,
       LLVMTypeRef arg_type;
       LLVMValueRef args[3];
 
-      if (type.width == 64) {
+      if (type.floating &&
+          type.width == 64) {
          intrinsic = "llvm.x86.sse41.blendvpd";
          arg_type = LLVMVectorType(LLVMDoubleType(), 2);
-      } else if (type.width == 32) {
+      } else if (type.floating &&
+                 type.width == 32) {
          intrinsic = "llvm.x86.sse41.blendvps";
          arg_type = LLVMVectorType(LLVMFloatType(), 4);
       } else {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 48baf7c425c..f56ddee7fd7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -178,3 +178,13 @@ lp_func_delete_body(LLVMValueRef FF)
    llvm::Function *func = llvm::unwrap<llvm::Function>(FF);
    func->deleteBody();
 }
+
+
+extern "C"
+LLVMValueRef
+lp_build_load_volatile(LLVMBuilderRef B, LLVMValueRef PointerVal,
+                       const char *Name)
+{
+   return llvm::wrap(llvm::unwrap(B)->CreateLoad(llvm::unwrap(PointerVal), true, Name));
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.c b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
index 153ba5b15b1..f418e96aff4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.c
@@ -29,6 +29,8 @@
 
 #include "util/u_debug.h"
 #include "util/u_memory.h"
+#include "util/u_string.h"
+#include "lp_bld_const.h"
 #include "lp_bld_printf.h"
 
 
@@ -119,3 +121,22 @@ lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...)
    return LLVMBuildCall(builder, func_printf, params, argcount + 1, "");
 }
 
+
+
+/**
+ * Print a float[4] vector.
+ */
+LLVMValueRef
+lp_build_print_vec4(LLVMBuilderRef builder, const char *msg, LLVMValueRef vec)
+{
+   char format[1000];
+   LLVMValueRef x, y, z, w;
+
+   x = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(0), "");
+   y = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(1), "");
+   z = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(2), "");
+   w = LLVMBuildExtractElement(builder, vec, lp_build_const_int32(3), "");
+
+   util_snprintf(format, sizeof(format), "%s %%f %%f %%f %%f\n", msg);
+   return lp_build_printf(builder, format, x, y, z, w);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_printf.h b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
index 83bd8f1d557..b6222c62ebe 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_printf.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_printf.h
@@ -35,5 +35,9 @@
 LLVMValueRef lp_build_const_string_variable(LLVMModuleRef module, const char *str, int len);
 LLVMValueRef lp_build_printf(LLVMBuilderRef builder, const char *fmt, ...);
 
+LLVMValueRef
+lp_build_print_vec4(LLVMBuilderRef builder, const char *msg, LLVMValueRef vec);
+
+
 #endif
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index 7b1088939b9..c18c8b47100 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -81,11 +81,15 @@ LLVMValueRef
 lp_build_scalar_ddx(struct lp_build_context *bld,
                     LLVMValueRef a)
 {
-   LLVMValueRef idx_left  = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
-   LLVMValueRef idx_right = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_RIGHT, 0);
-   LLVMValueRef a_left  = LLVMBuildExtractElement(bld->builder, a, idx_left, "");
-   LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "");
-   return lp_build_sub(bld, a_right, a_left);
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef idx_left  = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_right = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_RIGHT, 0);
+   LLVMValueRef a_left  = LLVMBuildExtractElement(bld->builder, a, idx_left, "left");
+   LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "right");
+   if (bld->type.floating)
+      return LLVMBuildFSub(bld->builder, a_right, a_left, "ddx");
+   else
+      return LLVMBuildSub(bld->builder, a_right, a_left, "ddx");
 }
 
 
@@ -93,9 +97,13 @@ LLVMValueRef
 lp_build_scalar_ddy(struct lp_build_context *bld,
                     LLVMValueRef a)
 {
-   LLVMValueRef idx_top    = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
-   LLVMValueRef idx_bottom = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_BOTTOM_LEFT, 0);
-   LLVMValueRef a_top    = LLVMBuildExtractElement(bld->builder, a, idx_top, "");
-   LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "");
-   return lp_build_sub(bld, a_bottom, a_top);
+   LLVMTypeRef i32t = LLVMInt32Type();
+   LLVMValueRef idx_top    = LLVMConstInt(i32t, LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_bottom = LLVMConstInt(i32t, LP_BLD_QUAD_BOTTOM_LEFT, 0);
+   LLVMValueRef a_top    = LLVMBuildExtractElement(bld->builder, a, idx_top, "top");
+   LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "bottom");
+   if (bld->type.floating)
+      return LLVMBuildFSub(bld->builder, a_bottom, a_top, "ddy");
+   else
+      return LLVMBuildSub(bld->builder, a_bottom, a_top, "ddy");
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 7a64392d3c1..844d1d935b5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -47,8 +47,7 @@
 
 
 /*
- * Bri-linear factor. Use zero or any other number less than one to force
- * tri-linear filtering.
+ * Bri-linear factor. Should be greater than one.
  */
 #define BRILINEAR_FACTOR 2
 
@@ -201,8 +200,8 @@ lp_build_rho(struct lp_build_sample_context *bld,
    LLVMValueRef float_size;
    LLVMValueRef rho;
 
-   dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
-   dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
+   dsdx = ddx[0];
+   dsdy = ddy[0];
 
    if (dims <= 1) {
       rho_x = dsdx;
@@ -215,15 +214,15 @@ lp_build_rho(struct lp_build_sample_context *bld,
       rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dsdx, index0, "");
       rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dsdy, index0, "");
 
-      dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx");
-      dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy");
+      dtdx = ddx[1];
+      dtdy = ddy[1];
 
       rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dtdx, index1, "");
       rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dtdy, index1, "");
 
       if (dims >= 3) {
-         drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx");
-         drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy");
+         drdx = ddx[2];
+         drdy = ddy[2];
 
          rho_x = LLVMBuildInsertElement(bld->builder, rho_x, drdx, index2, "");
          rho_y = LLVMBuildInsertElement(bld->builder, rho_y, drdy, index2, "");
@@ -294,31 +293,30 @@ lp_build_rho(struct lp_build_sample_context *bld,
  * TODO: This could be done in fixed point, where applicable.
  */
 static void
-lp_build_brilinear_lod(struct lp_build_sample_context *bld,
+lp_build_brilinear_lod(struct lp_build_context *bld,
                        LLVMValueRef lod,
                        double factor,
                        LLVMValueRef *out_lod_ipart,
                        LLVMValueRef *out_lod_fpart)
 {
-   struct lp_build_context *float_bld = &bld->float_bld;
    LLVMValueRef lod_fpart;
-   float pre_offset = (factor - 0.5)/factor - 0.5;
-   float post_offset = 1 - factor;
+   double pre_offset = (factor - 0.5)/factor - 0.5;
+   double post_offset = 1 - factor;
 
    if (0) {
       lp_build_printf(bld->builder, "lod = %f\n", lod);
    }
 
-   lod = lp_build_add(float_bld, lod,
-                      lp_build_const_vec(float_bld->type, pre_offset));
+   lod = lp_build_add(bld, lod,
+                      lp_build_const_vec(bld->type, pre_offset));
 
-   lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, &lod_fpart);
+   lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart);
 
-   lod_fpart = lp_build_mul(float_bld, lod_fpart,
-                            lp_build_const_vec(float_bld->type, factor));
+   lod_fpart = lp_build_mul(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, factor));
 
-   lod_fpart = lp_build_add(float_bld, lod_fpart,
-                            lp_build_const_vec(float_bld->type, post_offset));
+   lod_fpart = lp_build_add(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, post_offset));
 
    /*
     * It's not necessary to clamp lod_fpart since:
@@ -335,6 +333,61 @@ lp_build_brilinear_lod(struct lp_build_sample_context *bld,
 }
 
 
+/*
+ * Combined log2 and brilinear lod computation.
+ *
+ * It's in all identical to calling lp_build_fast_log2() and
+ * lp_build_brilinear_lod() above, but by combining we can compute the interger
+ * and fractional part independently.
+ */
+static void
+lp_build_brilinear_rho(struct lp_build_context *bld,
+                       LLVMValueRef rho,
+                       double factor,
+                       LLVMValueRef *out_lod_ipart,
+                       LLVMValueRef *out_lod_fpart)
+{
+   LLVMValueRef lod_ipart;
+   LLVMValueRef lod_fpart;
+
+   const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor);
+   const double post_offset = 1 - 2*factor;
+
+   assert(bld->type.floating);
+
+   assert(lp_check_value(bld->type, rho));
+
+   /*
+    * The pre factor will make the intersections with the exact powers of two
+    * happen precisely where we want then to be, which means that the integer
+    * part will not need any post adjustments.
+    */
+   rho = lp_build_mul(bld, rho,
+                      lp_build_const_vec(bld->type, pre_factor));
+
+   /* ipart = ifloor(log2(rho)) */
+   lod_ipart = lp_build_extract_exponent(bld, rho, 0);
+
+   /* fpart = rho / 2**ipart */
+   lod_fpart = lp_build_extract_mantissa(bld, rho);
+
+   lod_fpart = lp_build_mul(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, factor));
+
+   lod_fpart = lp_build_add(bld, lod_fpart,
+                            lp_build_const_vec(bld->type, post_offset));
+
+   /*
+    * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since:
+    * - the above expression will never produce numbers greater than one.
+    * - the mip filtering branch is only taken if lod_fpart is positive
+    */
+
+   *out_lod_ipart = lod_ipart;
+   *out_lod_fpart = lod_fpart;
+}
+
+
 /**
  * Generate code to compute texture level of detail (lambda).
  * \param ddx  partial derivatives of (s, t, r, q) with respect to X
@@ -389,16 +442,32 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
 
          rho = lp_build_rho(bld, ddx, ddy);
 
-         /* compute lod = log2(rho) */
-         if ((mip_filter == PIPE_TEX_MIPFILTER_NONE ||
-              mip_filter == PIPE_TEX_MIPFILTER_NEAREST) &&
-             !lod_bias &&
+         /*
+          * Compute lod = log2(rho)
+          */
+
+         if (!lod_bias &&
              !bld->static_state->lod_bias_non_zero &&
              !bld->static_state->apply_max_lod &&
              !bld->static_state->apply_min_lod) {
-            *out_lod_ipart = lp_build_ilog2(float_bld, rho);
-            *out_lod_fpart = bld->float_bld.zero;
-            return;
+            /*
+             * Special case when there are no post-log2 adjustments, which
+             * saves instructions but keeping the integer and fractional lod
+             * computations separate from the start.
+             */
+
+            if (mip_filter == PIPE_TEX_MIPFILTER_NONE ||
+                mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+               *out_lod_ipart = lp_build_ilog2(float_bld, rho);
+               *out_lod_fpart = bld->float_bld.zero;
+               return;
+            }
+            if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR &&
+                !(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+               lp_build_brilinear_rho(float_bld, rho, BRILINEAR_FACTOR,
+                                      out_lod_ipart, out_lod_fpart);
+               return;
+            }
          }
 
          if (0) {
@@ -437,21 +506,22 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
    }
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      if (BRILINEAR_FACTOR > 1.0) {
-         lp_build_brilinear_lod(bld, lod, BRILINEAR_FACTOR,
+      if (!(gallivm_debug & GALLIVM_DEBUG_NO_BRILINEAR)) {
+         lp_build_brilinear_lod(float_bld, lod, BRILINEAR_FACTOR,
                                 out_lod_ipart, out_lod_fpart);
       }
       else {
          lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart);
       }
 
-      lp_build_name(*out_lod_ipart, "lod_ipart");
       lp_build_name(*out_lod_fpart, "lod_fpart");
    }
    else {
       *out_lod_ipart = lp_build_iround(float_bld, lod);
    }
 
+   lp_build_name(*out_lod_ipart, "lod_ipart");
+
    return;
 }
 
@@ -630,37 +700,21 @@ lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
                             LLVMValueRef ilevel,
-                            LLVMValueRef *out_width_vec,
-                            LLVMValueRef *out_height_vec,
-                            LLVMValueRef *out_depth_vec,
+                            LLVMValueRef *out_size,
                             LLVMValueRef *row_stride_vec,
                             LLVMValueRef *img_stride_vec)
 {
    const unsigned dims = bld->dims;
    LLVMValueRef ilevel_vec;
-   LLVMValueRef size_vec;
-   LLVMTypeRef i32t = LLVMInt32Type();
 
    ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel);
 
    /*
     * Compute width, height, depth at mipmap level 'ilevel'
     */
-   size_vec = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
+   *out_size = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec);
 
-   *out_width_vec = lp_build_extract_broadcast(bld->builder,
-                                               bld->int_size_type,
-                                               bld->int_coord_type,
-                                               size_vec,
-                                               LLVMConstInt(i32t, 0, 0));
    if (dims >= 2) {
-
-      *out_height_vec = lp_build_extract_broadcast(bld->builder,
-                                                   bld->int_size_type,
-                                                   bld->int_coord_type,
-                                                   size_vec,
-                                                   LLVMConstInt(i32t, 1, 0));
-
       *row_stride_vec = lp_build_get_level_stride_vec(bld,
                                                       bld->row_stride_array,
                                                       ilevel);
@@ -668,18 +722,90 @@ lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
          *img_stride_vec = lp_build_get_level_stride_vec(bld,
                                                          bld->img_stride_array,
                                                          ilevel);
-         if (dims == 3) {
-            *out_depth_vec = lp_build_extract_broadcast(bld->builder,
-                                                        bld->int_size_type,
-                                                        bld->int_coord_type,
-                                                        size_vec,
-                                                        LLVMConstInt(i32t, 2, 0));
-         }
       }
    }
 }
 
 
+/**
+ * Extract and broadcast texture size.
+ *
+ * @param size_type   type of the texture size vector (either
+ *                    bld->int_size_type or bld->float_size_type)
+ * @param coord_type  type of the texture size vector (either
+ *                    bld->int_coord_type or bld->coord_type)
+ * @param int_size    vector with the integer texture size (width, height,
+ *                    depth)
+ */
+void
+lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
+                             struct lp_type size_type,
+                             struct lp_type coord_type,
+                             LLVMValueRef size,
+                             LLVMValueRef *out_width,
+                             LLVMValueRef *out_height,
+                             LLVMValueRef *out_depth)
+{
+   const unsigned dims = bld->dims;
+   LLVMTypeRef i32t = LLVMInt32Type();
+
+   *out_width = lp_build_extract_broadcast(bld->builder,
+                                           size_type,
+                                           coord_type,
+                                           size,
+                                           LLVMConstInt(i32t, 0, 0));
+   if (dims >= 2) {
+      *out_height = lp_build_extract_broadcast(bld->builder,
+                                               size_type,
+                                               coord_type,
+                                               size,
+                                               LLVMConstInt(i32t, 1, 0));
+      if (dims == 3) {
+         *out_depth = lp_build_extract_broadcast(bld->builder,
+                                                 size_type,
+                                                 coord_type,
+                                                 size,
+                                                 LLVMConstInt(i32t, 2, 0));
+      }
+   }
+}
+
+
+/**
+ * Unnormalize coords.
+ *
+ * @param int_size  vector with the integer texture size (width, height, depth)
+ */
+void
+lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
+                             LLVMValueRef flt_size,
+                             LLVMValueRef *s,
+                             LLVMValueRef *t,
+                             LLVMValueRef *r)
+{
+   const unsigned dims = bld->dims;
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef depth;
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &width,
+                                &height,
+                                &depth);
+
+   /* s = s * width, t = t * height */
+   *s = lp_build_mul(&bld->coord_bld, *s, width);
+   if (dims >= 2) {
+      *t = lp_build_mul(&bld->coord_bld, *t, height);
+      if (dims >= 3) {
+         *r = lp_build_mul(&bld->coord_bld, *r, depth);
+      }
+   }
+}
+
 
 /** Helper used by lp_build_cube_lookup() */
 static LLVMValueRef
@@ -798,25 +924,16 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
    rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
 
    {
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
+      LLVMValueRef face_s_var;
+      LLVMValueRef face_t_var;
+      LLVMValueRef face_var;
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
-
-      *face_s = bld->coord_bld.undef;
-      *face_t = bld->coord_bld.undef;
-      *face = bld->int_bld.undef;
-
-      lp_build_name(*face_s, "face_s");
-      lp_build_name(*face_t, "face_t");
-      lp_build_name(*face, "face");
+      face_s_var = lp_build_alloca(bld->builder, bld->coord_bld.vec_type, "face_s_var");
+      face_t_var = lp_build_alloca(bld->builder, bld->coord_bld.vec_type, "face_t_var");
+      face_var = lp_build_alloca(bld->builder, bld->int_bld.vec_type, "face_var");
 
-      lp_build_flow_scope_declare(flow_ctx, face_s);
-      lp_build_flow_scope_declare(flow_ctx, face_t);
-      lp_build_flow_scope_declare(flow_ctx, face);
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
+      lp_build_if(&if_ctx, bld->builder, arx_ge_ary_arz);
       {
          /* +/- X face */
          LLVMValueRef sign = lp_build_sgn(float_bld, rx);
@@ -826,57 +943,52 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
          *face = lp_build_cube_face(bld, rx,
                                     PIPE_TEX_FACE_POS_X,
                                     PIPE_TEX_FACE_NEG_X);
+         LLVMBuildStore(bld->builder, *face_s, face_s_var);
+         LLVMBuildStore(bld->builder, *face_t, face_t_var);
+         LLVMBuildStore(bld->builder, *face, face_var);
       }
       lp_build_else(&if_ctx);
       {
-         struct lp_build_flow_context *flow_ctx2;
          struct lp_build_if_state if_ctx2;
 
-         LLVMValueRef face_s2 = bld->coord_bld.undef;
-         LLVMValueRef face_t2 = bld->coord_bld.undef;
-         LLVMValueRef face2 = bld->int_bld.undef;
-
-         flow_ctx2 = lp_build_flow_create(bld->builder);
-         lp_build_flow_scope_begin(flow_ctx2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
-         lp_build_flow_scope_declare(flow_ctx2, &face2);
-
          ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
 
-         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
+         lp_build_if(&if_ctx2, bld->builder, ary_ge_arx_arz);
          {
             /* +/- Y face */
             LLVMValueRef sign = lp_build_sgn(float_bld, ry);
             LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
-            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
-            face2 = lp_build_cube_face(bld, ry,
+            *face_s = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
+            *face_t = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
+            *face = lp_build_cube_face(bld, ry,
                                        PIPE_TEX_FACE_POS_Y,
                                        PIPE_TEX_FACE_NEG_Y);
+            LLVMBuildStore(bld->builder, *face_s, face_s_var);
+            LLVMBuildStore(bld->builder, *face_t, face_t_var);
+            LLVMBuildStore(bld->builder, *face, face_var);
          }
          lp_build_else(&if_ctx2);
          {
             /* +/- Z face */
             LLVMValueRef sign = lp_build_sgn(float_bld, rz);
             LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
-            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
-            face2 = lp_build_cube_face(bld, rz,
+            *face_s = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
+            *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+            *face = lp_build_cube_face(bld, rz,
                                        PIPE_TEX_FACE_POS_Z,
                                        PIPE_TEX_FACE_NEG_Z);
+            LLVMBuildStore(bld->builder, *face_s, face_s_var);
+            LLVMBuildStore(bld->builder, *face_t, face_t_var);
+            LLVMBuildStore(bld->builder, *face, face_var);
          }
          lp_build_endif(&if_ctx2);
-         lp_build_flow_scope_end(flow_ctx2);
-         lp_build_flow_destroy(flow_ctx2);
-         *face_s = face_s2;
-         *face_t = face_t2;
-         *face = face2;
       }
 
       lp_build_endif(&if_ctx);
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
+
+      *face_s = LLVMBuildLoad(bld->builder, face_s_var, "face_s");
+      *face_t = LLVMBuildLoad(bld->builder, face_t_var, "face_t");
+      *face   = LLVMBuildLoad(bld->builder, face_var, "face");
    }
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index d1a1aa143d8..ffed27cee83 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -197,10 +197,6 @@ struct lp_build_sample_context
    struct lp_type coord_type;
    struct lp_build_context coord_bld;
 
-   /** Unsigned integer coordinates */
-   struct lp_type uint_coord_type;
-   struct lp_build_context uint_coord_bld;
-
    /** Signed integer coordinates */
    struct lp_type int_coord_type;
    struct lp_build_context int_coord_bld;
@@ -333,14 +329,30 @@ lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
 void
 lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
                             LLVMValueRef ilevel,
-                            LLVMValueRef *out_width_vec,
-                            LLVMValueRef *out_height_vec,
-                            LLVMValueRef *out_depth_vec,
+                            LLVMValueRef *out_size_vec,
                             LLVMValueRef *row_stride_vec,
                             LLVMValueRef *img_stride_vec);
 
 
 void
+lp_build_extract_image_sizes(struct lp_build_sample_context *bld,
+                             struct lp_type size_type,
+                             struct lp_type coord_type,
+                             LLVMValueRef size,
+                             LLVMValueRef *out_width,
+                             LLVMValueRef *out_height,
+                             LLVMValueRef *out_depth);
+
+
+void
+lp_build_unnormalized_coords(struct lp_build_sample_context *bld,
+                             LLVMValueRef flt_size,
+                             LLVMValueRef *s,
+                             LLVMValueRef *t,
+                             LLVMValueRef *r);
+
+
+void
 lp_build_cube_lookup(struct lp_build_sample_context *bld,
                      LLVMValueRef s,
                      LLVMValueRef t,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index e7410448c04..d6831a580b3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -45,6 +45,7 @@
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_bitarit.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_pack.h"
@@ -80,11 +81,10 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
                                  LLVMValueRef *out_offset,
                                  LLVMValueRef *out_i)
 {
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef length_minus_one;
 
-   length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 
    switch(wrap_mode) {
    case PIPE_TEX_WRAP_REPEAT:
@@ -92,7 +92,7 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
          coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
       else {
          /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
          coord = LLVMBuildAdd(bld->builder, coord, bias, "");
          coord = LLVMBuildURem(bld->builder, coord, length, "");
       }
@@ -113,7 +113,7 @@ lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
       assert(0);
    }
 
-   lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
+   lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride,
                                   out_offset, out_i);
 }
 
@@ -146,7 +146,6 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
                                 LLVMValueRef *i0,
                                 LLVMValueRef *i1)
 {
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef length_minus_one;
    LLVMValueRef lmask, umask, mask;
@@ -188,8 +187,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
     * multiplication.
     */
 
-   *i0 = uint_coord_bld->zero;
-   *i1 = uint_coord_bld->zero;
+   *i0 = int_coord_bld->zero;
+   *i1 = int_coord_bld->zero;
 
    length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
 
@@ -200,7 +199,7 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
       }
       else {
          /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
          coord0 = LLVMBuildAdd(bld->builder, coord0, bias, "");
          coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
       }
@@ -208,9 +207,9 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
       mask = lp_build_compare(bld->builder, int_coord_bld->type,
                               PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
 
-      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
       *offset1 = LLVMBuildAnd(bld->builder,
-                              lp_build_add(uint_coord_bld, *offset0, stride),
+                              lp_build_add(int_coord_bld, *offset0, stride),
                               mask, "");
       break;
 
@@ -225,8 +224,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
 
       mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
 
-      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
-      *offset1 = lp_build_add(uint_coord_bld,
+      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
+      *offset1 = lp_build_add(int_coord_bld,
                               *offset0,
                               LLVMBuildAnd(bld->builder, stride, mask, ""));
       break;
@@ -239,8 +238,8 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
    default:
       assert(0);
-      *offset0 = uint_coord_bld->zero;
-      *offset1 = uint_coord_bld->zero;
+      *offset0 = int_coord_bld->zero;
+      *offset1 = int_coord_bld->zero;
       break;
    }
 }
@@ -253,9 +252,7 @@ lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
-                              LLVMValueRef width_vec,
-                              LLVMValueRef height_vec,
-                              LLVMValueRef depth_vec,
+                              LLVMValueRef int_size,
                               LLVMValueRef row_stride_vec,
                               LLVMValueRef img_stride_vec,
                               LLVMValueRef data_ptr,
@@ -270,7 +267,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    struct lp_build_context i32, h16, u8n;
    LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
    LLVMValueRef i32_c8;
-   LLVMValueRef s_ipart, t_ipart, r_ipart;
+   LLVMValueRef width_vec, height_vec, depth_vec;
+   LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL;
    LLVMValueRef x_stride;
    LLVMValueRef x_offset, offset;
    LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
@@ -283,30 +281,33 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    h16_vec_type = lp_build_vec_type(h16.type);
    u8n_vec_type = lp_build_vec_type(u8n.type);
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                int_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
+
    if (bld->static_state->normalized_coords) {
-      /* s = s * width, t = t * height */
-      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
-      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
-                                              coord_vec_type, "");
-      s = lp_build_mul(&bld->coord_bld, s, fp_width);
-      if (dims >= 2) {
-         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
-                                                  coord_vec_type, "");
-         t = lp_build_mul(&bld->coord_bld, t, fp_height);
-         if (dims >= 3) {
-            LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
-                                                    coord_vec_type, "");
-            r = lp_build_mul(&bld->coord_bld, r, fp_depth);
-         }
-      }
-   }
+      LLVMValueRef scaled_size;
+      LLVMValueRef flt_size;
 
-   /* scale coords by 256 (8 fractional bits) */
-   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
-   if (dims >= 2)
-      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
-   if (dims >= 3)
-      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+      /* scale size by 256 (8 fractional bits) */
+      scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
+
+      flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
+
+      lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
+   }
+   else {
+      /* scale coords by 256 (8 fractional bits) */
+      s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+      if (dims >= 2)
+         t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+      if (dims >= 3)
+         r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+   }
 
    /* convert float to int */
    s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
@@ -324,7 +325,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
       r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
 
    /* get pixel, row, image strides */
-   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+   x_stride = lp_build_const_vec(bld->int_coord_bld.type,
                                  bld->format_desc->block.bits/8);
 
    /* Do texcoord wrapping, compute texel offset */
@@ -343,7 +344,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                        bld->static_state->pot_height,
                                        bld->static_state->wrap_t,
                                        &y_offset, &y_subcoord);
-      offset = lp_build_add(&bld->uint_coord_bld, offset, y_offset);
+      offset = lp_build_add(&bld->int_coord_bld, offset, y_offset);
       if (dims >= 3) {
          LLVMValueRef z_offset;
          lp_build_sample_wrap_nearest_int(bld,
@@ -352,13 +353,13 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                                           bld->static_state->pot_height,
                                           bld->static_state->wrap_r,
                                           &z_offset, &z_subcoord);
-         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+         offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
       }
       else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
          LLVMValueRef z_offset;
          /* The r coord is the cube face in [0,5] */
-         z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
-         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+         z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
+         offset = lp_build_add(&bld->int_coord_bld, offset, z_offset);
       }
    }
 
@@ -417,9 +418,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
-                             LLVMValueRef width_vec,
-                             LLVMValueRef height_vec,
-                             LLVMValueRef depth_vec,
+                             LLVMValueRef int_size,
                              LLVMValueRef row_stride_vec,
                              LLVMValueRef img_stride_vec,
                              LLVMValueRef data_ptr,
@@ -434,9 +433,10 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    struct lp_build_context i32, h16, u8n;
    LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
    LLVMValueRef i32_c8, i32_c128, i32_c255;
+   LLVMValueRef width_vec, height_vec, depth_vec;
    LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
-   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
-   LLVMValueRef r_ipart, r_fpart, r_fpart_lo, r_fpart_hi;
+   LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL;
+   LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL;
    LLVMValueRef x_stride, y_stride, z_stride;
    LLVMValueRef x_offset0, x_offset1;
    LLVMValueRef y_offset0, y_offset1;
@@ -458,30 +458,33 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    h16_vec_type = lp_build_vec_type(h16.type);
    u8n_vec_type = lp_build_vec_type(u8n.type);
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                int_size,
+                                &width_vec,
+                                &height_vec,
+                                &depth_vec);
+
    if (bld->static_state->normalized_coords) {
-      /* s = s * width, t = t * height */
-      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
-      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
-                                              coord_vec_type, "");
-      s = lp_build_mul(&bld->coord_bld, s, fp_width);
-      if (dims >= 2) {
-         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
-                                                  coord_vec_type, "");
-         t = lp_build_mul(&bld->coord_bld, t, fp_height);
-      }
-      if (dims >= 3) {
-         LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
-                                                 coord_vec_type, "");
-         r = lp_build_mul(&bld->coord_bld, r, fp_depth);
-      }
-   }
+      LLVMValueRef scaled_size;
+      LLVMValueRef flt_size;
 
-   /* scale coords by 256 (8 fractional bits) */
-   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
-   if (dims >= 2)
-      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
-   if (dims >= 3)
-      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+      /* scale size by 256 (8 fractional bits) */
+      scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8);
+
+      flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size);
+
+      lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r);
+   }
+   else {
+      /* scale coords by 256 (8 fractional bits) */
+      s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+      if (dims >= 2)
+         t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+      if (dims >= 3)
+         r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+   }
 
    /* convert float to int */
    s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
@@ -517,7 +520,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
 
    /* get pixel, row and image strides */
-   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+   x_stride = lp_build_const_vec(bld->int_coord_bld.type,
                                  bld->format_desc->block.bits/8);
    y_stride = row_stride_vec;
    z_stride = img_stride_vec;
@@ -548,9 +551,9 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 
       for (z = 0; z < 2; z++) {
          for (x = 0; x < 2; x++) {
-            offset[z][0][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[z][0][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[z][0][x], y_offset0);
-            offset[z][1][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[z][1][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[z][1][x], y_offset1);
          }
       }
@@ -566,20 +569,20 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                                       &z_subcoord[0], &z_subcoord[1]);
       for (y = 0; y < 2; y++) {
          for (x = 0; x < 2; x++) {
-            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[0][y][x], z_offset0);
-            offset[1][y][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[1][y][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[1][y][x], z_offset1);
          }
       }
    }
    else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
       LLVMValueRef z_offset;
-      z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
+      z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec);
       for (y = 0; y < 2; y++) {
          for (x = 0; x < 2; x++) {
             /* The r coord is the cube face in [0,5] */
-            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+            offset[0][y][x] = lp_build_add(&bld->int_coord_bld,
                                            offset[0][y][x], z_offset);
          }
       }
@@ -788,12 +791,8 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                        LLVMValueRef colors_hi_var)
 {
    LLVMBuilderRef builder = bld->builder;
-   LLVMValueRef width0_vec;
-   LLVMValueRef width1_vec;
-   LLVMValueRef height0_vec;
-   LLVMValueRef height1_vec;
-   LLVMValueRef depth0_vec;
-   LLVMValueRef depth1_vec;
+   LLVMValueRef size0;
+   LLVMValueRef size1;
    LLVMValueRef row_stride0_vec;
    LLVMValueRef row_stride1_vec;
    LLVMValueRef img_stride0_vec;
@@ -806,12 +805,12 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
    /* sample the first mipmap level */
    lp_build_mipmap_level_sizes(bld, ilevel0,
-                               &width0_vec, &height0_vec, &depth0_vec,
+                               &size0,
                                &row_stride0_vec, &img_stride0_vec);
    data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
       lp_build_sample_image_nearest(bld,
-                                    width0_vec, height0_vec, depth0_vec,
+                                    size0,
                                     row_stride0_vec, img_stride0_vec,
                                     data_ptr0, s, t, r,
                                     &colors0_lo, &colors0_hi);
@@ -819,7 +818,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
       lp_build_sample_image_linear(bld,
-                                   width0_vec, height0_vec, depth0_vec,
+                                   size0,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, s, t, r,
                                    &colors0_lo, &colors0_hi);
@@ -832,12 +831,9 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
       LLVMValueRef h16_scale = LLVMConstReal(LLVMFloatType(), 256.0);
       LLVMTypeRef i32_type = LLVMIntType(32);
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
       LLVMValueRef need_lerp;
 
-      flow_ctx = lp_build_flow_create(builder);
-
       lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, "");
       lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16");
 
@@ -846,7 +842,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                                 lod_fpart, LLVMConstNull(i32_type),
                                 "need_lerp");
 
-      lp_build_if(&if_ctx, flow_ctx, builder, need_lerp);
+      lp_build_if(&if_ctx, builder, need_lerp);
       {
          struct lp_build_context h16_bld;
 
@@ -854,19 +850,19 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
          /* sample the second mipmap level */
          lp_build_mipmap_level_sizes(bld, ilevel1,
-                                     &width1_vec, &height1_vec, &depth1_vec,
+                                     &size1,
                                      &row_stride1_vec, &img_stride1_vec);
          data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
             lp_build_sample_image_nearest(bld,
-                                          width1_vec, height1_vec, depth1_vec,
+                                          size1,
                                           row_stride1_vec, img_stride1_vec,
                                           data_ptr1, s, t, r,
                                           &colors1_lo, &colors1_hi);
          }
          else {
             lp_build_sample_image_linear(bld,
-                                         width1_vec, height1_vec, depth1_vec,
+                                         size1,
                                          row_stride1_vec, img_stride1_vec,
                                          data_ptr1, s, t, r,
                                          &colors1_lo, &colors1_hi);
@@ -877,6 +873,26 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
          lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, "");
          lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart);
 
+#if HAVE_LLVM == 0x208
+         /* This is a work-around for a bug in LLVM 2.8.
+          * Evidently, something goes wrong in the construction of the
+          * lod_fpart short[8] vector.  Adding this no-effect shuffle seems
+          * to force the vector to be properly constructed.
+          * Tested with mesa-demos/src/tests/mipmap_limits.c (press t, f).
+          */
+         {
+            LLVMValueRef shuffles[8], shuffle;
+            int i;
+            assert(h16_bld.type.length <= Elements(shuffles));
+            for (i = 0; i < h16_bld.type.length; i++)
+               shuffles[i] = lp_build_const_int32(2 * (i & 1));
+            shuffle = LLVMConstVector(shuffles, h16_bld.type.length);
+            lod_fpart = LLVMBuildShuffleVector(builder,
+                                               lod_fpart, lod_fpart,
+                                               shuffle, "");
+         }
+#endif
+
          colors0_lo = lp_build_lerp(&h16_bld, lod_fpart,
                                     colors0_lo, colors1_lo);
          colors0_hi = lp_build_lerp(&h16_bld, lod_fpart,
@@ -886,8 +902,6 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
          LLVMBuildStore(builder, colors0_hi, colors_hi_var);
       }
       lp_build_endif(&if_ctx);
-
-      lp_build_flow_destroy(flow_ctx);
    }
 }
 
@@ -946,12 +960,12 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
       r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
 
       /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
       face_ddx[2] = NULL;
       face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
       face_ddy[2] = NULL;
       face_ddy[3] = NULL;
       ddx = face_ddx;
@@ -1027,17 +1041,14 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
       /* Emit conditional to choose min image filter or mag image filter
        * depending on the lod being > 0 or <= 0, respectively.
        */
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
       LLVMValueRef minify;
 
-      flow_ctx = lp_build_flow_create(builder);
-
       /* minify = lod >= 0.0 */
       minify = LLVMBuildICmp(builder, LLVMIntSGE,
                              lod_ipart, int_bld->zero, "");
 
-      lp_build_if(&if_ctx, flow_ctx, builder, minify);
+      lp_build_if(&if_ctx, builder, minify);
       {
          /* Use the minification filter */
          lp_build_sample_mipmap(bld,
@@ -1056,8 +1067,6 @@ lp_build_sample_aos(struct lp_build_sample_context *bld,
                                 packed_lo, packed_hi);
       }
       lp_build_endif(&if_ctx);
-
-      lp_build_flow_destroy(flow_ctx);
    }
 
    /*
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index b8cf938acfe..53cc0c5f345 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -131,7 +131,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
    }
 
    /* convert x,y,z coords to linear offset from start of texture, in bytes */
-   lp_build_sample_offset(&bld->uint_coord_bld,
+   lp_build_sample_offset(&bld->int_coord_bld,
                           bld->format_desc,
                           x, y, z, y_stride, z_stride,
                           &offset, &i, &j);
@@ -145,7 +145,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
        * coords which are out of bounds to become zero.  Zero's guaranteed
        * to be inside the texture image.
        */
-      offset = lp_build_andnot(&bld->uint_coord_bld, offset, use_border);
+      offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
    }
 
    lp_build_fetch_rgba_soa(bld->builder,
@@ -202,11 +202,7 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef fract, flr, isOdd;
 
-   /* fract = coord - floor(coord) */
-   fract = lp_build_sub(coord_bld, coord, lp_build_floor(coord_bld, coord));
-
-   /* flr = ifloor(coord); */
-   flr = lp_build_ifloor(coord_bld, coord);
+   lp_build_ifloor_fract(coord_bld, coord, &flr, &fract);
 
    /* isOdd = flr & 1 */
    isOdd = LLVMBuildAnd(bld->builder, flr, int_coord_bld->one, "");
@@ -234,6 +230,7 @@ static void
 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
                             LLVMValueRef coord,
                             LLVMValueRef length,
+                            LLVMValueRef length_f,
                             boolean is_pot,
                             unsigned wrap_mode,
                             LLVMValueRef *x0_out,
@@ -242,10 +239,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
    LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
-   LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
-   LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
    LLVMValueRef coord0, coord1, weight;
 
    switch(wrap_mode) {
@@ -255,19 +250,23 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       coord = lp_build_sub(coord_bld, coord, half);
       /* convert to int, compute lerp weight */
       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
-      coord1 = lp_build_add(uint_coord_bld, coord0, uint_coord_bld->one);
       /* repeat wrap */
       if (is_pot) {
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
          coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
          coord1 = LLVMBuildAnd(bld->builder, coord1, length_minus_one, "");
       }
       else {
          /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
+         LLVMValueRef mask;
          coord0 = LLVMBuildAdd(bld->builder, coord0, bias, "");
-         coord1 = LLVMBuildAdd(bld->builder, coord1, bias, "");
          coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
-         coord1 = LLVMBuildURem(bld->builder, coord1, length, "");
+         mask = lp_build_compare(bld->builder, int_coord_bld->type,
+                                 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+         coord1 = LLVMBuildAnd(bld->builder,
+                              lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
+                              mask, "");
       }
       break;
 
@@ -288,41 +287,39 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      if (bld->static_state->normalized_coords) {
-         /* clamp to [0,1] */
-         coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, coord_bld->one);
-         /* mul by tex size and subtract 0.5 */
-         coord = lp_build_mul(coord_bld, coord, length_f);
+      {
+         struct lp_build_context abs_coord_bld = bld->coord_bld;
+         abs_coord_bld.type.sign = FALSE;
+
+         if (bld->static_state->normalized_coords) {
+            /* mul by tex size */
+            coord = lp_build_mul(coord_bld, coord, length_f);
+         }
+         /* clamp to length max */
+         coord = lp_build_min(coord_bld, coord, length_f);
+         /* subtract 0.5 */
          coord = lp_build_sub(coord_bld, coord, half);
+         /* clamp to [0, length - 0.5] */
+         coord = lp_build_max(coord_bld, coord, coord_bld->zero);
+         /* convert to int, compute lerp weight */
+         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+         /* coord1 = min(coord1, length-1) */
+         coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+         break;
       }
-      else {
-         LLVMValueRef min, max;
-         /* clamp to [0.5, length - 0.5] */
-         min = half;
-         max = lp_build_sub(coord_bld, length_f, min);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-      }
-      /* convert to int, compute lerp weight */
-      lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
-      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
-      /* coord0 = max(coord0, 0) */
-      coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
-      /* coord1 = min(coord1, length-1) */
-      coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
-      break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       {
-         LLVMValueRef min, max;
+         LLVMValueRef min;
          if (bld->static_state->normalized_coords) {
             /* scale coord to length */
             coord = lp_build_mul(coord_bld, coord, length_f);
          }
-         /* clamp to [-0.5, length + 0.5] */
-         min = lp_build_const_vec(coord_bld->type, -0.5F);
-         max = lp_build_sub(coord_bld, length_f, min);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
+         /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */
          coord = lp_build_sub(coord_bld, coord, half);
+         min = lp_build_const_vec(coord_bld->type, -1.0F);
+         coord = lp_build_clamp(coord_bld, coord, min, length_f);
          /* convert to int, compute lerp weight */
          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
@@ -368,7 +365,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
       {
          LLVMValueRef min, max;
-
+         struct lp_build_context abs_coord_bld = bld->coord_bld;
+         abs_coord_bld.type.sign = FALSE;
          coord = lp_build_abs(coord_bld, coord);
 
          if (bld->static_state->normalized_coords) {
@@ -384,15 +382,13 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          coord = lp_build_sub(coord_bld, coord, half);
 
          /* convert to int, compute lerp weight */
-         lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
+         lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
       {
-         LLVMValueRef min, max;
-
          coord = lp_build_abs(coord_bld, coord);
 
          if (bld->static_state->normalized_coords) {
@@ -400,12 +396,10 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
             coord = lp_build_mul(coord_bld, coord, length_f);
          }
 
-         /* clamp to [-0.5, length + 0.5] */
-         min = lp_build_negate(coord_bld, half);
-         max = lp_build_sub(coord_bld, length_f, min);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-
+         /* was: clamp to [-0.5, length + 0.5] then sub 0.5 */
+         /* skip -0.5 clamp (always positive), do sub first */
          coord = lp_build_sub(coord_bld, coord, half);
+         coord = lp_build_min(coord_bld, coord, length_f);
 
          /* convert to int, compute lerp weight */
          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
@@ -437,14 +431,13 @@ static LLVMValueRef
 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
                              LLVMValueRef coord,
                              LLVMValueRef length,
+                             LLVMValueRef length_f,
                              boolean is_pot,
                              unsigned wrap_mode)
 {
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
-   LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
    LLVMValueRef icoord;
    
    switch(wrap_mode) {
@@ -455,7 +448,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          icoord = LLVMBuildAnd(bld->builder, icoord, length_minus_one, "");
       else {
          /* Add a bias to the texcoord to handle negative coords */
-         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
          icoord = LLVMBuildAdd(bld->builder, icoord, bias, "");
          icoord = LLVMBuildURem(bld->builder, icoord, length, "");
       }
@@ -469,7 +462,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       }
 
       /* floor */
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* use itrunc instead since we clamp to 0 anyway */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length - 1]. */
       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
@@ -503,7 +497,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       assert(bld->static_state->normalized_coords);
       coord = lp_build_mul(coord_bld, coord, length_f);
 
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* itrunc == ifloor here */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length - 1] */
       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
@@ -518,7 +513,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
 
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* itrunc == ifloor here */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length - 1] */
       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
@@ -532,7 +528,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
 
-      icoord = lp_build_ifloor(coord_bld, coord);
+      /* itrunc == ifloor here */
+      icoord = lp_build_itrunc(coord_bld, coord);
 
       /* clamp to [0, length] */
       icoord = lp_build_min(int_coord_bld, icoord, length);
@@ -554,9 +551,7 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 static void
 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                               unsigned unit,
-                              LLVMValueRef width_vec,
-                              LLVMValueRef height_vec,
-                              LLVMValueRef depth_vec,
+                              LLVMValueRef size,
                               LLVMValueRef row_stride_vec,
                               LLVMValueRef img_stride_vec,
                               LLVMValueRef data_ptr,
@@ -566,24 +561,45 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
                               LLVMValueRef colors_out[4])
 {
    const unsigned dims = bld->dims;
+   LLVMValueRef width_vec;
+   LLVMValueRef height_vec;
+   LLVMValueRef depth_vec;
+   LLVMValueRef flt_size;
+   LLVMValueRef flt_width_vec;
+   LLVMValueRef flt_height_vec;
+   LLVMValueRef flt_depth_vec;
    LLVMValueRef x, y, z;
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                size,
+                                &width_vec, &height_vec, &depth_vec);
+
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
+
    /*
     * Compute integer texcoords.
     */
-   x = lp_build_sample_wrap_nearest(bld, s, width_vec,
+   x = lp_build_sample_wrap_nearest(bld, s, width_vec, flt_width_vec,
                                     bld->static_state->pot_width,
                                     bld->static_state->wrap_s);
    lp_build_name(x, "tex.x.wrapped");
 
    if (dims >= 2) {
-      y = lp_build_sample_wrap_nearest(bld, t, height_vec,
+      y = lp_build_sample_wrap_nearest(bld, t, height_vec, flt_height_vec,
                                        bld->static_state->pot_height,
                                        bld->static_state->wrap_t);
       lp_build_name(y, "tex.y.wrapped");
 
       if (dims == 3) {
-         z = lp_build_sample_wrap_nearest(bld, r, depth_vec,
+         z = lp_build_sample_wrap_nearest(bld, r, depth_vec, flt_depth_vec,
                                           bld->static_state->pot_depth,
                                           bld->static_state->wrap_r);
          lp_build_name(z, "tex.z.wrapped");
@@ -617,9 +633,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 static void
 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              unsigned unit,
-                             LLVMValueRef width_vec,
-                             LLVMValueRef height_vec,
-                             LLVMValueRef depth_vec,
+                             LLVMValueRef size,
                              LLVMValueRef row_stride_vec,
                              LLVMValueRef img_stride_vec,
                              LLVMValueRef data_ptr,
@@ -629,15 +643,36 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
                              LLVMValueRef colors_out[4])
 {
    const unsigned dims = bld->dims;
+   LLVMValueRef width_vec;
+   LLVMValueRef height_vec;
+   LLVMValueRef depth_vec;
+   LLVMValueRef flt_size;
+   LLVMValueRef flt_width_vec;
+   LLVMValueRef flt_height_vec;
+   LLVMValueRef flt_depth_vec;
    LLVMValueRef x0, y0, z0, x1, y1, z1;
    LLVMValueRef s_fpart, t_fpart, r_fpart;
    LLVMValueRef neighbors[2][2][4];
    int chan;
 
+   lp_build_extract_image_sizes(bld,
+                                bld->int_size_type,
+                                bld->int_coord_type,
+                                size,
+                                &width_vec, &height_vec, &depth_vec);
+
+   flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
+
+   lp_build_extract_image_sizes(bld,
+                                bld->float_size_type,
+                                bld->coord_type,
+                                flt_size,
+                                &flt_width_vec, &flt_height_vec, &flt_depth_vec);
+
    /*
     * Compute integer texcoords.
     */
-   lp_build_sample_wrap_linear(bld, s, width_vec,
+   lp_build_sample_wrap_linear(bld, s, width_vec, flt_width_vec,
                                bld->static_state->pot_width,
                                bld->static_state->wrap_s,
                                &x0, &x1, &s_fpart);
@@ -645,7 +680,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
    lp_build_name(x1, "tex.x1.wrapped");
 
    if (dims >= 2) {
-      lp_build_sample_wrap_linear(bld, t, height_vec,
+      lp_build_sample_wrap_linear(bld, t, height_vec, flt_height_vec,
                                   bld->static_state->pot_height,
                                   bld->static_state->wrap_t,
                                   &y0, &y1, &t_fpart);
@@ -653,7 +688,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       lp_build_name(y1, "tex.y1.wrapped");
 
       if (dims == 3) {
-         lp_build_sample_wrap_linear(bld, r, depth_vec,
+         lp_build_sample_wrap_linear(bld, r, depth_vec, flt_depth_vec,
                                      bld->static_state->pot_depth,
                                      bld->static_state->wrap_r,
                                      &z0, &z1, &r_fpart);
@@ -796,12 +831,8 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                        LLVMValueRef *colors_out)
 {
    LLVMBuilderRef builder = bld->builder;
-   LLVMValueRef width0_vec;
-   LLVMValueRef width1_vec;
-   LLVMValueRef height0_vec;
-   LLVMValueRef height1_vec;
-   LLVMValueRef depth0_vec;
-   LLVMValueRef depth1_vec;
+   LLVMValueRef size0;
+   LLVMValueRef size1;
    LLVMValueRef row_stride0_vec;
    LLVMValueRef row_stride1_vec;
    LLVMValueRef img_stride0_vec;
@@ -813,12 +844,12 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
 
    /* sample the first mipmap level */
    lp_build_mipmap_level_sizes(bld, ilevel0,
-                               &width0_vec, &height0_vec, &depth0_vec,
+                               &size0,
                                &row_stride0_vec, &img_stride0_vec);
    data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
       lp_build_sample_image_nearest(bld, unit,
-                                    width0_vec, height0_vec, depth0_vec,
+                                    size0,
                                     row_stride0_vec, img_stride0_vec,
                                     data_ptr0, s, t, r,
                                     colors0);
@@ -826,7 +857,7 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
       lp_build_sample_image_linear(bld, unit,
-                                   width0_vec, height0_vec, depth0_vec,
+                                   size0,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, s, t, r,
                                    colors0);
@@ -838,35 +869,32 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    }
 
    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
       LLVMValueRef need_lerp;
 
-      flow_ctx = lp_build_flow_create(builder);
-
       /* need_lerp = lod_fpart > 0 */
       need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
                                 lod_fpart,
                                 bld->float_bld.zero,
                                 "need_lerp");
 
-      lp_build_if(&if_ctx, flow_ctx, builder, need_lerp);
+      lp_build_if(&if_ctx, builder, need_lerp);
       {
          /* sample the second mipmap level */
          lp_build_mipmap_level_sizes(bld, ilevel1,
-                                     &width1_vec, &height1_vec, &depth1_vec,
+                                     &size1,
                                      &row_stride1_vec, &img_stride1_vec);
          data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
             lp_build_sample_image_nearest(bld, unit,
-                                          width1_vec, height1_vec, depth1_vec,
+                                          size1,
                                           row_stride1_vec, img_stride1_vec,
                                           data_ptr1, s, t, r,
                                           colors1);
          }
          else {
             lp_build_sample_image_linear(bld, unit,
-                                         width1_vec, height1_vec, depth1_vec,
+                                         size1,
                                          row_stride1_vec, img_stride1_vec,
                                          data_ptr1, s, t, r,
                                          colors1);
@@ -883,8 +911,6 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
          }
       }
       lp_build_endif(&if_ctx);
-
-      lp_build_flow_destroy(flow_ctx);
    }
 }
 
@@ -937,12 +963,12 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
 
       /* recompute ddx, ddy using the new (s,t) face texcoords */
-      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
-      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[0] = lp_build_scalar_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_scalar_ddx(&bld->coord_bld, t);
       face_ddx[2] = NULL;
       face_ddx[3] = NULL;
-      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
-      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[0] = lp_build_scalar_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_scalar_ddy(&bld->coord_bld, t);
       face_ddy[2] = NULL;
       face_ddy[3] = NULL;
       ddx = face_ddx;
@@ -1020,17 +1046,14 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Emit conditional to choose min image filter or mag image filter
        * depending on the lod being > 0 or <= 0, respectively.
        */
-      struct lp_build_flow_context *flow_ctx;
       struct lp_build_if_state if_ctx;
       LLVMValueRef minify;
 
-      flow_ctx = lp_build_flow_create(builder);
-
       /* minify = lod >= 0.0 */
       minify = LLVMBuildICmp(builder, LLVMIntSGE,
                              lod_ipart, int_bld->zero, "");
 
-      lp_build_if(&if_ctx, flow_ctx, builder, minify);
+      lp_build_if(&if_ctx, builder, minify);
       {
          /* Use the minification filter */
          lp_build_sample_mipmap(bld, unit,
@@ -1049,8 +1072,6 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
                                 texels);
       }
       lp_build_endif(&if_ctx);
-
-      lp_build_flow_destroy(flow_ctx);
    }
 
    for (chan = 0; chan < 4; ++chan) {
@@ -1166,7 +1187,6 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    bld.float_type = lp_type_float(32);
    bld.int_type = lp_type_int(32);
    bld.coord_type = type;
-   bld.uint_coord_type = lp_uint_type(type);
    bld.int_coord_type = lp_int_type(type);
    bld.float_size_type = lp_type_float(32);
    bld.float_size_type.length = dims > 1 ? 4 : 1;
@@ -1179,7 +1199,6 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    lp_build_context_init(&bld.float_vec_bld, builder, float_vec_type);
    lp_build_context_init(&bld.int_bld, builder, bld.int_type);
    lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
-   lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type);
    lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
    lp_build_context_init(&bld.int_size_bld, builder, bld.int_size_type);
    lp_build_context_init(&bld.float_size_bld, builder, bld.float_size_type);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 97318b3456c..a4d3b750c3c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -36,6 +36,9 @@
 #define LP_BLD_TGSI_H
 
 #include "gallivm/lp_bld.h"
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
 
 
 struct tgsi_token;
@@ -55,6 +58,75 @@ enum lp_build_tex_modifier {
 
 
 /**
+ * Describe a channel of a register.
+ *
+ * The value can be a:
+ * - immediate value (i.e. derived from a IMM register)
+ * - CONST[n].x/y/z/w
+ * - IN[n].x/y/z/w
+ * - undetermined (when .file == TGSI_FILE_NULL)
+ *
+ * This is one of the analysis results, and is used to described
+ * the output color in terms of inputs.
+ */
+struct lp_tgsi_channel_info
+{
+   unsigned file:4; /* TGSI_FILE_* */
+   unsigned swizzle:3; /* PIPE_SWIZZLE_x */
+   union {
+      uint32_t index;
+      float value; /* for TGSI_FILE_IMMEDIATE */
+   } u;
+};
+
+
+/**
+ * Describe a texture sampler interpolator.
+ *
+ * The interpolation is described in terms of regular inputs.
+ */
+struct lp_tgsi_texture_info
+{
+   struct lp_tgsi_channel_info coord[4];
+   unsigned target:8; /* TGSI_TEXTURE_* */
+   unsigned unit:8;  /* Sampler unit */
+   unsigned modifier:8; /* LP_BLD_TEX_MODIFIER_* */
+};
+
+
+struct lp_tgsi_info
+{
+   struct tgsi_shader_info base;
+
+   /*
+    * Whether any of the texture opcodes access a register file other than
+    * TGSI_FILE_INPUT.
+    *
+    * We could also handle TGSI_FILE_CONST/IMMEDIATE here, but there is little
+    * benefit.
+    */
+   unsigned indirect_textures:1;
+
+   /*
+    * Texture opcode description. Aimed at detecting and described direct
+    * texture opcodes.
+    */
+   unsigned num_texs;
+   struct lp_tgsi_texture_info tex[PIPE_MAX_SAMPLERS];
+
+   /*
+    * Output description. Aimed at detecting and describing simple blit
+    * shaders.
+    */
+   struct lp_tgsi_channel_info output[PIPE_MAX_SHADER_OUTPUTS][4];
+
+   /*
+    * Shortcut pointers into the above (for fragment shaders).
+    */
+   const struct lp_tgsi_channel_info *cbuf[PIPE_MAX_COLOR_BUFS];
+};
+
+/**
  * Sampler code generation interface.
  *
  * Although texture sampling is a requirement for TGSI translation, it is
@@ -97,6 +169,11 @@ struct lp_build_sampler_aos
 
 
 void
+lp_build_tgsi_info(const struct tgsi_token *tokens,
+                   struct lp_tgsi_info *info);
+
+
+void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
                   struct lp_type type,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index d5f963be58d..c3c082b2b95 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -513,7 +513,7 @@ emit_instruction(
 {
    LLVMValueRef src0, src1, src2;
    LLVMValueRef tmp0, tmp1;
-   LLVMValueRef dst0;
+   LLVMValueRef dst0 = NULL;
 
    /*
     * Stores and write masks are handled in a general fashion after the long
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
new file mode 100644
index 00000000000..ad514463de0
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_info.c
@@ -0,0 +1,479 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_dump.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_tgsi.h"
+
+
+/**
+ * Analysis context.
+ *
+ * This is where we keep store the value of each channel of the IMM/TEMP/OUT
+ * register values, as we walk the shader.
+ */
+struct analysis_context
+{
+   struct lp_tgsi_info *info;
+
+   unsigned num_imms;
+   float imm[32][4];
+
+   struct lp_tgsi_channel_info temp[32][4];
+};
+
+
+/**
+ * Describe the specified channel of the src register.
+ */
+static void
+analyse_src(struct analysis_context *ctx,
+            struct lp_tgsi_channel_info *chan_info,
+            const struct tgsi_src_register *src,
+            unsigned chan)
+{
+   chan_info->file = TGSI_FILE_NULL;
+   if (!src->Indirect && !src->Absolute && !src->Negate) {
+      unsigned swizzle = tgsi_util_get_src_register_swizzle(src, chan);
+      if (src->File == TGSI_FILE_TEMPORARY) {
+         if (src->Index < Elements(ctx->temp)) {
+            *chan_info = ctx->temp[src->Index][swizzle];
+         }
+      } else {
+         chan_info->file = src->File;
+         if (src->File == TGSI_FILE_IMMEDIATE) {
+            assert(src->Index < Elements(ctx->imm));
+            if (src->Index < Elements(ctx->imm)) {
+               chan_info->u.value = ctx->imm[src->Index][swizzle];
+            }
+         } else {
+            chan_info->u.index = src->Index;
+            chan_info->swizzle = swizzle;
+         }
+      }
+   }
+}
+
+
+/**
+ * Whether this register channel refers to a specific immediate value.
+ */
+static boolean
+is_immediate(const struct lp_tgsi_channel_info *chan_info, float value)
+{
+   return chan_info->file == TGSI_FILE_IMMEDIATE &&
+          chan_info->u.value == value;
+}
+
+
+static void
+analyse_tex(struct analysis_context *ctx,
+            const struct tgsi_full_instruction *inst,
+            enum lp_build_tex_modifier modifier)
+{
+   struct lp_tgsi_info *info = ctx->info;
+   unsigned chan;
+
+   if (info->num_texs < Elements(info->tex)) {
+      struct lp_tgsi_texture_info *tex_info = &info->tex[info->num_texs];
+      bool indirect = FALSE;
+      unsigned readmask = 0;
+
+      tex_info->target = inst->Texture.Texture;
+      switch (inst->Texture.Texture) {
+      case TGSI_TEXTURE_1D:
+         readmask = TGSI_WRITEMASK_X;
+         break;
+      case TGSI_TEXTURE_2D:
+      case TGSI_TEXTURE_RECT:
+         readmask = TGSI_WRITEMASK_XY;
+         break;
+      case TGSI_TEXTURE_SHADOW1D:
+      case TGSI_TEXTURE_SHADOW2D:
+      case TGSI_TEXTURE_SHADOWRECT:
+      case TGSI_TEXTURE_3D:
+      case TGSI_TEXTURE_CUBE:
+         readmask = TGSI_WRITEMASK_XYZ;
+         break;
+      default:
+         assert(0);
+         return;
+      }
+
+      if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+         /* We don't track explicit derivatives, although we could */
+         indirect = TRUE;
+         tex_info->unit = inst->Src[3].Register.Index;
+      }  else {
+         if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED ||
+             modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS ||
+             modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
+            readmask |= TGSI_WRITEMASK_W;
+         }
+         tex_info->unit = inst->Src[1].Register.Index;
+      }
+
+      for (chan = 0; chan < 4; ++chan) {
+         struct lp_tgsi_channel_info *chan_info = &tex_info->coord[chan];
+         if (readmask & (1 << chan)) {
+            analyse_src(ctx, chan_info, &inst->Src[0].Register, chan);
+            if (chan_info->file != TGSI_FILE_INPUT) {
+               indirect = TRUE;
+            }
+         } else {
+            memset(chan_info, 0, sizeof *chan_info);
+         }
+      }
+
+      if (indirect) {
+         info->indirect_textures = TRUE;
+      }
+
+      ++info->num_texs;
+   } else {
+      info->indirect_textures = TRUE;
+   }
+}
+
+
+/**
+ * Process an instruction, and update the register values accordingly.
+ */
+static void
+analyse_instruction(struct analysis_context *ctx,
+                    struct tgsi_full_instruction *inst)
+{
+   struct lp_tgsi_info *info = ctx->info;
+   struct lp_tgsi_channel_info (*regs)[4];
+   unsigned max_regs;
+   unsigned i;
+   unsigned index;
+   unsigned chan;
+
+   for (i = 0; i < inst->Instruction.NumDstRegs; ++i) {
+      const struct tgsi_dst_register *dst = &inst->Dst[i].Register;
+
+      /*
+       * Get the lp_tgsi_channel_info array corresponding to the destination
+       * register file.
+       */
+
+      if (dst->File == TGSI_FILE_TEMPORARY) {
+         regs = ctx->temp;
+         max_regs = Elements(ctx->temp);
+      } else if (dst->File == TGSI_FILE_OUTPUT) {
+         regs = info->output;
+         max_regs = Elements(info->output);
+      } else if (dst->File == TGSI_FILE_ADDRESS ||
+                 dst->File == TGSI_FILE_PREDICATE) {
+         continue;
+      } else {
+         assert(0);
+         continue;
+      }
+
+      /*
+       * Detect direct TEX instructions
+       */
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_TEX:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_NONE);
+         break;
+      case TGSI_OPCODE_TXD:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV);
+         break;
+      case TGSI_OPCODE_TXB:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS);
+         break;
+      case TGSI_OPCODE_TXL:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD);
+         break;
+      case TGSI_OPCODE_TXP:
+         analyse_tex(ctx, inst, LP_BLD_TEX_MODIFIER_PROJECTED);
+         break;
+      default:
+         break;
+      }
+
+      /*
+       * Keep track of assignments and writes
+       */
+
+      if (dst->Indirect) {
+         /*
+          * It could be any register index so clear all register indices.
+          */
+
+         for (chan = 0; chan < 4; ++chan) {
+            if (dst->WriteMask & (1 << chan)) {
+               for (index = 0; index < max_regs; ++index) {
+                  regs[index][chan].file = TGSI_FILE_NULL;
+               }
+            }
+         }
+      } else if (dst->Index < max_regs) {
+         /*
+          * Update this destination register value.
+          */
+
+         struct lp_tgsi_channel_info res[4];
+
+         memset(res, 0, sizeof res);
+
+         if (!inst->Instruction.Predicate &&
+             !inst->Instruction.Saturate) {
+            for (chan = 0; chan < 4; ++chan) {
+               if (dst->WriteMask & (1 << chan)) {
+                  if (inst->Instruction.Opcode == TGSI_OPCODE_MOV) {
+                     analyse_src(ctx, &res[chan],
+                                 &inst->Src[0].Register, chan);
+                  } else if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
+                     /*
+                      * Propagate values across 1.0 and 0.0 multiplications.
+                      */
+
+                     struct lp_tgsi_channel_info src0;
+                     struct lp_tgsi_channel_info src1;
+
+                     analyse_src(ctx, &src0, &inst->Src[0].Register, chan);
+                     analyse_src(ctx, &src1, &inst->Src[1].Register, chan);
+
+                     if (is_immediate(&src0, 0.0f)) {
+                        res[chan] = src0;
+                     } else if (is_immediate(&src1, 0.0f)) {
+                        res[chan] = src1;
+                     } else if (is_immediate(&src0, 1.0f)) {
+                        res[chan] = src1;
+                     } else if (is_immediate(&src1, 1.0f)) {
+                        res[chan] = src0;
+                     }
+                  }
+               }
+            }
+         }
+
+         for (chan = 0; chan < 4; ++chan) {
+            if (dst->WriteMask & (1 << chan)) {
+               regs[dst->Index][chan] = res[chan];
+            }
+         }
+      }
+   }
+
+   /*
+    * Clear all temporaries information in presence of a control flow opcode.
+    */
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_IFC:
+   case TGSI_OPCODE_ELSE:
+   case TGSI_OPCODE_ENDIF:
+   case TGSI_OPCODE_BGNLOOP:
+   case TGSI_OPCODE_BRK:
+   case TGSI_OPCODE_BREAKC:
+   case TGSI_OPCODE_CONT:
+   case TGSI_OPCODE_ENDLOOP:
+   case TGSI_OPCODE_CALLNZ:
+   case TGSI_OPCODE_CAL:
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+   case TGSI_OPCODE_SWITCH:
+   case TGSI_OPCODE_CASE:
+   case TGSI_OPCODE_DEFAULT:
+   case TGSI_OPCODE_ENDSWITCH:
+   case TGSI_OPCODE_RET:
+   case TGSI_OPCODE_END:
+      /* XXX: Are there more cases? */
+      memset(&ctx->temp, 0, sizeof ctx->temp);
+      memset(&info->output, 0, sizeof info->output);
+   default:
+      break;
+   }
+}
+
+
+static INLINE void
+dump_info(const struct tgsi_token *tokens,
+          struct lp_tgsi_info *info)
+{
+   unsigned index;
+   unsigned chan;
+
+   tgsi_dump(tokens, 0);
+
+   for (index = 0; index < info->num_texs; ++index) {
+      const struct lp_tgsi_texture_info *tex_info = &info->tex[index];
+      debug_printf("TEX[%u] =", index);
+      for (chan = 0; chan < 4; ++chan) {
+         const struct lp_tgsi_channel_info *chan_info =
+               &tex_info->coord[chan];
+         if (chan_info->file != TGSI_FILE_NULL) {
+            debug_printf(" %s[%u].%c",
+                         tgsi_file_names[chan_info->file],
+                         chan_info->u.index,
+                         "xyzw01"[chan_info->swizzle]);
+         } else {
+            debug_printf(" _");
+         }
+      }
+      debug_printf(", SAMP[%u], %s\n",
+                   tex_info->unit,
+                   tgsi_texture_names[tex_info->target]);
+   }
+
+   for (index = 0; index < PIPE_MAX_SHADER_OUTPUTS; ++index) {
+      for (chan = 0; chan < 4; ++chan) {
+         const struct lp_tgsi_channel_info *chan_info =
+               &info->output[index][chan];
+         if (chan_info->file != TGSI_FILE_NULL) {
+            debug_printf("OUT[%u].%c = ", index, "xyzw"[chan]);
+            if (chan_info->file == TGSI_FILE_IMMEDIATE) {
+               debug_printf("%f", chan_info->u.value);
+            } else {
+               const char *file_name;
+               switch (chan_info->file) {
+               case TGSI_FILE_CONSTANT:
+                  file_name = "CONST";
+                  break;
+               case TGSI_FILE_INPUT:
+                  file_name = "IN";
+                  break;
+               default:
+                  file_name = "???";
+                  break;
+               }
+               debug_printf("%s[%u].%c",
+                            file_name,
+                            chan_info->u.index,
+                            "xyzw01"[chan_info->swizzle]);
+            }
+            debug_printf("\n");
+         }
+      }
+   }
+}
+
+
+/**
+ * Detect any direct relationship between the output color
+ */
+void
+lp_build_tgsi_info(const struct tgsi_token *tokens,
+                   struct lp_tgsi_info *info)
+{
+   struct tgsi_parse_context parse;
+   struct analysis_context ctx;
+   unsigned index;
+   unsigned chan;
+
+   memset(info, 0, sizeof *info);
+
+   tgsi_scan_shader(tokens, &info->base);
+
+   memset(&ctx, 0, sizeof ctx);
+   ctx.info = info;
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *inst =
+                  &parse.FullToken.FullInstruction;
+
+            if (inst->Instruction.Opcode == TGSI_OPCODE_END ||
+                inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) {
+               /* We reached the end of main function body. */
+               goto finished;
+            }
+
+            analyse_instruction(&ctx, inst);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            const unsigned size =
+                  parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+            assert(size <= 4);
+            if (ctx.num_imms < Elements(ctx.imm)) {
+               for (chan = 0; chan < size; ++chan) {
+                  ctx.imm[ctx.num_imms][chan] =
+                        parse.FullToken.FullImmediate.u[chan].Float;
+               }
+               ++ctx.num_imms;
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+finished:
+
+   tgsi_parse_free(&parse);
+
+
+   /*
+    * Link the output color values.
+    */
+
+   for (index = 0; index < PIPE_MAX_COLOR_BUFS; ++index) {
+      const struct lp_tgsi_channel_info null_output[4];
+      info->cbuf[index] = null_output;
+   }
+
+   for (index = 0; index < info->base.num_outputs; ++index) {
+      unsigned semantic_name = info->base.output_semantic_name[index];
+      unsigned semantic_index = info->base.output_semantic_index[index];
+      if (semantic_name == TGSI_SEMANTIC_COLOR &&
+          semantic_index < PIPE_MAX_COLOR_BUFS) {
+         info->cbuf[semantic_index] = info->output[index];
+      }
+   }
+
+   if (gallivm_debug & GALLIVM_DEBUG_TGSI) {
+      dump_info(tokens, info);
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 441aebae298..3c318cc8c80 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -887,21 +887,25 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    }
 
    if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+      LLVMTypeRef i32t = LLVMInt32Type();
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = emit_fetch( bld, inst, 1, i );
-         ddy[i] = emit_fetch( bld, inst, 2, i );
+         LLVMValueRef src1 = emit_fetch( bld, inst, 1, i );
+         LLVMValueRef src2 = emit_fetch( bld, inst, 2, i );
+         ddx[i] = LLVMBuildExtractElement(bld->base.builder, src1, index0, "");
+         ddy[i] = LLVMBuildExtractElement(bld->base.builder, src2, index0, "");
       }
       unit = inst->Src[3].Register.Index;
    }  else {
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = lp_build_ddx( &bld->base, coords[i] );
-         ddy[i] = lp_build_ddy( &bld->base, coords[i] );
+         ddx[i] = lp_build_scalar_ddx( &bld->base, coords[i] );
+         ddy[i] = lp_build_scalar_ddy( &bld->base, coords[i] );
       }
       unit = inst->Src[1].Register.Index;
    }
    for (i = num_coords; i < 3; i++) {
-      ddx[i] = bld->base.undef;
-      ddy[i] = bld->base.undef;
+      ddx[i] = LLVMGetUndef(bld->base.elem_type);
+      ddy[i] = LLVMGetUndef(bld->base.elem_type);
    }
 
    bld->sampler->emit_fetch_texel(bld->sampler,
@@ -913,6 +917,43 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
                                   texel);
 }
 
+static boolean
+near_end_of_shader(struct lp_build_tgsi_soa_context *bld,
+		   int pc)
+{
+   int i;
+
+   for (i = 0; i < 5; i++) {
+      unsigned opcode;
+
+      if (pc + i >= bld->info->num_instructions)
+	 return TRUE;
+
+      opcode = bld->instructions[pc + i].Instruction.Opcode;
+
+      if (opcode == TGSI_OPCODE_END)
+	 return TRUE;
+
+      if (opcode == TGSI_OPCODE_TEX ||
+	  opcode == TGSI_OPCODE_TXP ||
+	  opcode == TGSI_OPCODE_TXD ||
+	  opcode == TGSI_OPCODE_TXB ||
+	  opcode == TGSI_OPCODE_TXL ||
+	  opcode == TGSI_OPCODE_TXF ||
+	  opcode == TGSI_OPCODE_TXQ ||
+	  opcode == TGSI_OPCODE_CAL ||
+	  opcode == TGSI_OPCODE_CALLNZ ||
+	  opcode == TGSI_OPCODE_IF ||
+	  opcode == TGSI_OPCODE_IFC ||
+	  opcode == TGSI_OPCODE_BGNLOOP ||
+	  opcode == TGSI_OPCODE_SWITCH)
+	 return FALSE;
+   }
+
+   return TRUE;
+}
+
+
 
 /**
  * Kill fragment if any of the src register values are negative.
@@ -920,7 +961,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
 static void
 emit_kil(
    struct lp_build_tgsi_soa_context *bld,
-   const struct tgsi_full_instruction *inst )
+   const struct tgsi_full_instruction *inst,
+   int pc)
 {
    const struct tgsi_full_src_register *reg = &inst->Src[0];
    LLVMValueRef terms[NUM_CHANNELS];
@@ -959,8 +1001,12 @@ emit_kil(
       }
    }
 
-   if(mask)
+   if(mask) {
       lp_build_mask_update(bld->mask, mask);
+
+      if (!near_end_of_shader(bld, pc))
+	 lp_build_mask_check(bld->mask);
+   }
 }
 
 
@@ -972,7 +1018,8 @@ emit_kil(
  */
 static void
 emit_kilp(struct lp_build_tgsi_soa_context *bld,
-          const struct tgsi_full_instruction *inst)
+          const struct tgsi_full_instruction *inst,
+	  int pc)
 {
    LLVMValueRef mask;
 
@@ -983,10 +1030,14 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld,
       mask = LLVMBuildNot(bld->base.builder, bld->exec_mask.exec_mask, "kilp");
    }
    else {
-      mask = bld->base.zero;
+      LLVMValueRef zero = LLVMConstNull(bld->base.int_vec_type);
+      mask = zero;
    }
 
    lp_build_mask_update(bld->mask, mask);
+
+   if (!near_end_of_shader(bld, pc))
+      lp_build_mask_check(bld->mask);
 }
 
 static void
@@ -1535,12 +1586,12 @@ emit_instruction(
 
    case TGSI_OPCODE_KILP:
       /* predicated kill */
-      emit_kilp( bld, inst );
+      emit_kilp( bld, inst, (*pc)-1 );
       break;
 
    case TGSI_OPCODE_KIL:
       /* conditional kill */
-      emit_kil( bld, inst );
+      emit_kil( bld, inst, (*pc)-1 );
       break;
 
    case TGSI_OPCODE_PK2H:
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
index ef4b306cb67..330838d23cf 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -97,7 +97,7 @@ void (*ppc_get_func(struct ppc_function *p))(void)
       return (void (*)(void)) NULL;
    else
 #endif
-      return (void (*)(void)) p->store;
+      return (void (*)(void)) pointer_to_func(p->store);
 }
 
 
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
index 036c1ee48a8..34bfa527db0 100644
--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -23,26 +23,13 @@
 #include "cell/ppu/cell_public.h"
 #endif
 
+
 static INLINE struct pipe_screen *
-sw_screen_create(struct sw_winsys *winsys)
+sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
 {
-   const char *default_driver;
-   const char *driver;
    struct pipe_screen *screen = NULL;
 
 #if defined(GALLIUM_CELL)
-   default_driver = "cell";
-#elif defined(GALLIUM_LLVMPIPE)
-   default_driver = "llvmpipe";
-#elif defined(GALLIUM_SOFTPIPE)
-   default_driver = "softpipe";
-#else
-   default_driver = "";
-#endif
-
-   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
-
-#if defined(GALLIUM_CELL)
    if (screen == NULL && strcmp(driver, "cell") == 0)
       screen = cell_create_screen(winsys);
 #endif
@@ -60,4 +47,26 @@ sw_screen_create(struct sw_winsys *winsys)
    return screen;
 }
 
+
+static INLINE struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys)
+{
+   const char *default_driver;
+   const char *driver;
+
+#if defined(GALLIUM_CELL)
+   default_driver = "cell";
+#elif defined(GALLIUM_LLVMPIPE)
+   default_driver = "llvmpipe";
+#elif defined(GALLIUM_SOFTPIPE)
+   default_driver = "softpipe";
+#else
+   default_driver = "";
+#endif
+
+   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
+   return sw_screen_create_named(winsys, driver);
+}
+
+
 #endif
diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
index 0b4e7404034..e4effa713e9 100644
--- a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
@@ -13,22 +13,28 @@ static INLINE struct pipe_screen *
 sw_screen_wrap(struct pipe_screen *screen)
 {
    struct sw_winsys *sws;
-   struct pipe_screen *sw_screen;
+   struct pipe_screen *sw_screen = NULL;
+   const char *driver;
 
-   sws = wrapper_sw_winsys_warp_pipe_screen(screen);
+   driver = debug_get_option("GALLIUM_DRIVER", "native");
+   if (strcmp(driver, "native") == 0)
+      return screen;
+
+   sws = wrapper_sw_winsys_wrap_pipe_screen(screen);
    if (!sws)
       goto err;
 
-   sw_screen = sw_screen_create(sws);
-   if (sw_screen == screen)
+   sw_screen = sw_screen_create_named(sws, driver);
+
+   if (!sw_screen)
       goto err_winsys;
 
    return sw_screen;
 
 err_winsys:
-   sws->destroy(sws);
+   return wrapper_sw_winsys_dewrap_pipe_screen(sws);
 err:
-  return screen;
+   return screen;
 }
 
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index f71ffb70308..77bde86684e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -90,7 +90,8 @@ static const char *processor_type_names[] =
    "GEOM"
 };
 
-static const char *file_names[TGSI_FILE_COUNT] =
+const char *
+tgsi_file_names[TGSI_FILE_COUNT] =
 {
    "NULL",
    "CONST",
@@ -125,7 +126,8 @@ static const char *semantic_names[] =
    "FACE",
    "EDGEFLAG",
    "PRIM_ID",
-   "INSTANCEID"
+   "INSTANCEID",
+   "STENCIL"
 };
 
 static const char *immediate_type_names[] =
@@ -135,7 +137,8 @@ static const char *immediate_type_names[] =
    "INT32"
 };
 
-static const char *swizzle_names[] =
+const char *
+tgsi_swizzle_names[] =
 {
    "x",
    "y",
@@ -143,7 +146,8 @@ static const char *swizzle_names[] =
    "w"
 };
 
-static const char *texture_names[] =
+const char *
+tgsi_texture_names[] =
 {
    "UNKNOWN",
    "1D",
@@ -201,15 +205,15 @@ _dump_register_src(
    struct dump_ctx *ctx,
    const struct tgsi_full_src_register *src )
 {
-   ENM(src->Register.File, file_names);
+   ENM(src->Register.File, tgsi_file_names);
    if (src->Register.Dimension) {
       if (src->Dimension.Indirect) {
          CHR( '[' );
-         ENM( src->DimIndirect.File, file_names );
+         ENM( src->DimIndirect.File, tgsi_file_names );
          CHR( '[' );
          SID( src->DimIndirect.Index );
          TXT( "]." );
-         ENM( src->DimIndirect.SwizzleX, swizzle_names );
+         ENM( src->DimIndirect.SwizzleX, tgsi_swizzle_names );
          if (src->Dimension.Index != 0) {
             if (src->Dimension.Index > 0)
                CHR( '+' );
@@ -224,11 +228,11 @@ _dump_register_src(
    }
    if (src->Register.Indirect) {
       CHR( '[' );
-      ENM( src->Indirect.File, file_names );
+      ENM( src->Indirect.File, tgsi_file_names );
       CHR( '[' );
       SID( src->Indirect.Index );
       TXT( "]." );
-      ENM( src->Indirect.SwizzleX, swizzle_names );
+      ENM( src->Indirect.SwizzleX, tgsi_swizzle_names );
       if (src->Register.Index != 0) {
          if (src->Register.Index > 0)
             CHR( '+' );
@@ -248,15 +252,15 @@ _dump_register_dst(
    struct dump_ctx *ctx,
    const struct tgsi_full_dst_register *dst )
 {
-   ENM(dst->Register.File, file_names);
+   ENM(dst->Register.File, tgsi_file_names);
    if (dst->Register.Dimension) {
       if (dst->Dimension.Indirect) {
          CHR( '[' );
-         ENM( dst->DimIndirect.File, file_names );
+         ENM( dst->DimIndirect.File, tgsi_file_names );
          CHR( '[' );
          SID( dst->DimIndirect.Index );
          TXT( "]." );
-         ENM( dst->DimIndirect.SwizzleX, swizzle_names );
+         ENM( dst->DimIndirect.SwizzleX, tgsi_swizzle_names );
          if (dst->Dimension.Index != 0) {
             if (dst->Dimension.Index > 0)
                CHR( '+' );
@@ -271,11 +275,11 @@ _dump_register_dst(
    }
    if (dst->Register.Indirect) {
       CHR( '[' );
-      ENM( dst->Indirect.File, file_names );
+      ENM( dst->Indirect.File, tgsi_file_names );
       CHR( '[' );
       SID( dst->Indirect.Index );
       TXT( "]." );
-      ENM( dst->Indirect.SwizzleX, swizzle_names );
+      ENM( dst->Indirect.SwizzleX, tgsi_swizzle_names );
       if (dst->Register.Index != 0) {
          if (dst->Register.Index > 0)
             CHR( '+' );
@@ -351,7 +355,7 @@ iter_declaration(
 
    TXT( "DCL " );
 
-   ENM(decl->Declaration.File, file_names);
+   ENM(decl->Declaration.File, tgsi_file_names);
 
    /* all geometry shader inputs are two dimensional */
    if (decl->Declaration.File == TGSI_FILE_INPUT &&
@@ -585,10 +589,10 @@ iter_instruction(
           inst->Predicate.SwizzleZ != TGSI_SWIZZLE_Z ||
           inst->Predicate.SwizzleW != TGSI_SWIZZLE_W) {
          CHR( '.' );
-         ENM( inst->Predicate.SwizzleX, swizzle_names );
-         ENM( inst->Predicate.SwizzleY, swizzle_names );
-         ENM( inst->Predicate.SwizzleZ, swizzle_names );
-         ENM( inst->Predicate.SwizzleW, swizzle_names );
+         ENM( inst->Predicate.SwizzleX, tgsi_swizzle_names );
+         ENM( inst->Predicate.SwizzleY, tgsi_swizzle_names );
+         ENM( inst->Predicate.SwizzleZ, tgsi_swizzle_names );
+         ENM( inst->Predicate.SwizzleW, tgsi_swizzle_names );
       }
 
       TXT( ") " );
@@ -641,10 +645,10 @@ iter_instruction(
           src->Register.SwizzleZ != TGSI_SWIZZLE_Z ||
           src->Register.SwizzleW != TGSI_SWIZZLE_W) {
          CHR( '.' );
-         ENM( src->Register.SwizzleX, swizzle_names );
-         ENM( src->Register.SwizzleY, swizzle_names );
-         ENM( src->Register.SwizzleZ, swizzle_names );
-         ENM( src->Register.SwizzleW, swizzle_names );
+         ENM( src->Register.SwizzleX, tgsi_swizzle_names );
+         ENM( src->Register.SwizzleY, tgsi_swizzle_names );
+         ENM( src->Register.SwizzleZ, tgsi_swizzle_names );
+         ENM( src->Register.SwizzleW, tgsi_swizzle_names );
       }
 
       if (src->Register.Absolute)
@@ -655,7 +659,7 @@ iter_instruction(
 
    if (inst->Instruction.Texture) {
       TXT( ", " );
-      ENM( inst->Texture.Texture, texture_names );
+      ENM( inst->Texture.Texture, tgsi_texture_names );
    }
 
    switch (inst->Instruction.Opcode) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
index dd78b361007..fc0429ad8d9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -35,6 +35,15 @@
 extern "C" {
 #endif
 
+extern const char *
+tgsi_file_names[TGSI_FILE_COUNT];
+
+extern const char *
+tgsi_swizzle_names[];
+
+extern const char *
+tgsi_texture_names[];
+
 void
 tgsi_dump_str(
    const struct tgsi_token *tokens,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 90198a4f604..6585da3e838 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -147,6 +147,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                   info->input_semantic_name[reg] = (ubyte)fulldecl->Semantic.Name;
                   info->input_semantic_index[reg] = (ubyte)fulldecl->Semantic.Index;
                   info->input_interpolate[reg] = (ubyte)fulldecl->Declaration.Interpolate;
+                  info->input_centroid[reg] = (ubyte)fulldecl->Declaration.Centroid;
                   info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Declaration.CylindricalWrap;
                   info->num_inputs++;
                }
@@ -157,9 +158,11 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
 
                   /* extra info for special outputs */
                   if (procType == TGSI_PROCESSOR_FRAGMENT &&
-                      fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
-                     info->writes_z = TRUE;
-                  }
+                      fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION)
+                        info->writes_z = TRUE;
+                  if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                      fulldecl->Semantic.Name == TGSI_SEMANTIC_STENCIL)
+                        info->writes_stencil = TRUE;
                   if (procType == TGSI_PROCESSOR_VERTEX &&
                       fulldecl->Semantic.Name == TGSI_SEMANTIC_EDGEFLAG) {
                      info->writes_edgeflag = TRUE;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index f8aa90cf065..104097fbc03 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -45,6 +45,7 @@ struct tgsi_shader_info
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
    ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_centroid[PIPE_MAX_SHADER_INPUTS];
    ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
    ubyte input_cylindrical_wrap[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
@@ -60,6 +61,7 @@ struct tgsi_shader_info
    uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
 
    boolean writes_z;  /**< does fragment shader write Z value? */
+   boolean writes_stencil; /**< does fragment shader write stencil value? */
    boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
    boolean uses_kill;  /**< KIL or KILP instruction used? */
 
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index a75380228b1..850ef39ef21 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -68,6 +68,33 @@ struct translate_key {
 };
 
 
+struct translate;
+
+
+typedef void (PIPE_CDECL *run_elts_func)(struct translate *,
+                                         const unsigned *elts,
+                                         unsigned count,
+                                         unsigned instance_id,
+                                         void *output_buffer);
+
+typedef void (PIPE_CDECL *run_elts16_func)(struct translate *,
+                                           const uint16_t *elts,
+                                           unsigned count,
+                                           unsigned instance_id,
+                                           void *output_buffer);
+
+typedef void (PIPE_CDECL *run_elts8_func)(struct translate *,
+                                          const uint8_t *elts,
+                                          unsigned count,
+                                          unsigned instance_id,
+                                          void *output_buffer);
+
+typedef void (PIPE_CDECL *run_func)(struct translate *,
+                                    unsigned start,
+                                    unsigned count,
+                                    unsigned instance_id,
+                                    void *output_buffer);
+
 struct translate {
    struct translate_key key;
 
@@ -79,42 +106,14 @@ struct translate {
 		       unsigned stride,
 		       unsigned max_index );
 
-   void (PIPE_CDECL *run_elts)( struct translate *,
-                                const unsigned *elts,
-                                unsigned count,
-                                unsigned instance_id,
-                                void *output_buffer);
-
-   void (PIPE_CDECL *run_elts16)( struct translate *,
-                                const uint16_t *elts,
-                                unsigned count,
-                                unsigned instance_id,
-                                void *output_buffer);
-
-   void (PIPE_CDECL *run_elts8)( struct translate *,
-                                const uint8_t *elts,
-                                unsigned count,
-                                unsigned instance_id,
-                                void *output_buffer);
-
-   void (PIPE_CDECL *run)( struct translate *,
-                           unsigned start,
-                           unsigned count,
-                           unsigned instance_id,
-                           void *output_buffer);
+   run_elts_func run_elts;
+   run_elts16_func run_elts16;
+   run_elts8_func run_elts8;
+   run_func run;
 };
 
 
 
-#if 0
-struct translate_context *translate_context_create( void );
-void translate_context_destroy( struct translate_context * );
-
-struct translate *translate_lookup_or_create( struct translate_context *tctx,
-					      const struct translate_key *key );
-#endif
-
-
 struct translate *translate_create( const struct translate_key *key );
 
 boolean translate_is_output_format_supported(enum pipe_format format);
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index f8bf5b46692..ef7f4be4c3e 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -1495,19 +1495,19 @@ struct translate *translate_sse2_create( const struct translate_key *key )
    if (!build_vertex_emit(p, &p->elt8_func, 1))
       goto fail;
 
-   p->translate.run = (void*)x86_get_func(&p->linear_func);
+   p->translate.run = (run_func) x86_get_func(&p->linear_func);
    if (p->translate.run == NULL)
       goto fail;
 
-   p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
+   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
    if (p->translate.run_elts == NULL)
       goto fail;
 
-   p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
+   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
    if (p->translate.run_elts16 == NULL)
       goto fail;
 
-   p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
+   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
    if (p->translate.run_elts8 == NULL)
       goto fail;
 
diff --git a/src/gallium/auxiliary/util/u_dl.c b/src/gallium/auxiliary/util/u_dl.c
index 220860ebf4b..aca435d6cad 100644
--- a/src/gallium/auxiliary/util/u_dl.c
+++ b/src/gallium/auxiliary/util/u_dl.c
@@ -38,6 +38,7 @@
 #endif
 
 #include "u_dl.h"
+#include "u_pointer.h"
 
 
 struct util_dl_library *
@@ -58,7 +59,7 @@ util_dl_get_proc_address(struct util_dl_library *library,
                          const char *procname)
 {
 #if defined(PIPE_OS_UNIX)
-   return (util_dl_proc)dlsym((void *)library, procname);
+   return (util_dl_proc) pointer_to_func(dlsym((void *)library, procname));
 #elif defined(PIPE_OS_WINDOWS)
    return (util_dl_proc)GetProcAddress((HMODULE)library, procname);
 #else
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index 0811280b97b..8e5d4487a67 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -109,9 +109,12 @@ PIPE_FORMAT_Z32_UNORM               , plain, 1, 1, un32,     ,     ,     , x___,
 PIPE_FORMAT_Z32_FLOAT               , plain, 1, 1, f32 ,     ,     ,     , x___, zs
 PIPE_FORMAT_Z24_UNORM_S8_USCALED    , plain, 1, 1, un24, u8  ,     ,     , xy__, zs
 PIPE_FORMAT_S8_USCALED_Z24_UNORM    , plain, 1, 1, u8 ,  un24,     ,     , yx__, zs
+PIPE_FORMAT_X24S8_USCALED           , plain, 1, 1, x24,  u8  ,     ,     , _y__, zs
+PIPE_FORMAT_S8X24_USCALED           , plain, 1, 1, u8  , x24 ,     ,     , _x__, zs
 PIPE_FORMAT_Z24X8_UNORM             , plain, 1, 1, un24, x8  ,     ,     , x___, zs
 PIPE_FORMAT_X8Z24_UNORM             , plain, 1, 1, x8  , un24,     ,     , y___, zs
 PIPE_FORMAT_Z32_FLOAT_S8X24_USCALED , plain, 1, 1, f32,  u8  , x24 ,     , xy__, zs
+PIPE_FORMAT_X32_S8X24_USCALED       , plain, 1, 1, x32,  u8  , x24 ,     , _y__, zs
 
 # YUV formats
 # http://www.fourcc.org/yuv.php#UYVY
diff --git a/src/gallium/auxiliary/util/u_format_srgb.py b/src/gallium/auxiliary/util/u_format_srgb.py
index 3e8000f3687..cd63ae78919 100644
--- a/src/gallium/auxiliary/util/u_format_srgb.py
+++ b/src/gallium/auxiliary/util/u_format_srgb.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-'''
+CopyRight = '''
 /**************************************************************************
  *
  * Copyright 2010 VMware, Inc.
@@ -89,7 +89,7 @@ def main():
     print '/* This file is autogenerated by u_format_srgb.py. Do not edit directly. */'
     print
     # This will print the copyright message on the top of this file
-    print __doc__.strip()
+    print CopyRight.strip()
     print
     print '#include "u_format_srgb.h"'
     print
diff --git a/src/gallium/auxiliary/util/u_format_table.py b/src/gallium/auxiliary/util/u_format_table.py
index f0b407b8b8e..8cc22a56371 100755
--- a/src/gallium/auxiliary/util/u_format_table.py
+++ b/src/gallium/auxiliary/util/u_format_table.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-'''
+CopyRight = '''
 /**************************************************************************
  *
  * Copyright 2010 VMware, Inc.
@@ -83,7 +83,7 @@ def write_format_table(formats):
     print '/* This file is autogenerated by u_format_table.py from u_format.csv. Do not edit directly. */'
     print
     # This will print the copyright message on the top of this file
-    print __doc__.strip()
+    print CopyRight.strip()
     print
     print '#include "u_format.h"'
     print '#include "u_format_s3tc.h"'
diff --git a/src/gallium/auxiliary/util/u_format_zs.c b/src/gallium/auxiliary/util/u_format_zs.c
index 792d69c214c..80081e22f7c 100644
--- a/src/gallium/auxiliary/util/u_format_zs.c
+++ b/src/gallium/auxiliary/util/u_format_zs.c
@@ -918,3 +918,56 @@ util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned d
    }
 }
 
+
+void
+util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_z24_unorm_s8_uscaled_unpack_s_8uscaled(dst_row, dst_stride,
+						      src_row, src_stride,
+						      width, height);
+}
+
+void
+util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_z24_unorm_s8_uscaled_pack_s_8uscaled(dst_row, dst_stride,
+						    src_row, src_stride,
+						    width, height);
+}
+
+void
+util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_s8_uscaled_z24_unorm_unpack_s_8uscaled(dst_row, dst_stride,
+						      src_row, src_stride,
+						      width, height);
+}
+
+void
+util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)
+{
+   util_format_s8_uscaled_z24_unorm_pack_s_8uscaled(dst_row, dst_stride,
+						      src_row, src_stride,
+						      width, height);
+}
+
+void
+util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+						const uint8_t *src_row, unsigned src_stride,
+						unsigned width, unsigned height)
+{
+   util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(dst_row, dst_stride,
+							 src_row, src_stride,
+							 width, height);
+
+}
+
+void
+util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride,
+					      const uint8_t *src_row, unsigned src_stride,
+					      unsigned width, unsigned height)
+{
+   util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(dst_row, dst_stride,
+                                                       src_row, src_stride,
+						       width, height);
+}
diff --git a/src/gallium/auxiliary/util/u_format_zs.h b/src/gallium/auxiliary/util/u_format_zs.h
index 650db4b95fd..1604cc3eee2 100644
--- a/src/gallium/auxiliary/util/u_format_zs.h
+++ b/src/gallium/auxiliary/util/u_format_zs.h
@@ -192,5 +192,21 @@ util_format_z32_float_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned
 void
 util_format_z32_float_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
 
+void
+util_format_x24s8_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x24s8_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
 
+void
+util_format_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x32_s8x24_uscaled_unpack_s_8uscaled(uint8_t *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
+
+void
+util_format_x32_s8x24_uscaled_pack_s_8uscaled(uint8_t *dst_row, unsigned dst_sride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height);
 #endif /* U_FORMAT_ZS_H_ */
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 69a76814945..37294b7203f 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -118,6 +118,11 @@ __inline double __cdecl atan2(double val)
 #endif
 
 
+#ifndef M_SQRT2
+#define M_SQRT2 1.41421356237309504880
+#endif
+
+
 #if defined(_MSC_VER) 
 
 #if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 03198c91da4..1df6c872677 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -71,6 +71,96 @@ _mm_castps_si128(__m128 a)
 
 #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
 
+union m128i {
+   __m128i m;
+   ubyte ub[16];
+   ushort us[8];
+   uint ui[4];
+};
+
+static INLINE void u_print_epi8(const char *name, __m128i r)
+{
+   union { __m128i m; ubyte ub[16]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x/"
+                "%02x\n",
+                name,
+                u.ub[0],  u.ub[1],  u.ub[2],  u.ub[3],
+                u.ub[4],  u.ub[5],  u.ub[6],  u.ub[7],
+                u.ub[8],  u.ub[9],  u.ub[10], u.ub[11],
+                u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
+}
+
+static INLINE void u_print_epi16(const char *name, __m128i r)
+{
+   union { __m128i m; ushort us[8]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x/"
+                "%04x\n",
+                name,
+                u.us[0],  u.us[1],  u.us[2],  u.us[3],
+                u.us[4],  u.us[5],  u.us[6],  u.us[7]);
+}
+
+static INLINE void u_print_epi32(const char *name, __m128i r)
+{
+   union { __m128i m; uint ui[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%08x/"
+                "%08x/"
+                "%08x/"
+                "%08x\n",
+                name,
+                u.ui[0],  u.ui[1],  u.ui[2],  u.ui[3]);
+}
+
+static INLINE void u_print_ps(const char *name, __m128 r)
+{
+   union { __m128 m; float f[4]; } u;
+   u.m = r;
+
+   debug_printf("%s: "
+                "%f/"
+                "%f/"
+                "%f/"
+                "%f\n",
+                name,
+                u.f[0],  u.f[1],  u.f[2],  u.f[3]);
+}
+
+
+#define U_DUMP_EPI32(a) u_print_epi32(#a, a)
+#define U_DUMP_EPI16(a) u_print_epi16(#a, a)
+#define U_DUMP_EPI8(a)  u_print_epi8(#a, a)
+#define U_DUMP_PS(a)    u_print_ps(#a, a)
+
+
 
 #if defined(PIPE_ARCH_SSSE3)
 
@@ -98,6 +188,68 @@ _mm_shuffle_epi8(__m128i a, __m128i mask)
 #endif /* !PIPE_ARCH_SSSE3 */
 
 
-#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+
+/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
+ * _mm_mul_epu32().
+ *
+ * I suspect this works fine for us because one of our operands is
+ * always positive, but not sure that this can be used for general
+ * signed integer multiplication.
+ *
+ * This seems close enough to the speed of SSE4 and the real
+ * _mm_mullo_epi32() intrinsic as to not justify adding an sse4
+ * dependency at this point.
+ */
+static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
+{
+   __m128i a4   = _mm_srli_epi64(a, 32);  /* shift by one dword */
+   __m128i b4   = _mm_srli_epi64(b, 32);  /* shift by one dword */
+   __m128i ba   = _mm_mul_epu32(b, a);   /* multply dwords 0, 2 */
+   __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
+
+   /* Interleave the results, either with shuffles or (slightly
+    * faster) direct bit operations:
+    */
+#if 0
+   __m128i ba8             = _mm_shuffle_epi32(ba, 8);
+   __m128i b4a48           = _mm_shuffle_epi32(b4a4, 8);
+   __m128i result          = _mm_unpacklo_epi32(ba8, b4a48);
+#else
+   __m128i mask            = _mm_setr_epi32(~0,0,~0,0);
+   __m128i ba_mask         = _mm_and_si128(ba, mask);
+   __m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32);
+   __m128i result          = _mm_or_si128(ba_mask, b4a4_mask_shift);
+#endif
+
+   return result;
+}
+
+
+static INLINE void
+transpose4_epi32(const __m128i * restrict a,
+                 const __m128i * restrict b,
+                 const __m128i * restrict c,
+                 const __m128i * restrict d,
+                 __m128i * restrict o,
+                 __m128i * restrict p,
+                 __m128i * restrict q,
+                 __m128i * restrict r)
+{
+  __m128i t0 = _mm_unpacklo_epi32(*a, *b);
+  __m128i t1 = _mm_unpacklo_epi32(*c, *d);
+  __m128i t2 = _mm_unpackhi_epi32(*a, *b);
+  __m128i t3 = _mm_unpackhi_epi32(*c, *d);
+
+  *o = _mm_unpacklo_epi64(t0, t1);
+  *p = _mm_unpackhi_epi64(t0, t1);
+  *q = _mm_unpacklo_epi64(t2, t3);
+  *r = _mm_unpackhi_epi64(t2, t3);
+}
+
+#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
+
+
+#endif /* PIPE_ARCH_SSE */
 
 #endif /* U_SSE_H_ */
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index f7aa1403d08..44cadbfcdd0 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -217,6 +217,81 @@ z24s8_get_tile_rgba(const unsigned *src,
    }
 }
 
+/*** PIPE_FORMAT_S8X24_USCALED ***/
+
+/**
+ * Return S component as four uint32_t in [0..255].  Z part ignored.
+ */
+static void
+s8x24_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float)((*src++ >> 24) & 0xff);
+      }
+
+      p += dst_stride;
+   }
+}
+
+/*** PIPE_FORMAT_X24S8_USCALED ***/
+
+/**
+ * Return S component as four uint32_t in [0..255].  Z part ignored.
+ */
+static void
+x24s8_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float)(*src++ & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/**
+ * Return S component as four uint32_t in [0..255].  Z part ignored.
+ */
+static void
+s8_get_tile_rgba(const unsigned char *src,
+		 unsigned w, unsigned h,
+		 float *p,
+		 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float)(*src++ & 0xff);
+      }
+      p += dst_stride;
+   }
+}
 
 /*** PIPE_FORMAT_Z32_FLOAT ***/
 
@@ -261,10 +336,19 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_Z24X8_UNORM:
       s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_S8_USCALED:
+      s8_get_tile_rgba((unsigned char *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_X24S8_USCALED:
+      s8x24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
    case PIPE_FORMAT_X8Z24_UNORM:
       z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
+   case PIPE_FORMAT_S8X24_USCALED:
+      x24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
    case PIPE_FORMAT_Z32_FLOAT:
       z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
       break;