32 files changed, 680 insertions, 1468 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 1abf12e95bd..3b202b5bc77 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -158,6 +158,7 @@ GALLIVM_SOURCES = \
         gallivm/lp_bld_logic.c \
         gallivm/lp_bld_pack.c \
         gallivm/lp_bld_printf.c \
+        gallivm/lp_bld_quad.c \
         gallivm/lp_bld_sample.c \
         gallivm/lp_bld_sample_soa.c \
         gallivm/lp_bld_struct.c \
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index 7039d5f0c55..af4d5edcf86 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -208,6 +208,7 @@ if env['llvm']:
     'gallivm/lp_bld_misc.cpp',
     'gallivm/lp_bld_pack.c',
     'gallivm/lp_bld_printf.c',
+    'gallivm/lp_bld_quad.c',
     'gallivm/lp_bld_sample.c',
     'gallivm/lp_bld_sample_soa.c',
     'gallivm/lp_bld_struct.c',
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 7ea51621f72..bd5d8853cf8 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -16,40 +16,13 @@
 
 #include "util/u_cpu_detect.h"
 #include "util/u_string.h"
+#include "util/u_pointer.h"
 
 #include <llvm-c/Transforms/Scalar.h>
 
 #define DEBUG_STORE 0
 
 
-/** cast wrapper */
-static INLINE draw_jit_vert_func_elts
-voidptr_to_draw_vert_func_elts(void *v)
-{
-   union {
-      void *v;
-      draw_jit_vert_func_elts f;
-   } u;
-   assert(sizeof(u.v) == sizeof(u.f));
-   u.v = v;
-   return u.f;
-}
-
-
-/** cast wrapper */
-static INLINE draw_jit_vert_func
-voidptr_to_draw_jit_vert_func(void *v)
-{
-   union {
-      void *v;
-      draw_jit_vert_func f;
-   } u;
-   assert(sizeof(u.v) == sizeof(u.f));
-   u.v = v;
-   return u.f;
-}
-
-
 /* generates the draw jit function */
 static void
 draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var);
@@ -744,7 +717,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    }
 
    code = LLVMGetPointerToGlobal(llvm->draw->engine, variant->function);
-   variant->jit_func = voidptr_to_draw_jit_vert_func(code);
+   variant->jit_func = (draw_jit_vert_func)pointer_to_func(code);
 
    if (gallivm_debug & GALLIVM_DEBUG_ASM) {
       lp_disassemble(code);
@@ -899,7 +872,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    }
 
    code = LLVMGetPointerToGlobal(llvm->draw->engine, variant->function_elts);
-   variant->jit_func_elts = voidptr_to_draw_vert_func_elts(code);
+   variant->jit_func_elts = (draw_jit_vert_func_elts)pointer_to_func(code);
 
    if (gallivm_debug & GALLIVM_DEBUG_ASM) {
       lp_disassemble(code);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 47aa9aa3621..d926b2de189 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -54,12 +54,10 @@
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
-#include "lp_bld_init.h" /* for lp_build_engine */
 #include "lp_bld_logic.h"
 #include "lp_bld_pack.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_arit.h"
-#include "lp_bld_printf.h"
 
 
 /**
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 031ce9d1a37..e42ff31ac7a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -280,34 +280,45 @@ lp_build_one(struct lp_type type)
                
 
 /**
- * Build constant-valued vector from a scalar value.
+ * Build constant-valued element from a scalar value.
  */
 LLVMValueRef
-lp_build_const_vec(struct lp_type type,
-                   double val)
+lp_build_const_elem(struct lp_type type,
+                    double val)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+   LLVMValueRef elem;
 
    if(type.floating) {
-      elems[0] = LLVMConstReal(elem_type, val);
+      elem = LLVMConstReal(elem_type, val);
    }
    else {
       double dscale = lp_const_scale(type);
 
-      elems[0] = LLVMConstInt(elem_type, val*dscale + 0.5, 0);
+      elem = LLVMConstInt(elem_type, val*dscale + 0.5, 0);
    }
 
-   if (type.length == 1)
-      return elems[0];
+   return elem;
+}
 
-   for(i = 1; i < type.length; ++i)
-      elems[i] = elems[0];
 
-   return LLVMConstVector(elems, type.length);
+/**
+ * Build constant-valued vector from a scalar value.
+ */
+LLVMValueRef
+lp_build_const_vec(struct lp_type type,
+                   double val)
+{
+   if (type.length == 1) {
+      return lp_build_const_elem(type, val);
+   } else {
+      LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+      unsigned i;
+      elems[0] = lp_build_const_elem(type, val);
+      for(i = 1; i < type.length; ++i)
+         elems[i] = elems[0];
+      return LLVMConstVector(elems, type.length);
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index 9ca2f0664eb..d46b9f882b0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -85,6 +85,10 @@ lp_build_one(struct lp_type type);
 
 
 LLVMValueRef
+lp_build_const_elem(struct lp_type type,
+                    double val);
+
+LLVMValueRef
 lp_build_const_vec(struct lp_type type, double val);
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index 823a8ec7b70..5bc9c741a88 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -38,7 +38,7 @@
 #include "lp_bld_flow.h"
 
 
-#define LP_BUILD_FLOW_MAX_VARIABLES 32
+#define LP_BUILD_FLOW_MAX_VARIABLES 64
 #define LP_BUILD_FLOW_MAX_DEPTH 32
 
 /**
@@ -407,6 +407,7 @@ lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
    /* for each variable, update the Phi node with a (variable, block) pair */
    for(i = 0; i < skip->num_variables; ++i) {
       assert(*flow->variables[i]);
+      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
       LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
    }
 
@@ -433,6 +434,7 @@ lp_build_flow_skip_end(struct lp_build_flow_context *flow)
    /* add (variable, block) tuples to the phi nodes */
    for(i = 0; i < skip->num_variables; ++i) {
       assert(*flow->variables[i]);
+      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
       LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
       *flow->variables[i] = skip->phi[i];
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index e02a45114b0..0a690ea7476 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -37,11 +37,11 @@
 unsigned gallivm_debug = 0;
 
 static const struct debug_named_value lp_bld_debug_flags[] = {
-   { "tgsi",   GALLIVM_DEBUG_TGSI },
-   { "ir",     GALLIVM_DEBUG_IR },
-   { "asm",    GALLIVM_DEBUG_ASM },
-   { "nopt",   GALLIVM_DEBUG_NO_OPT },
-   {NULL, 0}
+   { "tgsi",   GALLIVM_DEBUG_TGSI, NULL },
+   { "ir",     GALLIVM_DEBUG_IR, NULL },
+   { "asm",    GALLIVM_DEBUG_ASM, NULL },
+   { "nopt",   GALLIVM_DEBUG_NO_OPT, NULL },
+   DEBUG_NAMED_VALUE_END
 };
 #endif
 
@@ -75,6 +75,10 @@ enum LLVM_CodeGenOpt_Level {
 };
 
 
+extern void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
+
+
 void
 lp_build_init(void)
 {
@@ -109,6 +113,10 @@ lp_build_init(void)
          LLVMDisposeMessage(error);
          assert(0);
       }
+
+#if defined(DEBUG) || defined(PROFILE)
+      lp_register_oprofile_jit_event_listener(lp_build_engine);
+#endif
    }
 
    if (!lp_build_target)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index db0ca606e58..f004c0ae451 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -34,7 +34,10 @@
 #define __STDC_CONSTANT_MACROS
 #endif
 
-#include "llvm-c/Core.h"
+#include <llvm-c/Core.h>
+#include <llvm-c/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JITEventListener.h>
 
 #include "pipe/p_config.h"
 #include "util/u_debug.h"
@@ -98,3 +101,21 @@ lp_debug_dump_value(LLVMValueRef value)
 
 
 #endif
+
+
+/**
+ * Register the engine with oprofile.
+ *
+ * This allows to see the LLVM IR function names in oprofile output.
+ *
+ * To actually work LLVM needs to be built with the --with-oprofile configure
+ * option.
+ *
+ * Also a oprofile:oprofile user:group is necessary. Which is not created by
+ * default on some distributions.
+ */
+extern "C" void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE)
+{
+   llvm::unwrap(EE)->RegisterJITEventListener(llvm::createOProfileJITEventListener());
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
new file mode 100644
index 00000000000..38fd5a39efa
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -0,0 +1,101 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "lp_bld_type.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_quad.h"
+
+
+static const unsigned char
+swizzle_left[4] = {
+   LP_BLD_QUAD_TOP_LEFT,     LP_BLD_QUAD_TOP_LEFT,
+   LP_BLD_QUAD_BOTTOM_LEFT,  LP_BLD_QUAD_BOTTOM_LEFT
+};
+
+static const unsigned char
+swizzle_right[4] = {
+   LP_BLD_QUAD_TOP_RIGHT,    LP_BLD_QUAD_TOP_RIGHT,
+   LP_BLD_QUAD_BOTTOM_RIGHT, LP_BLD_QUAD_BOTTOM_RIGHT
+};
+
+static const unsigned char
+swizzle_top[4] = {
+   LP_BLD_QUAD_TOP_LEFT,     LP_BLD_QUAD_TOP_RIGHT,
+   LP_BLD_QUAD_TOP_LEFT,     LP_BLD_QUAD_TOP_RIGHT
+};
+
+static const unsigned char
+swizzle_bottom[4] = {
+   LP_BLD_QUAD_BOTTOM_LEFT,  LP_BLD_QUAD_BOTTOM_RIGHT,
+   LP_BLD_QUAD_BOTTOM_LEFT,  LP_BLD_QUAD_BOTTOM_RIGHT
+};
+
+
+LLVMValueRef
+lp_build_ddx(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   LLVMValueRef a_left  = lp_build_swizzle1_aos(bld, a, swizzle_left);
+   LLVMValueRef a_right = lp_build_swizzle1_aos(bld, a, swizzle_right);
+   return lp_build_sub(bld, a_right, a_left);
+}
+
+
+LLVMValueRef
+lp_build_ddy(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   LLVMValueRef a_top    = lp_build_swizzle1_aos(bld, a, swizzle_top);
+   LLVMValueRef a_bottom = lp_build_swizzle1_aos(bld, a, swizzle_bottom);
+   return lp_build_sub(bld, a_bottom, a_top);
+}
+
+
+LLVMValueRef
+lp_build_scalar_ddx(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   LLVMValueRef idx_left  = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_right = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_RIGHT, 0);
+   LLVMValueRef a_left  = LLVMBuildExtractElement(bld->builder, a, idx_left, "");
+   LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "");
+   return LLVMBuildSub(bld->builder, a_right, a_left, "");
+}
+
+
+LLVMValueRef
+lp_build_scalar_ddy(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   LLVMValueRef idx_top    = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_bottom = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_BOTTOM_LEFT, 0);
+   LLVMValueRef a_top    = LLVMBuildExtractElement(bld->builder, a, idx_top, "");
+   LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "");
+   return LLVMBuildSub(bld->builder, a_bottom, a_top, "");
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
new file mode 100644
index 00000000000..b7992912927
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
@@ -0,0 +1,96 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_QUAD_H_
+#define LP_BLD_QUAD_H_
+
+
+#include "gallivm/lp_bld.h"
+
+
+struct lp_build_context;
+
+
+/*
+ * Each quad is composed of four elements.
+ *
+ * #########
+ * # 0 | 1 #
+ * #---+---#
+ * # 2 | 3 #
+ * #########
+ */
+
+enum lp_bld_quad {
+   LP_BLD_QUAD_TOP_LEFT     = 0,
+   LP_BLD_QUAD_TOP_RIGHT    = 1,
+   LP_BLD_QUAD_BOTTOM_LEFT  = 2,
+   LP_BLD_QUAD_BOTTOM_RIGHT = 3
+};
+
+
+/*
+ * (Vector) derivates.
+ *
+ * More than one quad is supported. The only requirement is that the vector
+ * contains a whole number of quads:
+ *
+ * ######### ######### ...
+ * # 0 | 1 # # 4 | 5 #
+ * #---+---# #---+---# ...
+ * # 2 | 3 # # 6 | 7 #
+ * ######### ######### ...
+ */
+
+LLVMValueRef
+lp_build_ddx(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+
+LLVMValueRef
+lp_build_ddy(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+
+/*
+ * Scalar derivatives.
+ *
+ * Same as getting the first value of above.
+ */
+
+LLVMValueRef
+lp_build_scalar_ddx(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+
+LLVMValueRef
+lp_build_scalar_ddy(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+
+#endif /* LP_BLD_QUAD_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index c7f9b1083b1..946c23e317a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -92,7 +92,7 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->wrap_r            = sampler->wrap_r;
    state->min_img_filter    = sampler->min_img_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
-   if (texture->last_level) {
+   if (view->last_level) {
       state->min_mip_filter = sampler->min_mip_filter;
    } else {
       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
@@ -105,8 +105,14 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
    state->normalized_coords = sampler->normalized_coords;
    state->lod_bias          = sampler->lod_bias;
-   state->min_lod           = sampler->min_lod;
-   state->max_lod           = sampler->max_lod;
+   if (!view->last_level &&
+       sampler->min_img_filter == sampler->mag_img_filter) {
+      state->min_lod        = 0.0f;
+      state->max_lod        = 0.0f;
+   } else {
+      state->min_lod        = MAX2(sampler->min_lod, 0.0f);
+      state->max_lod        = sampler->max_lod;
+   }
    state->border_color[0]   = sampler->border_color[0];
    state->border_color[1]   = sampler->border_color[1];
    state->border_color[2]   = sampler->border_color[2];
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index bc7213db8b3..84c04fe272f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -415,10 +415,8 @@ is_simple_wrap_mode(unsigned mode)
 {
    switch (mode) {
    case PIPE_TEX_WRAP_REPEAT:
-   case PIPE_TEX_WRAP_CLAMP:
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
       return TRUE;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
    default:
       return FALSE;
    }
@@ -455,24 +453,17 @@ lp_build_sample_wrap_int(struct lp_build_sample_context *bld,
          coord = LLVMBuildURem(bld->builder, coord, length, "");
       break;
 
-   case PIPE_TEX_WRAP_CLAMP:
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
       coord = lp_build_min(int_coord_bld, coord, length_minus_one);
       break;
 
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
    case PIPE_TEX_WRAP_MIRROR_REPEAT:
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      /* FIXME */
-      _debug_printf("llvmpipe: failed to translate texture wrap mode %s\n",
-                    util_dump_tex_wrap(wrap_mode, TRUE));
-      coord = lp_build_max(uint_coord_bld, coord, uint_coord_bld->zero);
-      coord = lp_build_min(uint_coord_bld, coord, length_minus_one);
-      break;
-
    default:
       assert(0);
    }
@@ -500,11 +491,9 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
    LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
-   LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
    LLVMValueRef coord0, coord1, weight;
 
    switch(wrap_mode) {
@@ -532,16 +521,18 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
    case PIPE_TEX_WRAP_CLAMP:
       if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
+
+      /* clamp to [0, length] */
+      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
+
+      coord = lp_build_sub(coord_bld, coord, half);
+
       weight = lp_build_fract(coord_bld, coord);
-      coord0 = lp_build_clamp(coord_bld, coord, coord_bld->zero,
-                              length_f_minus_one);
-      coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
-      coord1 = lp_build_clamp(coord_bld, coord1, coord_bld->zero,
-                              length_f_minus_one);
-      coord0 = lp_build_ifloor(coord_bld, coord0);
-      coord1 = lp_build_ifloor(coord_bld, coord1);
+      coord0 = lp_build_ifloor(coord_bld, coord);
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
@@ -555,7 +546,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       else {
          LLVMValueRef min, max;
          /* clamp to [0.5, length - 0.5] */
-         min = lp_build_const_vec(coord_bld->type, 0.5F);
+         min = half;
          max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
       }
@@ -574,25 +565,14 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       {
          LLVMValueRef min, max;
          if (bld->static_state->normalized_coords) {
-            /* min = -1.0 / (2 * length) = -0.5 / length */
-            min = lp_build_mul(coord_bld,
-                               lp_build_const_vec(coord_bld->type, -0.5F),
-                               lp_build_rcp(coord_bld, length_f));
-            /* max = 1.0 - min */
-            max = lp_build_sub(coord_bld, coord_bld->one, min);
-            /* coord = clamp(coord, min, max) */
-            coord = lp_build_clamp(coord_bld, coord, min, max);
-            /* scale coord to length (and sub 0.5?) */
+            /* scale coord to length */
             coord = lp_build_mul(coord_bld, coord, length_f);
-            coord = lp_build_sub(coord_bld, coord, half);
-         }
-         else {
-            /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_vec(coord_bld->type, -0.5F);
-            max = lp_build_sub(coord_bld, length_f, min);
-            coord = lp_build_clamp(coord_bld, coord, min, max);
-            coord = lp_build_sub(coord_bld, coord, half);
          }
+         /* clamp to [-0.5, length + 0.5] */
+         min = lp_build_const_vec(coord_bld->type, -0.5F);
+         max = lp_build_sub(coord_bld, length_f, min);
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         coord = lp_build_sub(coord_bld, coord, half);
          /* compute lerp weight */
          weight = lp_build_fract(coord_bld, coord);
          /* convert to int */
@@ -623,35 +603,41 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = 1.0 - min */
-         max = lp_build_sub(coord_bld, coord_bld->one, min);
+      coord = lp_build_abs(coord_bld, coord);
 
-         coord = lp_build_abs(coord_bld, coord);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
-         if(0)coord = lp_build_sub(coord_bld, coord, half);
-         weight = lp_build_fract(coord_bld, coord);
-         coord0 = lp_build_ifloor(coord_bld, coord);
-         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
+
+      /* clamp to [0, length] */
+      coord = lp_build_min(coord_bld, coord, length_f);
+
+      coord = lp_build_sub(coord_bld, coord, half);
+
+      weight = lp_build_fract(coord_bld, coord);
+      coord0 = lp_build_ifloor(coord_bld, coord);
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
       {
          LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = 1.0 - min */
-         max = lp_build_sub(coord_bld, coord_bld->one, min);
 
          coord = lp_build_abs(coord_bld, coord);
+
+         if (bld->static_state->normalized_coords) {
+            /* scale coord to length */
+            coord = lp_build_mul(coord_bld, coord, length_f);
+         }
+
+         /* clamp to [0.5, length - 0.5] */
+         min = half;
+         max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
-         coord = lp_build_mul(coord_bld, coord, length_f);
+
          coord = lp_build_sub(coord_bld, coord, half);
+
          weight = lp_build_fract(coord_bld, coord);
          coord0 = lp_build_ifloor(coord_bld, coord);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
@@ -661,17 +647,21 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
       {
          LLVMValueRef min, max;
-         /* min = -1.0 / (2 * length) = -0.5 / length */
-         min = lp_build_mul(coord_bld,
-                            lp_build_const_vec(coord_bld->type, -0.5F),
-                            lp_build_rcp(coord_bld, length_f));
-         /* max = 1.0 - min */
-         max = lp_build_sub(coord_bld, coord_bld->one, min);
 
          coord = lp_build_abs(coord_bld, coord);
+
+         if (bld->static_state->normalized_coords) {
+            /* scale coord to length */
+            coord = lp_build_mul(coord_bld, coord, length_f);
+         }
+
+         /* clamp to [-0.5, length + 0.5] */
+         min = lp_build_negate(coord_bld, half);
+         max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
-         coord = lp_build_mul(coord_bld, coord, length_f);
+
          coord = lp_build_sub(coord_bld, coord, half);
+
          weight = lp_build_fract(coord_bld, coord);
          coord0 = lp_build_ifloor(coord_bld, coord);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
@@ -708,10 +698,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
-   LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
    LLVMValueRef icoord;
    
    switch(wrap_mode) {
@@ -727,120 +715,80 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_CLAMP:
-      /* mul by size */
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
       if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
+
       /* floor */
       icoord = lp_build_ifloor(coord_bld, coord);
-      /* clamp to [0, size-1].  Note: int coord builder type */
+
+      /* clamp to [0, length - 1]. */
       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
                               length_minus_one);
       break;
 
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         LLVMValueRef min, max;
-         if (bld->static_state->normalized_coords) {
-            /* min = 1.0 / (2 * length) */
-            min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-            /* max = length - min */
-            max = lp_build_sub(coord_bld, length_f, min);
-            /* scale coord to length */
-            coord = lp_build_mul(coord_bld, coord, length_f);
-         }
-         else {
-            /* clamp to [0.5, length - 0.5] */
-            min = lp_build_const_vec(coord_bld->type, 0.5F);
-            max = lp_build_sub(coord_bld, length_f, min);
-         }
-         /* coord = clamp(coord, min, max) */
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
-      }
-      break;
-
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
       {
          LLVMValueRef min, max;
+
          if (bld->static_state->normalized_coords) {
-            /* min = -1.0 / (2 * length) = -0.5 / length */
-            min = lp_build_mul(coord_bld,
-                               lp_build_const_vec(coord_bld->type, -0.5F),
-                               lp_build_rcp(coord_bld, length_f));
-            /* max = length - min */
-            max = lp_build_sub(coord_bld, length_f, min);
             /* scale coord to length */
             coord = lp_build_mul(coord_bld, coord, length_f);
          }
-         else {
-            /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_vec(coord_bld->type, -0.5F);
-            max = lp_build_sub(coord_bld, length_f, min);
-         }
-         /* coord = clamp(coord, min, max) */
-         coord = lp_build_clamp(coord_bld, coord, min, max);
+
          icoord = lp_build_ifloor(coord_bld, coord);
+
+         /* clamp to [-1, length] */
+         min = lp_build_negate(int_coord_bld, int_coord_bld->one);
+         max = length;
+         icoord = lp_build_clamp(int_coord_bld, icoord, min, max);
       }
       break;
 
    case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = length - min */
-         max = lp_build_sub(coord_bld, length_f, min);
+      /* compute mirror function */
+      coord = lp_build_coord_mirror(bld, coord);
 
-         /* compute mirror function */
-         coord = lp_build_coord_mirror(bld, coord);
+      /* scale coord to length */
+      assert(bld->static_state->normalized_coords);
+      coord = lp_build_mul(coord_bld, coord, length_f);
 
-         /* scale coord to length */
-         coord = lp_build_mul(coord_bld, coord, length_f);
+      icoord = lp_build_ifloor(coord_bld, coord);
 
-         /* coord = clamp(coord, min, max) */
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
-      }
+      /* clamp to [0, length - 1] */
+      icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      coord = lp_build_abs(coord_bld, coord);
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f_minus_one);
-      icoord = lp_build_ifloor(coord_bld, coord);
-      break;
-
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = length - min */
-         max = lp_build_sub(coord_bld, length_f, min);
+      coord = lp_build_abs(coord_bld, coord);
 
-         coord = lp_build_abs(coord_bld, coord);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
       }
+
+      icoord = lp_build_ifloor(coord_bld, coord);
+
+      /* clamp to [0, length - 1] */
+      icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         min = lp_build_negate(coord_bld, min);
-         /* max = length - min */
-         max = lp_build_sub(coord_bld, length_f, min);
+      coord = lp_build_abs(coord_bld, coord);
 
-         coord = lp_build_abs(coord_bld, coord);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
       }
+
+      icoord = lp_build_ifloor(coord_bld, coord);
+
+      /* clamp to [0, length] */
+      icoord = lp_build_min(int_coord_bld, icoord, length);
       break;
 
    default:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 40ea94c4935..ea949a13631 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -53,6 +53,7 @@
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_quad.h"
 #include "lp_bld_tgsi.h"
 #include "lp_bld_limits.h"
 #include "lp_bld_debug.h"
@@ -76,11 +77,6 @@
 #define CHAN_Z 2
 #define CHAN_W 3
 
-#define QUAD_TOP_LEFT     0
-#define QUAD_TOP_RIGHT    1
-#define QUAD_BOTTOM_LEFT  2
-#define QUAD_BOTTOM_RIGHT 3
-
 #define LP_MAX_INSTRUCTIONS 256
 
 
@@ -148,30 +144,6 @@ struct lp_build_tgsi_soa_context
    uint max_instructions;
 };
 
-static const unsigned char
-swizzle_left[4] = {
-   QUAD_TOP_LEFT,     QUAD_TOP_LEFT,
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_LEFT
-};
-
-static const unsigned char
-swizzle_right[4] = {
-   QUAD_TOP_RIGHT,    QUAD_TOP_RIGHT,
-   QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT
-};
-
-static const unsigned char
-swizzle_top[4] = {
-   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT,
-   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT
-};
-
-static const unsigned char
-swizzle_bottom[4] = {
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT,
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT
-};
-
 static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
 {
    mask->bld = bld;
@@ -433,25 +405,6 @@ static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
 }
 
 static LLVMValueRef
-emit_ddx(struct lp_build_tgsi_soa_context *bld,
-         LLVMValueRef src)
-{
-   LLVMValueRef src_left  = lp_build_swizzle1_aos(&bld->base, src, swizzle_left);
-   LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right);
-   return lp_build_sub(&bld->base, src_right, src_left);
-}
-
-
-static LLVMValueRef
-emit_ddy(struct lp_build_tgsi_soa_context *bld,
-         LLVMValueRef src)
-{
-   LLVMValueRef src_top    = lp_build_swizzle1_aos(&bld->base, src, swizzle_top);
-   LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom);
-   return lp_build_sub(&bld->base, src_top, src_bottom);
-}
-
-static LLVMValueRef
 get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
              unsigned index,
              unsigned chan,
@@ -599,10 +552,10 @@ emit_fetch_deriv(
    /* TODO: use interpolation coeffs for inputs */
 
    if(ddx)
-      *ddx = emit_ddx(bld, src);
+      *ddx = lp_build_ddx(&bld->base, src);
 
    if(ddy)
-      *ddy = emit_ddy(bld, src);
+      *ddy = lp_build_ddy(&bld->base, src);
 }
 
 
@@ -842,8 +795,8 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       unit = inst->Src[3].Register.Index;
    }  else {
       for (i = 0; i < num_coords; i++) {
-         ddx[i] = emit_ddx( bld, coords[i] );
-         ddy[i] = emit_ddy( bld, coords[i] );
+         ddx[i] = lp_build_ddx( &bld->base, coords[i] );
+         ddy[i] = lp_build_ddy( &bld->base, coords[i] );
       }
       unit = inst->Src[1].Register.Index;
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index aac3a57bc73..06f1aae6dcc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -372,7 +372,23 @@ lp_build_context_init(struct lp_build_context *bld,
 {
    bld->builder = builder;
    bld->type = type;
-   bld->undef = lp_build_undef(type);
-   bld->zero = lp_build_zero(type);
+
+   bld->int_elem_type = lp_build_int_elem_type(type);
+   if (type.floating)
+      bld->elem_type = lp_build_elem_type(type);
+   else
+      bld->elem_type = bld->int_elem_type;
+
+   if (type.length == 1) {
+      bld->int_vec_type = bld->int_elem_type;
+      bld->vec_type = bld->elem_type;
+   }
+   else {
+      bld->int_vec_type = LLVMVectorType(bld->int_elem_type, type.length);
+      bld->vec_type = LLVMVectorType(bld->elem_type, type.length);
+   }
+
+   bld->undef = LLVMGetUndef(bld->vec_type);
+   bld->zero = LLVMConstNull(bld->vec_type);
    bld->one = lp_build_one(type);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index 17819d4d32a..df77ef21551 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -129,6 +129,18 @@ struct lp_build_context
    struct lp_type type;
 
    /** Same as lp_build_undef(type) */
+   LLVMTypeRef elem_type;
+
+   /** Same as lp_build_undef(type) */
+   LLVMTypeRef vec_type;
+
+   /** Same as lp_build_undef(type) */
+   LLVMTypeRef int_elem_type;
+
+   /** Same as lp_build_undef(type) */
+   LLVMTypeRef int_vec_type;
+
+   /** Same as lp_build_undef(type) */
    LLVMValueRef undef;
 
    /** Same as lp_build_zero(type) */
diff --git a/src/gallium/auxiliary/tgsi/tgsi-instruction-set.txt b/src/gallium/auxiliary/tgsi/tgsi-instruction-set.txt
deleted file mode 100644
index 5d9eed92580..00000000000
--- a/src/gallium/auxiliary/tgsi/tgsi-instruction-set.txt
+++ /dev/null
@@ -1,1127 +0,0 @@
-TGSI Instruction Specification
-==============================
-==============================
-
-
-1  Instruction Set Operations
-=============================
-
-
-1.1  GL_NV_vertex_program
--------------------------
-
-
-1.1.1  ARL - Address Register Load
-
-  dst.x = floor(src.x)
-  dst.y = floor(src.y)
-  dst.z = floor(src.z)
-  dst.w = floor(src.w)
-
-
-1.1.2  MOV - Move
-
-  dst.x = src.x
-  dst.y = src.y
-  dst.z = src.z
-  dst.w = src.w
-
-
-1.1.3  LIT - Light Coefficients
-
-  dst.x = 1.0
-  dst.y = max(src.x, 0.0)
-  dst.z = (src.x > 0.0) ? pow(max(src.y, 0.0), clamp(src.w, -128.0, 128.0)) : 0.0
-  dst.w = 1.0
-
-
-1.1.4  RCP - Reciprocal
-
-  dst.x = 1.0 / src.x
-  dst.y = 1.0 / src.x
-  dst.z = 1.0 / src.x
-  dst.w = 1.0 / src.x
-
-
-1.1.5  RSQ - Reciprocal Square Root
-
-  dst.x = 1.0 / sqrt(abs(src.x))
-  dst.y = 1.0 / sqrt(abs(src.x))
-  dst.z = 1.0 / sqrt(abs(src.x))
-  dst.w = 1.0 / sqrt(abs(src.x))
-
-
-1.1.6  EXP - Approximate Exponential Base 2
-
-  dst.x = pow(2.0, floor(src.x))
-  dst.y = src.x - floor(src.x)
-  dst.z = pow(2.0, src.x)
-  dst.w = 1.0
-
-
-1.1.7  LOG - Approximate Logarithm Base 2
-
-  dst.x = floor(lg2(abs(src.x)))
-  dst.y = abs(src.x) / pow(2.0, floor(lg2(abs(src.x))))
-  dst.z = lg2(abs(src.x))
-  dst.w = 1.0
-
-
-1.1.8  MUL - Multiply
-
-  dst.x = src0.x * src1.x
-  dst.y = src0.y * src1.y
-  dst.z = src0.z * src1.z
-  dst.w = src0.w * src1.w
-
-
-1.1.9  ADD - Add
-
-  dst.x = src0.x + src1.x
-  dst.y = src0.y + src1.y
-  dst.z = src0.z + src1.z
-  dst.w = src0.w + src1.w
-
-
-1.1.10  DP3 - 3-component Dot Product
-
-  dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
-  dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
-  dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
-  dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
-
-
-1.1.11  DP4 - 4-component Dot Product
-
-  dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
-  dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
-  dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
-  dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
-
-
-1.1.12  DST - Distance Vector
-
-  dst.x = 1.0
-  dst.y = src0.y * src1.y
-  dst.z = src0.z
-  dst.w = src1.w
-
-
-1.1.13  MIN - Minimum
-
-  dst.x = min(src0.x, src1.x)
-  dst.y = min(src0.y, src1.y)
-  dst.z = min(src0.z, src1.z)
-  dst.w = min(src0.w, src1.w)
-
-
-1.1.14  MAX - Maximum
-
-  dst.x = max(src0.x, src1.x)
-  dst.y = max(src0.y, src1.y)
-  dst.z = max(src0.z, src1.z)
-  dst.w = max(src0.w, src1.w)
-
-
-1.1.15  SLT - Set On Less Than
-
-  dst.x = (src0.x < src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y < src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z < src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w < src1.w) ? 1.0 : 0.0
-
-
-1.1.16  SGE - Set On Greater Equal Than
-
-  dst.x = (src0.x >= src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y >= src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z >= src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w >= src1.w) ? 1.0 : 0.0
-
-
-1.1.17  MAD - Multiply And Add
-
-  dst.x = src0.x * src1.x + src2.x
-  dst.y = src0.y * src1.y + src2.y
-  dst.z = src0.z * src1.z + src2.z
-  dst.w = src0.w * src1.w + src2.w
-
-
-1.2  GL_ATI_fragment_shader
----------------------------
-
-
-1.2.1  SUB - Subtract
-
-  dst.x = src0.x - src1.x
-  dst.y = src0.y - src1.y
-  dst.z = src0.z - src1.z
-  dst.w = src0.w - src1.w
-
-
-1.2.2  DOT3 - 3-component Dot Product
-
-  Alias for DP3.
-
-
-1.2.3  DOT4 - 4-component Dot Product
-
-  Alias for DP4.
-
-
-1.2.4  LERP - Linear Interpolate
-
-  dst.x = src0.x * (src1.x - src2.x) + src2.x
-  dst.y = src0.y * (src1.y - src2.y) + src2.y
-  dst.z = src0.z * (src1.z - src2.z) + src2.z
-  dst.w = src0.w * (src1.w - src2.w) + src2.w
-
-
-1.2.5  CND - Condition
-
-  dst.x = (src2.x > 0.5) ? src0.x : src1.x
-  dst.y = (src2.y > 0.5) ? src0.y : src1.y
-  dst.z = (src2.z > 0.5) ? src0.z : src1.z
-  dst.w = (src2.w > 0.5) ? src0.w : src1.w
-
-
-1.2.6  CND0 - Condition Zero
-
-       Removed.  Use (CMP src2, src1, src0) instead.
-
-1.2.7  DOT2ADD - 2-component Dot Product And Add
-
-  dst.x = src0.x * src1.x + src0.y * src1.y + src2.x
-  dst.y = src0.x * src1.x + src0.y * src1.y + src2.x
-  dst.z = src0.x * src1.x + src0.y * src1.y + src2.x
-  dst.w = src0.x * src1.x + src0.y * src1.y + src2.x
-
-
-1.3  GL_EXT_vertex_shader
--------------------------
-
-
-1.3.1  INDEX - Array Lookup
-
-  Considered for removal from language.
-
-
-1.3.2  NEGATE - Negate
-
-  Considered for removal from language.
-
-
-1.3.3  MADD - Multiply And Add
-
-  Alias for MAD.
-
-
-1.3.4  FRAC - Fraction
-
-  dst.x = src.x - floor(src.x)
-  dst.y = src.y - floor(src.y)
-  dst.z = src.z - floor(src.z)
-  dst.w = src.w - floor(src.w)
-
-
-1.3.5  SETGE - Set On Greater Equal
-
-  Alias for SGE.
-
-
-1.3.6  SETLT - Set On Less Than
-
-  Alias for SLT.
-
-
-1.3.7  CLAMP - Clamp
-
-  dst.x = clamp(src0.x, src1.x, src2.x)
-  dst.y = clamp(src0.y, src1.y, src2.y)
-  dst.z = clamp(src0.z, src1.z, src2.z)
-  dst.w = clamp(src0.w, src1.w, src2.w)
-
-
-1.3.8  FLOOR - Floor
-
-  dst.x = floor(src.x)
-  dst.y = floor(src.y)
-  dst.z = floor(src.z)
-  dst.w = floor(src.w)
-
-
-1.3.9  ROUND - Round
-
-  dst.x = round(src.x)
-  dst.y = round(src.y)
-  dst.z = round(src.z)
-  dst.w = round(src.w)
-
-
-1.3.10  EXPBASE2 - Exponential Base 2
-
-  dst.x = pow(2.0, src.x)
-  dst.y = pow(2.0, src.x)
-  dst.z = pow(2.0, src.x)
-  dst.w = pow(2.0, src.x)
-
-
-1.3.11  LOGBASE2 - Logarithm Base 2
-
-  dst.x = lg2(src.x)
-  dst.y = lg2(src.x)
-  dst.z = lg2(src.x)
-  dst.w = lg2(src.x)
-
-
-1.3.12  POWER - Power
-
-  dst.x = pow(src0.x, src1.x)
-  dst.y = pow(src0.x, src1.x)
-  dst.z = pow(src0.x, src1.x)
-  dst.w = pow(src0.x, src1.x)
-
-
-1.3.13  RECIP - Reciprocal
-
-  Alias for RCP.
-
-
-1.3.14  RECIPSQRT - Reciprocal Square Root
-
-  Alias for RSQ.
-
-
-1.3.15  CROSSPRODUCT - Cross Product
-
-  dst.x = src0.y * src1.z - src1.y * src0.z
-  dst.y = src0.z * src1.x - src1.z * src0.x
-  dst.z = src0.x * src1.y - src1.x * src0.y
-  dst.w = 1.0
-
-
-1.3.16  MULTIPLYMATRIX - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.4  GL_NV_vertex_program1_1
-----------------------------
-
-
-1.4.1  ABS - Absolute
-
-  dst.x = abs(src.x)
-  dst.y = abs(src.y)
-  dst.z = abs(src.z)
-  dst.w = abs(src.w)
-
-
-1.4.2  RCC - Reciprocal Clamped
-
-  dst.x = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
-  dst.y = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
-  dst.z = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
-  dst.w = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
-
-
-1.4.3  DPH - Homogeneous Dot Product
-
-  dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
-  dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
-  dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
-  dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
-
-
-1.5  GL_NV_fragment_program
----------------------------
-
-
-1.5.1  COS - Cosine
-
-  dst.x = cos(src.x)
-  dst.y = cos(src.x)
-  dst.z = cos(src.x)
-  dst.w = cos(src.w)
-
-
-1.5.2  DDX - Derivative Relative To X
-
-  dst.x = partialx(src.x)
-  dst.y = partialx(src.y)
-  dst.z = partialx(src.z)
-  dst.w = partialx(src.w)
-
-
-1.5.3  DDY - Derivative Relative To Y
-
-  dst.x = partialy(src.x)
-  dst.y = partialy(src.y)
-  dst.z = partialy(src.z)
-  dst.w = partialy(src.w)
-
-
-1.5.4  EX2 - Exponential Base 2
-
-  Alias for EXPBASE2.
-
-
-1.5.5  FLR - Floor
-
-  Alias for FLOOR.
-
-
-1.5.6  FRC - Fraction
-
-  Alias for FRAC.
-
-
-1.5.7  KILP - Predicated Discard
-
-  discard
-
-
-1.5.8  LG2 - Logarithm Base 2
-
-  Alias for LOGBASE2.
-
-
-1.5.9  LRP - Linear Interpolate
-
-  Alias for LERP.
-
-
-1.5.10  PK2H - Pack Two 16-bit Floats
-
-  TBD
-
-
-1.5.11  PK2US - Pack Two Unsigned 16-bit Scalars
-
-  TBD
-
-
-1.5.12  PK4B - Pack Four Signed 8-bit Scalars
-
-  TBD
-
-
-1.5.13  PK4UB - Pack Four Unsigned 8-bit Scalars
-
-  TBD
-
-
-1.5.14  POW - Power
-
-  Alias for POWER.
-
-
-1.5.15  RFL - Reflection Vector
-
-  dst.x = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.x - src1.x
-  dst.y = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.y - src1.y
-  dst.z = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.z - src1.z
-  dst.w = 1.0
-
-
-1.5.16  SEQ - Set On Equal
-
-  dst.x = (src0.x == src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y == src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z == src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w == src1.w) ? 1.0 : 0.0
-
-
-1.5.17  SFL - Set On False
-
-  dst.x = 0.0
-  dst.y = 0.0
-  dst.z = 0.0
-  dst.w = 0.0
-
-
-1.5.18  SGT - Set On Greater Than
-
-  dst.x = (src0.x > src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y > src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z > src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w > src1.w) ? 1.0 : 0.0
-
-
-1.5.19  SIN - Sine
-
-  dst.x = sin(src.x)
-  dst.y = sin(src.x)
-  dst.z = sin(src.x)
-  dst.w = sin(src.w)
-
-
-1.5.20  SLE - Set On Less Equal Than
-
-  dst.x = (src0.x <= src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y <= src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z <= src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w <= src1.w) ? 1.0 : 0.0
-
-
-1.5.21  SNE - Set On Not Equal
-
-  dst.x = (src0.x != src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y != src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z != src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w != src1.w) ? 1.0 : 0.0
-
-
-1.5.22  STR - Set On True
-
-  dst.x = 1.0
-  dst.y = 1.0
-  dst.z = 1.0
-  dst.w = 1.0
-
-
-1.5.23  TEX - Texture Lookup
-
-  TBD
-
-
-1.5.24  TXD - Texture Lookup with Derivatives
-
-  TBD
-
-
-1.5.25  TXP - Projective Texture Lookup
-
-  TBD
-
-
-1.5.26  UP2H - Unpack Two 16-Bit Floats
-
-  TBD
-
-
-1.5.27  UP2US - Unpack Two Unsigned 16-Bit Scalars
-
-  TBD
-
-
-1.5.28  UP4B - Unpack Four Signed 8-Bit Values
-
-  TBD
-
-
-1.5.29  UP4UB - Unpack Four Unsigned 8-Bit Scalars
-
-  TBD
-
-
-1.5.30  X2D - 2D Coordinate Transformation
-
-  dst.x = src0.x + src1.x * src2.x + src1.y * src2.y
-  dst.y = src0.y + src1.x * src2.z + src1.y * src2.w
-  dst.z = src0.x + src1.x * src2.x + src1.y * src2.y
-  dst.w = src0.y + src1.x * src2.z + src1.y * src2.w
-
-
-1.6  GL_NV_vertex_program2
---------------------------
-
-
-1.6.1  ARA - Address Register Add
-
-  TBD
-
-
-1.6.2  ARR - Address Register Load With Round
-
-  dst.x = round(src.x)
-  dst.y = round(src.y)
-  dst.z = round(src.z)
-  dst.w = round(src.w)
-
-
-1.6.3  BRA - Branch
-
-  pc = target
-
-
-1.6.4  CAL - Subroutine Call
-
-  push(pc)
-  pc = target
-
-
-1.6.5  RET - Subroutine Call Return
-
-  pc = pop()
-
-
-1.6.6  SSG - Set Sign
-
-  dst.x = (src.x > 0.0) ? 1.0 : (src.x < 0.0) ? -1.0 : 0.0
-  dst.y = (src.y > 0.0) ? 1.0 : (src.y < 0.0) ? -1.0 : 0.0
-  dst.z = (src.z > 0.0) ? 1.0 : (src.z < 0.0) ? -1.0 : 0.0
-  dst.w = (src.w > 0.0) ? 1.0 : (src.w < 0.0) ? -1.0 : 0.0
-
-
-1.7  GL_ARB_vertex_program
---------------------------
-
-
-1.7.1  SWZ - Extended Swizzle
-
-  dst.x = src.x
-  dst.y = src.y
-  dst.z = src.z
-  dst.w = src.w
-
-
-1.7.2  XPD - Cross Product
-
-  Alias for CROSSPRODUCT.
-
-
-1.8  GL_ARB_fragment_program
-----------------------------
-
-
-1.8.1  CMP - Compare
-
-  dst.x = (src0.x < 0.0) ? src1.x : src2.x
-  dst.y = (src0.y < 0.0) ? src1.y : src2.y
-  dst.z = (src0.z < 0.0) ? src1.z : src2.z
-  dst.w = (src0.w < 0.0) ? src1.w : src2.w
-
-
-1.8.2  KIL - Conditional Discard
-
-  if (src.x < 0.0 || src.y < 0.0 || src.z < 0.0 || src.w < 0.0)
-    discard
-  endif
-
-
-1.8.3  SCS - Sine Cosine
-
-  dst.x = cos(src.x)
-  dst.y = sin(src.x)
-  dst.z = 0.0
-  dst.y = 1.0
-
-
-1.8.4  TXB - Texture Lookup With Bias
-
-  TBD
-
-
-1.9  GL_NV_fragment_program2
-----------------------------
-
-
-1.9.1  NRM - 3-component Vector Normalise
-
-  dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z)
-  dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z)
-  dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z)
-  dst.w = 1.0
-
-
-1.9.2  DIV - Divide
-
-  dst.x = src0.x / src1.x
-  dst.y = src0.y / src1.y
-  dst.z = src0.z / src1.z
-  dst.w = src0.w / src1.w
-
-
-1.9.3  DP2 - 2-component Dot Product
-
-  dst.x = src0.x * src1.x + src0.y * src1.y
-  dst.y = src0.x * src1.x + src0.y * src1.y
-  dst.z = src0.x * src1.x + src0.y * src1.y
-  dst.w = src0.x * src1.x + src0.y * src1.y
-
-
-1.9.4  DP2A - 2-component Dot Product And Add
-
-  Alias for DOT2ADD.
-
-
-1.9.5  TXL - Texture Lookup With LOD
-
-  TBD
-
-
-1.9.6  BRK - Break
-
-  TBD
-
-
-1.9.7  IF - If
-
-  TBD
-
-
-1.9.10  ELSE - Else
-
-  TBD
-
-
-1.9.11  ENDIF - End If
-
-  TBD
-
-
-1.10  GL_NV_vertex_program3
----------------------------
-
-
-1.10.1  PUSHA - Push Address Register On Stack
-
-  push(src.x)
-  push(src.y)
-  push(src.z)
-  push(src.w)
-
-
-1.10.2  POPA - Pop Address Register From Stack
-
-  dst.w = pop()
-  dst.z = pop()
-  dst.y = pop()
-  dst.x = pop()
-
-
-1.11  GL_NV_gpu_program4
-------------------------
-
-
-1.11.1  CEIL - Ceiling
-
-  dst.x = ceil(src.x)
-  dst.y = ceil(src.y)
-  dst.z = ceil(src.z)
-  dst.w = ceil(src.w)
-
-
-1.11.2  I2F - Integer To Float
-
-  dst.x = (float) src.x
-  dst.y = (float) src.y
-  dst.z = (float) src.z
-  dst.w = (float) src.w
-
-
-1.11.3  NOT - Bitwise Not
-
-  dst.x = ~src.x
-  dst.y = ~src.y
-  dst.z = ~src.z
-  dst.w = ~src.w
-
-
-1.11.4  TRUNC - Truncate
-
-  dst.x = trunc(src.x)
-  dst.y = trunc(src.y)
-  dst.z = trunc(src.z)
-  dst.w = trunc(src.w)
-
-
-1.11.5  SHL - Shift Left
-
-  dst.x = src0.x << src1.x
-  dst.y = src0.y << src1.x
-  dst.z = src0.z << src1.x
-  dst.w = src0.w << src1.x
-
-
-1.11.6  SHR - Shift Right
-
-  dst.x = src0.x >> src1.x
-  dst.y = src0.y >> src1.x
-  dst.z = src0.z >> src1.x
-  dst.w = src0.w >> src1.x
-
-
-1.11.7  AND - Bitwise And
-
-  dst.x = src0.x & src1.x
-  dst.y = src0.y & src1.y
-  dst.z = src0.z & src1.z
-  dst.w = src0.w & src1.w
-
-
-1.11.8  OR - Bitwise Or
-
-  dst.x = src0.x | src1.x
-  dst.y = src0.y | src1.y
-  dst.z = src0.z | src1.z
-  dst.w = src0.w | src1.w
-
-
-1.11.9  MOD - Modulus
-
-  dst.x = src0.x % src1.x
-  dst.y = src0.y % src1.y
-  dst.z = src0.z % src1.z
-  dst.w = src0.w % src1.w
-
-
-1.11.10  XOR - Bitwise Xor
-
-  dst.x = src0.x ^ src1.x
-  dst.y = src0.y ^ src1.y
-  dst.z = src0.z ^ src1.z
-  dst.w = src0.w ^ src1.w
-
-
-1.11.11  SAD - Sum Of Absolute Differences
-
-  dst.x = abs(src0.x - src1.x) + src2.x
-  dst.y = abs(src0.y - src1.y) + src2.y
-  dst.z = abs(src0.z - src1.z) + src2.z
-  dst.w = abs(src0.w - src1.w) + src2.w
-
-
-1.11.12  TXF - Texel Fetch
-
-  TBD
-
-
-1.11.13  TXQ - Texture Size Query
-
-  TBD
-
-
-1.11.14  CONT - Continue
-
-  TBD
-
-
-1.12  GL_NV_geometry_program4
------------------------------
-
-
-1.12.1  EMIT - Emit
-
-  TBD
-
-
-1.12.2  ENDPRIM - End Primitive
-
-  TBD
-
-
-1.13  GLSL
-----------
-
-
-1.13.1  BGNLOOP - Begin a Loop
-
-  TBD
-
-
-1.13.2  BGNSUB - Begin Subroutine
-
-  TBD
-
-
-1.13.3  ENDLOOP - End a Loop
-
-  TBD
-
-
-1.13.4  ENDSUB - End Subroutine
-
-  TBD
-
-
-1.13.5  INT - Truncate
-
-  Alias for TRUNC.
-
-
-1.13.6  NOISE1 - 1D Noise
-
-  TBD
-
-
-1.13.7  NOISE2 - 2D Noise
-
-  TBD
-
-
-1.13.8  NOISE3 - 3D Noise
-
-  TBD
-
-
-1.13.9  NOISE4 - 4D Noise
-
-  TBD
-
-
-1.13.10  NOP - No Operation
-
-  Do nothing.
-
-
-1.14  ps_1_1
-------------
-
-
-1.14.1  TEXKILL - Conditional Discard
-
-  Alias for KIL.
-
-
-1.15  ps_1_4
-------------
-
-
-1.15.1  TEXLD - Texture Lookup
-
-  Alias for TEX.
-
-
-1.16  ps_2_0
-------------
-
-
-1.16.1  M4X4 - Multiply Matrix
-
-  Alias for MULTIPLYMATRIX.
-
-
-1.16.2  M4X3 - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.16.3  M3X4 - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.16.4  M3X3 - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.16.5  M3X2 - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.16.6  CRS - Cross Product
-
-  Alias for XPD.
-
-
-1.16.7  NRM4 - 4-component Vector Normalise
-
-  dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
-  dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
-  dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
-  dst.w = src.w / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
-
-
-1.16.8  SINCOS - Sine Cosine
-
-  Alias for SCS.
-
-
-1.16.9  TEXLDB - Texture Lookup With Bias
-
-  Alias for TXB.
-
-
-1.16.10  DP2ADD - 2-component Dot Product And Add
-
-  Alias for DP2A.
-
-
-1.17  ps_2_x
-------------
-
-
-1.17.1  CALL - Subroutine Call
-
-  Alias for CAL.
-
-
-1.17.2  CALLNZ - Subroutine Call If Not Zero
-
-  TBD
-
-
-1.17.3  IFC - If
-
-  TBD
-
-
-1.17.4  BREAK - Break
-
-  Alias for BRK.
-
-
-1.17.5  BREAKC - Break Conditional
-
-  TBD
-
-
-1.17.6  DSX - Derivative Relative To X
-
-  Alias for DDX.
-
-
-1.17.7  DSY - Derivative Relative To Y
-
-  Alias for DDY.
-
-
-1.17.8  TEXLDD - Texture Lookup with Derivatives
-
-  Alias for TXD.
-
-
-1.18  vs_1_1
-------------
-
-
-1.18.1  EXPP - Approximate Exponential Base 2
-
-  Use EXP. See also 1.19.3.
-
-
-1.18.2  LOGP - Logarithm Base 2
-
-  Use LOG. See also 1.19.4.
-
-
-1.19  vs_2_0
-------------
-
-
-1.19.1  SGN - Set Sign
-
-  Alias for SSG.
-
-
-1.19.2  MOVA - Move Address Register
-
-  Alias for ARR.
-
-
-1.19.3  EXPP - Approximate Exponential Base 2
-
-  Use EX2.
-
-
-1.19.4  LOGP - Logarithm Base 2
-
-  Use LG2.
-
-
-2  Explanation of symbols used
-==============================
-
-
-2.1  Functions
---------------
-
-
-  abs(x)            Absolute value of x.
-                    |x|
-                    (x < 0.0) ? -x : x
-
-  ceil(x)           Ceiling of x.
-
-  clamp(x,y,z)      Clamp x between y and z.
-                    (x < y) ? y : (x > z) ? z : x
-
-  cos(x)            Cosine of x.
-
-  floor(x)          Floor of x.
-
-  lg2(x)            Logarithm base 2 of x.
-
-  max(x,y)          Maximum of x and y.
-                    (x > y) ? x : y
-
-  min(x,y)          Minimum of x and y.
-                    (x < y) ? x : y
-
-  partialx(x)       Derivative of x relative to fragment's X.
-
-  partialy(x)       Derivative of x relative to fragment's Y.
-
-  pop()             Pop from stack.
-
-  pow(x,y)          Raise x to power of y.
-
-  push(x)           Push x on stack.
-
-  round(x)          Round x.
-
-  sin(x)            Sine of x.
-
-  sqrt(x)           Square root of x.
-
-  trunc(x)          Truncate x.
-
-
-2.2  Keywords
--------------
-
-
-  discard           Discard fragment.
-
-  dst               First destination register.
-
-  dst0              First destination register.
-
-  pc                Program counter.
-
-  src               First source register.
-
-  src0              First source register.
-
-  src1              Second source register.
-
-  src2              Third source register.
-
-  target            Label of target instruction.
-
-
-3  Other tokens
-===============
-
-
-3.1  Declaration Semantic
--------------------------
-
-
-  Follows Declaration token if Semantic bit is set.
-
-  Since its purpose is to link a shader with other stages of the pipeline,
-  it is valid to follow only those Declaration tokens that declare a register
-  either in INPUT or OUTPUT file.
-
-  SemanticName field contains the semantic name of the register being declared.
-  There is no default value.
-
-  SemanticIndex is an optional subscript that can be used to distinguish
-  different register declarations with the same semantic name. The default value
-  is 0.
-
-  The meanings of the individual semantic names are explained in the following
-  sections.
-
-
-3.1.1  FACE
-
-  Valid only in a fragment shader INPUT declaration.
-
-  FACE.x is negative when the primitive is back facing. FACE.x is positive
-  when the primitive is front facing.
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index e0c5d3d3d61..ced9c94f468 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -36,6 +36,7 @@
 
 #include "util/u_math.h"
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_scan.h"
 
 
@@ -84,25 +85,28 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
          {
             const struct tgsi_full_instruction *fullinst
                = &parse.FullToken.FullInstruction;
+            uint i;
 
             assert(fullinst->Instruction.Opcode < TGSI_OPCODE_LAST);
             info->opcode_count[fullinst->Instruction.Opcode]++;
 
-            /* check if we read the frag shader FOG or FACE inputs */
-            if (procType == TGSI_PROCESSOR_FRAGMENT) {
-               uint i;
-               for (i = 0; i < fullinst->Instruction.NumSrcRegs; i++) {
-                  const struct tgsi_full_src_register *src =
-                     &fullinst->Src[i];
-                  if (src->Register.File == TGSI_FILE_INPUT ||
-                      src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
-                     const int ind = src->Register.Index;
-                     if (info->input_semantic_name[ind] == TGSI_SEMANTIC_FOG) {
-                        info->uses_fogcoord = TRUE;
-                     }
-                     else if (info->input_semantic_name[ind] == TGSI_SEMANTIC_FACE) {
-                        info->uses_frontfacing = TRUE;
+            for (i = 0; i < fullinst->Instruction.NumSrcRegs; i++) {
+               const struct tgsi_full_src_register *src =
+                  &fullinst->Src[i];
+               int ind = src->Register.Index;
+
+               /* Mark which inputs are effectively used */
+               if (src->Register.File == TGSI_FILE_INPUT) {
+                  unsigned usage_mask;
+                  usage_mask = tgsi_util_get_inst_usage_mask(fullinst, i);
+                  if (src->Register.Indirect) {
+                     for (ind = 0; ind < info->num_inputs; ++ind) {
+                        info->input_usage_mask[ind] |= usage_mask;
                      }
+                  } else {
+                     assert(ind >= 0);
+                     assert(ind < PIPE_MAX_SHADER_INPUTS);
+                     info->input_usage_mask[ind] |= usage_mask;
                   }
                }
             }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 27de33f7990..e75280336f0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -45,6 +45,7 @@ struct tgsi_shader_info
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
    ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
    ubyte input_cylindrical_wrap[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
@@ -61,8 +62,6 @@ struct tgsi_shader_info
    boolean writes_z;  /**< does fragment shader write Z value? */
    boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
    boolean uses_kill;  /**< KIL or KILP instruction used? */
-   boolean uses_fogcoord; /**< fragment shader uses fog coord? */
-   boolean uses_frontfacing; /**< fragment shader uses front/back-face flag? */
 
    struct {
       unsigned name;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index d5061f8b511..785a9fb0356 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -1244,16 +1244,9 @@ emit_sub(
       make_xmm( xmm_src ) );
 }
 
-
-
-
-
-
-
 /**
  * Register fetch.
  */
-
 static void
 emit_fetch(
    struct x86_function *func,
@@ -1338,7 +1331,6 @@ emit_fetch(
 /**
  * Register store.
  */
-
 static void
 emit_store(
    struct x86_function *func,
@@ -1455,7 +1447,6 @@ fetch_texel( struct tgsi_sampler **sampler,
 /**
  * High-level instruction translators.
  */
-
 static void
 emit_tex( struct x86_function *func,
           const struct tgsi_full_instruction *inst,
@@ -1507,7 +1498,6 @@ emit_tex( struct x86_function *func,
                get_temp( TEMP_R0, 3 ),
                make_xmm( 3 ) );
 
-   
    if (projected) {
       FETCH( func, *inst, 3, 0, 3 );
 
@@ -1535,7 +1525,6 @@ emit_tex( struct x86_function *func,
    args[0] = get_temp( TEMP_R0, 0 );
    args[1] = get_sampler_ptr( unit );
 
-
    emit_func_call( func,
                    0,
                    args,
@@ -1569,7 +1558,8 @@ emit_kil(
 
    /* This mask stores component bits that were already tested. Note that
     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
+    * tested.
+    */
    uniquemask = 0;
 
    FOR_EACH_CHANNEL( chan_index ) {
@@ -1715,22 +1705,26 @@ emit_cmp(
 
 
 /**
- * Check if inst src/dest regs use indirect addressing into temporary
- * register file.
+ * Check if inst src/dest regs use indirect addressing into temporary,
+ * input or output register files.
  */
 static boolean
-indirect_temp_reference(const struct tgsi_full_instruction *inst)
+indirect_reg_reference(const struct tgsi_full_instruction *inst)
 {
    uint i;
    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
       const struct tgsi_full_src_register *reg = &inst->Src[i];
-      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+      if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
+           reg->Register.File == TGSI_FILE_INPUT ||
+           reg->Register.File == TGSI_FILE_OUTPUT) &&
           reg->Register.Indirect)
          return TRUE;
    }
    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
       const struct tgsi_full_dst_register *reg = &inst->Dst[i];
-      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+      if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
+           reg->Register.File == TGSI_FILE_INPUT ||
+           reg->Register.File == TGSI_FILE_OUTPUT) &&
           reg->Register.Indirect)
          return TRUE;
    }
@@ -1746,7 +1740,7 @@ emit_instruction(
    unsigned chan_index;
 
    /* we can't handle indirect addressing into temp register file yet */
-   if (indirect_temp_reference(inst))
+   if (indirect_reg_reference(inst))
       return FALSE;
 
    switch (inst->Instruction.Opcode) {
@@ -2931,7 +2925,6 @@ tgsi_emit_sse2(
       x86_make_disp( get_machine_base(),
                      Offset( struct tgsi_exec_machine, Samplers ) ) );
 
-
    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
       tgsi_parse_token( &parse );
 
@@ -3015,4 +3008,3 @@ tgsi_emit_sse2(
 }
 
 #endif /* PIPE_ARCH_X86 */
-
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 0a7e4105a80..3ec54964169 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -163,3 +163,149 @@ tgsi_util_set_full_src_register_sign_mode(
       assert( 0 );
    }
 }
+
+/**
+ * Determine which channels of the specificed src register are effectively
+ * used by this instruction.
+ */
+unsigned
+tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst,
+                              unsigned src_idx)
+{
+   const struct tgsi_full_src_register *src = &inst->Src[src_idx];
+   unsigned write_mask = inst->Dst[0].Register.WriteMask;
+   unsigned read_mask;
+   unsigned usage_mask;
+   unsigned chan;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ARL:
+   case TGSI_OPCODE_ARR:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_DIV:
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_LRP:
+   case TGSI_OPCODE_CND:
+   case TGSI_OPCODE_FRC:
+   case TGSI_OPCODE_CEIL:
+   case TGSI_OPCODE_CLAMP:
+   case TGSI_OPCODE_FLR:
+   case TGSI_OPCODE_ROUND:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
+   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDY:
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SSG:
+   case TGSI_OPCODE_CMP:
+   case TGSI_OPCODE_TRUNC:
+   case TGSI_OPCODE_NOT:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_XOR:
+   case TGSI_OPCODE_SAD:
+      /* Channel-wise operations */
+      read_mask = write_mask;
+      break;
+
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_RCC:
+      read_mask = TGSI_WRITEMASK_X;
+      break;
+
+   case TGSI_OPCODE_SCS:
+      read_mask = write_mask & TGSI_WRITEMASK_XY ? TGSI_WRITEMASK_X : 0;
+      break;
+
+   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_LOG:
+      read_mask = write_mask & TGSI_WRITEMASK_XYZ ? TGSI_WRITEMASK_X : 0;
+      break;
+
+   case TGSI_OPCODE_DP2A:
+      read_mask = src_idx == 2 ? TGSI_WRITEMASK_X : TGSI_WRITEMASK_XY;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      read_mask = TGSI_WRITEMASK_XY;
+      break;
+
+   case TGSI_OPCODE_DP3:
+      read_mask = TGSI_WRITEMASK_XYZ;
+      break;
+
+   case TGSI_OPCODE_DP4:
+      read_mask = TGSI_WRITEMASK_XYZW;
+      break;
+
+   case TGSI_OPCODE_DPH:
+      read_mask = src_idx == 0 ? TGSI_WRITEMASK_XYZ : TGSI_WRITEMASK_XYZW;
+      break;
+
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXD:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXP:
+      if (src_idx == 0) {
+         switch (inst->Texture.Texture) {
+         case TGSI_TEXTURE_1D:
+         case TGSI_TEXTURE_SHADOW1D:
+            read_mask = TGSI_WRITEMASK_X;
+            break;
+
+         case TGSI_TEXTURE_2D:
+         case TGSI_TEXTURE_RECT:
+         case TGSI_TEXTURE_SHADOW2D:
+         case TGSI_TEXTURE_SHADOWRECT:
+            read_mask = TGSI_WRITEMASK_XY;
+            break;
+
+         case TGSI_TEXTURE_3D:
+         case TGSI_TEXTURE_CUBE:
+            read_mask = TGSI_WRITEMASK_XYZ;
+            break;
+
+         default:
+            assert(0);
+            read_mask = 0;
+         }
+
+         if (inst->Instruction.Opcode != TGSI_OPCODE_TEX) {
+            read_mask |= TGSI_WRITEMASK_W;
+         }
+      } else {
+         /* A safe approximation */
+         read_mask = TGSI_WRITEMASK_XYZW;
+      }
+      break;
+
+   default:
+      /* Assume all channels are read */
+      read_mask = TGSI_WRITEMASK_XYZW;
+      break;
+   }
+
+   usage_mask = 0;
+   for (chan = 0; chan < 4; ++chan) {
+      if (read_mask & (1 << chan)) {
+         usage_mask |= 1 << tgsi_util_get_full_src_register_swizzle(src, chan);
+      }
+   }
+
+   return usage_mask;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h
index 19ee2e7cf2a..04702ba9826 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -34,6 +34,7 @@ extern "C" {
 
 struct tgsi_src_register;
 struct tgsi_full_src_register;
+struct tgsi_full_instruction;
 
 void *
 tgsi_align_128bit(
@@ -71,6 +72,10 @@ tgsi_util_set_full_src_register_sign_mode(
    struct tgsi_full_src_register *reg,
    unsigned sign_mode );
 
+unsigned
+tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst,
+                              unsigned src_idx);
+
 #if defined __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index d4fbd658f40..97fa99ec65d 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -42,12 +42,10 @@
 
 #include "util/u_blit.h"
 #include "util/u_draw_quad.h"
-#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_sampler.h"
 #include "util/u_simple_shaders.h"
-#include "util/u_surface.h"
 
 #include "cso_cache/cso_context.h"
 
@@ -136,7 +134,8 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
 
    /* fragment shader */
    ctx->fs[TGSI_WRITEMASK_XYZW] =
-      util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D);
+      util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D,
+                                    TGSI_INTERPOLATE_LINEAR);
    ctx->vbuf = NULL;
 
    /* init vertex data that doesn't change */
@@ -476,6 +475,7 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    if (ctx->fs[writemask] == NULL)
       ctx->fs[writemask] =
          util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D,
+                                                 TGSI_INTERPOLATE_LINEAR,
                                                  writemask);
 
    /* shaders */
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 9ae7b38e6e9..183ffe5670f 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -522,6 +522,26 @@ void *blitter_get_fs_col(struct blitter_context_priv *ctx, unsigned num_cbufs)
    return ctx->fs_col[num_cbufs];
 }
 
+/** Convert PIPE_TEXTURE_x to TGSI_TEXTURE_x */
+static unsigned
+pipe_tex_to_tgsi_tex(unsigned pipe_tex_target)
+{
+   switch (pipe_tex_target) {
+   case PIPE_TEXTURE_1D:
+      return TGSI_TEXTURE_1D;
+   case PIPE_TEXTURE_2D:
+      return TGSI_TEXTURE_2D;
+   case PIPE_TEXTURE_3D:
+      return TGSI_TEXTURE_3D;
+   case PIPE_TEXTURE_CUBE:
+      return TGSI_TEXTURE_CUBE;
+   default:
+      assert(0 && "unexpected texture target");
+      return TGSI_TEXTURE_UNKNOWN;
+   }
+}
+
+
 static INLINE
 void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
                                   unsigned tex_target)
@@ -532,25 +552,10 @@ void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
 
    /* Create the fragment shader on-demand. */
    if (!ctx->fs_texfetch_col[tex_target]) {
-      switch (tex_target) {
-         case PIPE_TEXTURE_2D:
-            ctx->fs_texfetch_col[PIPE_TEXTURE_2D] =
-               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D);
-            break;
-         case PIPE_TEXTURE_3D:
-            ctx->fs_texfetch_col[PIPE_TEXTURE_3D] =
-               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_3D);
-            break;
-         case PIPE_TEXTURE_CUBE:
-            ctx->fs_texfetch_col[PIPE_TEXTURE_CUBE] =
-               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE);
-            break;
-         case PIPE_TEXTURE_1D:
-         default:
-            ctx->fs_texfetch_col[PIPE_TEXTURE_1D] =
-               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_1D);
-            tex_target = PIPE_TEXTURE_1D; /* for the default case */
-      }
+      unsigned tgsi_tex = pipe_tex_to_tgsi_tex(tex_target);
+
+      ctx->fs_texfetch_col[tex_target] =
+        util_make_fragment_tex_shader(pipe, tgsi_tex, TGSI_INTERPOLATE_LINEAR);
    }
 
    return ctx->fs_texfetch_col[tex_target];
@@ -566,25 +571,11 @@ void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
 
    /* Create the fragment shader on-demand. */
    if (!ctx->fs_texfetch_depth[tex_target]) {
-      switch (tex_target) {
-         case PIPE_TEXTURE_2D:
-            ctx->fs_texfetch_depth[PIPE_TEXTURE_2D] =
-               util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_2D);
-            break;
-         case PIPE_TEXTURE_3D:
-            ctx->fs_texfetch_depth[PIPE_TEXTURE_3D] =
-               util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_3D);
-            break;
-         case PIPE_TEXTURE_CUBE:
-            ctx->fs_texfetch_depth[PIPE_TEXTURE_CUBE] =
-               util_make_fragment_tex_shader_writedepth(pipe,TGSI_TEXTURE_CUBE);
-            break;
-         case PIPE_TEXTURE_1D:
-         default:
-            ctx->fs_texfetch_depth[PIPE_TEXTURE_1D] =
-               util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_1D);
-            tex_target = PIPE_TEXTURE_1D; /* for the default case */
-      }
+      unsigned tgsi_tex = pipe_tex_to_tgsi_tex(tex_target);
+
+      ctx->fs_texfetch_depth[tex_target] =
+         util_make_fragment_tex_shader_writedepth(pipe, tgsi_tex,
+                                                  TGSI_INTERPOLATE_LINEAR);
    }
 
    return ctx->fs_texfetch_depth[tex_target];
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 86db2c2e4b4..954f5706ef3 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -42,6 +42,7 @@
 #include "util/u_tile.h" 
 #include "util/u_prim.h" 
 
+#include <limits.h> /* CHAR_BIT */
 
 void _debug_vprintf(const char *format, va_list ap)
 {
@@ -181,16 +182,21 @@ debug_get_flags_option(const char *name,
 {
    unsigned long result;
    const char *str;
+   const struct debug_named_value *orig = flags;
+   int namealign = 0;
    
    str = os_get_option(name);
    if(!str)
       result = dfault;
    else if (!util_strcmp(str, "help")) {
       result = dfault;
-      while (flags->name) {
-         debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value);
-         flags++;
-      }
+      debug_printf("%s: help for %s:\n", __FUNCTION__, name);
+      for (; flags->name; ++flags)
+         namealign = MAX2(namealign, strlen(flags->name));
+      for (flags = orig; flags->name; ++flags)
+         debug_printf("| %*s [0x%0*lx]%s%s\n", namealign, flags->name,
+                      sizeof(unsigned long)*CHAR_BIT/4, flags->value,
+                      flags->desc ? " " : "", flags->desc ? flags->desc : "");
    }
    else {
       result = 0;
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index e8ff2773e69..1c9624ea3ed 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -230,6 +230,7 @@ struct debug_named_value
 {
    const char *name;
    unsigned long value;
+   const char *desc;
 };
 
 
@@ -252,8 +253,9 @@ struct debug_named_value
  *    ...
  * @endcode
  */
-#define DEBUG_NAMED_VALUE(__symbol) {#__symbol, (unsigned long)__symbol} 
-#define DEBUG_NAMED_VALUE_END {NULL, 0} 
+#define DEBUG_NAMED_VALUE(__symbol) DEBUG_NAMED_VALUE_WITH_DESCRIPTION(__symbol, NULL)
+#define DEBUG_NAMED_VALUE_WITH_DESCRIPTION(__symbol, __desc) {#__symbol, (unsigned long)__symbol, __desc}
+#define DEBUG_NAMED_VALUE_END {NULL, 0, NULL}
 
 
 /**
diff --git a/src/gallium/auxiliary/util/u_dl.h b/src/gallium/auxiliary/util/u_dl.h
index 2853b447c61..80a00ed6796 100644
--- a/src/gallium/auxiliary/util/u_dl.h
+++ b/src/gallium/auxiliary/util/u_dl.h
@@ -35,10 +35,13 @@
 
 #if defined(PIPE_OS_WINDOWS)
 #  define UTIL_DL_EXT ".dll"
+#  define UTIL_DL_PREFIX ""
 #elif defined(PIPE_OS_APPLE)
 #  define UTIL_DL_EXT ".dylib"
+#  define UTIL_DL_PREFIX "lib"
 #else
 #  define UTIL_DL_EXT ".so"
+#  define UTIL_DL_PREFIX "lib"
 #endif
 
 
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index d19267be72f..b7fe2d3003a 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -1327,8 +1327,10 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    }
 
    /* fragment shader */
-   ctx->fs2d = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D);
-   ctx->fsCube = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE);
+   ctx->fs2d = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D,
+                                             TGSI_INTERPOLATE_LINEAR);
+   ctx->fsCube = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE,
+                                               TGSI_INTERPOLATE_LINEAR);
 
    /* vertex data that doesn't change */
    for (i = 0; i < 4; i++) {
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index d1ec13def30..6370e779865 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -168,6 +168,9 @@ static INLINE float logf( float f )
 #undef logf
 #define logf(x) ((float)log((double)(x)))
 #endif /* logf */
+
+#define isfinite(x) _finite((double)(x))
+#define isnan(x) _isnan((double)(x))
 #endif
 
 static INLINE double log2( double x )
@@ -335,6 +338,15 @@ util_iround(float f)
 }
 
 
+/**
+ * Approximate floating point comparison
+ */
+static INLINE boolean
+util_is_approx(float a, float b, float tol)
+{
+   return fabs(b - a) <= tol;
+}
+
 
 /**
  * Test if x is NaN or +/- infinity.
diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h
index e1af9f11cb9..ae6f43bff87 100644
--- a/src/gallium/auxiliary/util/u_pointer.h
+++ b/src/gallium/auxiliary/util/u_pointer.h
@@ -98,6 +98,18 @@ align16( void *unaligned )
    return align_pointer( unaligned, 16 );
 }
 
+typedef void (*func_pointer)(void);
+
+static INLINE func_pointer
+pointer_to_func( void *p )
+{
+   union {
+      void *p;
+      func_pointer f;
+   } pf;
+   pf.p = p;
+   return pf.f;
+}
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 019dda767d0..5b682f496cb 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -87,10 +87,15 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
  *  MOV OUT[0], IMM[0]                    // (if writemask != 0xf)
  *  TEX OUT[0].writemask, IN[0], SAMP[0], 2D;
  *  END;
+ *
+ * \param tex_target  one of PIPE_TEXTURE_x
+ * \parma interp_mode  either TGSI_INTERPOLATE_LINEAR or PERSPECTIVE
+ * \param writemask  mask of TGSI_WRITEMASK_x
  */
 void *
 util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
                                         unsigned tex_target,
+                                        unsigned interp_mode,
                                         unsigned writemask )
 {
    struct ureg_program *ureg;
@@ -98,6 +103,9 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
    struct ureg_src tex;
    struct ureg_dst out;
 
+   assert(interp_mode == TGSI_INTERPOLATE_LINEAR ||
+          interp_mode == TGSI_INTERPOLATE_PERSPECTIVE);
+
    ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
    if (ureg == NULL)
       return NULL;
@@ -106,7 +114,7 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
 
    tex = ureg_DECL_fs_input( ureg, 
                              TGSI_SEMANTIC_GENERIC, 0, 
-                             TGSI_INTERPOLATE_PERSPECTIVE );
+                             interp_mode );
 
    out = ureg_DECL_output( ureg, 
                            TGSI_SEMANTIC_COLOR,
@@ -133,10 +141,12 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
  * \param tex_target  one of PIPE_TEXTURE_x
  */
 void *
-util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target )
+util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target,
+                              unsigned interp_mode)
 {
    return util_make_fragment_tex_shader_writemask( pipe,
                                                    tex_target,
+                                                   interp_mode,
                                                    TGSI_WRITEMASK_XYZW );
 }
 
@@ -147,7 +157,8 @@ util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target )
  */
 void *
 util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
-                                         unsigned tex_target)
+                                         unsigned tex_target,
+                                         unsigned interp_mode)
 {
    struct ureg_program *ureg;
    struct ureg_src sampler;
@@ -163,7 +174,7 @@ util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
 
    tex = ureg_DECL_fs_input( ureg,
                              TGSI_SEMANTIC_GENERIC, 0,
-                             TGSI_INTERPOLATE_PERSPECTIVE );
+                             interp_mode );
 
    out = ureg_DECL_output( ureg,
                            TGSI_SEMANTIC_COLOR,
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h
index 6e760942e25..4aa34bc4757 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.h
+++ b/src/gallium/auxiliary/util/u_simple_shaders.h
@@ -52,15 +52,18 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
 extern void *
 util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, 
                                         unsigned tex_target,
+                                        unsigned interp_mode,
                                         unsigned writemask);
 
 extern void *
-util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target);
+util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target,
+                              unsigned interp_mode);
 
 
 extern void *
 util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
-                                         unsigned tex_target);
+                                         unsigned tex_target,
+                                         unsigned interp_mode);
 
 
 extern void *