Merge branch 'master' into pipe-video

Conflicts: configs/linux-dri configure.ac src/gallium/drivers/nvfx/Makefile src/gallium/include/pipe/p_defines.h src/gallium/include/pipe/p_screen.h src/gallium/include/state_tracker/dri1_api.h src/gallium/include/state_tracker/drm_api.h src/gallium/tests/python/samples/tri.py src/gallium/tests/trivial/Makefile src/gallium/tests/unit/Makefile src/gallium/tests/unit/SConscript src/gallium/tests/unit/u_format_test.c src/gallium/winsys/nouveau/drm/nouveau_drm_api.c
author: Thomas Balling Sørensen <tball@tball-laptop.(none)> 2010-10-05 12:04:08 +0200
committer: Thomas Balling Sørensen <tball@tball-laptop.(none)> 2010-10-05 12:04:08 +0200
commit: 1218430e1200a08cd64b6555d3fd1fd0274ad9e5 (patch)
tree: e060fb27b8388a4bd237ca39fc20f1675c5e367c /src/gallium/auxiliary
parent: 63b1525cf0a50e3d31328c3b56355a86056e4c05 (diff)
parent: bf21b7006c63c3dc47045c22d4f372dfe6c7ce67 (diff)
244 files changed, 20799 insertions, 8881 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 16bb50fe168..05096b12a86 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -4,10 +4,11 @@ include $(TOP)/configs/current
 LIBNAME = gallium
 
 C_SOURCES = \
-	cso_cache/cso_context.c \
 	cso_cache/cso_cache.c \
+	cso_cache/cso_context.c \
 	cso_cache/cso_hash.c \
 	draw/draw_context.c \
+	draw/draw_fs.c \
 	draw/draw_gs.c \
 	draw/draw_pipe.c \
 	draw/draw_pipe_aaline.c \
@@ -26,32 +27,32 @@ C_SOURCES = \
 	draw/draw_pipe_wide_line.c \
 	draw/draw_pipe_wide_point.c \
 	draw/draw_pt.c \
-	draw/draw_pt_elts.c \
 	draw/draw_pt_emit.c \
 	draw/draw_pt_fetch.c \
 	draw/draw_pt_fetch_emit.c \
 	draw/draw_pt_fetch_shade_emit.c \
 	draw/draw_pt_fetch_shade_pipeline.c \
 	draw/draw_pt_post_vs.c \
+	draw/draw_pt_so_emit.c \
 	draw/draw_pt_util.c \
-	draw/draw_pt_varray.c \
-	draw/draw_pt_vcache.c \
+	draw/draw_pt_vsplit.c \
 	draw/draw_vertex.c \
 	draw/draw_vs.c \
-	draw/draw_vs_varient.c \
 	draw/draw_vs_aos.c \
 	draw/draw_vs_aos_io.c \
 	draw/draw_vs_aos_machine.c \
 	draw/draw_vs_exec.c \
 	draw/draw_vs_ppc.c \
 	draw/draw_vs_sse.c \
+	draw/draw_vs_varient.c \
 	indices/u_indices_gen.c \
 	indices/u_unfilled_gen.c \
 	os/os_misc.c \
+	os/os_stream.c \
 	os/os_stream_log.c \
+	os/os_stream_null.c \
 	os/os_stream_stdc.c \
 	os/os_stream_str.c \
-	os/os_stream_null.c \
 	os/os_time.c \
 	pipebuffer/pb_buffer_fenced.c \
 	pipebuffer/pb_buffer_malloc.c \
@@ -64,17 +65,16 @@ C_SOURCES = \
 	pipebuffer/pb_bufmgr_slab.c \
 	pipebuffer/pb_validate.c \
 	rbug/rbug_connection.c \
+	rbug/rbug_context.c \
 	rbug/rbug_core.c \
+	rbug/rbug_demarshal.c \
 	rbug/rbug_texture.c \
-	rbug/rbug_context.c \
 	rbug/rbug_shader.c \
-	rbug/rbug_demarshal.c \
 	rtasm/rtasm_cpu.c \
 	rtasm/rtasm_execmem.c \
-	rtasm/rtasm_x86sse.c \
 	rtasm/rtasm_ppc.c \
 	rtasm/rtasm_ppc_spe.c \
-	tgsi/tgsi_sanity.c \
+	rtasm/rtasm_x86sse.c \
 	tgsi/tgsi_build.c \
 	tgsi/tgsi_dump.c \
 	tgsi/tgsi_exec.c \
@@ -82,25 +82,29 @@ C_SOURCES = \
 	tgsi/tgsi_iterate.c \
 	tgsi/tgsi_parse.c \
 	tgsi/tgsi_ppc.c \
+	tgsi/tgsi_sanity.c \
 	tgsi/tgsi_scan.c \
 	tgsi/tgsi_sse2.c \
 	tgsi/tgsi_text.c \
 	tgsi/tgsi_transform.c \
 	tgsi/tgsi_ureg.c \
 	tgsi/tgsi_util.c \
-	translate/translate_generic.c \
-	translate/translate_sse.c \
 	translate/translate.c \
 	translate/translate_cache.c \
+	translate/translate_generic.c \
+	translate/translate_sse.c \
 	util/u_debug.c \
-	util/u_debug_symbol.c \
+	util/u_debug_describe.c \
+	util/u_debug_refcnt.c \
 	util/u_debug_stack.c \
+	util/u_debug_symbol.c \
 	util/u_dump_defines.c \
 	util/u_dump_state.c \
 	util/u_bitmask.c \
 	util/u_blit.c \
 	util/u_blitter.c \
 	util/u_cache.c \
+	util/u_caps.c \
 	util/u_cpu_detect.c \
 	util/u_dl.c \
 	util/u_draw_quad.c \
@@ -112,21 +116,26 @@ C_SOURCES = \
 	util/u_format_tests.c \
 	util/u_format_yuv.c \
 	util/u_format_zs.c \
+	util/u_framebuffer.c \
 	util/u_gen_mipmap.c \
 	util/u_half.c \
 	util/u_handle_table.c \
-	util/u_hash_table.c \
 	util/u_hash.c \
+	util/u_hash_table.c \
+	util/u_index_modify.c \
 	util/u_keymap.c \
 	util/u_linear.c \
+	util/u_linkage.c \
 	util/u_network.c \
 	util/u_math.c \
+	util/u_mempool.c \
 	util/u_mm.c \
 	util/u_rect.c \
 	util/u_ringbuffer.c \
 	util/u_sampler.c \
 	util/u_simple_shaders.c \
 	util/u_snprintf.c \
+	util/u_staging.c \
 	util/u_surface.c \
 	util/u_surfaces.c \
 	util/u_texture.c \
@@ -142,28 +151,38 @@ C_SOURCES = \
 
 GALLIVM_SOURCES = \
         gallivm/lp_bld_arit.c \
+        gallivm/lp_bld_assert.c \
+        gallivm/lp_bld_bitarit.c \
         gallivm/lp_bld_const.c \
         gallivm/lp_bld_conv.c \
         gallivm/lp_bld_debug.c \
         gallivm/lp_bld_flow.c \
         gallivm/lp_bld_format_aos.c \
         gallivm/lp_bld_format_soa.c \
+        gallivm/lp_bld_format_yuv.c \
+        gallivm/lp_bld_gather.c \
         gallivm/lp_bld_init.c \
         gallivm/lp_bld_intr.c \
         gallivm/lp_bld_logic.c \
         gallivm/lp_bld_pack.c \
         gallivm/lp_bld_printf.c \
+        gallivm/lp_bld_quad.c \
         gallivm/lp_bld_sample.c \
+        gallivm/lp_bld_sample_aos.c \
         gallivm/lp_bld_sample_soa.c \
         gallivm/lp_bld_struct.c \
         gallivm/lp_bld_swizzle.c \
+        gallivm/lp_bld_tgsi_aos.c \
         gallivm/lp_bld_tgsi_soa.c \
         gallivm/lp_bld_type.c \
         draw/draw_llvm.c \
-        draw/draw_pt_fetch_shade_pipeline_llvm.c \
-        draw/draw_llvm_translate.c
+        draw/draw_llvm_sample.c \
+        draw/draw_llvm_translate.c \
+        draw/draw_vs_llvm.c \
+        draw/draw_pt_fetch_shade_pipeline_llvm.c
 
-GALLIVM_CPP_SOURCES =
+GALLIVM_CPP_SOURCES = \
+    gallivm/lp_bld_misc.cpp
 
 GENERATED_SOURCES = \
 	indices/u_indices_gen.c \
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index 6cea83060a9..a18f7c0b2a3 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -32,28 +32,30 @@ env.CodeGenerate(
 
 env.CodeGenerate(
     target = 'util/u_format_table.c',
-    script = 'util/u_format_table.py',
-    source = ['util/u_format.csv'],
-    command = 'python $SCRIPT $SOURCE > $TARGET'
+    script = '#src/gallium/auxiliary/util/u_format_table.py',
+    source = ['#src/gallium/auxiliary/util/u_format.csv'],
+    command = python_cmd + ' $SCRIPT $SOURCE > $TARGET'
 )
 
 env.CodeGenerate(
     target = 'util/u_half.c',
     script = 'util/u_half.py',
     source = [],
-    command = 'python $SCRIPT > $TARGET'
+    command = python_cmd + ' $SCRIPT > $TARGET'
 )
 
 env.Depends('util/u_format_table.c', [
-    'util/u_format_parse.py', 
+    '#src/gallium/auxiliary/util/u_format_parse.py',
     'util/u_format_pack.py', 
 ])
 
 source = [
-    'cso_cache/cso_context.c',
     'cso_cache/cso_cache.c',
+    'cso_cache/cso_context.c',
     'cso_cache/cso_hash.c',
     'draw/draw_context.c',
+    'draw/draw_fs.c',
+    'draw/draw_gs.c',
     'draw/draw_pipe.c',
     'draw/draw_pipe_aaline.c',
     'draw/draw_pipe_aapoint.c',
@@ -71,16 +73,15 @@ source = [
     'draw/draw_pipe_wide_line.c',
     'draw/draw_pipe_wide_point.c',
     'draw/draw_pt.c',
-    'draw/draw_pt_elts.c',
     'draw/draw_pt_emit.c',
     'draw/draw_pt_fetch.c',
     'draw/draw_pt_fetch_emit.c',
     'draw/draw_pt_fetch_shade_emit.c',
     'draw/draw_pt_fetch_shade_pipeline.c',
     'draw/draw_pt_post_vs.c',
+    'draw/draw_pt_so_emit.c',
     'draw/draw_pt_util.c',
-    'draw/draw_pt_varray.c',
-    'draw/draw_pt_vcache.c',
+    'draw/draw_pt_vsplit.c',
     'draw/draw_vertex.c',
     'draw/draw_vs.c',
     'draw/draw_vs_aos.c',
@@ -90,16 +91,16 @@ source = [
     'draw/draw_vs_ppc.c',
     'draw/draw_vs_sse.c',
     'draw/draw_vs_varient.c',
-    'draw/draw_gs.c',
     #'indices/u_indices.c',
     #'indices/u_unfilled_indices.c',
     'indices/u_indices_gen.c',
     'indices/u_unfilled_gen.c',
     'os/os_misc.c',
+    'os/os_stream.c',
     'os/os_stream_log.c',
+    'os/os_stream_null.c',
     'os/os_stream_stdc.c',
     'os/os_stream_str.c',
-    'os/os_stream_null.c',
     'os/os_time.c',
     'pipebuffer/pb_buffer_fenced.c',
     'pipebuffer/pb_buffer_malloc.c',
@@ -111,42 +112,45 @@ source = [
     'pipebuffer/pb_bufmgr_pool.c',
     'pipebuffer/pb_bufmgr_slab.c',
     'pipebuffer/pb_validate.c',
+    'rbug/rbug_connection.c',
+    'rbug/rbug_context.c',
     'rbug/rbug_core.c',
+    'rbug/rbug_demarshal.c',
     'rbug/rbug_shader.c',
-    'rbug/rbug_context.c',
     'rbug/rbug_texture.c',
-    'rbug/rbug_demarshal.c',
-    'rbug/rbug_connection.c',
     'rtasm/rtasm_cpu.c',
     'rtasm/rtasm_execmem.c',
-    'rtasm/rtasm_x86sse.c',
     'rtasm/rtasm_ppc.c',
     'rtasm/rtasm_ppc_spe.c',
+    'rtasm/rtasm_x86sse.c',
     'tgsi/tgsi_build.c',
     'tgsi/tgsi_dump.c',
     'tgsi/tgsi_exec.c',
     'tgsi/tgsi_info.c',
     'tgsi/tgsi_iterate.c',
     'tgsi/tgsi_parse.c',
+    'tgsi/tgsi_ppc.c',
     'tgsi/tgsi_sanity.c',
     'tgsi/tgsi_scan.c',
-    'tgsi/tgsi_ppc.c',
     'tgsi/tgsi_sse2.c',
     'tgsi/tgsi_text.c',
     'tgsi/tgsi_transform.c',
     'tgsi/tgsi_ureg.c',
     'tgsi/tgsi_util.c',
-    'translate/translate_generic.c',
-    'translate/translate_sse.c',
     'translate/translate.c',
     'translate/translate_cache.c',
+    'translate/translate_generic.c',
+    'translate/translate_sse.c',
     'util/u_bitmask.c',
     'util/u_blit.c',
     'util/u_blitter.c',
     'util/u_cache.c',
+    'util/u_caps.c',
     'util/u_cpu_detect.c',
     'util/u_debug.c',
+    'util/u_debug_describe.c',
     'util/u_debug_memory.c',
+    'util/u_debug_refcnt.c',
     'util/u_debug_stack.c',
     'util/u_debug_symbol.c',
     'util/u_dump_defines.c',
@@ -161,14 +165,19 @@ source = [
     'util/u_format_tests.c',
     'util/u_format_yuv.c',
     'util/u_format_zs.c',
+    'util/u_framebuffer.c',
     'util/u_gen_mipmap.c',
     'util/u_half.c',
     'util/u_handle_table.c',
     'util/u_hash.c',
     'util/u_hash_table.c',
+    'util/u_index_modify.c',
     'util/u_keymap.c',
+    'util/u_linear.c',
+    'util/u_linkage.c',
     'util/u_network.c',
     'util/u_math.c',
+    'util/u_mempool.c',
     'util/u_mm.c',
     'util/u_rect.c',
     'util/u_resource.c',
@@ -176,6 +185,7 @@ source = [
     'util/u_sampler.c',
     'util/u_simple_shaders.c',
     'util/u_snprintf.c',
+    'util/u_staging.c',
     'util/u_surface.c',
     'util/u_surfaces.c',
     'util/u_texture.c',
@@ -192,26 +202,36 @@ source = [
 if env['llvm']:
     source += [
     'gallivm/lp_bld_arit.c',
+    'gallivm/lp_bld_assert.c',
+    'gallivm/lp_bld_bitarit.c',
     'gallivm/lp_bld_const.c',
     'gallivm/lp_bld_conv.c',
     'gallivm/lp_bld_debug.c',
     'gallivm/lp_bld_flow.c',
     'gallivm/lp_bld_format_aos.c',
     'gallivm/lp_bld_format_soa.c',
+    'gallivm/lp_bld_format_yuv.c',
+    'gallivm/lp_bld_gather.c',
+    'gallivm/lp_bld_init.c',
     'gallivm/lp_bld_intr.c',
     'gallivm/lp_bld_logic.c',
-    'gallivm/lp_bld_init.c',
+    'gallivm/lp_bld_misc.cpp',
     'gallivm/lp_bld_pack.c',
     'gallivm/lp_bld_printf.c',
+    'gallivm/lp_bld_quad.c',
     'gallivm/lp_bld_sample.c',
+    'gallivm/lp_bld_sample_aos.c',
     'gallivm/lp_bld_sample_soa.c',
     'gallivm/lp_bld_struct.c',
     'gallivm/lp_bld_swizzle.c',
+    'gallivm/lp_bld_tgsi_aos.c',
     'gallivm/lp_bld_tgsi_soa.c',
     'gallivm/lp_bld_type.c',
     'draw/draw_llvm.c',
+    'draw/draw_llvm_sample.c',
+    'draw/draw_llvm_translate.c',
     'draw/draw_pt_fetch_shade_pipeline_llvm.c',
-    'draw/draw_llvm_translate.c'
+    'draw/draw_vs_llvm.c'
     ]
 
 gallium = env.ConvenienceLibrary(
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 6d0b4207986..58b022d531d 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -36,6 +36,7 @@
   */
 
 #include "pipe/p_state.h"
+#include "util/u_framebuffer.h"
 #include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
@@ -98,14 +99,11 @@ struct cso_context {
    struct pipe_framebuffer_state fb, fb_saved;
    struct pipe_viewport_state vp, vp_saved;
    struct pipe_blend_color blend_color;
+   unsigned sample_mask;
    struct pipe_stencil_ref stencil_ref, stencil_ref_saved;
 };
 
 
-static void
-free_framebuffer_state(struct pipe_framebuffer_state *fb);
-
-
 static boolean delete_blend_state(struct cso_context *ctx, void *state)
 {
    struct cso_blend *cso = (struct cso_blend *)state;
@@ -291,6 +289,9 @@ void cso_release_all( struct cso_context *ctx )
       ctx->pipe->bind_fs_state( ctx->pipe, NULL );
       ctx->pipe->bind_vs_state( ctx->pipe, NULL );
       ctx->pipe->bind_vertex_elements_state( ctx->pipe, NULL );
+      ctx->pipe->set_fragment_sampler_views(ctx->pipe, 0, NULL);
+      if (ctx->pipe->set_vertex_sampler_views)
+         ctx->pipe->set_vertex_sampler_views(ctx->pipe, 0, NULL);
    }
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
@@ -303,8 +304,8 @@ void cso_release_all( struct cso_context *ctx )
       pipe_sampler_view_reference(&ctx->vertex_sampler_views_saved[i], NULL);
    }
 
-   free_framebuffer_state(&ctx->fb);
-   free_framebuffer_state(&ctx->fb_saved);
+   util_unreference_framebuffer_state(&ctx->fb);
+   util_unreference_framebuffer_state(&ctx->fb_saved);
 
    if (ctx->cache) {
       cso_cache_delete( ctx->cache );
@@ -313,10 +314,13 @@ void cso_release_all( struct cso_context *ctx )
 }
 
 
+/**
+ * Free the CSO context.  NOTE: the state tracker should have previously called
+ * cso_release_all().
+ */
 void cso_destroy_context( struct cso_context *ctx )
 {
    if (ctx) {
-      /*cso_release_all( ctx );*/
       FREE( ctx );
    }
 }
@@ -893,42 +897,11 @@ void cso_restore_vertex_shader(struct cso_context *ctx)
 }
 
 
-/**
- * Copy framebuffer state from src to dst with refcounting of surfaces.
- */
-static void
-copy_framebuffer_state(struct pipe_framebuffer_state *dst,
-                       const struct pipe_framebuffer_state *src)
-{
-   uint i;
-
-   dst->width = src->width;
-   dst->height = src->height;
-   dst->nr_cbufs = src->nr_cbufs;
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-      pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
-   }
-   pipe_surface_reference(&dst->zsbuf, src->zsbuf);
-}
-
-
-static void
-free_framebuffer_state(struct pipe_framebuffer_state *fb)
-{
-   uint i;
-
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-      pipe_surface_reference(&fb->cbufs[i], NULL);
-   }
-   pipe_surface_reference(&fb->zsbuf, NULL);
-}
-
-
 enum pipe_error cso_set_framebuffer(struct cso_context *ctx,
                                     const struct pipe_framebuffer_state *fb)
 {
    if (memcmp(&ctx->fb, fb, sizeof(*fb)) != 0) {
-      copy_framebuffer_state(&ctx->fb, fb);
+      util_copy_framebuffer_state(&ctx->fb, fb);
       ctx->pipe->set_framebuffer_state(ctx->pipe, fb);
    }
    return PIPE_OK;
@@ -936,15 +909,15 @@ enum pipe_error cso_set_framebuffer(struct cso_context *ctx,
 
 void cso_save_framebuffer(struct cso_context *ctx)
 {
-   copy_framebuffer_state(&ctx->fb_saved, &ctx->fb);
+   util_copy_framebuffer_state(&ctx->fb_saved, &ctx->fb);
 }
 
 void cso_restore_framebuffer(struct cso_context *ctx)
 {
    if (memcmp(&ctx->fb, &ctx->fb_saved, sizeof(ctx->fb))) {
-      copy_framebuffer_state(&ctx->fb, &ctx->fb_saved);
+      util_copy_framebuffer_state(&ctx->fb, &ctx->fb_saved);
       ctx->pipe->set_framebuffer_state(ctx->pipe, &ctx->fb);
-      free_framebuffer_state(&ctx->fb_saved);
+      util_unreference_framebuffer_state(&ctx->fb_saved);
    }
 }
 
@@ -984,6 +957,16 @@ enum pipe_error cso_set_blend_color(struct cso_context *ctx,
    return PIPE_OK;
 }
 
+enum pipe_error cso_set_sample_mask(struct cso_context *ctx,
+                                    unsigned sample_mask)
+{
+   if (ctx->sample_mask != sample_mask) {
+      ctx->sample_mask = sample_mask;
+      ctx->pipe->set_sample_mask(ctx->pipe, sample_mask);
+   }
+   return PIPE_OK;
+}
+
 enum pipe_error cso_set_stencil_ref(struct cso_context *ctx,
                                     const struct pipe_stencil_ref *sr)
 {
@@ -1049,6 +1032,7 @@ static INLINE void
 clip_state_cpy(struct pipe_clip_state *dst,
                const struct pipe_clip_state *src)
 {
+   dst->depth_clamp = src->depth_clamp;
    dst->nr = src->nr;
    if (src->nr) {
       memcpy(dst->ucp, src->ucp, src->nr * sizeof(src->ucp[0]));
@@ -1059,6 +1043,9 @@ static INLINE int
 clip_state_cmp(const struct pipe_clip_state *a,
                const struct pipe_clip_state *b)
 {
+   if (a->depth_clamp != b->depth_clamp) {
+      return 1;
+   }
    if (a->nr != b->nr) {
       return 1;
    }
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index d6bcb1fe8f7..f0b07f73765 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -159,6 +159,8 @@ void cso_restore_viewport(struct cso_context *cso);
 enum pipe_error cso_set_blend_color(struct cso_context *cso,
                                     const struct pipe_blend_color *bc);
 
+enum pipe_error cso_set_sample_mask(struct cso_context *cso,
+                                    unsigned stencil_mask);
 
 enum pipe_error cso_set_stencil_ref(struct cso_context *cso,
                                     const struct pipe_stencil_ref *sr);
diff --git a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
new file mode 100644
index 00000000000..958ed20dc84
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
@@ -0,0 +1,114 @@
+/**************************************************************************
+ * 
+ * Copyright 2010, VMware, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
+                                 struct draw_vertex_info *info )
+{
+   struct vertex_header *out = info->verts;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   /* const */ float (*plane)[4] = pvs->draw->plane;
+   const unsigned pos = draw_current_shader_position_output(pvs->draw);
+   const unsigned ef = pvs->draw->vs.edgeflag_output;
+   const unsigned nr = pvs->draw->nr_planes;
+   const unsigned flags = (FLAGS);
+   unsigned need_pipeline = 0;
+   unsigned j;
+
+   for (j = 0; j < info->count; j++) {
+      float *position = out->data[pos];
+      unsigned mask = 0x0;
+  
+      initialize_vertex_header(out);
+
+      if (flags & (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_HALF_Z | DO_CLIP_USER)) {
+         out->clip[0] = position[0];
+         out->clip[1] = position[1];
+         out->clip[2] = position[2];
+         out->clip[3] = position[3];
+
+         /* Do the hardwired planes first:
+          */
+         if (flags & DO_CLIP_XY) {
+            if (-position[0] + position[3] < 0) mask |= (1<<0);
+            if ( position[0] + position[3] < 0) mask |= (1<<1);
+            if (-position[1] + position[3] < 0) mask |= (1<<2);
+            if ( position[1] + position[3] < 0) mask |= (1<<3);
+         }
+
+         /* Clip Z planes according to full cube, half cube or none.
+          */
+         if (flags & DO_CLIP_FULL_Z) {
+            if ( position[2] + position[3] < 0) mask |= (1<<4);
+            if (-position[2] + position[3] < 0) mask |= (1<<5);
+         }
+         else if (flags & DO_CLIP_HALF_Z) {
+            if ( position[2]               < 0) mask |= (1<<4);
+            if (-position[2] + position[3] < 0) mask |= (1<<5);
+         }
+
+         if (flags & DO_CLIP_USER) {
+            unsigned i;
+            for (i = 6; i < nr; i++) {
+               if (dot4(position, plane[i]) < 0) 
+                  mask |= (1<<i);
+            }
+         }
+
+         out->clipmask = mask;
+         need_pipeline |= out->clipmask;
+      }
+
+      if ((flags & DO_VIEWPORT) && mask == 0)
+      {
+	 /* divide by w */
+	 float w = 1.0f / position[3];
+
+	 /* Viewport mapping */
+	 position[0] = position[0] * w * scale[0] + trans[0];
+	 position[1] = position[1] * w * scale[1] + trans[1];
+	 position[2] = position[2] * w * scale[2] + trans[2];
+	 position[3] = w;
+      }
+
+      if ((flags & DO_EDGEFLAG) && ef) {
+         const float *edgeflag = out->data[ef];
+         out->edgeflag = !(edgeflag[0] != 1.0f);
+         need_pipeline |= !out->edgeflag;
+      }
+
+      out = (struct vertex_header *)( (char *)out + info->stride );
+   }
+
+   return need_pipeline != 0;
+}
+
+
+#undef FLAGS
+#undef TAG
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 02abddf1491..032fcbbc70a 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -34,12 +34,33 @@
 #include "pipe/p_context.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 #include "draw_context.h"
 #include "draw_vs.h"
 #include "draw_gs.h"
 
 #if HAVE_LLVM
 #include "gallivm/lp_bld_init.h"
+#include "draw_llvm.h"
+
+static boolean
+draw_get_option_use_llvm(void)
+{
+   static boolean first = TRUE;
+   static boolean value;
+   if (first) {
+      first = FALSE;
+      value = debug_get_bool_option("DRAW_USE_LLVM", TRUE);
+
+#ifdef PIPE_ARCH_X86
+      util_cpu_detect();
+      /* require SSE2 due to LLVM PR6960. */
+      if (!util_cpu_caps.has_sse2)
+         value = FALSE;
+#endif
+   }
+   return value;
+}
 #endif
 
 struct draw_context *draw_create( struct pipe_context *pipe )
@@ -49,9 +70,13 @@ struct draw_context *draw_create( struct pipe_context *pipe )
       goto fail;
 
 #if HAVE_LLVM
-   lp_build_init();
-   assert(lp_build_engine);
-   draw->engine = lp_build_engine;
+   if(draw_get_option_use_llvm())
+   {
+      lp_build_init();
+      assert(lp_build_engine);
+      draw->engine = lp_build_engine;
+      draw->llvm = draw_llvm_create(draw);
+   }
 #endif
 
    if (!draw_init(draw))
@@ -81,6 +106,8 @@ boolean draw_init(struct draw_context *draw)
    ASSIGN_4V( draw->plane[4],  0,  0,  1, 1 ); /* yes these are correct */
    ASSIGN_4V( draw->plane[5],  0,  0, -1, 1 ); /* mesa's a bit wonky */
    draw->nr_planes = 6;
+   draw->clip_xy = 1;
+   draw->clip_z = 1;
 
 
    draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */
@@ -132,6 +159,10 @@ void draw_destroy( struct draw_context *draw )
    draw_pt_destroy( draw );
    draw_vs_destroy( draw );
    draw_gs_destroy( draw );
+#ifdef HAVE_LLVM
+   if(draw->llvm)
+      draw_llvm_destroy( draw->llvm );
+#endif
 
    FREE( draw );
 }
@@ -157,6 +188,14 @@ void draw_set_mrd(struct draw_context *draw, double mrd)
 }
 
 
+static void update_clip_flags( struct draw_context *draw )
+{
+   draw->clip_xy = !draw->driver.bypass_clip_xy;
+   draw->clip_z = (!draw->driver.bypass_clip_z &&
+                   !draw->depth_clamp);
+   draw->clip_user = (draw->nr_planes > 6);
+}
+
 /**
  * Register new primitive rasterization/rendering state.
  * This causes the drawing pipeline to be rebuilt.
@@ -171,18 +210,25 @@ void draw_set_rasterizer_state( struct draw_context *draw,
       draw->rasterizer = raster;
       draw->rast_handle = rast_handle;
 
-      draw->bypass_clipping = draw->driver.bypass_clipping;
-   }
+  }
 }
 
-
+/* With a little more work, llvmpipe will be able to turn this off and
+ * do its own x/y clipping.  
+ *
+ * Some hardware can turn off clipping altogether - in particular any
+ * hardware with a TNL unit can do its own clipping, even if it is
+ * relying on the draw module for some other reason.
+ */
 void draw_set_driver_clipping( struct draw_context *draw,
-                               boolean bypass_clipping )
+                               boolean bypass_clip_xy,
+                               boolean bypass_clip_z )
 {
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
 
-   draw->driver.bypass_clipping = bypass_clipping;
-   draw->bypass_clipping = draw->driver.bypass_clipping;
+   draw->driver.bypass_clip_xy = bypass_clip_xy;
+   draw->driver.bypass_clip_z = bypass_clip_z;
+   update_clip_flags(draw);
 }
 
 
@@ -211,6 +257,9 @@ void draw_set_clip_state( struct draw_context *draw,
    assert(clip->nr <= PIPE_MAX_CLIP_PLANES);
    memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0]));
    draw->nr_planes = 6 + clip->nr;
+   draw->depth_clamp = clip->depth_clamp;
+
+   update_clip_flags(draw);
 }
 
 
@@ -282,12 +331,19 @@ draw_set_mapped_constant_buffer(struct draw_context *draw,
                 shader_type == PIPE_SHADER_GEOMETRY);
    debug_assert(slot < PIPE_MAX_CONSTANT_BUFFERS);
 
-   if (shader_type == PIPE_SHADER_VERTEX) {
+   switch (shader_type) {
+   case PIPE_SHADER_VERTEX:
       draw->pt.user.vs_constants[slot] = buffer;
+      draw->pt.user.vs_constants_size[slot] = size;
       draw_vs_set_constants(draw, slot, buffer, size);
-   } else if (shader_type == PIPE_SHADER_GEOMETRY) {
+      break;
+   case PIPE_SHADER_GEOMETRY:
       draw->pt.user.gs_constants[slot] = buffer;
+      draw->pt.user.gs_constants_size[slot] = size;
       draw_gs_set_constants(draw, slot, buffer, size);
+      break;
+   default:
+      assert(0 && "invalid shader type in draw_set_mapped_constant_buffer");
    }
 }
 
@@ -357,6 +413,42 @@ draw_set_force_passthrough( struct draw_context *draw, boolean enable )
 }
 
 
+
+/**
+ * Allocate an extra vertex/geometry shader vertex attribute.
+ * This is used by some of the optional draw module stages such
+ * as wide_point which may need to allocate additional generic/texcoord
+ * attributes.
+ */
+int
+draw_alloc_extra_vertex_attrib(struct draw_context *draw,
+                               uint semantic_name, uint semantic_index)
+{
+   const int num_outputs = draw_current_shader_outputs(draw);
+   const int n = draw->extra_shader_outputs.num;
+
+   assert(n < Elements(draw->extra_shader_outputs.semantic_name));
+
+   draw->extra_shader_outputs.semantic_name[n] = semantic_name;
+   draw->extra_shader_outputs.semantic_index[n] = semantic_index;
+   draw->extra_shader_outputs.slot[n] = num_outputs + n;
+   draw->extra_shader_outputs.num++;
+
+   return draw->extra_shader_outputs.slot[n];
+}
+
+
+/**
+ * Remove all extra vertex attributes that were allocated with
+ * draw_alloc_extra_vertex_attrib().
+ */
+void
+draw_remove_extra_vertex_attribs(struct draw_context *draw)
+{
+   draw->extra_shader_outputs.num = 0;
+}
+
+
 /**
  * Ask the draw module for the location/slot of the given vertex attribute in
  * a post-transformed vertex.
@@ -390,12 +482,12 @@ draw_find_shader_output(const struct draw_context *draw,
          return i;
    }
 
-   /* XXX there may be more than one extra vertex attrib.
-    * For example, simulated gl_FragCoord and gl_PointCoord.
-    */
-   if (draw->extra_shader_outputs.semantic_name == semantic_name &&
-       draw->extra_shader_outputs.semantic_index == semantic_index) {
-      return draw->extra_shader_outputs.slot;
+   /* Search the extra vertex attributes */
+   for (i = 0; i < draw->extra_shader_outputs.num; i++) {
+      if (draw->extra_shader_outputs.semantic_name[i] == semantic_name &&
+          draw->extra_shader_outputs.semantic_index[i] == semantic_index) {
+         return draw->extra_shader_outputs.slot[i];
+      }
    }
 
    return 0;
@@ -414,16 +506,18 @@ draw_find_shader_output(const struct draw_context *draw,
 uint
 draw_num_shader_outputs(const struct draw_context *draw)
 {
-   uint count = draw->vs.vertex_shader->info.num_outputs;
+   uint count;
 
    /* If a geometry shader is present, its outputs go to the
     * driver, else the vertex shader's outputs.
     */
    if (draw->gs.geometry_shader)
       count = draw->gs.geometry_shader->info.num_outputs;
+   else
+      count = draw->vs.vertex_shader->info.num_outputs;
+
+   count += draw->extra_shader_outputs.num;
 
-   if (draw->extra_shader_outputs.slot > 0)
-      count++;
    return count;
 }
 
@@ -435,13 +529,18 @@ draw_num_shader_outputs(const struct draw_context *draw)
  */
 void
 draw_texture_samplers(struct draw_context *draw,
+                      uint shader,
                       uint num_samplers,
                       struct tgsi_sampler **samplers)
 {
-   draw->vs.num_samplers = num_samplers;
-   draw->vs.samplers = samplers;
-   draw->gs.num_samplers = num_samplers;
-   draw->gs.samplers = samplers;
+   if (shader == PIPE_SHADER_VERTEX) {
+      draw->vs.num_samplers = num_samplers;
+      draw->vs.samplers = samplers;
+   } else {
+      debug_assert(shader == PIPE_SHADER_GEOMETRY);
+      draw->gs.num_samplers = num_samplers;
+      draw->gs.samplers = samplers;
+   }
 }
 
 
@@ -454,47 +553,28 @@ void draw_set_render( struct draw_context *draw,
 }
 
 
-
-/**
- * Tell the drawing context about the index/element buffer to use
- * (ala glDrawElements)
- * If no element buffer is to be used (i.e. glDrawArrays) then this
- * should be called with eltSize=0 and elements=NULL.
- *
- * \param draw  the drawing context
- * \param eltSize  size of each element (1, 2 or 4 bytes)
- * \param elements  the element buffer ptr
- */
 void
-draw_set_mapped_element_buffer_range( struct draw_context *draw,
-                                      unsigned eltSize,
-                                      int eltBias,
-                                      unsigned min_index,
-                                      unsigned max_index,
-                                      const void *elements )
+draw_set_index_buffer(struct draw_context *draw,
+                      const struct pipe_index_buffer *ib)
 {
-   draw->pt.user.elts = elements;
-   draw->pt.user.eltSize = eltSize;
-   draw->pt.user.eltBias = eltBias;
-   draw->pt.user.min_index = min_index;
-   draw->pt.user.max_index = max_index;
+   if (ib)
+      memcpy(&draw->pt.index_buffer, ib, sizeof(draw->pt.index_buffer));
+   else
+      memset(&draw->pt.index_buffer, 0, sizeof(draw->pt.index_buffer));
 }
 
 
+/**
+ * Tell drawing context where to find mapped index/element buffer.
+ */
 void
-draw_set_mapped_element_buffer( struct draw_context *draw,
-                                unsigned eltSize,
-                                int eltBias,
-                                const void *elements )
+draw_set_mapped_index_buffer(struct draw_context *draw,
+                             const void *elements)
 {
-   draw->pt.user.elts = elements;
-   draw->pt.user.eltSize = eltSize;
-   draw->pt.user.eltBias = eltBias;
-   draw->pt.user.min_index = 0;
-   draw->pt.user.max_index = 0xffffffff;
+    draw->pt.user.elts = elements;
 }
 
- 
+
 /* Revamp me please:
  */
 void draw_do_flush( struct draw_context *draw, unsigned flags )
@@ -566,7 +646,7 @@ draw_get_rasterizer_no_cull( struct draw_context *draw,
       memset(&rast, 0, sizeof(rast));
       rast.scissor = scissor;
       rast.flatshade = flatshade;
-      rast.front_winding = PIPE_WINDING_CCW;
+      rast.front_ccw = 1;
       rast.gl_rasterization_rules = draw->rasterizer->gl_rasterization_rules;
 
       draw->rasterizer_no_cull[scissor][flatshade] =
@@ -574,3 +654,82 @@ draw_get_rasterizer_no_cull( struct draw_context *draw,
    }
    return draw->rasterizer_no_cull[scissor][flatshade];
 }
+
+void
+draw_set_mapped_so_buffers(struct draw_context *draw,
+                           void *buffers[PIPE_MAX_SO_BUFFERS],
+                           unsigned num_buffers)
+{
+   int i;
+
+   for (i = 0; i < num_buffers; ++i) {
+      draw->so.buffers[i] = buffers[i];
+   }
+   draw->so.num_buffers = num_buffers;
+}
+
+void
+draw_set_so_state(struct draw_context *draw,
+                  struct pipe_stream_output_state *state)
+{
+   memcpy(&draw->so.state,
+          state,
+          sizeof(struct pipe_stream_output_state));
+}
+
+void
+draw_set_sampler_views(struct draw_context *draw,
+                       struct pipe_sampler_view **views,
+                       unsigned num)
+{
+   unsigned i;
+
+   debug_assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   for (i = 0; i < num; ++i)
+      draw->sampler_views[i] = views[i];
+   for (i = num; i < PIPE_MAX_VERTEX_SAMPLERS; ++i)
+      draw->sampler_views[i] = NULL;
+
+   draw->num_sampler_views = num;
+}
+
+void
+draw_set_samplers(struct draw_context *draw,
+                  struct pipe_sampler_state **samplers,
+                  unsigned num)
+{
+   unsigned i;
+
+   debug_assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
+
+   for (i = 0; i < num; ++i)
+      draw->samplers[i] = samplers[i];
+   for (i = num; i < PIPE_MAX_VERTEX_SAMPLERS; ++i)
+      draw->samplers[i] = NULL;
+
+   draw->num_samplers = num;
+
+#ifdef HAVE_LLVM
+   if (draw->llvm)
+      draw_llvm_set_sampler_state(draw);
+#endif
+}
+
+void
+draw_set_mapped_texture(struct draw_context *draw,
+                        unsigned sampler_idx,
+                        uint32_t width, uint32_t height, uint32_t depth,
+                        uint32_t last_level,
+                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
+                        const void *data[DRAW_MAX_TEXTURE_LEVELS])
+{
+#ifdef HAVE_LLVM
+   if(draw->llvm)
+      draw_llvm_set_mapped_texture(draw,
+                                sampler_idx,
+                                width, height, depth, last_level,
+                                row_stride, img_stride, data);
+#endif
+}
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index b905c2f2da6..1f27cbf488a 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -39,19 +39,24 @@
 
 
 #include "pipe/p_state.h"
+#include "tgsi/tgsi_exec.h"
 
 struct pipe_context;
 struct draw_context;
 struct draw_stage;
 struct draw_vertex_shader;
 struct draw_geometry_shader;
+struct draw_fragment_shader;
 struct tgsi_sampler;
 
+#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
 
 struct draw_context *draw_create( struct pipe_context *pipe );
 
 void draw_destroy( struct draw_context *draw );
 
+void draw_flush(struct draw_context *draw);
+
 void draw_set_viewport_state( struct draw_context *draw,
                               const struct pipe_viewport_state *viewport );
 
@@ -97,9 +102,27 @@ draw_num_shader_outputs(const struct draw_context *draw);
 
 void
 draw_texture_samplers(struct draw_context *draw,
+                      uint shader_type,
                       uint num_samplers,
                       struct tgsi_sampler **samplers);
 
+void
+draw_set_sampler_views(struct draw_context *draw,
+                       struct pipe_sampler_view **views,
+                       unsigned num);
+void
+draw_set_samplers(struct draw_context *draw,
+                  struct pipe_sampler_state **samplers,
+                  unsigned num);
+
+void
+draw_set_mapped_texture(struct draw_context *draw,
+                        unsigned sampler_idx,
+                        uint32_t width, uint32_t height, uint32_t depth,
+                        uint32_t last_level,
+                        uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
+                        uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
+                        const void *data[DRAW_MAX_TEXTURE_LEVELS]);
 
 
 /*
@@ -116,6 +139,17 @@ void draw_delete_vertex_shader(struct draw_context *draw,
 
 
 /*
+ * Fragment shader functions
+ */
+struct draw_fragment_shader *
+draw_create_fragment_shader(struct draw_context *draw,
+                            const struct pipe_shader_state *shader);
+void draw_bind_fragment_shader(struct draw_context *draw,
+                               struct draw_fragment_shader *dvs);
+void draw_delete_fragment_shader(struct draw_context *draw,
+                                 struct draw_fragment_shader *dvs);
+
+/*
  * Geometry shader functions
  */
 struct draw_geometry_shader *
@@ -139,18 +173,11 @@ void draw_set_vertex_elements(struct draw_context *draw,
 			      unsigned count,
                               const struct pipe_vertex_element *elements);
 
-void
-draw_set_mapped_element_buffer_range( struct draw_context *draw,
-                                      unsigned eltSize,
-                                      int eltBias,
-                                      unsigned min_index,
-                                      unsigned max_index,
-                                      const void *elements );
-
-void draw_set_mapped_element_buffer( struct draw_context *draw,
-                                     unsigned eltSize, 
-                                     int eltBias,
-                                     const void *elements );
+void draw_set_index_buffer(struct draw_context *draw,
+                           const struct pipe_index_buffer *ib);
+
+void draw_set_mapped_index_buffer(struct draw_context *draw,
+                                  const void *elements);
 
 void draw_set_mapped_vertex_buffer(struct draw_context *draw,
                                    unsigned attr, const void *buffer);
@@ -162,11 +189,22 @@ draw_set_mapped_constant_buffer(struct draw_context *draw,
                                 const void *buffer,
                                 unsigned size);
 
+void
+draw_set_mapped_so_buffers(struct draw_context *draw,
+                           void *buffers[PIPE_MAX_SO_BUFFERS],
+                           unsigned num_buffers);
+void
+draw_set_so_state(struct draw_context *draw,
+                  struct pipe_stream_output_state *state);
+
 
 /***********************************************************************
- * draw_prim.c 
+ * draw_pt.c 
  */
 
+void draw_vbo(struct draw_context *draw,
+              const struct pipe_draw_info *info);
+
 void draw_arrays(struct draw_context *draw, unsigned prim,
 		 unsigned start, unsigned count);
 
@@ -178,8 +216,6 @@ draw_arrays_instanced(struct draw_context *draw,
                       unsigned startInstance,
                       unsigned instanceCount);
 
-void draw_flush(struct draw_context *draw);
-
 
 /*******************************************************************************
  * Driver backend interface 
@@ -189,7 +225,8 @@ void draw_set_render( struct draw_context *draw,
 		      struct vbuf_render *render );
 
 void draw_set_driver_clipping( struct draw_context *draw,
-                               boolean bypass_clipping );
+                               boolean bypass_clip_xy,
+                               boolean bypass_clip_z );
 
 void draw_set_force_passthrough( struct draw_context *draw, 
                                  boolean enable );
@@ -201,4 +238,16 @@ boolean draw_need_pipeline(const struct draw_context *draw,
                            const struct pipe_rasterizer_state *rasterizer,
                            unsigned prim );
 
+static INLINE int
+draw_get_shader_param(unsigned shader, enum pipe_cap param)
+{
+   switch(shader) {
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_GEOMETRY:
+      return tgsi_exec_get_shader_param(param);
+   default:
+      return 0;
+   }
+}
+
 #endif /* DRAW_CONTEXT_H */
diff --git a/src/gallium/auxiliary/draw/draw_decompose_tmp.h b/src/gallium/auxiliary/draw/draw_decompose_tmp.h
new file mode 100644
index 00000000000..a142563af97
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_decompose_tmp.h
@@ -0,0 +1,431 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+  *   Keith Whitwell <[email protected]>
+ *    Chia-I Wu <[email protected]>
+ */
+
+/* these macros are optional */
+#ifndef LOCAL_VARS
+#define LOCAL_VARS
+#endif
+#ifndef FUNC_ENTER
+#define FUNC_ENTER do {} while (0)
+#endif
+#ifndef FUNC_EXIT
+#define FUNC_EXIT do {} while (0)
+#endif
+#ifndef LINE_ADJ
+#define LINE_ADJ(flags, a0, i0, i1, a1) LINE(flags, i0, i1)
+#endif
+#ifndef TRIANGLE_ADJ
+#define TRIANGLE_ADJ(flags, i0, a0, i1, a1, i2, a2) TRIANGLE(flags, i0, i1, i2)
+#endif
+
+static void
+FUNC(FUNC_VARS)
+{
+   unsigned idx[6], i;
+   ushort flags;
+   LOCAL_VARS
+
+   FUNC_ENTER;
+
+   /* prim, prim_flags, count, and last_vertex_last should have been defined */
+   if (0) {
+      debug_printf("%s: prim 0x%x, prim_flags 0x%x, count %d, last_vertex_last %d\n",
+            __FUNCTION__, prim, prim_flags, count, last_vertex_last);
+   }
+
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < count; i++) {
+         idx[0] = GET_ELT(i);
+         POINT(idx[0]);
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      flags = DRAW_PIPE_RESET_STIPPLE;
+      for (i = 0; i + 1 < count; i += 2) {
+         idx[0] = GET_ELT(i);
+         idx[1] = GET_ELT(i + 1);
+         LINE(flags, idx[0], idx[1]);
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+   case PIPE_PRIM_LINE_STRIP:
+      if (count >= 2) {
+         flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE;
+         idx[1] = GET_ELT(0);
+         idx[2] = idx[1];
+
+         for (i = 1; i < count; i++, flags = 0) {
+            idx[0] = idx[1];
+            idx[1] = GET_ELT(i);
+            LINE(flags, idx[0], idx[1]);
+         }
+         /* close the loop */
+         if (prim == PIPE_PRIM_LINE_LOOP && !prim_flags)
+            LINE(flags, idx[1], idx[2]);
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
+      for (i = 0; i + 2 < count; i += 3) {
+         idx[0] = GET_ELT(i);
+         idx[1] = GET_ELT(i + 1);
+         idx[2] = GET_ELT(i + 2);
+         TRIANGLE(flags, idx[0], idx[1], idx[2]);
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (count >= 3) {
+         flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
+         idx[1] = GET_ELT(0);
+         idx[2] = GET_ELT(1);
+
+         if (last_vertex_last) {
+            for (i = 0; i + 2 < count; i++) {
+               idx[0] = idx[1];
+               idx[1] = idx[2];
+               idx[2] = GET_ELT(i + 2);
+               /* always emit idx[2] last */
+               if (i & 1)
+                  TRIANGLE(flags, idx[1], idx[0], idx[2]);
+               else
+                  TRIANGLE(flags, idx[0], idx[1], idx[2]);
+            }
+         }
+         else {
+            for (i = 0; i + 2 < count; i++) {
+               idx[0] = idx[1];
+               idx[1] = idx[2];
+               idx[2] = GET_ELT(i + 2);
+               /* always emit idx[0] first */
+               if (i & 1)
+                  TRIANGLE(flags, idx[0], idx[2], idx[1]);
+               else
+                  TRIANGLE(flags, idx[0], idx[1], idx[2]);
+            }
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (count >= 3) {
+         flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
+         idx[0] = GET_ELT(0);
+         idx[2] = GET_ELT(1);
+
+         /* idx[0] is neither the first nor the last vertex */
+         if (last_vertex_last) {
+            for (i = 0; i + 2 < count; i++) {
+               idx[1] = idx[2];
+               idx[2] = GET_ELT(i + 2);
+               /* always emit idx[2] last */
+               TRIANGLE(flags, idx[0], idx[1], idx[2]);
+            }
+         }
+         else {
+            for (i = 0; i + 2 < count; i++) {
+               idx[1] = idx[2];
+               idx[2] = GET_ELT(i + 2);
+               /* always emit idx[1] first */
+               TRIANGLE(flags, idx[1], idx[2], idx[0]);
+            }
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      if (last_vertex_last) {
+         for (i = 0; i + 3 < count; i += 4) {
+            idx[0] = GET_ELT(i);
+            idx[1] = GET_ELT(i + 1);
+            idx[2] = GET_ELT(i + 2);
+            idx[3] = GET_ELT(i + 3);
+
+            flags = DRAW_PIPE_RESET_STIPPLE |
+                    DRAW_PIPE_EDGE_FLAG_0 |
+                    DRAW_PIPE_EDGE_FLAG_2;
+            /* always emit idx[3] last */
+            TRIANGLE(flags, idx[0], idx[1], idx[3]);
+
+            flags = DRAW_PIPE_EDGE_FLAG_0 |
+                    DRAW_PIPE_EDGE_FLAG_1;
+            TRIANGLE(flags, idx[1], idx[2], idx[3]);
+         }
+      }
+      else {
+         for (i = 0; i + 3 < count; i += 4) {
+            idx[0] = GET_ELT(i);
+            idx[1] = GET_ELT(i + 1);
+            idx[2] = GET_ELT(i + 2);
+            idx[3] = GET_ELT(i + 3);
+
+            flags = DRAW_PIPE_RESET_STIPPLE |
+                    DRAW_PIPE_EDGE_FLAG_0 |
+                    DRAW_PIPE_EDGE_FLAG_1;
+            /* XXX should always emit idx[0] first */
+            /* always emit idx[3] first */
+            TRIANGLE(flags, idx[3], idx[0], idx[1]);
+
+            flags = DRAW_PIPE_EDGE_FLAG_1 |
+                    DRAW_PIPE_EDGE_FLAG_2;
+            TRIANGLE(flags, idx[3], idx[1], idx[2]);
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      if (count >= 4) {
+         idx[2] = GET_ELT(0);
+         idx[3] = GET_ELT(1);
+
+         if (last_vertex_last) {
+            for (i = 0; i + 3 < count; i += 2) {
+               idx[0] = idx[2];
+               idx[1] = idx[3];
+               idx[2] = GET_ELT(i + 2);
+               idx[3] = GET_ELT(i + 3);
+
+               /* always emit idx[3] last */
+               flags = DRAW_PIPE_RESET_STIPPLE |
+                       DRAW_PIPE_EDGE_FLAG_0 |
+                       DRAW_PIPE_EDGE_FLAG_2;
+               TRIANGLE(flags, idx[2], idx[0], idx[3]);
+
+               flags = DRAW_PIPE_EDGE_FLAG_0 |
+                       DRAW_PIPE_EDGE_FLAG_1;
+               TRIANGLE(flags, idx[0], idx[1], idx[3]);
+            }
+         }
+         else {
+            for (i = 0; i + 3 < count; i += 2) {
+               idx[0] = idx[2];
+               idx[1] = idx[3];
+               idx[2] = GET_ELT(i + 2);
+               idx[3] = GET_ELT(i + 3);
+
+               flags = DRAW_PIPE_RESET_STIPPLE |
+                       DRAW_PIPE_EDGE_FLAG_0 |
+                       DRAW_PIPE_EDGE_FLAG_1;
+               /* XXX should always emit idx[0] first */
+               /* always emit idx[3] first */
+               TRIANGLE(flags, idx[3], idx[2], idx[0]);
+
+               flags = DRAW_PIPE_EDGE_FLAG_1 |
+                       DRAW_PIPE_EDGE_FLAG_2;
+               TRIANGLE(flags, idx[3], idx[0], idx[1]);
+            }
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      if (count >= 3) {
+         ushort edge_next, edge_finish;
+
+         if (last_vertex_last) {
+            flags = (DRAW_PIPE_RESET_STIPPLE |
+                     DRAW_PIPE_EDGE_FLAG_0);
+            if (!(prim_flags & DRAW_SPLIT_BEFORE))
+               flags |= DRAW_PIPE_EDGE_FLAG_2;
+
+            edge_next = DRAW_PIPE_EDGE_FLAG_0;
+            edge_finish =
+               (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_1;
+         }
+         else {
+            flags = (DRAW_PIPE_RESET_STIPPLE |
+                     DRAW_PIPE_EDGE_FLAG_1);
+            if (!(prim_flags & DRAW_SPLIT_BEFORE))
+               flags |= DRAW_PIPE_EDGE_FLAG_0;
+
+            edge_next = DRAW_PIPE_EDGE_FLAG_1;
+            edge_finish =
+               (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_2;
+         }
+
+         idx[0] = GET_ELT(0);
+         idx[2] = GET_ELT(1);
+
+         for (i = 0; i + 2 < count; i++, flags = edge_next) {
+            idx[1] = idx[2];
+            idx[2] = GET_ELT(i + 2);
+
+            if (i + 3 == count)
+               flags |= edge_finish;
+
+            /* idx[0] is both the first and the last vertex */
+            if (last_vertex_last)
+               TRIANGLE(flags, idx[1], idx[2], idx[0]);
+            else
+               TRIANGLE(flags, idx[0], idx[1], idx[2]);
+         }
+      }
+      break;
+
+   case PIPE_PRIM_LINES_ADJACENCY:
+      flags = DRAW_PIPE_RESET_STIPPLE;
+      for (i = 0; i + 3 < count; i += 4) {
+         idx[0] = GET_ELT(i);
+         idx[1] = GET_ELT(i + 1);
+         idx[2] = GET_ELT(i + 2);
+         idx[3] = GET_ELT(i + 3);
+         LINE_ADJ(flags, idx[0], idx[1], idx[2], idx[3]);
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      if (count >= 4) {
+         flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE;
+         idx[1] = GET_ELT(0);
+         idx[2] = GET_ELT(1);
+         idx[3] = GET_ELT(2);
+
+         for (i = 1; i + 2 < count; i++, flags = 0) {
+            idx[0] = idx[1];
+            idx[1] = idx[2];
+            idx[2] = idx[3];
+            idx[3] = GET_ELT(i + 2);
+            LINE_ADJ(flags, idx[0], idx[1], idx[2], idx[3]);
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
+      for (i = 0; i + 5 < count; i += 6) {
+         idx[0] = GET_ELT(i);
+         idx[1] = GET_ELT(i + 1);
+         idx[2] = GET_ELT(i + 2);
+         idx[3] = GET_ELT(i + 3);
+         idx[4] = GET_ELT(i + 4);
+         idx[5] = GET_ELT(i + 5);
+         TRIANGLE_ADJ(flags, idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]);
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      if (count >= 6) {
+         flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
+         idx[0] = GET_ELT(1);
+         idx[2] = GET_ELT(0);
+         idx[4] = GET_ELT(2);
+         idx[3] = GET_ELT(4);
+
+         /*
+          * The vertices of the i-th triangle are stored in
+          * idx[0,2,4] = { 2*i, 2*i+2, 2*i+4 };
+          *
+          * The adjacent vertices are stored in
+          * idx[1,3,5] = { 2*i-2, 2*i+6, 2*i+3 }.
+          *
+          * However, there are two exceptions:
+          *
+          * For the first triangle, idx[1] = 1;
+          * For the  last triangle, idx[3] = 2*i+5.
+          */
+         if (last_vertex_last) {
+            for (i = 0; i + 5 < count; i += 2) {
+               idx[1] = idx[0];
+
+               idx[0] = idx[2];
+               idx[2] = idx[4];
+               idx[4] = idx[3];
+
+               idx[3] = GET_ELT(i + ((i + 7 < count) ? 6 : 5));
+               idx[5] = GET_ELT(i + 3);
+
+               /*
+                * alternate the first two vertices (idx[0] and idx[2]) and the
+                * corresponding adjacent vertices (idx[3] and idx[5]) to have
+                * the correct orientation
+                */
+               if (i & 2) {
+                  TRIANGLE_ADJ(flags,
+                        idx[2], idx[1], idx[0], idx[5], idx[4], idx[3]);
+               }
+               else {
+                  TRIANGLE_ADJ(flags,
+                        idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]);
+               }
+            }
+         }
+         else {
+            for (i = 0; i + 5 < count; i += 2) {
+               idx[1] = idx[0];
+
+               idx[0] = idx[2];
+               idx[2] = idx[4];
+               idx[4] = idx[3];
+
+               idx[3] = GET_ELT(i + ((i + 7 < count) ? 6 : 5));
+               idx[5] = GET_ELT(i + 3);
+
+               /*
+                * alternate the last two vertices (idx[2] and idx[4]) and the
+                * corresponding adjacent vertices (idx[1] and idx[5]) to have
+                * the correct orientation
+                */
+               if (i & 2) {
+                  TRIANGLE_ADJ(flags,
+                        idx[0], idx[5], idx[4], idx[3], idx[2], idx[1]);
+               }
+               else {
+                  TRIANGLE_ADJ(flags,
+                        idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]);
+               }
+            }
+         }
+      }
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   FUNC_EXIT;
+}
+
+#undef LOCAL_VARS
+#undef FUNC_ENTER
+#undef FUNC_EXIT
+#undef LINE_ADJ
+#undef TRIANGLE_ADJ
+
+#undef FUNC
+#undef FUNC_VARS
+#undef GET_ELT
+#undef POINT
+#undef LINE
+#undef TRIANGLE
diff --git a/src/gallium/auxiliary/draw/draw_fs.c b/src/gallium/auxiliary/draw/draw_fs.c
new file mode 100644
index 00000000000..1543bd86f17
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_fs.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_prim.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "draw_fs.h"
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+struct draw_fragment_shader *
+draw_create_fragment_shader(struct draw_context *draw,
+                            const struct pipe_shader_state *shader)
+{
+   struct draw_fragment_shader *dfs;
+
+   dfs = CALLOC_STRUCT(draw_fragment_shader);
+   if (dfs) {
+      dfs->base = *shader;
+      tgsi_scan_shader(shader->tokens, &dfs->info);
+   }
+
+   return dfs;
+}
+
+
+void
+draw_bind_fragment_shader(struct draw_context *draw,
+                          struct draw_fragment_shader *dfs)
+{
+   draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
+
+   draw->fs.fragment_shader = dfs;
+}
+
+
+void
+draw_delete_fragment_shader(struct draw_context *draw,
+                            struct draw_fragment_shader *dfs)
+{
+   FREE(dfs);
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_fs.h b/src/gallium/auxiliary/draw/draw_fs.h
new file mode 100644
index 00000000000..44995b8277f
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_fs.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef DRAW_FS_H
+#define DRAW_FS_H
+
+
+#include "tgsi/tgsi_scan.h"
+
+
+struct draw_fragment_shader
+{
+   struct pipe_shader_state base;
+   struct tgsi_shader_info info;
+};
+
+
+#endif /* DRAW_FS_H */
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 131deed43e4..50a03ac95a5 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMWare Inc.
+ * Copyright 2009 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -37,8 +37,8 @@
 
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 
-#define MAX_PRIM_VERTICES 6
 /* fixme: move it from here */
 #define MAX_PRIMITIVES 64
 
@@ -75,6 +75,10 @@ draw_gs_set_constants(struct draw_context *draw,
                       const void *constants,
                       unsigned size)
 {
+   /* noop. added here for symmetry with the VS
+    * code and in case we'll ever want to allign
+    * the constants, e.g. when we'll change to a
+    * different interpreter */
 }
 
 
@@ -90,6 +94,7 @@ draw_create_geometry_shader(struct draw_context *draw,
    if (!gs)
       return NULL;
 
+   gs->draw = draw;
    gs->state = *state;
    gs->state.tokens = tgsi_dup_tokens(state->tokens);
    if (!gs->state.tokens) {
@@ -112,7 +117,7 @@ draw_create_geometry_shader(struct draw_context *draw,
                TGSI_PROPERTY_GS_OUTPUT_PRIM)
          gs->output_primitive = gs->info.properties[i].data[0];
       else if (gs->info.properties[i].name ==
-               TGSI_PROPERTY_GS_MAX_VERTICES)
+               TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES)
          gs->max_output_vertices = gs->info.properties[i].data[0];
    }
 
@@ -154,128 +159,37 @@ void draw_delete_geometry_shader(struct draw_context *draw,
    FREE(dgs);
 }
 
-static INLINE int num_vertices_for_prim(int prim)
-{
-   switch(prim) {
-   case PIPE_PRIM_POINTS:
-      return 1;
-   case PIPE_PRIM_LINES:
-      return 2;
-   case PIPE_PRIM_LINE_LOOP:
-      return 2;
-   case PIPE_PRIM_LINE_STRIP:
-      return 2;
-   case PIPE_PRIM_TRIANGLES:
-      return 3;
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      return 3;
-   case PIPE_PRIM_TRIANGLE_FAN:
-      return 3;
-   case PIPE_PRIM_LINES_ADJACENCY:
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-      return 4;
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      return 6;
-   default:
-      assert(!"Bad geometry shader input");
-      return 0;
-   }
-}
-
-static void draw_fetch_geometry_input(struct draw_geometry_shader *shader,
-                                      int start_primitive,
-                                      int num_primitives,
-                                      const float (*input_ptr)[4],
-                                      unsigned input_vertex_stride,
-                                      unsigned inputs_from_vs)
-{
-   struct tgsi_exec_machine *machine = shader->machine;
-   unsigned slot, vs_slot, k, j;
-   unsigned num_vertices = num_vertices_for_prim(shader->input_primitive);
-   int idx = 0;
-
-   for (slot = 0, vs_slot = 0; slot < shader->info.num_inputs; slot++) {
-      /*debug_printf("Slot = %d (semantic = %d)\n", slot,
-        shader->info.input_semantic_name[slot]);*/
-      if (shader->info.input_semantic_name[slot] ==
-          TGSI_SEMANTIC_PRIMID) {
-         for (j = 0; j < num_primitives; ++j) {
-            machine->Inputs[idx].xyzw[0].f[j] = (float)start_primitive + j;
-            machine->Inputs[idx].xyzw[1].f[j] = (float)start_primitive + j;
-            machine->Inputs[idx].xyzw[2].f[j] = (float)start_primitive + j;
-            machine->Inputs[idx].xyzw[3].f[j] = (float)start_primitive + j;
-         }
-         ++idx;
-      } else {
-         for (j = 0; j < num_primitives; ++j) {
-            int vidx = idx;
-            const float (*prim_ptr)[4];
-            /*debug_printf("    %d) Prim (num_verts = %d)\n", start_primitive + j,
-              num_vertices);*/
-            prim_ptr = (const float (*)[4])(
-               (const char *)input_ptr +
-               (j * num_vertices * input_vertex_stride));
-
-            for (k = 0; k < num_vertices; ++k, ++vidx) {
-               const float (*input)[4];
-               input = (const float (*)[4])(
-                  (const char *)prim_ptr + (k * input_vertex_stride));
-               vidx = k * TGSI_EXEC_MAX_INPUT_ATTRIBS + slot;
-               /*debug_printf("\t%d)(%d) Input vert:\n", vidx, k);*/
-#if 1
-               assert(!util_is_inf_or_nan(input[vs_slot][0]));
-               assert(!util_is_inf_or_nan(input[vs_slot][1]));
-               assert(!util_is_inf_or_nan(input[vs_slot][2]));
-               assert(!util_is_inf_or_nan(input[vs_slot][3]));
-#endif
-               machine->Inputs[vidx].xyzw[0].f[j] = input[vs_slot][0];
-               machine->Inputs[vidx].xyzw[1].f[j] = input[vs_slot][1];
-               machine->Inputs[vidx].xyzw[2].f[j] = input[vs_slot][2];
-               machine->Inputs[vidx].xyzw[3].f[j] = input[vs_slot][3];
-#if 0
-               debug_printf("\t\t%d %f %f %f %f\n", slot,
-                            machine->Inputs[vidx].xyzw[0].f[j],
-                            machine->Inputs[vidx].xyzw[1].f[j],
-                            machine->Inputs[vidx].xyzw[2].f[j],
-                            machine->Inputs[vidx].xyzw[3].f[j]);
-#endif
-            }
-         }
-         ++vs_slot;
-         idx += num_vertices;
-      }
-   }
-}
-
+/*#define DEBUG_OUTPUTS 1*/
 static INLINE void
 draw_geometry_fetch_outputs(struct draw_geometry_shader *shader,
                             int num_primitives,
-                            float (*output)[4],
-                            unsigned vertex_size)
+                            float (**p_output)[4])
 {
    struct tgsi_exec_machine *machine = shader->machine;
    unsigned prim_idx, j, slot;
+   float (*output)[4];
+
+   output = *p_output;
 
    /* Unswizzle all output results.
     */
-   /* FIXME: handle all the primitives produced by the gs, not just
-    * the first one
-    unsigned prim_count =
-    mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];*/
+
    for (prim_idx = 0; prim_idx < num_primitives; ++prim_idx) {
-      unsigned num_verts_per_prim = machine->Primitives[0];
+      unsigned num_verts_per_prim = machine->Primitives[prim_idx];
+      shader->primitive_lengths[prim_idx +   shader->emitted_primitives] =
+         machine->Primitives[prim_idx];
+      shader->emitted_vertices += num_verts_per_prim;
       for (j = 0; j < num_verts_per_prim; j++) {
          int idx = (prim_idx * num_verts_per_prim + j) *
                    shader->info.num_outputs;
 #ifdef DEBUG_OUTPUTS
-         debug_printf("%d) Output vert:\n", idx);
+         debug_printf("%d) Output vert:\n", idx / shader->info.num_outputs);
 #endif
          for (slot = 0; slot < shader->info.num_outputs; slot++) {
-            output[slot][0] = machine->Outputs[idx + slot].xyzw[0].f[prim_idx];
-            output[slot][1] = machine->Outputs[idx + slot].xyzw[1].f[prim_idx];
-            output[slot][2] = machine->Outputs[idx + slot].xyzw[2].f[prim_idx];
-            output[slot][3] = machine->Outputs[idx + slot].xyzw[3].f[prim_idx];
+            output[slot][0] = machine->Outputs[idx + slot].xyzw[0].f[0];
+            output[slot][1] = machine->Outputs[idx + slot].xyzw[1].f[0];
+            output[slot][2] = machine->Outputs[idx + slot].xyzw[2].f[0];
+            output[slot][3] = machine->Outputs[idx + slot].xyzw[3].f[0];
 #ifdef DEBUG_OUTPUTS
             debug_printf("\t%d: %f %f %f %f\n", slot,
                          output[slot][0],
@@ -285,52 +199,276 @@ draw_geometry_fetch_outputs(struct draw_geometry_shader *shader,
 #endif
             debug_assert(!util_is_inf_or_nan(output[slot][0]));
          }
-         output = (float (*)[4])((char *)output + vertex_size);
+         output = (float (*)[4])((char *)output + shader->vertex_size);
       }
    }
+   *p_output = output;
+         shader->emitted_primitives += num_primitives;
 }
 
-void draw_geometry_shader_run(struct draw_geometry_shader *shader,
-                              const float (*input)[4],
-                              float (*output)[4],
-                              const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
-                              unsigned count,
-                              unsigned input_stride,
-                              unsigned vertex_size)
+/*#define DEBUG_INPUTS 1*/
+static void draw_fetch_gs_input(struct draw_geometry_shader *shader,
+                                unsigned *indices,
+                                unsigned num_vertices,
+                                unsigned prim_idx)
 {
    struct tgsi_exec_machine *machine = shader->machine;
-   unsigned int i;
-   unsigned num_vertices = num_vertices_for_prim(shader->input_primitive);
-   unsigned num_primitives = count/num_vertices;
-   unsigned inputs_from_vs = 0;
+   unsigned slot, vs_slot, i;
+   unsigned input_vertex_stride = shader->input_vertex_stride;
+   const float (*input_ptr)[4];
 
-   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-      machine->Consts[i] = constants[i];
-   }
+   input_ptr = shader->input;
 
-   for (i = 0; i < shader->info.num_inputs; ++i) {
-      if (shader->info.input_semantic_name[i] != TGSI_SEMANTIC_PRIMID)
-         ++inputs_from_vs;
+   for (i = 0; i < num_vertices; ++i) {
+      const float (*input)[4];
+#if DEBUG_INPUTS
+      debug_printf("%d) vertex index = %d (prim idx = %d)\n",
+                   i, indices[i], prim_idx);
+#endif
+      input = (const float (*)[4])(
+         (const char *)input_ptr + (indices[i] * input_vertex_stride));
+      for (slot = 0, vs_slot = 0; slot < shader->info.num_inputs; ++slot) {
+         unsigned idx = i * TGSI_EXEC_MAX_INPUT_ATTRIBS + slot;
+         if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_PRIMID) {
+            machine->Inputs[idx].xyzw[0].f[prim_idx] =
+               (float)shader->in_prim_idx;
+            machine->Inputs[idx].xyzw[1].f[prim_idx] =
+               (float)shader->in_prim_idx;
+            machine->Inputs[idx].xyzw[2].f[prim_idx] =
+               (float)shader->in_prim_idx;
+            machine->Inputs[idx].xyzw[3].f[prim_idx] =
+               (float)shader->in_prim_idx;
+         } else {
+#if DEBUG_INPUTS
+            debug_printf("\tSlot = %d, vs_slot = %d, idx = %d:\n",
+                         slot, vs_slot, idx);
+#endif
+#if 1
+            assert(!util_is_inf_or_nan(input[vs_slot][0]));
+            assert(!util_is_inf_or_nan(input[vs_slot][1]));
+            assert(!util_is_inf_or_nan(input[vs_slot][2]));
+            assert(!util_is_inf_or_nan(input[vs_slot][3]));
+#endif
+            machine->Inputs[idx].xyzw[0].f[prim_idx] = input[vs_slot][0];
+            machine->Inputs[idx].xyzw[1].f[prim_idx] = input[vs_slot][1];
+            machine->Inputs[idx].xyzw[2].f[prim_idx] = input[vs_slot][2];
+            machine->Inputs[idx].xyzw[3].f[prim_idx] = input[vs_slot][3];
+#if DEBUG_INPUTS
+            debug_printf("\t\t%f %f %f %f\n",
+                         machine->Inputs[idx].xyzw[0].f[prim_idx],
+                         machine->Inputs[idx].xyzw[1].f[prim_idx],
+                         machine->Inputs[idx].xyzw[2].f[prim_idx],
+                         machine->Inputs[idx].xyzw[3].f[prim_idx]);
+#endif
+            ++vs_slot;
+         }
+      }
    }
+}
 
-   for (i = 0; i < num_primitives; ++i) {
-      unsigned int max_primitives = 1;
+static void gs_flush(struct draw_geometry_shader *shader,
+                     unsigned input_primitives)
+{
+   unsigned out_prim_count;
+   struct tgsi_exec_machine *machine = shader->machine;
+
+   debug_assert(input_primitives > 0 &&
+                input_primitives < 4);
 
-      draw_fetch_geometry_input(shader, i, max_primitives, input,
-                                input_stride, inputs_from_vs);
+   tgsi_set_exec_mask(machine,
+                      1,
+                      input_primitives > 1,
+                      input_primitives > 2,
+                      input_primitives > 3);
 
-      tgsi_set_exec_mask(machine,
-                         1,
-                         max_primitives > 1,
-                         max_primitives > 2,
-                         max_primitives > 3);
+   /* run interpreter */
+   tgsi_exec_machine_run(machine);
 
-      /* run interpreter */
-      tgsi_exec_machine_run(machine);
+   out_prim_count =
+      machine->Temps[TGSI_EXEC_TEMP_PRIMITIVE_I].xyzw[TGSI_EXEC_TEMP_PRIMITIVE_C].u[0];
+
+#if 0
+   debug_printf("PRIM emitted prims = %d (verts=%d), cur prim count = %d\n",
+                shader->emitted_primitives, shader->emitted_vertices,
+                out_prim_count);
+#endif
+   draw_geometry_fetch_outputs(shader, out_prim_count,
+                               &shader->tmp_output);
+}
+
+static void gs_point(struct draw_geometry_shader *shader,
+                     int idx)
+{
+   unsigned indices[1];
+
+   indices[0] = idx;
+
+   draw_fetch_gs_input(shader, indices, 1, 0);
+   ++shader->in_prim_idx;
+
+   gs_flush(shader, 1);
+}
+
+static void gs_line(struct draw_geometry_shader *shader,
+                    int i0, int i1)
+{
+   unsigned indices[2];
 
-      draw_geometry_fetch_outputs(shader, max_primitives,
-                                  output, vertex_size);
+   indices[0] = i0;
+   indices[1] = i1;
+
+   draw_fetch_gs_input(shader, indices, 2, 0);
+   ++shader->in_prim_idx;
+
+   gs_flush(shader, 1);
+}
+
+static void gs_line_adj(struct draw_geometry_shader *shader,
+                        int i0, int i1, int i2, int i3)
+{
+   unsigned indices[4];
+
+   indices[0] = i0;
+   indices[1] = i1;
+   indices[2] = i2;
+   indices[3] = i3;
+
+   draw_fetch_gs_input(shader, indices, 4, 0);
+   ++shader->in_prim_idx;
+
+   gs_flush(shader, 1);
+}
+
+static void gs_tri(struct draw_geometry_shader *shader,
+                   int i0, int i1, int i2)
+{
+   unsigned indices[3];
+
+   indices[0] = i0;
+   indices[1] = i1;
+   indices[2] = i2;
+
+   draw_fetch_gs_input(shader, indices, 3, 0);
+   ++shader->in_prim_idx;
+
+   gs_flush(shader, 1);
+}
+
+static void gs_tri_adj(struct draw_geometry_shader *shader,
+                       int i0, int i1, int i2,
+                       int i3, int i4, int i5)
+{
+   unsigned indices[6];
+
+   indices[0] = i0;
+   indices[1] = i1;
+   indices[2] = i2;
+   indices[3] = i3;
+   indices[4] = i4;
+   indices[5] = i5;
+
+   draw_fetch_gs_input(shader, indices, 6, 0);
+   ++shader->in_prim_idx;
+
+   gs_flush(shader, 1);
+}
+
+#define FUNC         gs_run
+#define GET_ELT(idx) (idx)
+#include "draw_gs_tmp.h"
+
+
+#define FUNC         gs_run_elts
+#define LOCAL_VARS   const ushort *elts = input_prims->elts;
+#define GET_ELT(idx) (elts[idx])
+#include "draw_gs_tmp.h"
+
+
+/**
+ * Execute geometry shader using TGSI interpreter.
+ */
+int draw_geometry_shader_run(struct draw_geometry_shader *shader,
+                             const void *constants[PIPE_MAX_CONSTANT_BUFFERS], 
+                             const unsigned constants_size[PIPE_MAX_CONSTANT_BUFFERS], 
+                             const struct draw_vertex_info *input_verts,
+                             const struct draw_prim_info *input_prim,
+                             struct draw_vertex_info *output_verts,
+                             struct draw_prim_info *output_prims )
+{
+   const float (*input)[4] = (const float (*)[4])input_verts->verts->data;
+   unsigned input_stride = input_verts->vertex_size;
+   unsigned vertex_size = input_verts->vertex_size;
+   struct tgsi_exec_machine *machine = shader->machine;
+   unsigned num_input_verts = input_prim->linear ?
+                              input_verts->count :
+                              input_prim->count;
+   unsigned num_in_primitives =
+      MAX2(u_gs_prims_for_vertices(input_prim->prim, num_input_verts),
+           u_gs_prims_for_vertices(shader->input_primitive, num_input_verts));
+   unsigned max_out_prims = u_gs_prims_for_vertices(shader->output_primitive,
+                                                    shader->max_output_vertices)
+                            * num_in_primitives;
+
+   output_verts->vertex_size = input_verts->vertex_size;
+   output_verts->stride = input_verts->vertex_size;
+   output_verts->verts =
+      (struct vertex_header *)MALLOC(input_verts->vertex_size *
+                                     num_in_primitives *
+                                     shader->max_output_vertices);
+
+
+#if 0
+   debug_printf("%s count = %d (in prims # = %d)\n",
+                __FUNCTION__, num_input_verts, num_in_primitives);
+   debug_printf("\tlinear = %d, prim_info->count = %d\n",
+                input_prim->linear, input_prim->count);
+   debug_printf("\tprimt pipe = %d, shader in = %d, shader out = %d, max out = %d\n",
+                input_prim->prim, shader->input_primitive,
+                shader->output_primitive,
+                shader->max_output_vertices);
+#endif
+
+   shader->emitted_vertices = 0;
+   shader->emitted_primitives = 0;
+   shader->vertex_size = vertex_size;
+   shader->tmp_output = (float (*)[4])output_verts->verts->data;
+   shader->in_prim_idx = 0;
+   shader->input_vertex_stride = input_stride;
+   shader->input = input;
+   if (shader->primitive_lengths) {
+      FREE(shader->primitive_lengths);
    }
+   shader->primitive_lengths = MALLOC(max_out_prims * sizeof(unsigned));
+
+   tgsi_exec_set_constant_buffers(machine, PIPE_MAX_CONSTANT_BUFFERS,
+                                  constants, constants_size);
+
+   if (input_prim->linear)
+      gs_run(shader, input_prim, input_verts,
+             output_prims, output_verts);
+   else
+      gs_run_elts(shader, input_prim, input_verts,
+                  output_prims, output_verts);
+
+   /* Update prim_info:
+    */
+   output_prims->linear = TRUE;
+   output_prims->elts = NULL;
+   output_prims->start = 0;
+   output_prims->count = shader->emitted_vertices;
+   output_prims->prim = shader->output_primitive;
+   output_prims->flags = 0x0;
+   output_prims->primitive_lengths = shader->primitive_lengths;
+   output_prims->primitive_count = shader->emitted_primitives;
+   output_verts->count = shader->emitted_vertices;
+
+#if 0
+   debug_printf("GS finished, prims = %d, verts = %d\n",
+                output_prims->primitive_count,
+                output_verts->count);
+#endif
+
+   return shader->emitted_vertices;
 }
 
 void draw_geometry_shader_delete(struct draw_geometry_shader *shader)
diff --git a/src/gallium/auxiliary/draw/draw_gs.h b/src/gallium/auxiliary/draw/draw_gs.h
index d8eb2103433..67bc1aa73ff 100644
--- a/src/gallium/auxiliary/draw/draw_gs.h
+++ b/src/gallium/auxiliary/draw/draw_gs.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2009 VMWare Inc.
+ * Copyright 2009 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -54,23 +54,37 @@ struct draw_geometry_shader {
    unsigned input_primitive;
    unsigned output_primitive;
 
-   /* Extracted from shader:
-    */
-   const float (*immediates)[4];
+   unsigned *primitive_lengths;
+   unsigned emitted_vertices;
+   unsigned emitted_primitives;
+
+   float (*tmp_output)[4];
+   unsigned vertex_size;
+
+   unsigned in_prim_idx;
+   unsigned input_vertex_stride;
+   const float (*input)[4];
 };
 
-void draw_geometry_shader_run(struct draw_geometry_shader *shader,
-                              const float (*input)[4],
-                              float (*output)[4],
-                              const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
-                              unsigned count,
-                              unsigned input_stride,
-                              unsigned output_stride);
+/*
+ * Returns the number of vertices emitted.
+ * The vertex shader can emit any number of vertices as long as it's
+ * smaller than the GS_MAX_OUTPUT_VERTICES shader property.
+ */
+int draw_geometry_shader_run(struct draw_geometry_shader *shader,
+                             const void *constants[PIPE_MAX_CONSTANT_BUFFERS], 
+                             const unsigned constants_size[PIPE_MAX_CONSTANT_BUFFERS], 
+                             const struct draw_vertex_info *input_verts,
+                             const struct draw_prim_info *input_prim,
+                             struct draw_vertex_info *output_verts,
+                             struct draw_prim_info *output_prims );
 
 void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
                                   struct draw_context *draw);
 
 void draw_geometry_shader_delete(struct draw_geometry_shader *shader);
 
+int draw_gs_max_output_vertices(struct draw_geometry_shader *shader,
+                                unsigned pipe_prim);
 
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_gs_tmp.h b/src/gallium/auxiliary/draw/draw_gs_tmp.h
new file mode 100644
index 00000000000..de7b02655a5
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_gs_tmp.h
@@ -0,0 +1,32 @@
+#define FUNC_VARS struct draw_geometry_shader *gs,             \
+                  const struct draw_prim_info *input_prims,    \
+                  const struct draw_vertex_info *input_verts,  \
+                  struct draw_prim_info *output_prims,         \
+                  struct draw_vertex_info *output_verts
+
+#define FUNC_ENTER                                                \
+   /* declare more local vars */                                  \
+   const unsigned prim = input_prims->prim;                       \
+   const unsigned prim_flags = input_prims->flags;                \
+   const unsigned count = input_prims->count;                     \
+   const boolean last_vertex_last = TRUE;                         \
+   do {                                                           \
+      debug_assert(input_prims->primitive_count == 1);            \
+      switch (prim) {                                             \
+      case PIPE_PRIM_QUADS:                                       \
+      case PIPE_PRIM_QUAD_STRIP:                                  \
+      case PIPE_PRIM_POLYGON:                                     \
+         debug_assert(!"unexpected primitive type in GS");        \
+         return;                                                  \
+      default:                                                    \
+         break;                                                   \
+      }                                                           \
+   } while (0)                                                    \
+
+#define POINT(i0)                             gs_point(gs,i0)
+#define LINE(flags,i0,i1)                     gs_line(gs,i0,i1)
+#define TRIANGLE(flags,i0,i1,i2)              gs_tri(gs,i0,i1,i2)
+#define LINE_ADJ(flags,i0,i1,i2,i3)           gs_line_adj(gs,i0,i1,i2,i3)
+#define TRIANGLE_ADJ(flags,i0,i1,i2,i3,i4,i5) gs_tri_adj(gs,i0,i1,i2,i3,i4,i5)
+
+#include "draw_decompose_tmp.h"
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index 27383221b9b..7fb86d7cb27 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -1,3 +1,30 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #include "draw_llvm.h"
 
 #include "draw_context.h"
@@ -10,17 +37,21 @@
 #include "gallivm/lp_bld_debug.h"
 #include "gallivm/lp_bld_tgsi.h"
 #include "gallivm/lp_bld_printf.h"
+#include "gallivm/lp_bld_intr.h"
+#include "gallivm/lp_bld_init.h"
 
 #include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
 
 #include "util/u_cpu_detect.h"
+#include "util/u_math.h"
+#include "util/u_pointer.h"
 #include "util/u_string.h"
 
 #include <llvm-c/Transforms/Scalar.h>
 
 #define DEBUG_STORE 0
 
-
 /* generates the draw jit function */
 static void
 draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var);
@@ -34,12 +65,24 @@ init_globals(struct draw_llvm *llvm)
 
    /* struct draw_jit_texture */
    {
-      LLVMTypeRef elem_types[4];
+      LLVMTypeRef elem_types[DRAW_JIT_TEXTURE_NUM_FIELDS];
 
       elem_types[DRAW_JIT_TEXTURE_WIDTH]  = LLVMInt32Type();
       elem_types[DRAW_JIT_TEXTURE_HEIGHT] = LLVMInt32Type();
-      elem_types[DRAW_JIT_TEXTURE_STRIDE] = LLVMInt32Type();
-      elem_types[DRAW_JIT_TEXTURE_DATA]   = LLVMPointerType(LLVMInt8Type(), 0);
+      elem_types[DRAW_JIT_TEXTURE_DEPTH] = LLVMInt32Type();
+      elem_types[DRAW_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32Type();
+      elem_types[DRAW_JIT_TEXTURE_ROW_STRIDE] =
+         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+      elem_types[DRAW_JIT_TEXTURE_IMG_STRIDE] =
+         LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS);
+      elem_types[DRAW_JIT_TEXTURE_DATA] =
+         LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0),
+                       DRAW_MAX_TEXTURE_LEVELS);
+      elem_types[DRAW_JIT_TEXTURE_MIN_LOD] = LLVMFloatType();
+      elem_types[DRAW_JIT_TEXTURE_MAX_LOD] = LLVMFloatType();
+      elem_types[DRAW_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType();
+      elem_types[DRAW_JIT_TEXTURE_BORDER_COLOR] = 
+         LLVMArrayType(LLVMFloatType(), 4);
 
       texture_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -49,12 +92,33 @@ init_globals(struct draw_llvm *llvm)
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, height,
                              llvm->target, texture_type,
                              DRAW_JIT_TEXTURE_HEIGHT);
-      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, stride,
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, depth,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_DEPTH);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, last_level,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_LAST_LEVEL);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, row_stride,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_ROW_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, img_stride,
                              llvm->target, texture_type,
-                             DRAW_JIT_TEXTURE_STRIDE);
+                             DRAW_JIT_TEXTURE_IMG_STRIDE);
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, data,
                              llvm->target, texture_type,
                              DRAW_JIT_TEXTURE_DATA);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, min_lod,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_MIN_LOD);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, max_lod,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_MAX_LOD);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, lod_bias,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_LOD_BIAS);
+      LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, border_color,
+                             llvm->target, texture_type,
+                             DRAW_JIT_TEXTURE_BORDER_COLOR);
       LP_CHECK_STRUCT_SIZE(struct draw_jit_texture,
                            llvm->target, texture_type);
 
@@ -69,7 +133,8 @@ init_globals(struct draw_llvm *llvm)
 
       elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
       elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */
-      elem_types[2] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */
+      elem_types[2] = LLVMArrayType(texture_type,
+                                    PIPE_MAX_VERTEX_SAMPLERS); /* textures */
 
       context_type = LLVMStructType(elem_types, Elements(elem_types), 0);
 
@@ -79,7 +144,7 @@ init_globals(struct draw_llvm *llvm)
                              llvm->target, context_type, 1);
       LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, textures,
                              llvm->target, context_type,
-                             DRAW_JIT_CONTEXT_TEXTURES_INDEX);
+                             DRAW_JIT_CTX_TEXTURES);
       LP_CHECK_STRUCT_SIZE(struct draw_jit_context,
                            llvm->target, context_type);
 
@@ -161,9 +226,11 @@ create_vertex_header(struct draw_llvm *llvm, int data_elems)
 struct draw_llvm *
 draw_llvm_create(struct draw_context *draw)
 {
-   struct draw_llvm *llvm = CALLOC_STRUCT( draw_llvm );
+   struct draw_llvm *llvm;
 
-   util_cpu_detect();
+   llvm = CALLOC_STRUCT( draw_llvm );
+   if (!llvm)
+      return NULL;
 
    llvm->draw = draw;
    llvm->engine = draw->engine;
@@ -179,27 +246,50 @@ draw_llvm_create(struct draw_context *draw)
 
    llvm->pass = LLVMCreateFunctionPassManager(llvm->provider);
    LLVMAddTargetData(llvm->target, llvm->pass);
-   /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
-    * but there are more on SVN. */
-   /* TODO: Add more passes */
-   LLVMAddConstantPropagationPass(llvm->pass);
-   if(util_cpu_caps.has_sse4_1) {
-      /* FIXME: There is a bug in this pass, whereby the combination of fptosi
-       * and sitofp (necessary for trunc/floor/ceil/round implementation)
-       * somehow becomes invalid code.
+
+   if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
+      /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+       * but there are more on SVN. */
+      /* TODO: Add more passes */
+
+      LLVMAddCFGSimplificationPass(llvm->pass);
+
+      if (HAVE_LLVM >= 0x207 && sizeof(void*) == 4) {
+         /* For LLVM >= 2.7 and 32-bit build, use this order of passes to
+          * avoid generating bad code.
+          * Test with piglit glsl-vs-sqrt-zero test.
+          */
+         LLVMAddConstantPropagationPass(llvm->pass);
+         LLVMAddPromoteMemoryToRegisterPass(llvm->pass);
+      }
+      else {
+         LLVMAddPromoteMemoryToRegisterPass(llvm->pass);
+         LLVMAddConstantPropagationPass(llvm->pass);
+      }
+
+      if(util_cpu_caps.has_sse4_1) {
+         /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+          * and sitofp (necessary for trunc/floor/ceil/round implementation)
+          * somehow becomes invalid code.
+          */
+         LLVMAddInstructionCombiningPass(llvm->pass);
+      }
+      LLVMAddGVNPass(llvm->pass);
+   } else {
+      /* We need at least this pass to prevent the backends to fail in
+       * unexpected ways.
        */
-      LLVMAddInstructionCombiningPass(llvm->pass);
+      LLVMAddPromoteMemoryToRegisterPass(llvm->pass);
    }
-   LLVMAddPromoteMemoryToRegisterPass(llvm->pass);
-   LLVMAddGVNPass(llvm->pass);
-   LLVMAddCFGSimplificationPass(llvm->pass);
 
    init_globals(llvm);
 
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      LLVMDumpModule(llvm->module);
+   }
 
-#if 0
-   LLVMDumpModule(llvm->module);
-#endif
+   llvm->nr_variants = 0;
+   make_empty_list(&llvm->vs_variants_list);
 
    return llvm;
 }
@@ -207,21 +297,41 @@ draw_llvm_create(struct draw_context *draw)
 void
 draw_llvm_destroy(struct draw_llvm *llvm)
 {
+   LLVMDisposePassManager(llvm->pass);
+
    FREE(llvm);
 }
 
 struct draw_llvm_variant *
-draw_llvm_prepare(struct draw_llvm *llvm, int num_inputs)
+draw_llvm_create_variant(struct draw_llvm *llvm,
+			 unsigned num_inputs,
+			 const struct draw_llvm_variant_key *key)
 {
-   struct draw_llvm_variant *variant = MALLOC(sizeof(struct draw_llvm_variant));
+   struct draw_llvm_variant *variant;
+   struct llvm_vertex_shader *shader =
+      llvm_vertex_shader(llvm->draw->vs.vertex_shader);
 
-   draw_llvm_make_variant_key(llvm, &variant->key);
+   variant = MALLOC(sizeof *variant +
+		    shader->variant_key_size -
+		    sizeof variant->key);
+   if (variant == NULL)
+      return NULL;
+
+   variant->llvm = llvm;
+
+   memcpy(&variant->key, key, shader->variant_key_size);
 
    llvm->vertex_header_ptr_type = create_vertex_header(llvm, num_inputs);
 
    draw_llvm_generate(llvm, variant);
    draw_llvm_generate_elts(llvm, variant);
 
+   variant->shader = shader;
+   variant->list_item_global.base = variant;
+   variant->list_item_local.base = variant;
+   /*variant->no = */shader->variants_created++;
+   variant->list_item_global.base = variant;
+
    return variant;
 }
 
@@ -230,11 +340,13 @@ generate_vs(struct draw_llvm *llvm,
             LLVMBuilderRef builder,
             LLVMValueRef (*outputs)[NUM_CHANNELS],
             const LLVMValueRef (*inputs)[NUM_CHANNELS],
-            LLVMValueRef context_ptr)
+            LLVMValueRef context_ptr,
+            struct lp_build_sampler_soa *draw_sampler)
 {
    const struct tgsi_token *tokens = llvm->draw->vs.vertex_shader->state.tokens;
    struct lp_type vs_type;
    LLVMValueRef consts_ptr = draw_jit_context_vs_constants(builder, context_ptr);
+   struct lp_build_sampler_soa *sampler = 0;
 
    memset(&vs_type, 0, sizeof vs_type);
    vs_type.floating = TRUE; /* floating point values */
@@ -246,7 +358,14 @@ generate_vs(struct draw_llvm *llvm,
    num_vs = 4;              /* number of vertices per block */
 #endif
 
-   /*tgsi_dump(tokens, 0);*/
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      tgsi_dump(tokens, 0);
+   }
+
+   if (llvm->draw->num_sampler_views &&
+       llvm->draw->num_samplers)
+      sampler = draw_sampler;
+
    lp_build_tgsi_soa(builder,
                      tokens,
                      vs_type,
@@ -255,7 +374,7 @@ generate_vs(struct draw_llvm *llvm,
                      NULL /*pos*/,
                      inputs,
                      outputs,
-                     NULL/*sampler*/,
+                     sampler,
                      &llvm->draw->vs.vertex_shader->info);
 }
 
@@ -283,7 +402,8 @@ generate_fetch(LLVMBuilderRef builder,
                LLVMValueRef *res,
                struct pipe_vertex_element *velem,
                LLVMValueRef vbuf,
-               LLVMValueRef index)
+               LLVMValueRef index,
+               LLVMValueRef instance_id)
 {
    LLVMValueRef indices = LLVMConstInt(LLVMInt64Type(), velem->vertex_buffer_index, 0);
    LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr,
@@ -294,8 +414,15 @@ generate_fetch(LLVMBuilderRef builder,
    LLVMValueRef cond;
    LLVMValueRef stride;
 
-   cond = LLVMBuildICmp(builder, LLVMIntULE, index, vb_max_index, "");
+   if (velem->instance_divisor) {
+      /* array index = instance_id / instance_divisor */
+      index = LLVMBuildUDiv(builder, instance_id,
+                            LLVMConstInt(LLVMInt32Type(), velem->instance_divisor, 0),
+                            "instance_divisor");
+   }
 
+   /* limit index to min(inex, vb_max_index) */
+   cond = LLVMBuildICmp(builder, LLVMIntULE, index, vb_max_index, "");
    index = LLVMBuildSelect(builder, cond, index, vb_max_index, "");
 
    stride = LLVMBuildMul(builder, vb_stride, index, "");
@@ -563,20 +690,22 @@ convert_to_aos(LLVMBuilderRef builder,
 static void
 draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 {
-   LLVMTypeRef arg_types[7];
+   LLVMTypeRef arg_types[8];
    LLVMTypeRef func_type;
    LLVMValueRef context_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef start, end, count, stride, step, io_itr;
    LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
+   LLVMValueRef instance_id;
    struct draw_context *draw = llvm->draw;
    unsigned i, j;
    struct lp_build_context bld;
    struct lp_build_loop_state lp_loop;
-   struct lp_type vs_type = lp_type_float_vec(32);
    const int max_vertices = 4;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
+   void *code;
+   struct lp_build_sampler_soa *sampler = 0;
 
    arg_types[0] = llvm->context_ptr_type;           /* context */
    arg_types[1] = llvm->vertex_header_ptr_type;     /* vertex_header */
@@ -585,6 +714,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    arg_types[4] = LLVMInt32Type();                  /* count */
    arg_types[5] = LLVMInt32Type();                  /* stride */
    arg_types[6] = llvm->vb_ptr_type;                /* pipe_vertex_buffer's */
+   arg_types[7] = LLVMInt32Type();                  /* instance_id */
 
    func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
 
@@ -601,6 +731,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    count        = LLVMGetParam(variant->function, 4);
    stride       = LLVMGetParam(variant->function, 5);
    vb_ptr       = LLVMGetParam(variant->function, 6);
+   instance_id  = LLVMGetParam(variant->function, 7);
 
    lp_build_name(context_ptr, "context");
    lp_build_name(io_ptr, "io");
@@ -609,6 +740,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    lp_build_name(count, "count");
    lp_build_name(stride, "stride");
    lp_build_name(vb_ptr, "vb");
+   lp_build_name(instance_id, "instance_id");
 
    /*
     * Function body
@@ -618,12 +750,17 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
 
-   lp_build_context_init(&bld, builder, vs_type);
+   lp_build_context_init(&bld, builder, lp_type_int(32));
 
    end = lp_build_add(&bld, start, count);
 
    step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
 
+   /* code generated texture sampling */
+   sampler = draw_llvm_sampler_soa_create(
+      draw_llvm_variant_key_samplers(&variant->key),
+      context_ptr);
+
 #if DEBUG_STORE
    lp_build_printf(builder, "start = %d, end = %d, step = %d\n",
                    start, end, step);
@@ -654,7 +791,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
             LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr,
                                            &vb_index, 1, "");
             generate_fetch(builder, vbuffers_ptr,
-                           &aos_attribs[j][i], velem, vb, true_index);
+                           &aos_attribs[j][i], velem, vb, true_index,
+                           instance_id);
          }
       }
       convert_to_soa(builder, aos_attribs, inputs,
@@ -665,7 +803,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
                   builder,
                   outputs,
                   ptr_aos,
-                  context_ptr);
+                  context_ptr,
+                  sampler);
 
       convert_to_aos(builder, io, outputs,
                      draw->vs.vertex_shader->info.num_outputs,
@@ -673,6 +812,13 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    }
    lp_build_loop_end_cond(builder, end, step, LLVMIntUGE, &lp_loop);
 
+   sampler->destroy(sampler);
+
+#ifdef PIPE_ARCH_X86
+   /* Avoid corrupting the FPU stack on 32bit OSes. */
+   lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0);
+#endif
+
    LLVMBuildRetVoid(builder);
 
    LLVMDisposeBuilder(builder);
@@ -682,41 +828,48 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
     */
 #ifdef DEBUG
    if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
-      LLVMDumpValue(variant->function);
+      lp_debug_dump_value(variant->function);
       assert(0);
    }
 #endif
 
    LLVMRunFunctionPassManager(llvm->pass, variant->function);
 
-   if (0) {
-      LLVMDumpValue(variant->function);
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      lp_debug_dump_value(variant->function);
       debug_printf("\n");
    }
-   variant->jit_func = (draw_jit_vert_func)LLVMGetPointerToGlobal(llvm->draw->engine, variant->function);
 
-   if (0)
-      lp_disassemble(variant->jit_func);
+   code = LLVMGetPointerToGlobal(llvm->draw->engine, variant->function);
+   variant->jit_func = (draw_jit_vert_func)pointer_to_func(code);
+
+   if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+      lp_disassemble(code);
+   }
+   lp_func_delete_body(variant->function);
 }
 
 
 static void
 draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
 {
-   LLVMTypeRef arg_types[7];
+   LLVMTypeRef arg_types[8];
    LLVMTypeRef func_type;
    LLVMValueRef context_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
    LLVMValueRef fetch_elts, fetch_count, stride, step, io_itr;
    LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr;
+   LLVMValueRef instance_id;
    struct draw_context *draw = llvm->draw;
    unsigned i, j;
    struct lp_build_context bld;
    struct lp_build_loop_state lp_loop;
-   struct lp_type vs_type = lp_type_float_vec(32);
    const int max_vertices = 4;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
+   LLVMValueRef fetch_max;
+   void *code;
+   struct lp_build_sampler_soa *sampler = 0;
 
    arg_types[0] = llvm->context_ptr_type;               /* context */
    arg_types[1] = llvm->vertex_header_ptr_type;         /* vertex_header */
@@ -725,14 +878,17 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    arg_types[4] = LLVMInt32Type();                      /* fetch_count */
    arg_types[5] = LLVMInt32Type();                      /* stride */
    arg_types[6] = llvm->vb_ptr_type;                    /* pipe_vertex_buffer's */
+   arg_types[7] = LLVMInt32Type();                      /* instance_id */
 
    func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
 
-   variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", func_type);
+   variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts",
+                                            func_type);
    LLVMSetFunctionCallConv(variant->function_elts, LLVMCCallConv);
    for(i = 0; i < Elements(arg_types); ++i)
       if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
-         LLVMAddAttribute(LLVMGetParam(variant->function_elts, i), LLVMNoAliasAttribute);
+         LLVMAddAttribute(LLVMGetParam(variant->function_elts, i),
+                          LLVMNoAliasAttribute);
 
    context_ptr  = LLVMGetParam(variant->function_elts, 0);
    io_ptr       = LLVMGetParam(variant->function_elts, 1);
@@ -741,6 +897,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    fetch_count  = LLVMGetParam(variant->function_elts, 4);
    stride       = LLVMGetParam(variant->function_elts, 5);
    vb_ptr       = LLVMGetParam(variant->function_elts, 6);
+   instance_id  = LLVMGetParam(variant->function_elts, 7);
 
    lp_build_name(context_ptr, "context");
    lp_build_name(io_ptr, "io");
@@ -749,6 +906,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    lp_build_name(fetch_count, "fetch_count");
    lp_build_name(stride, "stride");
    lp_build_name(vb_ptr, "vb");
+   lp_build_name(instance_id, "instance_id");
 
    /*
     * Function body
@@ -758,10 +916,19 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
 
-   lp_build_context_init(&bld, builder, vs_type);
+   lp_build_context_init(&bld, builder, lp_type_int(32));
 
    step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
 
+   /* code generated texture sampling */
+   sampler = draw_llvm_sampler_soa_create(
+      draw_llvm_variant_key_samplers(&variant->key),
+      context_ptr);
+
+   fetch_max = LLVMBuildSub(builder, fetch_count,
+                            LLVMConstInt(LLVMInt32Type(), 1, 0),
+                            "fetch_max");
+
    lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), &lp_loop);
    {
       LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
@@ -780,8 +947,15 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
             builder,
             lp_loop.counter,
             LLVMConstInt(LLVMInt32Type(), i, 0), "");
-         LLVMValueRef fetch_ptr = LLVMBuildGEP(builder, fetch_elts,
-                                               &true_index, 1, "");
+         LLVMValueRef fetch_ptr;
+
+         /* make sure we're not out of bounds which can happen
+          * if fetch_count % 4 != 0, because on the last iteration
+          * a few of the 4 vertex fetches will be out of bounds */
+         true_index = lp_build_min(&bld, true_index, fetch_max);
+
+         fetch_ptr = LLVMBuildGEP(builder, fetch_elts,
+                                  &true_index, 1, "");
          true_index = LLVMBuildLoad(builder, fetch_ptr, "fetch_elt");
          for (j = 0; j < draw->pt.nr_vertex_elements; ++j) {
             struct pipe_vertex_element *velem = &draw->pt.vertex_element[j];
@@ -791,7 +965,8 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
             LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr,
                                            &vb_index, 1, "");
             generate_fetch(builder, vbuffers_ptr,
-                           &aos_attribs[j][i], velem, vb, true_index);
+                           &aos_attribs[j][i], velem, vb, true_index,
+                           instance_id);
          }
       }
       convert_to_soa(builder, aos_attribs, inputs,
@@ -802,7 +977,8 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
                   builder,
                   outputs,
                   ptr_aos,
-                  context_ptr);
+                  context_ptr,
+                  sampler);
 
       convert_to_aos(builder, io, outputs,
                      draw->vs.vertex_shader->info.num_outputs,
@@ -810,6 +986,13 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    }
    lp_build_loop_end_cond(builder, fetch_count, step, LLVMIntUGE, &lp_loop);
 
+   sampler->destroy(sampler);
+
+#ifdef PIPE_ARCH_X86
+   /* Avoid corrupting the FPU stack on 32bit OSes. */
+   lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0);
+#endif
+
    LLVMBuildRetVoid(builder);
 
    LLVMDisposeBuilder(builder);
@@ -819,37 +1002,136 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
     */
 #ifdef DEBUG
    if(LLVMVerifyFunction(variant->function_elts, LLVMPrintMessageAction)) {
-      LLVMDumpValue(variant->function_elts);
+      lp_debug_dump_value(variant->function_elts);
       assert(0);
    }
 #endif
 
    LLVMRunFunctionPassManager(llvm->pass, variant->function_elts);
 
-   if (0) {
-      LLVMDumpValue(variant->function_elts);
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      lp_debug_dump_value(variant->function_elts);
       debug_printf("\n");
    }
-   variant->jit_func_elts = (draw_jit_vert_func_elts)LLVMGetPointerToGlobal(
-      llvm->draw->engine, variant->function_elts);
 
-   if (0)
-      lp_disassemble(variant->jit_func_elts);
+   code = LLVMGetPointerToGlobal(llvm->draw->engine, variant->function_elts);
+   variant->jit_func_elts = (draw_jit_vert_func_elts)pointer_to_func(code);
+
+   if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+      lp_disassemble(code);
+   }
+   lp_func_delete_body(variant->function_elts);
 }
 
-void
-draw_llvm_make_variant_key(struct draw_llvm *llvm,
-                           struct draw_llvm_variant_key *key)
+
+struct draw_llvm_variant_key *
+draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store)
 {
-   memset(key, 0, sizeof(struct draw_llvm_variant_key));
+   unsigned i;
+   struct draw_llvm_variant_key *key;
+   struct lp_sampler_static_state *sampler;
+
+   key = (struct draw_llvm_variant_key *)store;
 
+   /* Presumably all variants of the shader should have the same
+    * number of vertex elements - ie the number of shader inputs.
+    */
    key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements;
 
+   /* All variants of this shader will have the same value for
+    * nr_samplers.  Not yet trying to compact away holes in the
+    * sampler array.
+    */
+   key->nr_samplers = llvm->draw->vs.vertex_shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+
+   sampler = draw_llvm_variant_key_samplers(key);
+
    memcpy(key->vertex_element,
           llvm->draw->pt.vertex_element,
           sizeof(struct pipe_vertex_element) * key->nr_vertex_elements);
+   
+   memset(sampler, 0, key->nr_samplers * sizeof *sampler);
+
+   for (i = 0 ; i < key->nr_samplers; i++) {
+      lp_sampler_static_state(&sampler[i],
+			      llvm->draw->sampler_views[i],
+			      llvm->draw->samplers[i]);
+   }
+
+   return key;
+}
+
+void
+draw_llvm_set_mapped_texture(struct draw_context *draw,
+                             unsigned sampler_idx,
+                             uint32_t width, uint32_t height, uint32_t depth,
+                             uint32_t last_level,
+                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
+                             const void *data[DRAW_MAX_TEXTURE_LEVELS])
+{
+   unsigned j;
+   struct draw_jit_texture *jit_tex;
+
+   assert(sampler_idx < PIPE_MAX_VERTEX_SAMPLERS);
+
+
+   jit_tex = &draw->llvm->jit_context.textures[sampler_idx];
+
+   jit_tex->width = width;
+   jit_tex->height = height;
+   jit_tex->depth = depth;
+   jit_tex->last_level = last_level;
+
+   for (j = 0; j <= last_level; j++) {
+      jit_tex->data[j] = data[j];
+      jit_tex->row_stride[j] = row_stride[j];
+      jit_tex->img_stride[j] = img_stride[j];
+   }
+}
+
+
+void
+draw_llvm_set_sampler_state(struct draw_context *draw)
+{
+   unsigned i;
+
+   for (i = 0; i < draw->num_samplers; i++) {
+      struct draw_jit_texture *jit_tex = &draw->llvm->jit_context.textures[i];
+
+      if (draw->samplers[i]) {
+         jit_tex->min_lod = draw->samplers[i]->min_lod;
+         jit_tex->max_lod = draw->samplers[i]->max_lod;
+         jit_tex->lod_bias = draw->samplers[i]->lod_bias;
+         COPY_4V(jit_tex->border_color, draw->samplers[i]->border_color);
+      }
+   }
+}
+
+
+void
+draw_llvm_destroy_variant(struct draw_llvm_variant *variant)
+{
+   struct draw_llvm *llvm = variant->llvm;
+   struct draw_context *draw = llvm->draw;
+
+   if (variant->function_elts) {
+      if (variant->function_elts)
+         LLVMFreeMachineCodeForFunction(draw->engine,
+                                        variant->function_elts);
+      LLVMDeleteFunction(variant->function_elts);
+   }
+
+   if (variant->function) {
+      if (variant->function)
+         LLVMFreeMachineCodeForFunction(draw->engine,
+                                        variant->function);
+      LLVMDeleteFunction(variant->function);
+   }
 
-   memcpy(&key->vs,
-          &llvm->draw->vs.vertex_shader->state,
-          sizeof(struct pipe_shader_state));
+   remove_from_list(&variant->list_item_local);
+   variant->shader->variants_cached--;
+   remove_from_list(&variant->list_item_global);
+   llvm->nr_variants--;
+   FREE(variant);
 }
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index 58fee7f9d60..d0a68ae412d 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -1,28 +1,79 @@
-#ifndef HAVE_LLVM_H
-#define HAVE_LLVM_H
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef DRAW_LLVM_H
+#define DRAW_LLVM_H
 
 #include "draw/draw_private.h"
 
+#include "draw/draw_vs.h"
+#include "gallivm/lp_bld_sample.h"
+
 #include "pipe/p_context.h"
+#include "util/u_simple_list.h"
 
 #include <llvm-c/Core.h>
 #include <llvm-c/Analysis.h>
 #include <llvm-c/Target.h>
 #include <llvm-c/ExecutionEngine.h>
 
+#define DRAW_MAX_TEXTURE_LEVELS 13  /* 4K x 4K for now */
+
+struct draw_llvm;
+struct llvm_vertex_shader;
+
 struct draw_jit_texture
 {
    uint32_t width;
    uint32_t height;
-   uint32_t stride;
-   const void *data;
+   uint32_t depth;
+   uint32_t last_level;
+   uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
+   const void *data[DRAW_MAX_TEXTURE_LEVELS];
+   float min_lod;
+   float max_lod;
+   float lod_bias;
+   float border_color[4];
 };
 
 enum {
    DRAW_JIT_TEXTURE_WIDTH = 0,
    DRAW_JIT_TEXTURE_HEIGHT,
-   DRAW_JIT_TEXTURE_STRIDE,
-   DRAW_JIT_TEXTURE_DATA
+   DRAW_JIT_TEXTURE_DEPTH,
+   DRAW_JIT_TEXTURE_LAST_LEVEL,
+   DRAW_JIT_TEXTURE_ROW_STRIDE,
+   DRAW_JIT_TEXTURE_IMG_STRIDE,
+   DRAW_JIT_TEXTURE_DATA,
+   DRAW_JIT_TEXTURE_MIN_LOD,
+   DRAW_JIT_TEXTURE_MAX_LOD,
+   DRAW_JIT_TEXTURE_LOD_BIAS,
+   DRAW_JIT_TEXTURE_BORDER_COLOR,
+   DRAW_JIT_TEXTURE_NUM_FIELDS  /* number of fields above */
 };
 
 enum {
@@ -48,7 +99,7 @@ struct draw_jit_context
    const float *gs_constants;
 
 
-   struct draw_jit_texture textures[PIPE_MAX_SAMPLERS];
+   struct draw_jit_texture textures[PIPE_MAX_VERTEX_SAMPLERS];
 };
 
 
@@ -58,10 +109,10 @@ struct draw_jit_context
 #define draw_jit_context_gs_constants(_builder, _ptr) \
    lp_build_struct_get(_builder, _ptr, 1, "gs_constants")
 
-#define DRAW_JIT_CONTEXT_TEXTURES_INDEX 2
+#define DRAW_JIT_CTX_TEXTURES 2
 
 #define draw_jit_context_textures(_builder, _ptr) \
-   lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CONTEXT_TEXTURES_INDEX, "textures")
+   lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CTX_TEXTURES, "textures")
 
 
 
@@ -92,7 +143,8 @@ typedef void
                       unsigned start,
                       unsigned count,
                       unsigned stride,
-                      struct pipe_vertex_buffer *vertex_buffers);
+                      struct pipe_vertex_buffer *vertex_buffers,
+                      unsigned instance_id);
 
 
 typedef void
@@ -102,13 +154,89 @@ typedef void
                            const unsigned *fetch_elts,
                            unsigned fetch_count,
                            unsigned stride,
-                           struct pipe_vertex_buffer *vertex_buffers);
+                           struct pipe_vertex_buffer *vertex_buffers,
+                           unsigned instance_id);
+
+struct draw_llvm_variant_key
+{
+   unsigned nr_vertex_elements:16;
+   unsigned nr_samplers:16;
+
+   /* Variable number of vertex elements:
+    */
+   struct pipe_vertex_element vertex_element[1];
+
+   /* Followed by variable number of samplers:
+    */
+/*   struct lp_sampler_static_state sampler; */
+};
+
+#define DRAW_LLVM_MAX_VARIANT_KEY_SIZE \
+   (sizeof(struct draw_llvm_variant_key) +	\
+    PIPE_MAX_VERTEX_SAMPLERS * sizeof(struct lp_sampler_static_state) +	\
+    (PIPE_MAX_ATTRIBS-1) * sizeof(struct pipe_vertex_element))
+
+
+static INLINE size_t
+draw_llvm_variant_key_size(unsigned nr_vertex_elements,
+			   unsigned nr_samplers)
+{
+   return (sizeof(struct draw_llvm_variant_key) +
+	   nr_samplers * sizeof(struct lp_sampler_static_state) +
+	   (nr_vertex_elements - 1) * sizeof(struct pipe_vertex_element));
+}
+
+
+static INLINE struct lp_sampler_static_state *
+draw_llvm_variant_key_samplers(struct draw_llvm_variant_key *key)
+{
+   return (struct lp_sampler_static_state *)
+      &key->vertex_element[key->nr_vertex_elements];
+}
+
+
+
+struct draw_llvm_variant_list_item
+{
+   struct draw_llvm_variant *base;
+   struct draw_llvm_variant_list_item *next, *prev;
+};
+
+struct draw_llvm_variant
+{
+   LLVMValueRef function;
+   LLVMValueRef function_elts;
+   draw_jit_vert_func jit_func;
+   draw_jit_vert_func_elts jit_func_elts;
+
+   struct llvm_vertex_shader *shader;
+
+   struct draw_llvm *llvm;
+   struct draw_llvm_variant_list_item list_item_global;
+   struct draw_llvm_variant_list_item list_item_local;
+
+   /* key is variable-sized, must be last */
+   struct draw_llvm_variant_key key;
+   /* key is variable-sized, must be last */
+};
+
+struct llvm_vertex_shader {
+   struct draw_vertex_shader base;
+
+   unsigned variant_key_size;
+   struct draw_llvm_variant_list_item variants;
+   unsigned variants_created;
+   unsigned variants_cached;
+};
 
 struct draw_llvm {
    struct draw_context *draw;
 
    struct draw_jit_context jit_context;
 
+   struct draw_llvm_variant_list_item vs_variants_list;
+   int nr_variants;
+
    LLVMModuleRef module;
    LLVMExecutionEngineRef engine;
    LLVMModuleProviderRef provider;
@@ -121,23 +249,12 @@ struct draw_llvm {
    LLVMTypeRef vb_ptr_type;
 };
 
-struct draw_llvm_variant_key
-{
-   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
-   unsigned                   nr_vertex_elements;
-   struct pipe_shader_state   vs;
-};
-
-struct draw_llvm_variant
+static INLINE struct llvm_vertex_shader *
+llvm_vertex_shader(struct draw_vertex_shader *vs)
 {
-   struct draw_llvm_variant_key key;
-   LLVMValueRef function;
-   LLVMValueRef function_elts;
-   draw_jit_vert_func jit_func;
-   draw_jit_vert_func_elts jit_func_elts;
+   return (struct llvm_vertex_shader *)vs;
+}
 
-   struct draw_llvm_variant *next;
-};
 
 struct draw_llvm *
 draw_llvm_create(struct draw_context *draw);
@@ -146,14 +263,35 @@ void
 draw_llvm_destroy(struct draw_llvm *llvm);
 
 struct draw_llvm_variant *
-draw_llvm_prepare(struct draw_llvm *llvm, int num_inputs);
+draw_llvm_create_variant(struct draw_llvm *llvm,
+			 unsigned num_vertex_header_attribs,
+			 const struct draw_llvm_variant_key *key);
 
 void
-draw_llvm_make_variant_key(struct draw_llvm *llvm,
-                           struct draw_llvm_variant_key *key);
+draw_llvm_destroy_variant(struct draw_llvm_variant *variant);
+
+struct draw_llvm_variant_key *
+draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store);
 
 LLVMValueRef
 draw_llvm_translate_from(LLVMBuilderRef builder,
                          LLVMValueRef vbuffer,
                          enum pipe_format from_format);
+
+struct lp_build_sampler_soa *
+draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
+                             LLVMValueRef context_ptr);
+
+void
+draw_llvm_set_sampler_state(struct draw_context *draw);
+
+void
+draw_llvm_set_mapped_texture(struct draw_context *draw,
+                             unsigned sampler_idx,
+                             uint32_t width, uint32_t height, uint32_t depth,
+                             uint32_t last_level,
+                             uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS],
+                             uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS],
+                             const void *data[DRAW_MAX_TEXTURE_LEVELS]);
+
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c
new file mode 100644
index 00000000000..ac1fbb179c6
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c
@@ -0,0 +1,223 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Texture sampling code generation
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_sample.h"
+#include "gallivm/lp_bld_tgsi.h"
+
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_pointer.h"
+#include "util/u_string.h"
+
+#include "draw_llvm.h"
+
+
+/**
+ * This provides the bridge between the sampler state store in
+ * lp_jit_context and lp_jit_texture and the sampler code
+ * generator. It provides the texture layout information required by
+ * the texture sampler code generator in terms of the state stored in
+ * lp_jit_context and lp_jit_texture in runtime.
+ */
+struct draw_llvm_sampler_dynamic_state
+{
+   struct lp_sampler_dynamic_state base;
+
+   const struct lp_sampler_static_state *static_state;
+
+   LLVMValueRef context_ptr;
+};
+
+
+/**
+ * This is the bridge between our sampler and the TGSI translator.
+ */
+struct draw_llvm_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   struct draw_llvm_sampler_dynamic_state dynamic_state;
+};
+
+
+/**
+ * Fetch the specified member of the lp_jit_texture structure.
+ * \param emit_load  if TRUE, emit the LLVM load instruction to actually
+ *                   fetch the field's value.  Otherwise, just emit the
+ *                   GEP code to address the field.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+draw_llvm_texture_member(const struct lp_sampler_dynamic_state *base,
+                         LLVMBuilderRef builder,
+                         unsigned unit,
+                         unsigned member_index,
+                         const char *member_name,
+                         boolean emit_load)
+{
+   struct draw_llvm_sampler_dynamic_state *state =
+      (struct draw_llvm_sampler_dynamic_state *)base;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   debug_assert(unit < PIPE_MAX_VERTEX_SAMPLERS);
+
+   /* context[0] */
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   /* context[0].textures */
+   indices[1] = LLVMConstInt(LLVMInt32Type(), DRAW_JIT_CTX_TEXTURES, 0);
+   /* context[0].textures[unit] */
+   indices[2] = LLVMConstInt(LLVMInt32Type(), unit, 0);
+   /* context[0].textures[unit].member */
+   indices[3] = LLVMConstInt(LLVMInt32Type(), member_index, 0);
+
+   ptr = LLVMBuildGEP(builder, state->context_ptr, indices, Elements(indices), "");
+
+   if (emit_load)
+      res = LLVMBuildLoad(builder, ptr, "");
+   else
+      res = ptr;
+
+   lp_build_name(res, "context.texture%u.%s", unit, member_name);
+
+   return res;
+}
+
+
+/**
+ * Helper macro to instantiate the functions that generate the code to
+ * fetch the members of lp_jit_texture to fulfill the sampler code
+ * generator requests.
+ *
+ * This complexity is the price we have to pay to keep the texture
+ * sampler code generator a reusable module without dependencies to
+ * llvmpipe internals.
+ */
+#define DRAW_LLVM_TEXTURE_MEMBER(_name, _index, _emit_load)  \
+   static LLVMValueRef \
+   draw_llvm_texture_##_name( const struct lp_sampler_dynamic_state *base, \
+                              LLVMBuilderRef builder,                   \
+                              unsigned unit)                            \
+   { \
+      return draw_llvm_texture_member(base, builder, unit, _index, #_name, _emit_load ); \
+   }
+
+
+DRAW_LLVM_TEXTURE_MEMBER(width,      DRAW_JIT_TEXTURE_WIDTH, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(height,     DRAW_JIT_TEXTURE_HEIGHT, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(depth,      DRAW_JIT_TEXTURE_DEPTH, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(last_level, DRAW_JIT_TEXTURE_LAST_LEVEL, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(row_stride, DRAW_JIT_TEXTURE_ROW_STRIDE, FALSE)
+DRAW_LLVM_TEXTURE_MEMBER(img_stride, DRAW_JIT_TEXTURE_IMG_STRIDE, FALSE)
+DRAW_LLVM_TEXTURE_MEMBER(data_ptr,   DRAW_JIT_TEXTURE_DATA, FALSE)
+DRAW_LLVM_TEXTURE_MEMBER(min_lod,    DRAW_JIT_TEXTURE_MIN_LOD, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(max_lod,    DRAW_JIT_TEXTURE_MAX_LOD, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(lod_bias,   DRAW_JIT_TEXTURE_LOD_BIAS, TRUE)
+DRAW_LLVM_TEXTURE_MEMBER(border_color, DRAW_JIT_TEXTURE_BORDER_COLOR, FALSE)
+
+
+static void
+draw_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+/**
+ * Fetch filtered values from texture.
+ * The 'texel' parameter returns four vectors corresponding to R, G, B, A.
+ */
+static void
+draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
+                                       LLVMBuilderRef builder,
+                                       struct lp_type type,
+                                       unsigned unit,
+                                       unsigned num_coords,
+                                       const LLVMValueRef *coords,
+                                       const LLVMValueRef *ddx,
+                                       const LLVMValueRef *ddy,
+                                       LLVMValueRef lod_bias, /* optional */
+                                       LLVMValueRef explicit_lod, /* optional */
+                                       LLVMValueRef *texel)
+{
+   struct draw_llvm_sampler_soa *sampler = (struct draw_llvm_sampler_soa *)base;
+
+   assert(unit < PIPE_MAX_VERTEX_SAMPLERS);
+
+   lp_build_sample_soa(builder,
+                       &sampler->dynamic_state.static_state[unit],
+                       &sampler->dynamic_state.base,
+                       type,
+                       unit,
+                       num_coords, coords,
+                       ddx, ddy,
+                       lod_bias, explicit_lod,
+                       texel);
+}
+
+
+struct lp_build_sampler_soa *
+draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state,
+                             LLVMValueRef context_ptr)
+{
+   struct draw_llvm_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(draw_llvm_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = draw_llvm_sampler_soa_destroy;
+   sampler->base.emit_fetch_texel = draw_llvm_sampler_soa_emit_fetch_texel;
+   sampler->dynamic_state.base.width = draw_llvm_texture_width;
+   sampler->dynamic_state.base.height = draw_llvm_texture_height;
+   sampler->dynamic_state.base.depth = draw_llvm_texture_depth;
+   sampler->dynamic_state.base.last_level = draw_llvm_texture_last_level;
+   sampler->dynamic_state.base.row_stride = draw_llvm_texture_row_stride;
+   sampler->dynamic_state.base.img_stride = draw_llvm_texture_img_stride;
+   sampler->dynamic_state.base.data_ptr = draw_llvm_texture_data_ptr;
+   sampler->dynamic_state.base.min_lod = draw_llvm_texture_min_lod;
+   sampler->dynamic_state.base.max_lod = draw_llvm_texture_max_lod;
+   sampler->dynamic_state.base.lod_bias = draw_llvm_texture_lod_bias;
+   sampler->dynamic_state.base.border_color = draw_llvm_texture_border_color;
+   sampler->dynamic_state.static_state = static_state;
+   sampler->dynamic_state.context_ptr = context_ptr;
+
+   return &sampler->base;
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_llvm_translate.c b/src/gallium/auxiliary/draw/draw_llvm_translate.c
index d7da7ed357d..5171327ce2d 100644
--- a/src/gallium/auxiliary/draw/draw_llvm_translate.c
+++ b/src/gallium/auxiliary/draw/draw_llvm_translate.c
@@ -3,10 +3,10 @@
 
 #include "draw_llvm.h"
 
-#include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_struct.h"
 #include "gallivm/lp_bld_format.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_type.h"
 
 #include "util/u_memory.h"
 #include "util/u_format.h"
@@ -466,6 +466,7 @@ draw_llvm_translate_from(LLVMBuilderRef builder,
    const struct util_format_description *format_desc;
    LLVMValueRef zero;
    int i;
+   struct lp_type type = lp_float32_vec4_type();
 
    /*
     * The above can only cope with straight arrays: no bitfields,
@@ -493,5 +494,5 @@ draw_llvm_translate_from(LLVMBuilderRef builder,
 
    format_desc = util_format_description(from_format);
    zero = LLVMConstNull(LLVMInt32Type());
-   return lp_build_fetch_rgba_aos(builder, format_desc, vbuffer, zero, zero);
+   return lp_build_fetch_rgba_aos(builder, format_desc, type, vbuffer, zero, zero, zero);
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 64c35025081..6206197dae9 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -170,58 +170,42 @@ static void do_triangle( struct draw_context *draw,
  * Set up macros for draw_pt_decompose.h template code.
  * This code uses vertex indexes / elements.
  */
-#define QUAD(i0,i1,i2,i3)                       \
-   do_triangle( draw,                           \
-                ( DRAW_PIPE_RESET_STIPPLE |     \
-                  DRAW_PIPE_EDGE_FLAG_0 |       \
-                  DRAW_PIPE_EDGE_FLAG_2 ),      \
-                verts + stride * elts[i0],      \
-                verts + stride * elts[i1],      \
-                verts + stride * elts[i3]);     \
-   do_triangle( draw,                           \
-                ( DRAW_PIPE_EDGE_FLAG_0 |       \
-                  DRAW_PIPE_EDGE_FLAG_1 ),      \
-                verts + stride * elts[i1],      \
-                verts + stride * elts[i2],      \
-                verts + stride * elts[i3])
-
-#define TRIANGLE(flags,i0,i1,i2)                                        \
-   do_triangle( draw,                                                   \
-                elts[i0],  /* flags */                                  \
-                verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK),     \
-                verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK),     \
-                verts + stride * (elts[i2] & ~DRAW_PIPE_FLAG_MASK) );
-
-#define LINE(flags,i0,i1)                                       \
-   do_line( draw,                                               \
-            elts[i0],                                           \
-            verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK), \
-            verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK) );
+
+#define TRIANGLE(flags,i0,i1,i2)                                  \
+   do {                                                           \
+      do_triangle( draw,                                          \
+                   flags,                                         \
+                   verts + stride * (i0),                         \
+                   verts + stride * (i1),                         \
+                   verts + stride * (i2) );                       \
+   } while (0)
+
+#define LINE(flags,i0,i1)                                         \
+   do {                                                           \
+      do_line( draw,                                              \
+               flags,                                             \
+               verts + stride * (i0),                             \
+               verts + stride * (i1) );                           \
+   } while (0)
 
 #define POINT(i0)                               \
-   do_point( draw,                              \
-             verts + stride * elts[i0] )
+   do {                                         \
+      do_point( draw, verts + stride * (i0) );  \
+   } while (0)
+
+#define GET_ELT(idx) (elts[idx])
 
-#define FUNC pipe_run
-#define ARGS                                    \
+#define FUNC pipe_run_elts
+#define FUNC_VARS                               \
     struct draw_context *draw,                  \
     unsigned prim,                              \
+    unsigned prim_flags,                        \
     struct vertex_header *vertices,             \
     unsigned stride,                            \
-    const ushort *elts
-
-#define LOCAL_VARS                                           \
-   char *verts = (char *)vertices;                           \
-   boolean flatfirst = (draw->rasterizer->flatshade &&       \
-                        draw->rasterizer->flatshade_first);  \
-   unsigned i;                                               \
-   ushort flags
-
-#define FLUSH
+    const ushort *elts,                         \
+    unsigned count
 
 #include "draw_pt_decompose.h"
-#undef ARGS
-#undef LOCAL_VARS
 
 
 
@@ -238,78 +222,84 @@ static void do_triangle( struct draw_context *draw,
  * draw_vbuf.c code uses when it has to perform a flush.
  */
 void draw_pipeline_run( struct draw_context *draw,
-                        unsigned prim,
-                        struct vertex_header *vertices,
-                        unsigned vertex_count,
-                        unsigned stride,
-                        const ushort *elts,
-                        unsigned count )
+                        const struct draw_vertex_info *vert_info,
+                        const struct draw_prim_info *prim_info)
 {
-   char *verts = (char *)vertices;
+   unsigned i, start;
+
+   draw->pipeline.verts = (char *)vert_info->verts;
+   draw->pipeline.vertex_stride = vert_info->stride;
+   draw->pipeline.vertex_count = vert_info->count;
+
+   for (start = i = 0;
+        i < prim_info->primitive_count;
+        start += prim_info->primitive_lengths[i], i++)
+   {
+      const unsigned count = prim_info->primitive_lengths[i];
+
+#if DEBUG
+      /* Warn if one of the element indexes go outside the vertex buffer */
+      {
+         unsigned max_index = 0x0, i;
+         /* find the largest element index */
+         for (i = 0; i < count; i++) {
+            unsigned int index = prim_info->elts[start + i];
+            if (index > max_index)
+               max_index = index;
+         }
+         if (max_index >= vert_info->count) {
+            debug_printf("%s: max_index (%u) outside vertex buffer (%u)\n",
+                         __FUNCTION__,
+                         max_index,
+                         vert_info->count);
+         }
+      }
+#endif
+
+      pipe_run_elts(draw,
+                    prim_info->prim,
+                    prim_info->flags,
+                    vert_info->verts,
+                    vert_info->stride,
+                    prim_info->elts + start,
+                    count);
+   }
 
-   draw->pipeline.verts = verts;
-   draw->pipeline.vertex_stride = stride;
-   draw->pipeline.vertex_count = vertex_count;
-   
-   pipe_run(draw, prim, vertices, stride, elts, count);
-   
    draw->pipeline.verts = NULL;
    draw->pipeline.vertex_count = 0;
 }
 
 
-
 /*
  * Set up macros for draw_pt_decompose.h template code.
- * This code is for non-indexed rendering (no elts).
+ * This code is for non-indexed (aka linear) rendering (no elts).
  */
-#define QUAD(i0,i1,i2,i3)                                        \
-   do_triangle( draw,                                            \
-                ( DRAW_PIPE_RESET_STIPPLE |                      \
-                  DRAW_PIPE_EDGE_FLAG_0 |                        \
-                  DRAW_PIPE_EDGE_FLAG_2 ),                       \
-                verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i1),                           \
-                verts + stride * (i3));                          \
-   do_triangle( draw,                                            \
-                ( DRAW_PIPE_EDGE_FLAG_0 |                        \
-                  DRAW_PIPE_EDGE_FLAG_1 ),                       \
-                verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i2),                           \
-                verts + stride * (i3))
-
-#define TRIANGLE(flags,i0,i1,i2)                                 \
-   do_triangle( draw,                                            \
-                flags,  /* flags */                              \
-                verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i1),                           \
-                verts + stride * (i2))
-
-#define LINE(flags,i0,i1)                                   \
-   do_line( draw,                                           \
-            flags,                                          \
-            verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \
-            verts + stride * (i1))
 
-#define POINT(i0)                               \
-   do_point( draw,                              \
-             verts + stride * i0 )
+#define TRIANGLE(flags,i0,i1,i2)       \
+   do_triangle( draw, flags,           \
+                verts + stride * (i0), \
+                verts + stride * (i1), \
+                verts + stride * (i2) )
 
-#define FUNC pipe_run_linear
-#define ARGS                                    \
-    struct draw_context *draw,                  \
-    unsigned prim,                              \
-    struct vertex_header *vertices,             \
-    unsigned stride
+#define LINE(flags,i0,i1)              \
+   do_line( draw, flags,               \
+            verts + stride * (i0),     \
+            verts + stride * (i1) )
 
-#define LOCAL_VARS                                           \
-   char *verts = (char *)vertices;                           \
-   boolean flatfirst = (draw->rasterizer->flatshade &&       \
-                        draw->rasterizer->flatshade_first);  \
-   unsigned i;                                               \
-   ushort flags
+#define POINT(i0)                      \
+   do_point( draw, verts + stride * (i0) )
 
-#define FLUSH
+
+#define GET_ELT(idx) (idx)
+
+#define FUNC pipe_run_linear
+#define FUNC_VARS                      \
+    struct draw_context *draw,         \
+    unsigned prim,                     \
+    unsigned prim_flags,               \
+    struct vertex_header *vertices,    \
+    unsigned stride,                   \
+    unsigned count
 
 #include "draw_pt_decompose.h"
 
@@ -318,17 +308,32 @@ void draw_pipeline_run( struct draw_context *draw,
  * For drawing non-indexed primitives.
  */
 void draw_pipeline_run_linear( struct draw_context *draw,
-                               unsigned prim,
-                               struct vertex_header *vertices,
-                               unsigned count,
-                               unsigned stride )
+                               const struct draw_vertex_info *vert_info,
+                               const struct draw_prim_info *prim_info)
 {
-   char *verts = (char *)vertices;
-   draw->pipeline.verts = verts;
-   draw->pipeline.vertex_stride = stride;
-   draw->pipeline.vertex_count = count;
-
-   pipe_run_linear(draw, prim, vertices, stride, count);
+   unsigned i, start;
+
+   for (start = i = 0;
+        i < prim_info->primitive_count;
+        start += prim_info->primitive_lengths[i], i++)
+   {
+      unsigned count = prim_info->primitive_lengths[i];
+      char *verts = ((char*)vert_info->verts) +
+                    (start * vert_info->stride);
+
+      draw->pipeline.verts = verts;
+      draw->pipeline.vertex_stride = vert_info->stride;
+      draw->pipeline.vertex_count = count;
+
+      assert(count <= vert_info->count);
+
+      pipe_run_linear(draw,
+                      prim_info->prim,
+                      prim_info->flags,
+                      (struct vertex_header*)verts,
+                      vert_info->stride,
+                      count);
+   }
 
    draw->pipeline.verts = NULL;
    draw->pipeline.vertex_count = 0;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 4faf0a779ca..d1aba763098 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -373,8 +373,7 @@ generate_aaline_fs(struct aaline_stage *aaline)
 
    aaline->fs->sampler_unit = transform.freeSampler;
 
-   aaline->fs->aaline_fs
-      = aaline->driver_create_fs_state(pipe, &aaline_fs);
+   aaline->fs->aaline_fs = aaline->driver_create_fs_state(pipe, &aaline_fs);
    if (aaline->fs->aaline_fs == NULL)
       goto fail;
 
@@ -425,7 +424,8 @@ aaline_create_texture(struct aaline_stage *aaline)
 
    /* Fill in mipmap images.
     * Basically each level is solid opaque, except for the outermost
-    * texels which are zero.  Special case the 1x1 and 2x2 levels.
+    * texels which are zero.  Special case the 1x1 and 2x2 levels
+    * (though, those levels shouldn't be used - see the max_lod setting).
     */
    for (level = 0; level <= MAX_TEXTURE_LEVEL; level++) {
       struct pipe_transfer *transfer;
@@ -497,7 +497,8 @@ aaline_create_sampler(struct aaline_stage *aaline)
    sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
    sampler.normalized_coords = 1;
    sampler.min_lod = 0.0f;
-   sampler.max_lod = MAX_TEXTURE_LEVEL;
+   /* avoid using the 1x1 and 2x2 mipmap levels */
+   sampler.max_lod = MAX_TEXTURE_LEVEL - 2;
 
    aaline->sampler_cso = pipe->create_sampler_state(pipe, &sampler);
    if (aaline->sampler_cso == NULL)
@@ -669,8 +670,8 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
 
    assert(draw->rasterizer->line_smooth);
 
-   if (draw->rasterizer->line_width <= 3.0)
-      aaline->half_line_width = 1.5f;
+   if (draw->rasterizer->line_width <= 2.2)
+      aaline->half_line_width = 1.1f;
    else
       aaline->half_line_width = 0.5f * draw->rasterizer->line_width;
 
@@ -687,10 +688,9 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
    aaline->tex_slot = draw_current_shader_outputs(draw);
    aaline->pos_slot = draw_current_shader_position_output(draw);;
 
-   /* advertise the extra post-transformed vertex attribute */
-   draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
-   draw->extra_shader_outputs.semantic_index = aaline->fs->generic_attrib;
-   draw->extra_shader_outputs.slot = aaline->tex_slot;
+   /* allocate the extra post-transformed vertex attribute */
+   (void) draw_alloc_extra_vertex_attrib(draw, TGSI_SEMANTIC_GENERIC,
+                                         aaline->fs->generic_attrib);
 
    /* how many samplers? */
    /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
@@ -743,7 +743,7 @@ aaline_flush(struct draw_stage *stage, unsigned flags)
 
    draw->suspend_flushing = FALSE;
 
-   draw->extra_shader_outputs.slot = 0;
+   draw_remove_extra_vertex_attribs(draw);
 }
 
 
@@ -788,9 +788,6 @@ draw_aaline_stage(struct draw_context *draw)
    if (aaline == NULL)
       return NULL;
 
-   if (!draw_alloc_temp_verts( &aaline->stage, 8 ))
-      goto fail;
-
    aaline->stage.draw = draw;
    aaline->stage.name = "aaline";
    aaline->stage.next = NULL;
@@ -801,11 +798,14 @@ draw_aaline_stage(struct draw_context *draw)
    aaline->stage.reset_stipple_counter = aaline_reset_stipple_counter;
    aaline->stage.destroy = aaline_destroy;
 
+   if (!draw_alloc_temp_verts( &aaline->stage, 8 ))
+      goto fail;
+
    return aaline;
 
  fail:
    if (aaline)
-      aaline_destroy(&aaline->stage);
+      aaline->stage.destroy(&aaline->stage);
 
    return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index bba6f50c020..5ea552f51c1 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -701,9 +701,9 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header)
 
    aapoint->pos_slot = draw_current_shader_position_output(draw);
 
-   draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
-   draw->extra_shader_outputs.semantic_index = aapoint->fs->generic_attrib;
-   draw->extra_shader_outputs.slot = aapoint->tex_slot;
+   /* allocate the extra post-transformed vertex attribute */
+   (void) draw_alloc_extra_vertex_attrib(draw, TGSI_SEMANTIC_GENERIC,
+                                         aapoint->fs->generic_attrib);
 
    /* find psize slot in post-transform vertex */
    aapoint->psize_slot = -1;
@@ -754,7 +754,7 @@ aapoint_flush(struct draw_stage *stage, unsigned flags)
 
    draw->suspend_flushing = FALSE;
 
-   draw->extra_shader_outputs.slot = 0;
+   draw_remove_extra_vertex_attribs(draw);
 }
 
 
@@ -780,9 +780,6 @@ draw_aapoint_stage(struct draw_context *draw)
    if (aapoint == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &aapoint->stage, 4 ))
-      goto fail;
-
    aapoint->stage.draw = draw;
    aapoint->stage.name = "aapoint";
    aapoint->stage.next = NULL;
@@ -793,11 +790,14 @@ draw_aapoint_stage(struct draw_context *draw)
    aapoint->stage.reset_stipple_counter = aapoint_reset_stipple_counter;
    aapoint->stage.destroy = aapoint_destroy;
 
+   if (!draw_alloc_temp_verts( &aapoint->stage, 4 ))
+      goto fail;
+
    return aapoint;
 
  fail:
    if (aapoint)
-      aapoint_destroy(&aapoint->stage);
+      aapoint->stage.destroy(&aapoint->stage);
 
    return NULL;
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 51a6115ebf5..a10d8e9edc0 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -68,8 +68,7 @@ struct clip_stage {
 };
 
 
-/* This is a bit confusing:
- */
+/** Cast wrapper */
 static INLINE struct clip_stage *clip_stage( struct draw_stage *stage )
 {
    return (struct clip_stage *)stage;
@@ -81,18 +80,22 @@ static INLINE struct clip_stage *clip_stage( struct draw_stage *stage )
 
 /* All attributes are float[4], so this is easy:
  */
-static void interp_attr( float *fdst,
+static void interp_attr( float dst[4],
 			 float t,
-			 const float *fin,
-			 const float *fout )
+			 const float in[4],
+			 const float out[4] )
 {  
-   fdst[0] = LINTERP( t, fout[0], fin[0] );
-   fdst[1] = LINTERP( t, fout[1], fin[1] );
-   fdst[2] = LINTERP( t, fout[2], fin[2] );
-   fdst[3] = LINTERP( t, fout[3], fin[3] );
+   dst[0] = LINTERP( t, out[0], in[0] );
+   dst[1] = LINTERP( t, out[1], in[1] );
+   dst[2] = LINTERP( t, out[2], in[2] );
+   dst[3] = LINTERP( t, out[3], in[3] );
 }
 
 
+/**
+ * Copy front/back, primary/secondary colors from src vertex to dst vertex.
+ * Used when flat shading.
+ */
 static void copy_colors( struct draw_stage *stage,
 			 struct vertex_header *dst,
 			 const struct vertex_header *src )
@@ -121,20 +124,17 @@ static void interp( const struct clip_stage *clip,
 
    /* Vertex header.
     */
-   {
-      dst->clipmask = 0;
-      dst->edgeflag = 0;        /* will get overwritten later */
-      dst->pad = 0;
-      dst->vertex_id = UNDEFINED_VERTEX_ID;
-   }
+   dst->clipmask = 0;
+   dst->edgeflag = 0;        /* will get overwritten later */
+   dst->pad = 0;
+   dst->vertex_id = UNDEFINED_VERTEX_ID;
 
-   /* Clip coordinates:  interpolate normally
+   /* Interpolate the clip-space coords.
     */
-   {
-      interp_attr(dst->clip, t, in->clip, out->clip);
-   }
+   interp_attr(dst->clip, t, in->clip, out->clip);
 
-   /* Do the projective divide and insert window coordinates:
+   /* Do the projective divide and viewport transformation to get
+    * new window coordinates:
     */
    {
       const float *pos = dst->clip;
@@ -157,6 +157,10 @@ static void interp( const struct clip_stage *clip,
 }
 
 
+/**
+ * Emit a post-clip polygon to the next pipeline stage.  The polygon
+ * will be convex and the provoking vertex will always be vertex[0].
+ */
 static void emit_poly( struct draw_stage *stage,
 		       struct vertex_header **inlist,
 		       unsigned n,
@@ -164,10 +168,18 @@ static void emit_poly( struct draw_stage *stage,
 {
    struct prim_header header;
    unsigned i;
+   ushort edge_first, edge_middle, edge_last;
 
-   const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
-   const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
-   const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
+   if (stage->draw->rasterizer->flatshade_first) {
+      edge_first  = DRAW_PIPE_EDGE_FLAG_0;
+      edge_middle = DRAW_PIPE_EDGE_FLAG_1;
+      edge_last   = DRAW_PIPE_EDGE_FLAG_2;
+   }
+   else {
+      edge_first  = DRAW_PIPE_EDGE_FLAG_2;
+      edge_middle = DRAW_PIPE_EDGE_FLAG_0;
+      edge_last   = DRAW_PIPE_EDGE_FLAG_1;
+   }
 
    /* later stages may need the determinant, but only the sign matters */
    header.det = origPrim->det;
@@ -175,9 +187,17 @@ static void emit_poly( struct draw_stage *stage,
    header.pad = 0;
 
    for (i = 2; i < n; i++, header.flags = edge_middle) {
-      header.v[0] = inlist[i-1];
-      header.v[1] = inlist[i];
-      header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
+      /* order the triangle verts to respect the provoking vertex mode */
+      if (stage->draw->rasterizer->flatshade_first) {
+         header.v[0] = inlist[0];  /* the provoking vertex */
+         header.v[1] = inlist[i-1];
+         header.v[2] = inlist[i];
+      }
+      else {
+         header.v[0] = inlist[i-1];
+         header.v[1] = inlist[i];
+         header.v[2] = inlist[0];  /* the provoking vertex */
+      }
 
       if (i == n-1)
          header.flags |= edge_last;
@@ -185,7 +205,8 @@ static void emit_poly( struct draw_stage *stage,
       if (0) {
          const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
          uint j, k;
-         debug_printf("Clipped tri:\n");
+         debug_printf("Clipped tri: (flat-shade-first = %d)\n",
+                      stage->draw->rasterizer->flatshade_first);
          for (j = 0; j < 3; j++) {
             for (k = 0; k < vs->info.num_outputs; k++) {
                debug_printf("  Vert %d: Attr %d:  %f %f %f %f\n", j, k,
@@ -241,6 +262,9 @@ do_clip_tri( struct draw_stage *stage,
 
       clipmask &= ~(1<<plane_idx);
 
+      assert(n < MAX_CLIPPED_VERTICES);
+      if (n >= MAX_CLIPPED_VERTICES)
+         return;
       inlist[n] = inlist[0]; /* prevent rotation of vertices */
 
       for (i = 1; i <= n; i++) {
@@ -249,11 +273,23 @@ do_clip_tri( struct draw_stage *stage,
 	 float dp = dot4( vert->clip, plane );
 
 	 if (!IS_NEGATIVE(dp_prev)) {
+            assert(outcount < MAX_CLIPPED_VERTICES);
+            if (outcount >= MAX_CLIPPED_VERTICES)
+               return;
 	    outlist[outcount++] = vert_prev;
 	 }
 
 	 if (DIFFERENT_SIGNS(dp, dp_prev)) {
-	    struct vertex_header *new_vert = clipper->stage.tmp[tmpnr++];
+	    struct vertex_header *new_vert;
+
+            assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
+            if (tmpnr >= MAX_CLIPPED_VERTICES + 1)
+               return;
+            new_vert = clipper->stage.tmp[tmpnr++];
+
+            assert(outcount < MAX_CLIPPED_VERTICES);
+            if (outcount >= MAX_CLIPPED_VERTICES)
+               return;
 	    outlist[outcount++] = new_vert;
 
 	    if (IS_NEGATIVE(dp)) {
@@ -291,18 +327,34 @@ do_clip_tri( struct draw_stage *stage,
       }
    }
 
-   /* If flat-shading, copy color to new provoking vertex.
+   /* If flat-shading, copy provoking vertex color to polygon vertex[0]
     */
-   if (clipper->flat && inlist[0] != header->v[2]) {
-      inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
-
-      copy_colors(stage, inlist[0], header->v[2]);
-   }
-
-   /* Emit the polygon as triangles to the setup stage:
-    */
-   if (n >= 3)
+   if (n >= 3) {
+      if (clipper->flat) {
+         if (stage->draw->rasterizer->flatshade_first) {
+            if (inlist[0] != header->v[0]) {
+               assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
+               if (tmpnr >= MAX_CLIPPED_VERTICES + 1)
+                  return;
+               inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+               copy_colors(stage, inlist[0], header->v[0]);
+            }
+         }
+         else {
+            if (inlist[0] != header->v[2]) {
+               assert(tmpnr < MAX_CLIPPED_VERTICES + 1);
+               if (tmpnr >= MAX_CLIPPED_VERTICES + 1)
+                  return;
+               inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+               copy_colors(stage, inlist[0], header->v[2]);
+            }
+         }
+      }
+      
+      /* Emit the polygon as triangles to the setup stage:
+       */
       emit_poly( stage, inlist, n, header );
+   }
 }
 
 
@@ -492,9 +544,6 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw )
    if (clipper == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 ))
-      goto fail;
-
    clipper->stage.draw = draw;
    clipper->stage.name = "clipper";
    clipper->stage.point = clip_point;
@@ -506,6 +555,9 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw )
 
    clipper->plane = draw->plane;
 
+   if (!draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 ))
+      goto fail;
+
    return &clipper->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c
index dc66c65a56c..2f4d01d23ab 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -40,7 +40,8 @@
 
 struct cull_stage {
    struct draw_stage stage;
-   unsigned winding;  /**< which winding(s) to cull (one of PIPE_WINDING_x) */
+   unsigned cull_face;  /**< which face(s) to cull (one of PIPE_FACE_x) */
+   unsigned front_ccw;
 };
 
 
@@ -73,9 +74,12 @@ static void cull_tri( struct draw_stage *stage,
       /* if det < 0 then Z points toward the camera and the triangle is 
        * counter-clockwise winding.
        */
-      unsigned winding = (header->det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
+      unsigned ccw = (header->det < 0);
+      unsigned face = ((ccw == cull_stage(stage)->front_ccw) ?
+                       PIPE_FACE_FRONT :
+                       PIPE_FACE_BACK);
 
-      if ((winding & cull_stage(stage)->winding) == 0) {
+      if ((face & cull_stage(stage)->cull_face) == 0) {
          /* triangle is not culled, pass to next stage */
 	 stage->next->tri( stage->next, header );
       }
@@ -88,7 +92,8 @@ static void cull_first_tri( struct draw_stage *stage,
 {
    struct cull_stage *cull = cull_stage(stage);
 
-   cull->winding = stage->draw->rasterizer->cull_mode;
+   cull->cull_face = stage->draw->rasterizer->cull_face;
+   cull->front_ccw = stage->draw->rasterizer->front_ccw;
 
    stage->tri = cull_tri;
    stage->tri( stage, header );
@@ -124,9 +129,6 @@ struct draw_stage *draw_cull_stage( struct draw_context *draw )
    if (cull == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &cull->stage, 0 ))
-      goto fail;
-
    cull->stage.draw = draw;
    cull->stage.name = "cull";
    cull->stage.next = NULL;
@@ -137,6 +139,9 @@ struct draw_stage *draw_cull_stage( struct draw_context *draw )
    cull->stage.reset_stipple_counter = cull_reset_stipple_counter;
    cull->stage.destroy = cull_destroy;
 
+   if (!draw_alloc_temp_verts( &cull->stage, 0 ))
+      goto fail;
+
    return &cull->stage;
 
 fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
index 34afb1a0b60..693f2895aad 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -257,9 +257,6 @@ struct draw_stage *draw_flatshade_stage( struct draw_context *draw )
    if (flatshade == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &flatshade->stage, 2 ))
-      goto fail;
-
    flatshade->stage.draw = draw;
    flatshade->stage.name = "flatshade";
    flatshade->stage.next = NULL;
@@ -270,6 +267,9 @@ struct draw_stage *draw_flatshade_stage( struct draw_context *draw )
    flatshade->stage.reset_stipple_counter = flatshade_reset_stipple_counter;
    flatshade->stage.destroy = flatshade_destroy;
 
+   if (!draw_alloc_temp_verts( &flatshade->stage, 2 ))
+      goto fail;
+
    return &flatshade->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
index 8e321946ced..8afbbfa1569 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_offset.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -161,9 +161,7 @@ struct draw_stage *draw_offset_stage( struct draw_context *draw )
 {
    struct offset_stage *offset = CALLOC_STRUCT(offset_stage);
    if (offset == NULL)
-      return NULL;
-
-   draw_alloc_temp_verts( &offset->stage, 3 );
+      goto fail;
 
    offset->stage.draw = draw;
    offset->stage.name = "offset";
@@ -175,5 +173,14 @@ struct draw_stage *draw_offset_stage( struct draw_context *draw )
    offset->stage.reset_stipple_counter = offset_reset_stipple_counter;
    offset->stage.destroy = offset_destroy;
 
+   if (!draw_alloc_temp_verts( &offset->stage, 3 ))
+      goto fail;
+
    return &offset->stage;
+
+fail:
+   if (offset)
+      offset->stage.destroy( &offset->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index ef30db094fe..ed9a53e154d 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -363,8 +363,12 @@ generate_pstip_fs(struct pstip_stage *pstip)
    assert(pstip->fs->sampler_unit < PIPE_MAX_SAMPLERS);
 
    pstip->fs->pstip_fs = pstip->driver_create_fs_state(pstip->pipe, &pstip_fs);
-
+   
    FREE((void *)pstip_fs.tokens);
+
+   if (!pstip->fs->pstip_fs)
+      return FALSE;
+
    return TRUE;
 }
 
@@ -603,12 +607,15 @@ pstip_destroy(struct draw_stage *stage)
 }
 
 
+/** Create a new polygon stipple drawing stage object */
 static struct pstip_stage *
-draw_pstip_stage(struct draw_context *draw)
+draw_pstip_stage(struct draw_context *draw, struct pipe_context *pipe)
 {
    struct pstip_stage *pstip = CALLOC_STRUCT(pstip_stage);
+   if (pstip == NULL)
+      goto fail;
 
-   draw_alloc_temp_verts( &pstip->stage, 8 );
+   pstip->pipe = pipe;
 
    pstip->stage.draw = draw;
    pstip->stage.name = "pstip";
@@ -620,7 +627,16 @@ draw_pstip_stage(struct draw_context *draw)
    pstip->stage.reset_stipple_counter = pstip_reset_stipple_counter;
    pstip->stage.destroy = pstip_destroy;
 
+   if (!draw_alloc_temp_verts( &pstip->stage, 8 ))
+      goto fail;
+
    return pstip;
+
+fail:
+   if (pstip)
+      pstip->stage.destroy( &pstip->stage );
+
+   return NULL;
 }
 
 
@@ -756,14 +772,12 @@ draw_install_pstipple_stage(struct draw_context *draw,
    /*
     * Create / install pgon stipple drawing / prim stage
     */
-   pstip = draw_pstip_stage( draw );
+   pstip = draw_pstip_stage( draw, pipe );
    if (pstip == NULL)
       goto fail;
 
    draw->pipeline.pstipple = &pstip->stage;
 
-   pstip->pipe = pipe;
-
    /* create special texture, sampler state */
    if (!pstip_create_texture(pstip))
       goto fail;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 70fbab9ea76..4b3f4e7ae11 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -235,8 +235,8 @@ stipple_destroy( struct draw_stage *stage )
 struct draw_stage *draw_stipple_stage( struct draw_context *draw )
 {
    struct stipple_stage *stipple = CALLOC_STRUCT(stipple_stage);
-
-   draw_alloc_temp_verts( &stipple->stage, 2 );
+   if (stipple == NULL)
+      goto fail;
 
    stipple->stage.draw = draw;
    stipple->stage.name = "stipple";
@@ -248,5 +248,14 @@ struct draw_stage *draw_stipple_stage( struct draw_context *draw )
    stipple->stage.flush = stipple_flush;
    stipple->stage.destroy = stipple_destroy;
 
+   if (!draw_alloc_temp_verts( &stipple->stage, 2 ))
+      goto fail;
+
    return &stipple->stage;
+
+fail:
+   if (stipple)
+      stipple->stage.destroy( &stipple->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
index eef0238b157..9a3f3fee625 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -141,7 +141,7 @@ static void twoside_first_tri( struct draw_stage *stage,
     * if the triangle is back-facing (negative).
     * sign = -1 for CCW, +1 for CW
     */
-   twoside->sign = (stage->draw->rasterizer->front_winding == PIPE_WINDING_CCW) ? -1.0f : 1.0f;
+   twoside->sign = stage->draw->rasterizer->front_ccw ? -1.0f : 1.0f;
 
    stage->tri = twoside_tri;
    stage->tri( stage, header );
@@ -177,9 +177,6 @@ struct draw_stage *draw_twoside_stage( struct draw_context *draw )
    if (twoside == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &twoside->stage, 3 ))
-      goto fail;
-
    twoside->stage.draw = draw;
    twoside->stage.name = "twoside";
    twoside->stage.next = NULL;
@@ -190,6 +187,9 @@ struct draw_stage *draw_twoside_stage( struct draw_context *draw )
    twoside->stage.reset_stipple_counter = twoside_reset_stipple_counter;
    twoside->stage.destroy = twoside_destroy;
 
+   if (!draw_alloc_temp_verts( &twoside->stage, 3 ))
+      goto fail;
+
    return &twoside->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index 03bb842e20a..d87741b91e7 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -105,6 +105,23 @@ static void lines( struct draw_stage *stage,
 }
 
 
+/** For debugging */
+static void
+print_header_flags(unsigned flags)
+{
+   debug_printf("header->flags = ");
+   if (flags & DRAW_PIPE_RESET_STIPPLE)
+      debug_printf("RESET_STIPPLE ");
+   if (flags & DRAW_PIPE_EDGE_FLAG_0)
+      debug_printf("EDGE_FLAG_0 ");
+   if (flags & DRAW_PIPE_EDGE_FLAG_1)
+      debug_printf("EDGE_FLAG_1 ");
+   if (flags & DRAW_PIPE_EDGE_FLAG_2)
+      debug_printf("EDGE_FLAG_2 ");
+   debug_printf("\n");
+}
+
+
 /* Unfilled tri:  
  *
  * Note edgeflags in the vertex struct is not sufficient as we will
@@ -117,8 +134,12 @@ static void unfilled_tri( struct draw_stage *stage,
 			  struct prim_header *header )
 {
    struct unfilled_stage *unfilled = unfilled_stage(stage);
-   unsigned mode = unfilled->mode[header->det >= 0.0];
+   unsigned cw = header->det >= 0.0;
+   unsigned mode = unfilled->mode[cw];
   
+   if (0)
+      print_header_flags(header->flags);
+
    switch (mode) {
    case PIPE_POLYGON_MODE_FILL:
       stage->next->tri( stage->next, header );
@@ -139,9 +160,10 @@ static void unfilled_first_tri( struct draw_stage *stage,
 				struct prim_header *header )
 {
    struct unfilled_stage *unfilled = unfilled_stage(stage);
+   const struct pipe_rasterizer_state *rast = stage->draw->rasterizer;
 
-   unfilled->mode[0] = stage->draw->rasterizer->fill_ccw; /* front */
-   unfilled->mode[1] = stage->draw->rasterizer->fill_cw;  /* back */
+   unfilled->mode[0] = rast->front_ccw ? rast->fill_front : rast->fill_back;
+   unfilled->mode[1] = rast->front_ccw ? rast->fill_back : rast->fill_front;
 
    stage->tri = unfilled_tri;
    stage->tri( stage, header );
@@ -180,9 +202,6 @@ struct draw_stage *draw_unfilled_stage( struct draw_context *draw )
    if (unfilled == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &unfilled->stage, 0 ))
-      goto fail;
-
    unfilled->stage.draw = draw;
    unfilled->stage.name = "unfilled";
    unfilled->stage.next = NULL;
@@ -194,6 +213,9 @@ struct draw_stage *draw_unfilled_stage( struct draw_context *draw )
    unfilled->stage.reset_stipple_counter = unfilled_reset_stipple_counter;
    unfilled->stage.destroy = unfilled_destroy;
 
+   if (!draw_alloc_temp_verts( &unfilled->stage, 0 ))
+      goto fail;
+
    return &unfilled->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c
index 2a50af7a414..c575a8ac7ca 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_validate.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c
@@ -122,12 +122,14 @@ draw_need_pipeline(const struct draw_context *draw,
          return TRUE;
 
       /* unfilled polygons */
-      if (rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
-          rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL)
+      if (rasterizer->fill_front != PIPE_POLYGON_MODE_FILL ||
+          rasterizer->fill_back != PIPE_POLYGON_MODE_FILL)
          return TRUE;
       
       /* polygon offset */
-      if (rasterizer->offset_cw || rasterizer->offset_ccw)
+      if (rasterizer->offset_point ||
+          rasterizer->offset_line ||
+          rasterizer->offset_tri)
          return TRUE;
 
       /* two-side lighting */
@@ -170,7 +172,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    wide_lines = (rast->line_width > draw->pipeline.wide_line_threshold
                  && !rast->line_smooth);
 
-   /* drawing large points? */
+   /* drawing large/sprite points (but not AA points)? */
    if (rast->sprite_coord_enable && draw->pipeline.point_sprite)
       wide_points = TRUE;
    else if (rast->point_smooth && draw->pipeline.aapoint)
@@ -205,7 +207,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
       precalc_flat = TRUE;
    }
 
-   if (wide_points || rast->sprite_coord_enable) {
+   if (wide_points) {
       draw->pipeline.wide_point->next = next;
       next = draw->pipeline.wide_point;
    }
@@ -222,8 +224,8 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
       next = draw->pipeline.pstipple;
    }
 
-   if (rast->fill_cw != PIPE_POLYGON_MODE_FILL ||
-       rast->fill_ccw != PIPE_POLYGON_MODE_FILL) {
+   if (rast->fill_front != PIPE_POLYGON_MODE_FILL ||
+       rast->fill_back != PIPE_POLYGON_MODE_FILL) {
       draw->pipeline.unfilled->next = next;
       next = draw->pipeline.unfilled;
       precalc_flat = TRUE;		/* only needed for triangles really */
@@ -235,8 +237,9 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
       next = draw->pipeline.flatshade;
    }
 	 
-   if (rast->offset_cw ||
-       rast->offset_ccw) {
+   if (rast->offset_point ||
+       rast->offset_line ||
+       rast->offset_tri) {
       draw->pipeline.offset->next = next;
       next = draw->pipeline.offset;
       need_det = TRUE;
@@ -255,14 +258,14 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
     * to less work emitting vertices, smaller vertex buffers, etc.
     * It's difficult to say whether this will be true in general.
     */
-   if (need_det || rast->cull_mode) {
+   if (need_det || rast->cull_face != PIPE_FACE_NONE) {
       draw->pipeline.cull->next = next;
       next = draw->pipeline.cull;
    }
 
    /* Clip stage
     */
-   if (!draw->bypass_clipping)
+   if (draw->clip_xy || draw->clip_z || draw->clip_user)
    {
       draw->pipeline.clip->next = next;
       next = draw->pipeline.clip;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index abbf6247ab8..58c5858734a 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -159,19 +159,8 @@ vbuf_tri( struct draw_stage *stage,
 
    check_space( vbuf, 3 );
 
-   if (vbuf->stage.draw->rasterizer->flatshade_first) {
-      /* Put provoking vertex in position expected by the driver.
-       * Emit last provoking vertex in first pos.
-       * Swap verts 0 & 1 to preserve polygon winding.
-       */
-      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[2] );
-      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[0] );
-      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[1] );
-   }
-   else {
-      for (i = 0; i < 3; i++) {
-         vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] );
-      }
+   for (i = 0; i < 3; i++) {
+      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] );
    }
 }
 
@@ -335,9 +324,9 @@ vbuf_flush_vertices( struct vbuf_stage *vbuf )
 
       if (vbuf->nr_indices) 
       {
-         vbuf->render->draw(vbuf->render, 
-                            vbuf->indices, 
-                            vbuf->nr_indices );
+         vbuf->render->draw_elements(vbuf->render, 
+                                     vbuf->indices, 
+                                     vbuf->nr_indices );
    
          vbuf->nr_indices = 0;
       }
@@ -364,9 +353,6 @@ vbuf_alloc_vertices( struct vbuf_stage *vbuf )
    /* Allocate a new vertex buffer */
    vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size;
 
-   /* even number */
-   vbuf->max_vertices = vbuf->max_vertices & ~1;
-
    if(vbuf->max_vertices >= UNDEFINED_VERTEX_ID)
       vbuf->max_vertices = UNDEFINED_VERTEX_ID - 1;
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
index ab167065815..98da9cfb999 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
@@ -77,8 +77,11 @@ static void wideline_line( struct draw_stage *stage,
    const float dx = fabsf(pos0[0] - pos2[0]);
    const float dy = fabsf(pos0[1] - pos2[1]);
 
+   const boolean gl_rasterization_rules =
+      stage->draw->rasterizer->gl_rasterization_rules;
+
    /* small tweak to meet GL specification */
-   const float bias = 0.125f;
+   const float bias = gl_rasterization_rules ? 0.125f : 0.0f;
 
    /*
     * Draw wide line as a quad (two tris) by "stretching" the line along
@@ -92,19 +95,21 @@ static void wideline_line( struct draw_stage *stage,
       pos1[1] = pos1[1] + half_width - bias;
       pos2[1] = pos2[1] - half_width - bias;
       pos3[1] = pos3[1] + half_width - bias;
-      if (pos0[0] < pos2[0]) {
-         /* left to right line */
-         pos0[0] -= 0.5f;
-         pos1[0] -= 0.5f;
-         pos2[0] -= 0.5f;
-         pos3[0] -= 0.5f;
-      }
-      else {
-         /* right to left line */
-         pos0[0] += 0.5f;
-         pos1[0] += 0.5f;
-         pos2[0] += 0.5f;
-         pos3[0] += 0.5f;
+      if (gl_rasterization_rules) {
+         if (pos0[0] < pos2[0]) {
+            /* left to right line */
+            pos0[0] -= 0.5f;
+            pos1[0] -= 0.5f;
+            pos2[0] -= 0.5f;
+            pos3[0] -= 0.5f;
+         }
+         else {
+            /* right to left line */
+            pos0[0] += 0.5f;
+            pos1[0] += 0.5f;
+            pos2[0] += 0.5f;
+            pos3[0] += 0.5f;
+         }
       }
    }
    else {
@@ -113,19 +118,21 @@ static void wideline_line( struct draw_stage *stage,
       pos1[0] = pos1[0] + half_width + bias;
       pos2[0] = pos2[0] - half_width + bias;
       pos3[0] = pos3[0] + half_width + bias;
-      if (pos0[1] < pos2[1]) {
-         /* top to bottom line */
-         pos0[1] -= 0.5f;
-         pos1[1] -= 0.5f;
-         pos2[1] -= 0.5f;
-         pos3[1] -= 0.5f;
-      }
-      else {
-         /* bottom to top line */
-         pos0[1] += 0.5f;
-         pos1[1] += 0.5f;
-         pos2[1] += 0.5f;
-         pos3[1] += 0.5f;
+      if (gl_rasterization_rules) {
+         if (pos0[1] < pos2[1]) {
+            /* top to bottom line */
+            pos0[1] -= 0.5f;
+            pos1[1] -= 0.5f;
+            pos2[1] -= 0.5f;
+            pos3[1] -= 0.5f;
+         }
+         else {
+            /* bottom to top line */
+            pos0[1] += 0.5f;
+            pos1[1] += 0.5f;
+            pos2[1] += 0.5f;
+            pos3[1] += 0.5f;
+         }
       }
    }
 
@@ -195,8 +202,8 @@ static void wideline_destroy( struct draw_stage *stage )
 struct draw_stage *draw_wide_line_stage( struct draw_context *draw )
 {
    struct wideline_stage *wide = CALLOC_STRUCT(wideline_stage);
-
-   draw_alloc_temp_verts( &wide->stage, 4 );
+   if (wide == NULL)
+      goto fail;
 
    wide->stage.draw = draw;
    wide->stage.name = "wide-line";
@@ -208,5 +215,14 @@ struct draw_stage *draw_wide_line_stage( struct draw_context *draw )
    wide->stage.reset_stipple_counter = wideline_reset_stipple_counter;
    wide->stage.destroy = wideline_destroy;
 
+   if (!draw_alloc_temp_verts( &wide->stage, 4 ))
+      goto fail;
+
    return &wide->stage;
+
+fail:
+   if (wide)
+      wide->stage.destroy( &wide->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index a86fe19586c..3646c6a7145 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -57,26 +57,24 @@
 #include "util/u_memory.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "draw_fs.h"
 #include "draw_vs.h"
 #include "draw_pipe.h"
 
 
 struct widepoint_stage {
-   struct draw_stage stage;
+   struct draw_stage stage;  /**< base class */
 
    float half_point_size;
 
    float xbias;
    float ybias;
 
-   uint texcoord_slot[PIPE_MAX_SHADER_OUTPUTS];
-   uint texcoord_enable[PIPE_MAX_SHADER_OUTPUTS];
-   uint num_texcoords;
-   uint texcoord_mode;
+   /** for automatic texcoord generation/replacement */
+   uint num_texcoord_gen;
+   uint texcoord_gen_slot[PIPE_MAX_SHADER_OUTPUTS];
 
    int psize_slot;
-
-   int point_coord_fs_input;  /**< input for pointcoord */
 };
 
 
@@ -96,30 +94,20 @@ widepoint_stage( struct draw_stage *stage )
 static void set_texcoords(const struct widepoint_stage *wide,
                           struct vertex_header *v, const float tc[4])
 {
+   const struct draw_context *draw = wide->stage.draw;
+   const struct pipe_rasterizer_state *rast = draw->rasterizer;
+   const uint texcoord_mode = rast->sprite_coord_mode;
    uint i;
-   for (i = 0; i < wide->num_texcoords; i++) {
-      if (wide->texcoord_enable[i]) {
-         uint j = wide->texcoord_slot[i];
-         v->data[j][0] = tc[0];
-         if (wide->texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
-            v->data[j][1] = 1.0f - tc[1];
-         else
-            v->data[j][1] = tc[1];
-         v->data[j][2] = tc[2];
-         v->data[j][3] = tc[3];
-      }
-   }
 
-   if (wide->point_coord_fs_input >= 0) {
-      /* put gl_PointCoord into the extra vertex slot */
-      uint slot = wide->stage.draw->extra_shader_outputs.slot;
+   for (i = 0; i < wide->num_texcoord_gen; i++) {
+      const uint slot = wide->texcoord_gen_slot[i];
       v->data[slot][0] = tc[0];
-      if (wide->texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
+      if (texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
          v->data[slot][1] = 1.0f - tc[1];
       else
          v->data[slot][1] = tc[1];
-      v->data[slot][2] = 0.0F;
-      v->data[slot][3] = 1.0F;
+      v->data[slot][2] = tc[2];
+      v->data[slot][3] = tc[3];
    }
 }
 
@@ -201,18 +189,9 @@ static void widepoint_point( struct draw_stage *stage,
 }
 
 
-static int
-find_pntc_input_attrib(struct draw_context *draw)
-{
-   /* Scan the fragment program's input decls to find the pointcoord
-    * attribute.  The xy components will store the point coord.
-    */
-   return 0; /* XXX fix this */
-}
-
-
-static void widepoint_first_point( struct draw_stage *stage, 
-			      struct prim_header *header )
+static void
+widepoint_first_point(struct draw_stage *stage, 
+                      struct prim_header *header)
 {
    struct widepoint_stage *wide = widepoint_stage(stage);
    struct draw_context *draw = stage->draw;
@@ -226,6 +205,7 @@ static void widepoint_first_point( struct draw_stage *stage,
 
    if (rast->gl_rasterization_rules) {
       wide->xbias = 0.125;
+      wide->ybias = -0.125;
    }
 
    /* Disable triangle culling, stippling, unfilled mode etc. */
@@ -243,31 +223,49 @@ static void widepoint_first_point( struct draw_stage *stage,
       stage->point = draw_pipe_passthrough_point;
    }
 
+   draw_remove_extra_vertex_attribs(draw);
+
    if (rast->point_quad_rasterization) {
-      /* find vertex shader texcoord outputs */
-      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
-      uint i, j = 0;
-      wide->texcoord_mode = rast->sprite_coord_mode;
-      for (i = 0; i < vs->info.num_outputs; i++) {
-         if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
-            wide->texcoord_slot[j] = i;
-            wide->texcoord_enable[j] = (rast->sprite_coord_enable >> j) & 1;
-            j++;
+      const struct draw_fragment_shader *fs = draw->fs.fragment_shader;
+      uint i;
+
+      wide->num_texcoord_gen = 0;
+
+      /* Loop over fragment shader inputs looking for generic inputs
+       * for which bit 'k' in sprite_coord_enable is set.
+       */
+      for (i = 0; i < fs->info.num_inputs; i++) {
+         if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
+            const int generic_index = fs->info.input_semantic_index[i];
+            /* Note that sprite_coord enable is a bitfield of
+             * PIPE_MAX_SHADER_OUTPUTS bits.
+             */
+            if (generic_index < PIPE_MAX_SHADER_OUTPUTS &&
+                (rast->sprite_coord_enable & (1 << generic_index))) {
+               /* OK, this generic attribute needs to be replaced with a
+                * texcoord (see above).
+                */
+               int slot = draw_find_shader_output(draw,
+                                                  TGSI_SEMANTIC_GENERIC,
+                                                  generic_index);
+
+               if (slot > 0) {
+                  /* there's already a post-vertex shader attribute
+                   * for this fragment shader input attribute.
+                   */
+               }
+               else {
+                  /* need to allocate a new post-vertex shader attribute */
+                  slot = draw_alloc_extra_vertex_attrib(draw,
+                                                        TGSI_SEMANTIC_GENERIC,
+                                                        generic_index);
+               }
+
+               /* add this slot to the texcoord-gen list */
+               wide->texcoord_gen_slot[wide->num_texcoord_gen++] = slot;
+            }
          }
       }
-      wide->num_texcoords = j;
-
-      /* find fragment shader PointCoord input */
-      wide->point_coord_fs_input = find_pntc_input_attrib(draw);
-
-      /* setup extra vp output (point coord implemented as a texcoord) */
-      draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
-      draw->extra_shader_outputs.semantic_index = 0;
-      draw->extra_shader_outputs.slot = draw_current_shader_outputs(draw);
-   }
-   else {
-      wide->point_coord_fs_input = -1;
-      draw->extra_shader_outputs.slot = 0;
    }
 
    wide->psize_slot = -1;
@@ -294,7 +292,8 @@ static void widepoint_flush( struct draw_stage *stage, unsigned flags )
 
    stage->point = widepoint_first_point;
    stage->next->flush( stage->next, flags );
-   stage->draw->extra_shader_outputs.slot = 0;
+
+   draw_remove_extra_vertex_attribs(draw);
 
    /* restore original rasterizer state */
    if (draw->rast_handle) {
@@ -324,9 +323,6 @@ struct draw_stage *draw_wide_point_stage( struct draw_context *draw )
    if (wide == NULL)
       goto fail;
 
-   if (!draw_alloc_temp_verts( &wide->stage, 4 ))
-      goto fail;
-
    wide->stage.draw = draw;
    wide->stage.name = "wide-point";
    wide->stage.next = NULL;
@@ -337,6 +333,9 @@ struct draw_stage *draw_wide_point_stage( struct draw_context *draw )
    wide->stage.reset_stipple_counter = widepoint_reset_stipple_counter;
    wide->stage.destroy = widepoint_destroy;
 
+   if (!draw_alloc_temp_verts( &wide->stage, 4 ))
+      goto fail;
+
    return &wide->stage;
 
  fail:
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index a2bfb693c09..d417f825a0f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -48,6 +48,7 @@
 
 #ifdef HAVE_LLVM
 #include <llvm-c/ExecutionEngine.h>
+struct draw_llvm;
 #endif
 
 
@@ -81,6 +82,9 @@ struct vertex_header {
 #define UNDEFINED_VERTEX_ID 0xffff
 
 
+/* maximum number of shader variants we can cache */
+#define DRAW_MAX_SHADER_VARIANTS 1024
+
 /**
  * Private context for the drawing module.
  */
@@ -136,8 +140,7 @@ struct draw_context
       } middle;
 
       struct {
-         struct draw_pt_front_end *vcache;
-         struct draw_pt_front_end *varray;
+         struct draw_pt_front_end *vsplit;
       } front;
 
       struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
@@ -146,6 +149,8 @@ struct draw_context
       struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
       unsigned nr_vertex_elements;
 
+      struct pipe_index_buffer index_buffer;
+
       /* user-space vertex data, buffers */
       struct {
          /** vertex element/index buffer (ex: glDrawElements) */
@@ -159,9 +164,11 @@ struct draw_context
          /** vertex arrays */
          const void *vbuffer[PIPE_MAX_ATTRIBS];
          
-         /** constant buffer (for vertex/geometry shader) */
+         /** constant buffers (for vertex/geometry shader) */
          const void *vs_constants[PIPE_MAX_CONSTANT_BUFFERS];
+         unsigned vs_constants_size[PIPE_MAX_CONSTANT_BUFFERS];
          const void *gs_constants[PIPE_MAX_CONSTANT_BUFFERS];
+         unsigned gs_constants_size[PIPE_MAX_CONSTANT_BUFFERS];
       } user;
 
       boolean test_fse;         /* enable FSE even though its not correct (eg for softpipe) */
@@ -169,13 +176,19 @@ struct draw_context
    } pt;
 
    struct {
-      boolean bypass_clipping;
-      boolean bypass_vs;
+      boolean bypass_clip_xy;
+      boolean bypass_clip_z;
    } driver;
 
    boolean flushing;         /**< debugging/sanity */
    boolean suspend_flushing; /**< internally set */
-   boolean bypass_clipping;  /**< set if either api or driver bypass_clipping true */
+
+   /* Flags set if API requires clipping in these planes and the
+    * driver doesn't indicate that it can do it for us.
+    */
+   boolean clip_xy;
+   boolean clip_z;
+   boolean clip_user;
 
    boolean force_passthrough; /**< never clip or shade */
 
@@ -194,6 +207,7 @@ struct draw_context
    struct pipe_viewport_state viewport;
    boolean identity_viewport;
 
+   /** Vertex shader state */
    struct {
       struct draw_vertex_shader *vertex_shader;
       uint num_vs_outputs;  /**< convenience, from vertex_shader */
@@ -223,6 +237,7 @@ struct draw_context
       struct translate_cache *emit_cache;
    } vs;
 
+   /** Geometry shader state */
    struct {
       struct draw_geometry_shader *geometry_shader;
       uint num_gs_outputs;  /**< convenience, from geometry_shader */
@@ -235,17 +250,31 @@ struct draw_context
       struct tgsi_sampler **samplers;
    } gs;
 
+   /** Fragment shader state */
+   struct {
+      struct draw_fragment_shader *fragment_shader;
+   } fs;
+
+   /** Stream output (vertex feedback) state */
+   struct {
+      struct pipe_stream_output_state state;
+      void *buffers[PIPE_MAX_SO_BUFFERS];
+      uint num_buffers;
+   } so;
+
    /* Clip derived state:
     */
    float plane[12][4];
    unsigned nr_planes;
+   boolean depth_clamp;
 
    /* If a prim stage introduces new vertex attributes, they'll be stored here
     */
    struct {
-      uint semantic_name;
-      uint semantic_index;
-      int slot;
+      uint num;
+      uint semantic_name[10];
+      uint semantic_index[10];
+      uint slot[10];
    } extra_shader_outputs;
 
    unsigned reduced_prim;
@@ -253,12 +282,51 @@ struct draw_context
    unsigned instance_id;
 
 #ifdef HAVE_LLVM
+   struct draw_llvm *llvm;
    LLVMExecutionEngineRef engine;
 #endif
 
+   struct pipe_sampler_view *sampler_views[PIPE_MAX_VERTEX_SAMPLERS];
+   unsigned num_sampler_views;
+   const struct pipe_sampler_state *samplers[PIPE_MAX_VERTEX_SAMPLERS];
+   unsigned num_samplers;
+
    void *driver_private;
 };
 
+
+struct draw_fetch_info {
+   boolean linear;
+   unsigned start;
+   const unsigned *elts;
+   unsigned count;
+};
+
+struct draw_vertex_info {
+   struct vertex_header *verts;
+   unsigned vertex_size;
+   unsigned stride;
+   unsigned count;
+};
+
+/* these flags are set if the primitive is a segment of a larger one */
+#define DRAW_SPLIT_BEFORE 0x1
+#define DRAW_SPLIT_AFTER  0x2
+
+struct draw_prim_info {
+   boolean linear;
+   unsigned start;
+
+   const ushort *elts;
+   unsigned count;
+
+   unsigned prim;
+   unsigned flags;
+   unsigned *primitive_lengths;
+   unsigned primitive_count;
+};
+
+
 /*******************************************************************************
  * Draw common initialization code
  */
@@ -300,6 +368,11 @@ void draw_gs_destroy( struct draw_context *draw );
 uint draw_current_shader_outputs(const struct draw_context *draw);
 uint draw_current_shader_position_output(const struct draw_context *draw);
 
+int draw_alloc_extra_vertex_attrib(struct draw_context *draw,
+                                   uint semantic_name, uint semantic_index);
+void draw_remove_extra_vertex_attribs(struct draw_context *draw);
+
+
 /*******************************************************************************
  * Vertex processing (was passthrough) code:
  */
@@ -319,35 +392,24 @@ void draw_pipeline_destroy( struct draw_context *draw );
 
 
 
-/* We use the top few bits in the elts[] parameter to convey a little
- * API information.  This limits the number of vertices we can address
- * to only 4096 -- if that becomes a problem, we can switch to 32-bit
- * draw indices.
- *
- * These flags expected at first vertex of lines & triangles when
- * unfilled and/or line stipple modes are operational.
+/*
+ * These flags are used by the pipeline when unfilled and/or line stipple modes
+ * are operational.
  */
-#define DRAW_PIPE_MAX_VERTICES  (0x1<<12)
-#define DRAW_PIPE_EDGE_FLAG_0   (0x1<<12)
-#define DRAW_PIPE_EDGE_FLAG_1   (0x2<<12)
-#define DRAW_PIPE_EDGE_FLAG_2   (0x4<<12)
-#define DRAW_PIPE_EDGE_FLAG_ALL (0x7<<12)
-#define DRAW_PIPE_RESET_STIPPLE (0x8<<12)
-#define DRAW_PIPE_FLAG_MASK     (0xf<<12)
+#define DRAW_PIPE_EDGE_FLAG_0   0x1
+#define DRAW_PIPE_EDGE_FLAG_1   0x2
+#define DRAW_PIPE_EDGE_FLAG_2   0x4
+#define DRAW_PIPE_EDGE_FLAG_ALL 0x7
+#define DRAW_PIPE_RESET_STIPPLE 0x8
 
 void draw_pipeline_run( struct draw_context *draw,
-                        unsigned prim,
-                        struct vertex_header *vertices,
-                        unsigned vertex_count,
-                        unsigned stride,
-                        const ushort *elts,
-                        unsigned count );
+                        const struct draw_vertex_info *vert,
+                        const struct draw_prim_info *prim);
 
 void draw_pipeline_run_linear( struct draw_context *draw,
-                               unsigned prim,
-                               struct vertex_header *vertices,
-                               unsigned count,
-                               unsigned stride );
+                               const struct draw_vertex_info *vert,
+                               const struct draw_prim_info *prim);
+
 
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index b853f3a89f8..f44bf2507c6 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -31,30 +31,22 @@
   */
 
 #include "draw/draw_context.h"
+#include "draw/draw_gs.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
 #include "tgsi/tgsi_dump.h"
 #include "util/u_math.h"
 #include "util/u_prim.h"
+#include "util/u_format.h"
+#include "util/u_draw.h"
 
 
 DEBUG_GET_ONCE_BOOL_OPTION(draw_fse, "DRAW_FSE", FALSE)
 DEBUG_GET_ONCE_BOOL_OPTION(draw_no_fse, "DRAW_NO_FSE", FALSE)
-#ifdef HAVE_LLVM
-DEBUG_GET_ONCE_BOOL_OPTION(draw_use_llvm, "DRAW_USE_LLVM", TRUE)
-#endif
-
-static unsigned trim( unsigned count, unsigned first, unsigned incr )
-{
-   if (count < first)
-      return 0;
-   return count - (count - first) % incr; 
-}
-
-
 
 /* Overall we split things into:
- *     - frontend -- prepare fetch_elts, draw_elts - eg vcache
+ *     - frontend -- prepare fetch_elts, draw_elts - eg vsplit
  *     - middle   -- fetch, shade, cliptest, viewport
  *     - pipeline -- the prim pipeline: clipping, wide lines, etc 
  *     - backend  -- the vbuf_render provided by the driver.
@@ -74,29 +66,35 @@ draw_pt_arrays(struct draw_context *draw,
    {
       unsigned first, incr;
       draw_pt_split_prim(prim, &first, &incr);
-      count = trim(count, first, incr); 
+      count = draw_pt_trim_count(count, first, incr);
       if (count < first)
          return TRUE;
    }
 
    if (!draw->force_passthrough) {
+      unsigned gs_out_prim = (draw->gs.geometry_shader ? 
+                              draw->gs.geometry_shader->output_primitive :
+                              prim);
+
       if (!draw->render) {
          opt |= PT_PIPELINE;
       }
-      
+
       if (draw_need_pipeline(draw,
                              draw->rasterizer,
-                             prim)) {
+                             gs_out_prim)) {
          opt |= PT_PIPELINE;
       }
 
-      if (!draw->bypass_clipping && !draw->pt.test_fse) {
+      if ((draw->clip_xy ||
+           draw->clip_z ||
+           draw->clip_user) && !draw->pt.test_fse) {
          opt |= PT_CLIPTEST;
       }
-      
+
       opt |= PT_SHADE;
    }
-      
+
    if (draw->pt.middle.llvm) {
       middle = draw->pt.middle.llvm;
    } else {
@@ -108,22 +106,11 @@ draw_pt_arrays(struct draw_context *draw,
          middle = draw->pt.middle.general;
    }
 
-
-   /* Pick the right frontend
-    */
-   if (draw->pt.user.elts || (opt & PT_PIPELINE)) {
-      frontend = draw->pt.front.vcache;
-   } else {
-      frontend = draw->pt.front.varray;
-   }
+   frontend = draw->pt.front.vsplit;
 
    frontend->prepare( frontend, prim, middle, opt );
 
-   frontend->run(frontend, 
-                 draw_pt_elt_func(draw),
-                 draw_pt_elt_ptr(draw, start),
-                 draw->pt.user.eltBias,
-                 count);
+   frontend->run(frontend, start, count);
 
    frontend->finish( frontend );
 
@@ -136,12 +123,8 @@ boolean draw_pt_init( struct draw_context *draw )
    draw->pt.test_fse = debug_get_option_draw_fse();
    draw->pt.no_fse = debug_get_option_draw_no_fse();
 
-   draw->pt.front.vcache = draw_pt_vcache( draw );
-   if (!draw->pt.front.vcache)
-      return FALSE;
-
-   draw->pt.front.varray = draw_pt_varray(draw);
-   if (!draw->pt.front.varray)
+   draw->pt.front.vsplit = draw_pt_vsplit(draw);
+   if (!draw->pt.front.vsplit)
       return FALSE;
 
    draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw );
@@ -157,7 +140,7 @@ boolean draw_pt_init( struct draw_context *draw )
       return FALSE;
 
 #if HAVE_LLVM
-   if (debug_get_option_draw_use_llvm())
+   if (draw->llvm)
       draw->pt.middle.llvm = draw_pt_fetch_pipeline_or_emit_llvm( draw );
 #endif
 
@@ -187,14 +170,9 @@ void draw_pt_destroy( struct draw_context *draw )
       draw->pt.middle.fetch_shade_emit = NULL;
    }
 
-   if (draw->pt.front.vcache) {
-      draw->pt.front.vcache->destroy( draw->pt.front.vcache );
-      draw->pt.front.vcache = NULL;
-   }
-
-   if (draw->pt.front.varray) {
-      draw->pt.front.varray->destroy( draw->pt.front.varray );
-      draw->pt.front.varray = NULL;
+   if (draw->pt.front.vsplit) {
+      draw->pt.front.vsplit->destroy( draw->pt.front.vsplit );
+      draw->pt.front.vsplit = NULL;
    }
 }
 
@@ -214,24 +192,29 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
       uint ii = 0;
       uint j;
 
-      if (draw->pt.user.elts) {
+      if (draw->pt.user.eltSize) {
+         const char *elts;
+
          /* indexed arrays */
+         elts = (const char *) draw->pt.user.elts;
+         elts += draw->pt.index_buffer.offset;
+
          switch (draw->pt.user.eltSize) {
          case 1:
             {
-               const ubyte *elem = (const ubyte *) draw->pt.user.elts;
+               const ubyte *elem = (const ubyte *) elts;
                ii = elem[start + i];
             }
             break;
          case 2:
             {
-               const ushort *elem = (const ushort *) draw->pt.user.elts;
+               const ushort *elem = (const ushort *) elts;
                ii = elem[start + i];
             }
             break;
          case 4:
             {
-               const uint *elem = (const uint *) draw->pt.user.elts;
+               const uint *elem = (const uint *) elts;
                ii = elem[start + i];
             }
             break;
@@ -252,6 +235,12 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
       for (j = 0; j < draw->pt.nr_vertex_elements; j++) {
          uint buf = draw->pt.vertex_element[j].vertex_buffer_index;
          ubyte *ptr = (ubyte *) draw->pt.user.vbuffer[buf];
+
+         if (draw->pt.vertex_element[j].instance_divisor) {
+            ii = draw->instance_id / draw->pt.vertex_element[j].instance_divisor;
+         }
+
+         ptr += draw->pt.vertex_buffer[buf].buffer_offset;
          ptr += draw->pt.vertex_buffer[buf].stride * ii;
          ptr += draw->pt.vertex_element[j].src_offset;
 
@@ -260,31 +249,38 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
          case PIPE_FORMAT_R32_FLOAT:
             {
                float *v = (float *) ptr;
-               debug_printf("%f  @ %p\n", v[0], (void *) v);
+               debug_printf("R %f  @ %p\n", v[0], (void *) v);
             }
             break;
          case PIPE_FORMAT_R32G32_FLOAT:
             {
                float *v = (float *) ptr;
-               debug_printf("%f %f  @ %p\n", v[0], v[1], (void *) v);
+               debug_printf("RG %f %f  @ %p\n", v[0], v[1], (void *) v);
             }
             break;
          case PIPE_FORMAT_R32G32B32_FLOAT:
             {
                float *v = (float *) ptr;
-               debug_printf("%f %f %f  @ %p\n", v[0], v[1], v[2], (void *) v);
+               debug_printf("RGB %f %f %f  @ %p\n", v[0], v[1], v[2], (void *) v);
             }
             break;
          case PIPE_FORMAT_R32G32B32A32_FLOAT:
             {
                float *v = (float *) ptr;
-               debug_printf("%f %f %f %f  @ %p\n", v[0], v[1], v[2], v[3],
+               debug_printf("RGBA %f %f %f %f  @ %p\n", v[0], v[1], v[2], v[3],
                             (void *) v);
             }
             break;
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            {
+               ubyte *u = (ubyte *) ptr;
+               debug_printf("BGRA %d %d %d %d  @ %p\n", u[0], u[1], u[2], u[3],
+                            (void *) u);
+            }
+            break;
          default:
-            debug_printf("other format (fix me)\n");
-            ;
+            debug_printf("other format %s (fix me)\n",
+                     util_format_name(draw->pt.vertex_element[j].src_format));
          }
       }
    }
@@ -292,11 +288,8 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
 
 
 /**
- * Draw vertex arrays
- * This is the main entrypoint into the drawing module.
- * \param prim  one of PIPE_PRIM_x
- * \param start  index of first vertex to draw
- * \param count  number of vertices to draw
+ * Non-instanced drawing.
+ * \sa draw_arrays_instanced
  */
 void
 draw_arrays(struct draw_context *draw, unsigned prim,
@@ -305,6 +298,11 @@ draw_arrays(struct draw_context *draw, unsigned prim,
    draw_arrays_instanced(draw, prim, start, count, 0, 1);
 }
 
+
+/**
+ * Instanced drawing.
+ * \sa draw_vbo
+ */
 void
 draw_arrays_instanced(struct draw_context *draw,
                       unsigned mode,
@@ -313,40 +311,90 @@ draw_arrays_instanced(struct draw_context *draw,
                       unsigned startInstance,
                       unsigned instanceCount)
 {
-   unsigned reduced_prim = u_reduced_prim(mode);
+   struct pipe_draw_info info;
+
+   util_draw_init_info(&info);
+
+   info.mode = mode;
+   info.start = start;
+   info.count = count;
+   info.start_instance = startInstance;
+   info.instance_count = instanceCount;
+
+   info.indexed = (draw->pt.user.elts != NULL);
+   if (!info.indexed) {
+      info.min_index = start;
+      info.max_index = start + count - 1;
+   }
+
+   draw_vbo(draw, &info);
+}
+
+
+/**
+ * Draw vertex arrays.
+ * This is the main entrypoint into the drawing module.  If drawing an indexed
+ * primitive, the draw_set_index_buffer() and draw_set_mapped_index_buffer()
+ * functions should have already been called to specify the element/index
+ * buffer information.
+ */
+void
+draw_vbo(struct draw_context *draw,
+         const struct pipe_draw_info *info)
+{
+   unsigned reduced_prim = u_reduced_prim(info->mode);
    unsigned instance;
 
+   assert(info->instance_count > 0);
+   if (info->indexed)
+      assert(draw->pt.user.elts);
+
+   draw->pt.user.eltSize =
+      (info->indexed) ? draw->pt.index_buffer.index_size : 0;
+
+   draw->pt.user.eltBias = info->index_bias;
+   draw->pt.user.min_index = info->min_index;
+   draw->pt.user.max_index = info->max_index;
+
    if (reduced_prim != draw->reduced_prim) {
       draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
       draw->reduced_prim = reduced_prim;
    }
 
    if (0)
-      draw_print_arrays(draw, mode, start, MIN2(count, 20));
+      debug_printf("draw_vbo(mode=%u start=%u count=%u):\n",
+                   info->mode, info->start, info->count);
 
-#if 0
-   {
-      int i;
-      debug_printf("draw_arrays(mode=%u start=%u count=%u):\n",
-                   mode, start, count);
+   if (0)
       tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
+
+   if (0) {
+      unsigned int i;
       debug_printf("Elements:\n");
       for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
-         debug_printf("  format=%s\n",
+         debug_printf("  %u: src_offset=%u  inst_div=%u   vbuf=%u  format=%s\n",
+                      i,
+                      draw->pt.vertex_element[i].src_offset,
+                      draw->pt.vertex_element[i].instance_divisor,
+                      draw->pt.vertex_element[i].vertex_buffer_index,
                       util_format_name(draw->pt.vertex_element[i].src_format));
       }
       debug_printf("Buffers:\n");
       for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
-         debug_printf("  stride=%u offset=%u ptr=%p\n",
+         debug_printf("  %u: stride=%u maxindex=%u offset=%u ptr=%p\n",
+                      i,
                       draw->pt.vertex_buffer[i].stride,
+                      draw->pt.vertex_buffer[i].max_index,
                       draw->pt.vertex_buffer[i].buffer_offset,
                       draw->pt.user.vbuffer[i]);
       }
    }
-#endif
 
-   for (instance = 0; instance < instanceCount; instance++) {
-      draw->instance_id = instance + startInstance;
-      draw_pt_arrays(draw, mode, start, count);
+   if (0)
+      draw_print_arrays(draw, info->mode, info->start, MIN2(info->count, 20));
+
+   for (instance = 0; instance < info->instance_count; instance++) {
+      draw->instance_id = instance + info->start_instance;
+      draw_pt_arrays(draw, info->mode, info->start, info->count);
    }
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index 3e3ea320cc0..5fbb4242915 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -35,10 +35,10 @@
 
 #include "pipe/p_compiler.h"
 
-typedef unsigned (*pt_elt_func)( const void *elts, unsigned idx );
-
 struct draw_pt_middle_end;
 struct draw_context;
+struct draw_prim_info;
+struct draw_vertex_info;
 
 
 #define PT_SHADE      0x1
@@ -50,13 +50,18 @@ struct draw_context;
 /* The "front end" - prepare sets of fetch, draw elements for the
  * middle end.
  *
- * Currenly one version of this:
- *    - vcache - catchall implementation, decomposes to TRI/LINE/POINT prims
- * Later:
- *    - varray, varray_split
- *    - velement, velement_split
+ * The fetch elements are indices to the vertices.  The draw elements are
+ * indices to the fetched vertices.  When both arrays of elements are both
+ * linear, middle->run_linear is called;  When only the fetch elements are
+ * linear, middle->run_linear_elts is called;  Otherwise, middle->run is
+ * called.
+ *
+ * When the number of the draw elements exceeds max_vertex of the middle end,
+ * the draw elements (as well as the fetch elements) are splitted and the
+ * middle end is called multiple times.
  *
- * Currenly only using the vcache version.
+ * Currenly there is:
+ *    - vsplit - catchall implementation, splits big prims
  */
 struct draw_pt_front_end {
    void (*prepare)( struct draw_pt_front_end *,
@@ -65,9 +70,7 @@ struct draw_pt_front_end {
 		    unsigned opt );
 
    void (*run)( struct draw_pt_front_end *,
-                pt_elt_func elt_func,
-                const void *elt_ptr,
-                int elt_bias,
+                unsigned start,
                 unsigned count );
 
    void (*finish)( struct draw_pt_front_end * );
@@ -78,6 +81,8 @@ struct draw_pt_front_end {
 /* The "middle end" - prepares actual hardware vertices for the
  * hardware backend.
  *
+ * prim_flags is as defined by pipe_draw_info::flags.
+ *
  * Currently two versions of this:
  *     - fetch, vertex shade, cliptest, prim-pipeline
  *     - fetch, emit (ie passthrough)
@@ -92,11 +97,13 @@ struct draw_pt_middle_end {
                 const unsigned *fetch_elts,
                 unsigned fetch_count,
                 const ushort *draw_elts,
-                unsigned draw_count );
+                unsigned draw_count,
+                unsigned prim_flags );
 
    void (*run_linear)(struct draw_pt_middle_end *,
                       unsigned start,
-                      unsigned count);
+                      unsigned count,
+                      unsigned prim_flags );
 
    /* Transform all vertices in a linear range and then draw them with
     * the supplied element list.  May fail and return FALSE.
@@ -105,7 +112,8 @@ struct draw_pt_middle_end {
                             unsigned fetch_start,
                             unsigned fetch_count,
                             const ushort *draw_elts,
-                            unsigned draw_count );
+                            unsigned draw_count,
+                            unsigned prim_flags );
 
    int (*get_max_vertex_count)( struct draw_pt_middle_end * );
 
@@ -120,19 +128,11 @@ struct vbuf_render;
 struct vertex_header;
 
 
-/* Helper functions.
- */
-pt_elt_func draw_pt_elt_func( struct draw_context *draw );
-const void *draw_pt_elt_ptr( struct draw_context *draw,
-                             unsigned start );
-
 /* Frontends: 
  *
- * Currently only the general-purpose vcache implementation, could add
- * a special case for tiny vertex buffers.
+ * Currently only the general-purpose vsplit implementation.
  */
-struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw );
-struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw);
+struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw);
 
 
 /* Middle-ends:
@@ -162,21 +162,31 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
                            unsigned *max_vertices );
 
 void draw_pt_emit( struct pt_emit *emit,
-		   const float (*vertex_data)[4],
-		   unsigned vertex_count,
-		   unsigned stride,
-		   const ushort *elts,
-		   unsigned count );
+                   const struct draw_vertex_info *vert_info,
+                   const struct draw_prim_info *prim_info);
 
 void draw_pt_emit_linear( struct pt_emit *emit,
-                          const float (*vertex_data)[4],
-                          unsigned stride,
-                          unsigned count );
+                          const struct draw_vertex_info *vert_info,
+                          const struct draw_prim_info *prim_info);
 
 void draw_pt_emit_destroy( struct pt_emit *emit );
 
 struct pt_emit *draw_pt_emit_create( struct draw_context *draw );
 
+/*******************************************************************************
+ * HW stream output emit:
+ */
+struct pt_so_emit;
+
+void draw_pt_so_emit_prepare( struct pt_so_emit *emit );
+
+void draw_pt_so_emit( struct pt_so_emit *emit,
+                      const struct draw_vertex_info *vert_info,
+                      const struct draw_prim_info *prim_info );
+
+void draw_pt_so_emit_destroy( struct pt_so_emit *emit );
+
+struct pt_so_emit *draw_pt_so_emit_create( struct draw_context *draw );
 
 /*******************************************************************************
  * API vertex fetch:
@@ -208,12 +218,12 @@ struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw );
 struct pt_post_vs;
 
 boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
-			     struct vertex_header *pipeline_verts,
-			     unsigned stride,
-			     unsigned count );
+			     struct draw_vertex_info *info );
 
 void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
-			      boolean bypass_clipping,
+			      boolean clip_xy,
+			      boolean clip_z,
+			      boolean clip_user,
 			      boolean bypass_viewport,
 			      boolean opengl,
 			      boolean need_edgeflags );
@@ -227,6 +237,7 @@ void draw_pt_post_vs_destroy( struct pt_post_vs *pvs );
  * Utils: 
  */
 void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr);
+unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr);
 
 
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_pt_decompose.h b/src/gallium/auxiliary/draw/draw_pt_decompose.h
index 3c44f7c11ee..3127aad7310 100644
--- a/src/gallium/auxiliary/draw/draw_pt_decompose.h
+++ b/src/gallium/auxiliary/draw/draw_pt_decompose.h
@@ -1,162 +1,7 @@
+#define LOCAL_VARS                           \
+   char *verts = (char *) vertices;          \
+   const boolean last_vertex_last =          \
+      !(draw->rasterizer->flatshade &&       \
+        draw->rasterizer->flatshade_first);
 
-
-static void FUNC( ARGS,
-                  unsigned count )
-{
-   LOCAL_VARS;
-
-   switch (prim) {
-   case PIPE_PRIM_POINTS:
-      for (i = 0; i < count; i ++) {
-	 POINT( (i + 0) );
-      }
-      break;
-
-   case PIPE_PRIM_LINES:
-      for (i = 0; i+1 < count; i += 2) {
-         LINE( DRAW_PIPE_RESET_STIPPLE,
-               (i + 0),
-               (i + 1));
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:
-      if (count >= 2) {
-         flags = DRAW_PIPE_RESET_STIPPLE;
-
-         for (i = 1; i < count; i++, flags = 0) {
-            LINE( flags,
-                  (i - 1),
-                  (i ));
-         }
-
-	 LINE( flags,
-               (i - 1),
-               (0 ));
-      }
-      break;
-
-   case PIPE_PRIM_LINE_STRIP:
-      flags = DRAW_PIPE_RESET_STIPPLE;
-      for (i = 1; i < count; i++, flags = 0) {
-         LINE( flags,
-               (i - 1),
-               (i ));
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLES:
-      for (i = 0; i+2 < count; i += 3) {
-         if (flatfirst) {
-            /* put provoking vertex in last pos for clipper */
-            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                      (i + 1),
-                      (i + 2),
-                      (i + 0 ));
-         }
-         else {
-            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                      (i + 0),
-                      (i + 1),
-                      (i + 2 ));
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      if (flatfirst) {
-         for (i = 0; i+2 < count; i++) {
-            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                      (i + 1 + (i&1)),
-                      (i + 2 - (i&1)),
-                      (i + 0) );
-         }
-      }
-      else {
-         for (i = 0; i+2 < count; i++) {
-            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                      (i + 0 + (i&1)),
-                      (i + 1 - (i&1)),
-                      (i + 2 ));
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_FAN:
-      if (count >= 3) {
-         if (flatfirst) {
-            for (i = 0; i+2 < count; i++) {
-               TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                         (i + 2),
-                         0,
-                         (i + 1) );
-            }
-         }
-         else {
-            for (i = 0; i+2 < count; i++) {
-               TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                         (0),
-                         (i + 1),
-                         (i + 2 ));
-            }
-         }
-      }
-      break;
-
-
-   case PIPE_PRIM_QUADS:
-      for (i = 0; i+3 < count; i += 4) {
-         QUAD( (i + 0),
-               (i + 1),
-               (i + 2),
-               (i + 3));
-      }
-      break;
-
-   case PIPE_PRIM_QUAD_STRIP:
-      for (i = 0; i+3 < count; i += 2) {
-         QUAD( (i + 2),
-               (i + 0),
-               (i + 1),
-               (i + 3));
-      }
-      break;
-
-   case PIPE_PRIM_POLYGON:
-      {
-         /* These bitflags look a little odd because we submit the
-          * vertices as (1,2,0) to satisfy flatshade requirements.
-          */
-         const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
-         const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
-         const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
-
-         flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
-
-	 for (i = 0; i+2 < count; i++, flags = edge_middle) {
-
-            if (i + 3 == count)
-               flags |= edge_last;
-
-	    TRIANGLE( flags,
-                      (i + 1),
-                      (i + 2),
-                      (0));
-	 }
-      }
-      break;
-
-   default:
-      assert(0);
-      break;
-   }
-
-   FLUSH;
-}
-
-
-#undef TRIANGLE
-#undef QUAD
-#undef POINT
-#undef LINE
-#undef FUNC
+#include "draw_decompose_tmp.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c
deleted file mode 100644
index 88f4d9f495a..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_elts.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <[email protected]>
-  */
-
-#include "draw/draw_pt.h"
-#include "draw/draw_private.h"
-
-/* Neat get_elt func that also works for varrays drawing by encoding
- * the start value into a pointer.  
- */
-
-static unsigned elt_uint( const void *elts, unsigned idx )
-{
-   return *(((const uint *)elts) + idx);
-}
-
-static unsigned elt_ushort( const void *elts, unsigned idx )
-{
-   return *(((const ushort *)elts) + idx);
-}
-
-static unsigned elt_ubyte( const void *elts, unsigned idx )
-{
-   return *(((const ubyte *)elts) + idx);
-}
-
-static unsigned elt_vert( const void *elts, unsigned idx )
-{
-   /* unsigned index is packed in the pointer */
-   return (unsigned)(uintptr_t)elts + idx;
-}
-
-pt_elt_func draw_pt_elt_func( struct draw_context *draw )
-{
-   switch (draw->pt.user.eltSize) {
-   case 0: return &elt_vert;
-   case 1: return &elt_ubyte;
-   case 2: return &elt_ushort; 
-   case 4: return &elt_uint;
-   default: return NULL;
-   }
-}     
-
-const void *draw_pt_elt_ptr( struct draw_context *draw,
-                             unsigned start )
-{
-   const char *elts = draw->pt.user.elts;
-
-   switch (draw->pt.user.eltSize) {
-   case 0: 
-      return (const void *)(((const ubyte *)NULL) + start);
-   case 1: 
-      return (const void *)(((const ubyte *)elts) + start);
-   case 2: 
-      return (const void *)(((const ushort *)elts) + start);
-   case 4: 
-      return (const void *)(((const uint *)elts) + start);
-   default:
-      return NULL;
-   }
-}
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index ad48fa39a4f..c8dfc16911e 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -120,22 +120,21 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
                     (vinfo->size * 4));
-
-   /* even number */
-   *max_vertices = *max_vertices & ~1;
 }
 
 
 void draw_pt_emit( struct pt_emit *emit,
-		   const float (*vertex_data)[4],
-		   unsigned vertex_count,
-		   unsigned stride,
-		   const ushort *elts,
-		   unsigned count )
+                         const struct draw_vertex_info *vert_info,
+                         const struct draw_prim_info *prim_info)
 {
+   const float (*vertex_data)[4] = (const float (*)[4])vert_info->verts->data;
+   unsigned vertex_count = vert_info->count;
+   unsigned stride = vert_info->stride;
+   const ushort *elts = prim_info->elts;
    struct draw_context *draw = emit->draw;
    struct translate *translate = emit->translate;
    struct vbuf_render *render = draw->render;
+   unsigned start, i;
    void *hw_verts;
 
    /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
@@ -145,11 +144,6 @@ void draw_pt_emit( struct pt_emit *emit,
    if (vertex_count == 0)
       return;
 
-   if (vertex_count >= UNDEFINED_VERTEX_ID) {
-      assert(0);
-      return;
-   }
-
    /* XXX: and work out some way to coordinate the render primitive
     * between vbuf.c and here...
     */
@@ -180,6 +174,7 @@ void draw_pt_emit( struct pt_emit *emit,
 			 0,
 			 ~0);
 
+   /* fetch/translate vertex attribs to fill hw_verts[] */
    translate->run( translate,
 		   0, 
 		   vertex_count,
@@ -190,23 +185,31 @@ void draw_pt_emit( struct pt_emit *emit,
                            0, 
                            vertex_count - 1 );
 
-   render->draw(render,
-		elts,
-		count);
+   for (start = i = 0;
+        i < prim_info->primitive_count;
+        start += prim_info->primitive_lengths[i], i++)
+   {
+      render->draw_elements(render,
+                            elts + start,
+                            prim_info->primitive_lengths[i]);
+   }
 
    render->release_vertices(render);
 }
 
 
 void draw_pt_emit_linear(struct pt_emit *emit,
-                         const float (*vertex_data)[4],
-                         unsigned stride,
-                         unsigned count)
+                         const struct draw_vertex_info *vert_info,
+                         const struct draw_prim_info *prim_info)
 {
+   const float (*vertex_data)[4] = (const float (*)[4])vert_info->verts->data;
+   unsigned stride = vert_info->stride;
+   unsigned count = vert_info->count;
    struct draw_context *draw = emit->draw;
    struct translate *translate = emit->translate;
    struct vbuf_render *render = draw->render;
    void *hw_verts;
+   unsigned start, i;
 
 #if 0
    debug_printf("Linear emit\n");
@@ -215,9 +218,6 @@ void draw_pt_emit_linear(struct pt_emit *emit,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID)
-      goto fail;
-
    /* XXX: and work out some way to coordinate the render primitive
     * between vbuf.c and here...
     */
@@ -258,7 +258,14 @@ void draw_pt_emit_linear(struct pt_emit *emit,
 
    render->unmap_vertices( render, 0, count - 1 );
 
-   render->draw_arrays(render, 0, count);
+   for (start = i = 0;
+        i < prim_info->primitive_count;
+        start += prim_info->primitive_lengths[i], i++)
+   {
+      render->draw_arrays(render,
+                          start,
+                          prim_info->primitive_lengths[i]);
+   }
 
    render->release_vertices(render);
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
index a1347221b5d..ae12ee24bdc 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -29,7 +29,6 @@
 #include "util/u_math.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
-#include "draw/draw_vbuf.h"
 #include "draw/draw_pt.h"
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
@@ -69,31 +68,12 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
 
    fetch->vertex_size = vertex_size;
 
-   /* Always emit/leave space for a vertex header.
-    *
-    * It's worth considering whether the vertex headers should contain
-    * a pointer to the 'data', rather than having it inline.
-    * Something to look at after we've fully switched over to the pt
-    * paths.
+   /* Leave the clipmask/edgeflags/pad/vertex_id untouched
     */
-   {
-      /* Need to set header->vertex_id = 0xffff somehow.
-       */
-      key.element[nr].type = TRANSLATE_ELEMENT_NORMAL;
-      key.element[nr].input_format = PIPE_FORMAT_R32_FLOAT;
-      key.element[nr].input_buffer = draw->pt.nr_vertex_buffers;
-      key.element[nr].input_offset = 0;
-      key.element[nr].instance_divisor = 0;
-      key.element[nr].output_format = PIPE_FORMAT_R32_FLOAT;
-      key.element[nr].output_offset = dst_offset;
-      dst_offset += 1 * sizeof(float);
-      nr++;
-
-
-      /* Just leave the clip[] array untouched.
-       */
-      dst_offset += 4 * sizeof(float);
-   }
+   dst_offset += 1 * sizeof(float);
+   /* Just leave the clip[] array untouched.
+    */
+   dst_offset += 4 * sizeof(float);
 
    if (instance_id_index != ~0) {
       num_extra_inputs++;
@@ -132,26 +112,11 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
    key.nr_elements = nr;
    key.output_stride = vertex_size;
 
-
    if (!fetch->translate ||
        translate_key_compare(&fetch->translate->key, &key) != 0)
    {
       translate_key_sanitize(&key);
       fetch->translate = translate_cache_find(fetch->cache, &key);
-
-      {
-         static struct vertex_header vh = { 0,
-                                            1,
-                                            0,
-                                            UNDEFINED_VERTEX_ID,
-                                            { .0f, .0f, .0f, .0f } };
-
-	 fetch->translate->set_buffer(fetch->translate,
-				      draw->pt.nr_vertex_buffers,
-				      &vh,
-				      0,
-				      ~0);
-      }
    }
 
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index d7735bf1ac9..e706b7796f8 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -36,6 +36,7 @@
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
+#include "draw/draw_gs.h"
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
 
@@ -100,9 +101,14 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
    boolean ok;
    struct translate_key key;
 
+   unsigned gs_out_prim = (draw->gs.geometry_shader ? 
+                           draw->gs.geometry_shader->output_primitive :
+                           prim);
+
+
 
    ok = draw->render->set_primitive( draw->render, 
-                                     prim );
+                                     gs_out_prim );
    if (!ok) {
       assert(0);
       return;
@@ -185,15 +191,6 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
                     (vinfo->size * 4));
-
-   /* Return an even number of verts.
-    * This prevents "parity" errors when splitting long triangle strips which
-    * can lead to front/back culling mix-ups.
-    * Every other triangle in a strip has an alternate front/back orientation
-    * so splitting at an odd position can cause the orientation of subsequent
-    * triangles to get reversed.
-    */
-   *max_vertices = *max_vertices & ~1;
 }
 
 
@@ -204,7 +201,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
                             const unsigned *fetch_elts,
                             unsigned fetch_count,
                             const ushort *draw_elts,
-                            unsigned draw_count )
+                            unsigned draw_count,
+                            unsigned prim_flags )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
@@ -214,11 +212,6 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (fetch_count >= UNDEFINED_VERTEX_ID) {
-      assert(0);
-      return;
-   }
-
    draw->render->allocate_vertices( draw->render,
                                     (ushort)feme->translate->key.output_stride,
                                     (ushort)fetch_count );
@@ -254,9 +247,9 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
    /* XXX: Draw arrays path to avoid re-emitting index list again and
     * again.
     */
-   draw->render->draw( draw->render, 
-                       draw_elts, 
-                       draw_count );
+   draw->render->draw_elements( draw->render, 
+                                draw_elts, 
+                                draw_count );
 
    /* Done -- that was easy, wasn't it: 
     */
@@ -267,7 +260,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
 
 static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
                                    unsigned start,
-                                   unsigned count )
+                                   unsigned count,
+                                   unsigned prim_flags )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
@@ -277,9 +271,6 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID) 
-      goto fail;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)feme->translate->key.output_stride,
                                          (ushort)count )) 
@@ -328,7 +319,8 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
                                         unsigned start,
                                         unsigned count,
                                         const ushort *draw_elts,
-                                        unsigned draw_count )
+                                        unsigned draw_count,
+                                        unsigned prim_flags )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
@@ -338,9 +330,6 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID)
-      return FALSE;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)feme->translate->key.output_stride,
                                          (ushort)count ))
@@ -363,9 +352,9 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
    /* XXX: Draw arrays path to avoid re-emitting index list again and
     * again.
     */
-   draw->render->draw( draw->render, 
-                       draw_elts, 
-                       draw_count );
+   draw->render->draw_elements( draw->render, 
+                                draw_elts, 
+                                draw_count );
 
    /* Done -- that was easy, wasn't it:
     */
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index cbb5b6c9605..7c198c6026d 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -67,9 +67,8 @@ struct fetch_shade_emit {
 
 
 
-			       
 static void fse_prepare( struct draw_pt_middle_end *middle,
-                         unsigned prim, 
+                         unsigned prim,
                          unsigned opt,
                          unsigned *max_vertices )
 {
@@ -79,9 +78,12 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    const struct vertex_info *vinfo;
    unsigned i;
    unsigned nr_vbs = 0;
-   
 
-   if (!draw->render->set_primitive( draw->render, 
+   /* Can't support geometry shader on this path.
+    */
+   assert(!draw->gs.geometry_shader);
+
+   if (!draw->render->set_primitive( draw->render,
                                      prim )) {
       assert(0);
       return;
@@ -90,7 +92,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    /* Must do this after set_primitive() above:
     */
    fse->vinfo = vinfo = draw->render->get_vertex_info(draw->render);
-   
 
 
    fse->key.output_stride = vinfo->size * 4;
@@ -101,7 +102,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
                                fse->key.nr_inputs);     /* inputs - fetch from api format */
 
    fse->key.viewport = !draw->identity_viewport;
-   fse->key.clip = !draw->bypass_clipping;
+   fse->key.clip = draw->clip_xy || draw->clip_z || draw->clip_user;
    fse->key.const_vbuffers = 0;
 
    memset(fse->key.element, 0, 
@@ -174,15 +175,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
                     (vinfo->size * 4));
 
-   /* Return an even number of verts.
-    * This prevents "parity" errors when splitting long triangle strips which
-    * can lead to front/back culling mix-ups.
-    * Every other triangle in a strip has an alternate front/back orientation
-    * so splitting at an odd position can cause the orientation of subsequent
-    * triangles to get reversed.
-    */
-   *max_vertices = *max_vertices & ~1;
-
    /* Probably need to do this somewhere (or fix exec shader not to
     * need it):
     */
@@ -196,7 +188,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
 
 static void fse_run_linear( struct draw_pt_middle_end *middle, 
                             unsigned start, 
-                            unsigned count )
+                            unsigned count,
+                            unsigned prim_flags )
 {
    struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
    struct draw_context *draw = fse->draw;
@@ -206,9 +199,6 @@ static void fse_run_linear( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID) 
-      goto fail;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)fse->key.output_stride,
                                          (ushort)count ))
@@ -264,7 +254,8 @@ fse_run(struct draw_pt_middle_end *middle,
         const unsigned *fetch_elts,
         unsigned fetch_count,
         const ushort *draw_elts,
-        unsigned draw_count )
+        unsigned draw_count,
+        unsigned prim_flags )
 {
    struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
    struct draw_context *draw = fse->draw;
@@ -274,9 +265,6 @@ fse_run(struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (fetch_count >= UNDEFINED_VERTEX_ID) 
-      goto fail;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)fse->key.output_stride,
                                          (ushort)fetch_count ))
@@ -307,9 +295,9 @@ fse_run(struct draw_pt_middle_end *middle,
 
    draw->render->unmap_vertices( draw->render, 0, (ushort)(fetch_count - 1) );
    
-   draw->render->draw( draw->render, 
-                       draw_elts, 
-                       draw_count );
+   draw->render->draw_elements( draw->render, 
+                                draw_elts, 
+                                draw_count );
 
 
    draw->render->release_vertices( draw->render );
@@ -326,7 +314,8 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle,
                                  unsigned start, 
                                  unsigned count,
                                  const ushort *draw_elts,
-                                 unsigned draw_count )
+                                 unsigned draw_count,
+                                 unsigned prim_flags )
 {
    struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
    struct draw_context *draw = fse->draw;
@@ -336,9 +325,6 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
-   if (count >= UNDEFINED_VERTEX_ID)
-      return FALSE;
-
    if (!draw->render->allocate_vertices( draw->render,
                                          (ushort)fse->key.output_stride,
                                          (ushort)count ))
@@ -357,9 +343,9 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle,
                             hw_verts );
 
 
-   draw->render->draw( draw->render, 
-                       draw_elts, 
-                       draw_count );
+   draw->render->draw_elements( draw->render, 
+                                draw_elts, 
+                                draw_count );
    
 
    draw->render->unmap_vertices( draw->render, 0, (ushort)(count - 1) );
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index da5106463a7..b72fd612451 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -27,6 +27,7 @@
 
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
@@ -40,16 +41,16 @@ struct fetch_pipeline_middle_end {
    struct draw_context *draw;
 
    struct pt_emit *emit;
+   struct pt_so_emit *so_emit;
    struct pt_fetch *fetch;
    struct pt_post_vs *post_vs;
 
    unsigned vertex_data_offset;
    unsigned vertex_size;
-   unsigned prim;
+   unsigned input_prim;
    unsigned opt;
 };
 
-
 static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
                                     unsigned prim,
 				    unsigned opt,
@@ -61,6 +62,10 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
    unsigned i;
    unsigned instance_id_index = ~0;
 
+   unsigned gs_out_prim = (draw->gs.geometry_shader ? 
+                           draw->gs.geometry_shader->output_primitive :
+                           prim);
+
    /* Add one to num_outputs because the pipeline occasionally tags on
     * an additional texcoord, eg for AA lines.
     */
@@ -76,7 +81,7 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
       }
    }
 
-   fpme->prim = prim;
+   fpme->input_prim = prim;
    fpme->opt = opt;
 
    /* Always leave room for the vertex header whether we need it or
@@ -95,173 +100,177 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
     * but gl vs dx9 clip spaces.
     */
    draw_pt_post_vs_prepare( fpme->post_vs,
-			    (boolean)draw->bypass_clipping,
-			    (boolean)draw->identity_viewport,
+			    draw->clip_xy,
+			    draw->clip_z,
+			    draw->clip_user,
+			    draw->identity_viewport,
 			    (boolean)draw->rasterizer->gl_rasterization_rules,
-			    (draw->vs.edgeflag_output ? true : false) );    
+			    (draw->vs.edgeflag_output ? TRUE : FALSE) );
+
+   draw_pt_so_emit_prepare( fpme->so_emit );
 
    if (!(opt & PT_PIPELINE)) {
-      draw_pt_emit_prepare( fpme->emit, 
-			    prim,
+      draw_pt_emit_prepare( fpme->emit,
+			    gs_out_prim,
                             max_vertices );
 
-      *max_vertices = MAX2( *max_vertices,
-                            DRAW_PIPE_MAX_VERTICES );
+      *max_vertices = MAX2( *max_vertices, 4096 );
    }
    else {
-      *max_vertices = DRAW_PIPE_MAX_VERTICES; 
+      /* limit max fetches by limiting max_vertices */
+      *max_vertices = 4096;
    }
 
-   /* return even number */
-   *max_vertices = *max_vertices & ~1;
-
    /* No need to prepare the shader.
     */
    vs->prepare(vs, draw);
 }
 
 
-
-static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
-                                const unsigned *fetch_elts,
-                                unsigned fetch_count,
-                                const ushort *draw_elts,
-                                unsigned draw_count )
+static void fetch( struct pt_fetch *fetch,
+                   const struct draw_fetch_info *fetch_info,
+                   char *output)
 {
-   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *vshader = draw->vs.vertex_shader;
-   struct draw_geometry_shader *gshader = draw->gs.geometry_shader;
-   unsigned opt = fpme->opt;
-   unsigned alloc_count = align( fetch_count, 4 );
-
-   struct vertex_header *pipeline_verts = 
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
-
-   if (!pipeline_verts) {
-      /* Not much we can do here - just skip the rendering.
-       */
-      assert(0);
-      return;
+   if (fetch_info->linear) {
+      draw_pt_fetch_run_linear( fetch,
+                                fetch_info->start,
+                                fetch_info->count,
+                                output );
    }
-
-   /* Fetch into our vertex buffer
-    */
-   draw_pt_fetch_run( fpme->fetch,
-		      fetch_elts, 
-		      fetch_count,
-		      (char *)pipeline_verts );
-
-   /* Run the shader, note that this overwrites the data[] parts of
-    * the pipeline verts.
-    */
-   if (opt & PT_SHADE)
-   {
-      vshader->run_linear(vshader,
-                          (const float (*)[4])pipeline_verts->data,
-                          (      float (*)[4])pipeline_verts->data,
-                          draw->pt.user.vs_constants,
-                          fetch_count,
-                          fpme->vertex_size,
-                          fpme->vertex_size);
-      if (gshader)
-         draw_geometry_shader_run(gshader,
-                                  (const float (*)[4])pipeline_verts->data,
-                                  (      float (*)[4])pipeline_verts->data,
-                                  draw->pt.user.gs_constants,
-                                  fetch_count,
-                                  fpme->vertex_size,
-                                  fpme->vertex_size);
+   else {
+      draw_pt_fetch_run( fetch,
+                         fetch_info->elts,
+                         fetch_info->count,
+                         output );
    }
+}
 
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    fetch_count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
 
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
+static void pipeline(struct fetch_pipeline_middle_end *fpme,
+                     const struct draw_vertex_info *vert_info,
+                     const struct draw_prim_info *prim_info)
+{
+   if (prim_info->linear)
+      draw_pipeline_run_linear( fpme->draw,
+                                vert_info,
+                                prim_info);
+   else
       draw_pipeline_run( fpme->draw,
-                         fpme->prim,
-                         pipeline_verts,
-                         fetch_count,
-                         fpme->vertex_size,
-                         draw_elts,
-                         draw_count );
+                         vert_info,
+                         prim_info );
+}
+
+static void emit(struct pt_emit *emit,
+                 const struct draw_vertex_info *vert_info,
+                 const struct draw_prim_info *prim_info)
+{
+   if (prim_info->linear) {
+      draw_pt_emit_linear(emit, vert_info, prim_info);
    }
    else {
-      draw_pt_emit( fpme->emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    fetch_count,
-		    fpme->vertex_size,
-		    draw_elts,
-		    draw_count );
+      draw_pt_emit(emit, vert_info, prim_info);
    }
+}
 
 
-   FREE(pipeline_verts);
+static void draw_vertex_shader_run(struct draw_vertex_shader *vshader,
+                                   const void *constants[PIPE_MAX_CONSTANT_BUFFERS], 
+                                   unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS],
+                                   const struct draw_vertex_info *input_verts,
+                                   struct draw_vertex_info *output_verts )
+{
+   output_verts->vertex_size = input_verts->vertex_size;
+   output_verts->stride = input_verts->vertex_size;
+   output_verts->count = input_verts->count;
+   output_verts->verts =
+      (struct vertex_header *)MALLOC(output_verts->vertex_size *
+                                     align(output_verts->count, 4));
+
+   vshader->run_linear(vshader,
+                       (const float (*)[4])input_verts->verts->data,
+                       (      float (*)[4])output_verts->verts->data,
+                       constants,
+                       const_size,
+                       input_verts->count,
+                       input_verts->vertex_size,
+                       input_verts->vertex_size);
 }
 
-
-static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
-                                       unsigned start,
-                                       unsigned count)
+static void fetch_pipeline_generic( struct draw_pt_middle_end *middle,
+                                    const struct draw_fetch_info *fetch_info,
+                                    const struct draw_prim_info *prim_info )
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
-   struct draw_geometry_shader *geometry_shader = draw->gs.geometry_shader;
+   struct draw_vertex_shader *vshader = draw->vs.vertex_shader;
+   struct draw_geometry_shader *gshader = draw->gs.geometry_shader;
+   struct draw_prim_info gs_prim_info;
+   struct draw_vertex_info fetched_vert_info;
+   struct draw_vertex_info vs_vert_info;
+   struct draw_vertex_info gs_vert_info;
+   struct draw_vertex_info *vert_info;
    unsigned opt = fpme->opt;
-   unsigned alloc_count = align( count, 4 );
 
-   struct vertex_header *pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
-
-   if (!pipeline_verts) {
-      /* Not much we can do here - just skip the rendering.
-       */
+   fetched_vert_info.count = fetch_info->count;
+   fetched_vert_info.vertex_size = fpme->vertex_size;
+   fetched_vert_info.stride = fpme->vertex_size;
+   fetched_vert_info.verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size *
+                                     align(fetch_info->count,  4));
+   if (!fetched_vert_info.verts) {
       assert(0);
       return;
    }
 
-   /* Fetch into our vertex buffer
+   /* Fetch into our vertex buffer.
+    */
+   fetch( fpme->fetch, fetch_info, (char *)fetched_vert_info.verts );
+
+   /* Finished with fetch:
     */
-   draw_pt_fetch_run_linear( fpme->fetch,
-                             start,
-                             count,
-                             (char *)pipeline_verts );
+   fetch_info = NULL;
+   vert_info = &fetched_vert_info;
 
    /* Run the shader, note that this overwrites the data[] parts of
     * the pipeline verts.
     */
-   if (opt & PT_SHADE)
-   {
-      shader->run_linear(shader,
-			 (const float (*)[4])pipeline_verts->data,
-			 (      float (*)[4])pipeline_verts->data,
-                         draw->pt.user.vs_constants,
-			 count,
-			 fpme->vertex_size,
-			 fpme->vertex_size);
-
-      if (geometry_shader)
-         draw_geometry_shader_run(geometry_shader,
-                                  (const float (*)[4])pipeline_verts->data,
-                                  (      float (*)[4])pipeline_verts->data,
-                                  draw->pt.user.gs_constants,
-                                  count,
-                                  fpme->vertex_size,
-                                  fpme->vertex_size);
+   if (fpme->opt & PT_SHADE) {
+      draw_vertex_shader_run(vshader,
+                             draw->pt.user.vs_constants,
+                             draw->pt.user.vs_constants_size,
+                             vert_info,
+                             &vs_vert_info);
+
+      FREE(vert_info->verts);
+      vert_info = &vs_vert_info;
+   }
+
+   if ((fpme->opt & PT_SHADE) && gshader) {
+      draw_geometry_shader_run(gshader,
+                               draw->pt.user.gs_constants,
+                               draw->pt.user.gs_constants_size,
+                               vert_info,
+                               prim_info,
+                               &gs_vert_info,
+                               &gs_prim_info);
+
+      FREE(vert_info->verts);
+      vert_info = &gs_vert_info;
+      prim_info = &gs_prim_info;
    }
 
+
+   /* Stream output needs to be done before clipping.
+    *
+    * XXX: Stream output surely needs to respect the prim_info->elt
+    *      lists.
+    */
+   draw_pt_so_emit( fpme->so_emit,
+                    vert_info,
+                    prim_info );
+
    if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    count,
-			    fpme->vertex_size ))
+                            vert_info ))
    {
       opt |= PT_PIPELINE;
    }
@@ -269,102 +278,102 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
    /* Do we need to run the pipeline?
     */
    if (opt & PT_PIPELINE) {
-      draw_pipeline_run_linear( fpme->draw,
-                                fpme->prim,
-                                pipeline_verts,
-                                count,
-                                fpme->vertex_size);
+      pipeline( fpme,
+                vert_info,
+                prim_info );
    }
    else {
-      draw_pt_emit_linear( fpme->emit,
-                           (const float (*)[4])pipeline_verts->data,
-                           fpme->vertex_size,
-                           count );
+      emit( fpme->emit,
+            vert_info,
+            prim_info );
    }
-
-   FREE(pipeline_verts);
+   FREE(vert_info->verts);
 }
 
+static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
+                                const unsigned *fetch_elts,
+                                unsigned fetch_count,
+                                const ushort *draw_elts,
+                                unsigned draw_count,
+                                unsigned prim_flags )
+{
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
+
+   fetch_info.linear = FALSE;
+   fetch_info.start = 0;
+   fetch_info.elts = fetch_elts;
+   fetch_info.count = fetch_count;
+
+   prim_info.linear = FALSE;
+   prim_info.start = 0;
+   prim_info.count = draw_count;
+   prim_info.elts = draw_elts;
+   prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &draw_count;
+
+   fetch_pipeline_generic( middle, &fetch_info, &prim_info );
+}
 
 
-static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle,
-                                            unsigned start,
-                                            unsigned count,
-                                            const ushort *draw_elts,
-                                            unsigned draw_count )
+static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
+                                       unsigned start,
+                                       unsigned count,
+                                       unsigned prim_flags)
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
-   struct draw_geometry_shader *geometry_shader = draw->gs.geometry_shader;
-   unsigned opt = fpme->opt;
-   unsigned alloc_count = align( count, 4 );
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
+
+   fetch_info.linear = TRUE;
+   fetch_info.start = start;
+   fetch_info.count = count;
+   fetch_info.elts = NULL;
+
+   prim_info.linear = TRUE;
+   prim_info.start = 0;
+   prim_info.count = count;
+   prim_info.elts = NULL;
+   prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &count;
+
+   fetch_pipeline_generic( middle, &fetch_info, &prim_info );
+}
 
-   struct vertex_header *pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
 
-   if (!pipeline_verts) 
-      return FALSE;
 
-   /* Fetch into our vertex buffer
-    */
-   draw_pt_fetch_run_linear( fpme->fetch,
-                             start,
-                             count,
-                             (char *)pipeline_verts );
+static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle,
+                                               unsigned start,
+                                               unsigned count,
+                                               const ushort *draw_elts,
+                                               unsigned draw_count,
+                                               unsigned prim_flags )
+{
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
 
-   /* Run the shader, note that this overwrites the data[] parts of
-    * the pipeline verts.
-    */
-   if (opt & PT_SHADE)
-   {
-      shader->run_linear(shader,
-			 (const float (*)[4])pipeline_verts->data,
-			 (      float (*)[4])pipeline_verts->data,
-                         draw->pt.user.vs_constants,
-			 count,
-			 fpme->vertex_size,
-			 fpme->vertex_size);
-
-      if (geometry_shader)
-         draw_geometry_shader_run(geometry_shader,
-                                  (const float (*)[4])pipeline_verts->data,
-                                  (      float (*)[4])pipeline_verts->data,
-                                  draw->pt.user.gs_constants,
-                                  count,
-                                  fpme->vertex_size,
-                                  fpme->vertex_size);
-   }
+   fetch_info.linear = TRUE;
+   fetch_info.start = start;
+   fetch_info.count = count;
+   fetch_info.elts = NULL;
 
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
+   prim_info.linear = FALSE;
+   prim_info.start = 0;
+   prim_info.count = draw_count;
+   prim_info.elts = draw_elts;
+   prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &draw_count;
 
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
-      draw_pipeline_run( fpme->draw,
-                         fpme->prim,
-                         pipeline_verts,
-                         count,
-                         fpme->vertex_size,
-                         draw_elts,
-                         draw_count );
-   }
-   else {
-      draw_pt_emit( fpme->emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    count,
-		    fpme->vertex_size,
-		    draw_elts,
-		    draw_count );
-   }
+   fetch_pipeline_generic( middle, &fetch_info, &prim_info );
 
-   FREE(pipeline_verts);
    return TRUE;
 }
 
@@ -385,6 +394,9 @@ static void fetch_pipeline_destroy( struct draw_pt_middle_end *middle )
    if (fpme->emit)
       draw_pt_emit_destroy( fpme->emit );
 
+   if (fpme->so_emit)
+      draw_pt_so_emit_destroy( fpme->so_emit );
+
    if (fpme->post_vs)
       draw_pt_post_vs_destroy( fpme->post_vs );
 
@@ -416,7 +428,11 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit( struct draw_context *
       goto fail;
 
    fpme->emit = draw_pt_emit_create( draw );
-   if (!fpme->emit) 
+   if (!fpme->emit)
+      goto fail;
+
+   fpme->so_emit = draw_pt_so_emit_create( draw );
+   if (!fpme->so_emit)
       goto fail;
 
    return &fpme->base;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index d2a492f2b4c..77291e304e1 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2010 VMWare, Inc.
+ * Copyright 2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -28,11 +28,11 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "draw/draw_context.h"
+#include "draw/draw_gs.h"
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
 #include "draw/draw_vs.h"
-#include "draw/draw_gs.h"
 #include "draw/draw_llvm.h"
 
 
@@ -41,53 +41,59 @@ struct llvm_middle_end {
    struct draw_context *draw;
 
    struct pt_emit *emit;
+   struct pt_so_emit *so_emit;
    struct pt_fetch *fetch;
    struct pt_post_vs *post_vs;
 
 
    unsigned vertex_data_offset;
    unsigned vertex_size;
-   unsigned prim;
+   unsigned input_prim;
    unsigned opt;
 
    struct draw_llvm *llvm;
-   struct draw_llvm_variant *variants;
    struct draw_llvm_variant *current_variant;
-   int nr_variants;
 };
 
 
 static void
 llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
-                         unsigned prim,
+                         unsigned in_prim,
                          unsigned opt,
                          unsigned *max_vertices )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *vs = draw->vs.vertex_shader;
-   struct draw_geometry_shader *gs = draw->gs.geometry_shader;
-   struct draw_llvm_variant_key key;
+   struct llvm_vertex_shader *shader =
+      llvm_vertex_shader(draw->vs.vertex_shader);
+   char store[DRAW_LLVM_MAX_VARIANT_KEY_SIZE];
+   struct draw_llvm_variant_key *key;
    struct draw_llvm_variant *variant = NULL;
+   struct draw_llvm_variant_list_item *li;
    unsigned i;
    unsigned instance_id_index = ~0;
 
+
+   unsigned out_prim = (draw->gs.geometry_shader ? 
+                        draw->gs.geometry_shader->output_primitive :
+                        in_prim);
+
    /* Add one to num_outputs because the pipeline occasionally tags on
     * an additional texcoord, eg for AA lines.
     */
-   unsigned nr = MAX2( vs->info.num_inputs,
-		       vs->info.num_outputs + 1 );
+   unsigned nr = MAX2( shader->base.info.num_inputs,
+		       shader->base.info.num_outputs + 1 );
 
    /* Scan for instanceID system value.
     */
-   for (i = 0; i < vs->info.num_inputs; i++) {
-      if (vs->info.input_semantic_name[i] == TGSI_SEMANTIC_INSTANCEID) {
+   for (i = 0; i < shader->base.info.num_inputs; i++) {
+      if (shader->base.info.input_semantic_name[i] == TGSI_SEMANTIC_INSTANCEID) {
          instance_id_index = i;
          break;
       }
    }
 
-   fpme->prim = prim;
+   fpme->input_prim = in_prim;
    fpme->opt = opt;
 
    /* Always leave room for the vertex header whether we need it or
@@ -97,57 +103,71 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
    fpme->vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
 
 
-
-   draw_pt_fetch_prepare( fpme->fetch,
-                          vs->info.num_inputs,
-                          fpme->vertex_size,
-                          instance_id_index );
-   if (opt & PT_SHADE) {
-      vs->prepare(vs, draw);
-      draw_geometry_shader_prepare(gs, draw);
-   }
-
-
    /* XXX: it's not really gl rasterization rules we care about here,
     * but gl vs dx9 clip spaces.
     */
    draw_pt_post_vs_prepare( fpme->post_vs,
-			    (boolean)draw->bypass_clipping,
-			    (boolean)(draw->identity_viewport),
+			    draw->clip_xy,
+			    draw->clip_z,
+			    draw->clip_user,
+			    draw->identity_viewport,
 			    (boolean)draw->rasterizer->gl_rasterization_rules,
-			    (draw->vs.edgeflag_output ? true : false) );
+			    (draw->vs.edgeflag_output ? TRUE : FALSE) );
+
+   draw_pt_so_emit_prepare( fpme->so_emit );
 
    if (!(opt & PT_PIPELINE)) {
       draw_pt_emit_prepare( fpme->emit,
-			    prim,
+			    out_prim,
                             max_vertices );
 
-      *max_vertices = MAX2( *max_vertices,
-                            DRAW_PIPE_MAX_VERTICES );
+      *max_vertices = MAX2( *max_vertices, 4096 );
    }
    else {
-      *max_vertices = DRAW_PIPE_MAX_VERTICES;
+      /* limit max fetches by limiting max_vertices */
+      *max_vertices = 4096;
    }
 
    /* return even number */
    *max_vertices = *max_vertices & ~1;
+   
+   key = draw_llvm_make_variant_key(fpme->llvm, store);
 
-   draw_llvm_make_variant_key(fpme->llvm, &key);
-
-   variant = fpme->variants;
-   while(variant) {
-      if(memcmp(&variant->key, &key, sizeof key) == 0)
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      if(memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
+         variant = li->base;
          break;
+      }
+      li = next_elem(li);
+   }
 
-      variant = variant->next;
+   if (variant) {
+      move_to_head(&fpme->llvm->vs_variants_list, &variant->list_item_global);
    }
+   else {
+      unsigned i;
+      if (fpme->llvm->nr_variants >= DRAW_MAX_SHADER_VARIANTS) {
+         /*
+          * XXX: should we flush here ?
+          */
+         for (i = 0; i < DRAW_MAX_SHADER_VARIANTS / 4; i++) {
+            struct draw_llvm_variant_list_item *item =
+               last_elem(&fpme->llvm->vs_variants_list);
+            draw_llvm_destroy_variant(item->base);
+         }
+      }
+
+      variant = draw_llvm_create_variant(fpme->llvm, nr, key);
 
-   if (!variant) {
-      variant = draw_llvm_prepare(fpme->llvm, nr);
-      variant->next = fpme->variants;
-      fpme->variants = variant;
-      ++fpme->nr_variants;
+      if (variant) {
+         insert_at_head(&shader->variants, &variant->list_item_local);
+         insert_at_head(&fpme->llvm->vs_variants_list, &variant->list_item_global);
+         fpme->llvm->nr_variants++;
+         shader->variants_cached++;
+      }
    }
+
    fpme->current_variant = variant;
 
    /*XXX we only support one constant buffer */
@@ -158,125 +178,174 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
 }
 
 
+static void pipeline(struct llvm_middle_end *llvm,
+                     const struct draw_vertex_info *vert_info,
+                     const struct draw_prim_info *prim_info)
+{
+   if (prim_info->linear)
+      draw_pipeline_run_linear( llvm->draw,
+                                vert_info,
+                                prim_info);
+   else
+      draw_pipeline_run( llvm->draw,
+                         vert_info,
+                         prim_info );
+}
+
+static void emit(struct pt_emit *emit,
+                 const struct draw_vertex_info *vert_info,
+                 const struct draw_prim_info *prim_info)
+{
+   if (prim_info->linear) {
+      draw_pt_emit_linear(emit, vert_info, prim_info);
+   }
+   else {
+      draw_pt_emit(emit, vert_info, prim_info);
+   }
+}
 
-static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
-                                 const unsigned *fetch_elts,
-                                 unsigned fetch_count,
-                                 const ushort *draw_elts,
-                                 unsigned draw_count )
+static void
+llvm_pipeline_generic( struct draw_pt_middle_end *middle,
+                       const struct draw_fetch_info *fetch_info,
+                       const struct draw_prim_info *prim_info )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
+   struct draw_geometry_shader *gshader = draw->gs.geometry_shader;
+   struct draw_prim_info gs_prim_info;
+   struct draw_vertex_info llvm_vert_info;
+   struct draw_vertex_info gs_vert_info;
+   struct draw_vertex_info *vert_info;
    unsigned opt = fpme->opt;
-   unsigned alloc_count = align( fetch_count, 4 );
-
-   struct vertex_header *pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
 
-   if (!pipeline_verts) {
-      /* Not much we can do here - just skip the rendering.
-       */
+   llvm_vert_info.count = fetch_info->count;
+   llvm_vert_info.vertex_size = fpme->vertex_size;
+   llvm_vert_info.stride = fpme->vertex_size;
+   llvm_vert_info.verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size *
+                                     align(fetch_info->count,  4));
+   if (!llvm_vert_info.verts) {
       assert(0);
       return;
    }
 
-   fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context,
-                                         pipeline_verts,
-                                         (const char **)draw->pt.user.vbuffer,
-                                         fetch_elts,
-                                         fetch_count,
-                                         fpme->vertex_size,
-                                         draw->pt.vertex_buffer );
-
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    fetch_count,
-			    fpme->vertex_size ))
-   {
+   if (fetch_info->linear)
+      fpme->current_variant->jit_func( &fpme->llvm->jit_context,
+                                       llvm_vert_info.verts,
+                                       (const char **)draw->pt.user.vbuffer,
+                                       fetch_info->start,
+                                       fetch_info->count,
+                                       fpme->vertex_size,
+                                       draw->pt.vertex_buffer,
+                                       draw->instance_id);
+   else
+      fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context,
+                                            llvm_vert_info.verts,
+                                            (const char **)draw->pt.user.vbuffer,
+                                            fetch_info->elts,
+                                            fetch_info->count,
+                                            fpme->vertex_size,
+                                            draw->pt.vertex_buffer,
+                                            draw->instance_id);
+
+   /* Finished with fetch and vs:
+    */
+   fetch_info = NULL;
+   vert_info = &llvm_vert_info;
+
+
+   if ((opt & PT_SHADE) && gshader) {
+      draw_geometry_shader_run(gshader,
+                               draw->pt.user.gs_constants,
+                               draw->pt.user.gs_constants_size,
+                               vert_info,
+                               prim_info,
+                               &gs_vert_info,
+                               &gs_prim_info);
+
+      FREE(vert_info->verts);
+      vert_info = &gs_vert_info;
+      prim_info = &gs_prim_info;
+   }
+
+   /* stream output needs to be done before clipping */
+   draw_pt_so_emit( fpme->so_emit,
+		    vert_info,
+                    prim_info );
+
+   if (draw_pt_post_vs_run( fpme->post_vs, vert_info )) {
       opt |= PT_PIPELINE;
    }
 
    /* Do we need to run the pipeline?
     */
    if (opt & PT_PIPELINE) {
-      draw_pipeline_run( fpme->draw,
-                         fpme->prim,
-                         pipeline_verts,
-                         fetch_count,
-                         fpme->vertex_size,
-                         draw_elts,
-                         draw_count );
+      pipeline( fpme,
+                vert_info,
+                prim_info );
    }
    else {
-      draw_pt_emit( fpme->emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    fetch_count,
-		    fpme->vertex_size,
-		    draw_elts,
-		    draw_count );
+      emit( fpme->emit,
+            vert_info,
+            prim_info );
    }
+   FREE(vert_info->verts);
+}
 
 
-   FREE(pipeline_verts);
+static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
+                                 const unsigned *fetch_elts,
+                                 unsigned fetch_count,
+                                 const ushort *draw_elts,
+                                 unsigned draw_count,
+                                 unsigned prim_flags )
+{
+   struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
+
+   fetch_info.linear = FALSE;
+   fetch_info.start = 0;
+   fetch_info.elts = fetch_elts;
+   fetch_info.count = fetch_count;
+
+   prim_info.linear = FALSE;
+   prim_info.start = 0;
+   prim_info.count = draw_count;
+   prim_info.elts = draw_elts;
+   prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &draw_count;
+
+   llvm_pipeline_generic( middle, &fetch_info, &prim_info );
 }
 
 
 static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle,
                                        unsigned start,
-                                       unsigned count)
+                                       unsigned count,
+                                       unsigned prim_flags)
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   unsigned opt = fpme->opt;
-   unsigned alloc_count = align( count, 4 );
-
-   struct vertex_header *pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
-
-   if (!pipeline_verts) {
-      /* Not much we can do here - just skip the rendering.
-       */
-      assert(0);
-      return;
-   }
-
-#if 0
-   debug_printf("#### Pipeline = %p (data = %p)\n",
-                pipeline_verts, pipeline_verts->data);
-#endif
-   fpme->current_variant->jit_func( &fpme->llvm->jit_context,
-                                    pipeline_verts,
-                                    (const char **)draw->pt.user.vbuffer,
-                                    start,
-                                    count,
-                                    fpme->vertex_size,
-                                    draw->pt.vertex_buffer );
-
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
-
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
-      draw_pipeline_run_linear( fpme->draw,
-                                fpme->prim,
-                                pipeline_verts,
-                                count,
-                                fpme->vertex_size);
-   }
-   else {
-      draw_pt_emit_linear( fpme->emit,
-                           (const float (*)[4])pipeline_verts->data,
-                           fpme->vertex_size,
-                           count );
-   }
-
-   FREE(pipeline_verts);
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
+
+   fetch_info.linear = TRUE;
+   fetch_info.start = start;
+   fetch_info.count = count;
+   fetch_info.elts = NULL;
+
+   prim_info.linear = TRUE;
+   prim_info.start = 0;
+   prim_info.count = count;
+   prim_info.elts = NULL;
+   prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &count;
+
+   llvm_pipeline_generic( middle, &fetch_info, &prim_info );
 }
 
 
@@ -286,56 +355,29 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle,
                                  unsigned start,
                                  unsigned count,
                                  const ushort *draw_elts,
-                                 unsigned draw_count )
+                                 unsigned draw_count,
+                                 unsigned prim_flags )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   unsigned opt = fpme->opt;
-   unsigned alloc_count = align( count, 4 );
-
-   struct vertex_header *pipeline_verts =
-      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
-
-   if (!pipeline_verts)
-      return FALSE;
-
-   fpme->current_variant->jit_func( &fpme->llvm->jit_context,
-                                    pipeline_verts,
-                                    (const char **)draw->pt.user.vbuffer,
-                                    start,
-                                    count,
-                                    fpme->vertex_size,
-                                    draw->pt.vertex_buffer );
-
-   if (draw_pt_post_vs_run( fpme->post_vs,
-			    pipeline_verts,
-			    count,
-			    fpme->vertex_size ))
-   {
-      opt |= PT_PIPELINE;
-   }
+   struct draw_fetch_info fetch_info;
+   struct draw_prim_info prim_info;
 
-   /* Do we need to run the pipeline?
-    */
-   if (opt & PT_PIPELINE) {
-      draw_pipeline_run( fpme->draw,
-                         fpme->prim,
-                         pipeline_verts,
-                         count,
-                         fpme->vertex_size,
-                         draw_elts,
-                         draw_count );
-   }
-   else {
-      draw_pt_emit( fpme->emit,
-		    (const float (*)[4])pipeline_verts->data,
-		    count,
-		    fpme->vertex_size,
-		    draw_elts,
-		    draw_count );
-   }
+   fetch_info.linear = TRUE;
+   fetch_info.start = start;
+   fetch_info.count = count;
+   fetch_info.elts = NULL;
+
+   prim_info.linear = FALSE;
+   prim_info.start = 0;
+   prim_info.count = draw_count;
+   prim_info.elts = draw_elts;
+   prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
+   prim_info.primitive_count = 1;
+   prim_info.primitive_lengths = &draw_count;
+
+   llvm_pipeline_generic( middle, &fetch_info, &prim_info );
 
-   FREE(pipeline_verts);
    return TRUE;
 }
 
@@ -356,17 +398,18 @@ static void llvm_middle_end_destroy( struct draw_pt_middle_end *middle )
    if (fpme->emit)
       draw_pt_emit_destroy( fpme->emit );
 
+   if (fpme->so_emit)
+      draw_pt_so_emit_destroy( fpme->so_emit );
+
    if (fpme->post_vs)
       draw_pt_post_vs_destroy( fpme->post_vs );
 
-   if (fpme->llvm)
-      draw_llvm_destroy( fpme->llvm );
-
    FREE(middle);
 }
 
 
-struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit_llvm( struct draw_context *draw )
+struct draw_pt_middle_end *
+draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw)
 {
    struct llvm_middle_end *fpme = 0;
 
@@ -398,13 +441,15 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit_llvm( struct draw_cont
    if (!fpme->emit)
       goto fail;
 
-   fpme->llvm = draw_llvm_create(draw);
+   fpme->so_emit = draw_pt_so_emit_create( draw );
+   if (!fpme->so_emit)
+      goto fail;
+
+   fpme->llvm = draw->llvm;
    if (!fpme->llvm)
       goto fail;
 
-   fpme->variants = NULL;
    fpme->current_variant = NULL;
-   fpme->nr_variants = 0;
 
    return &fpme->base;
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 5525dfc748d..769409cfd67 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -26,22 +26,38 @@
  **************************************************************************/
 
 #include "util/u_memory.h"
+#include "util/u_math.h"
 #include "pipe/p_context.h"
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
-#include "draw/draw_vbuf.h"
 #include "draw/draw_pt.h"
 
+
+#define DO_CLIP_XY           0x1
+#define DO_CLIP_FULL_Z       0x2
+#define DO_CLIP_HALF_Z       0x4
+#define DO_CLIP_USER         0x8
+#define DO_VIEWPORT          0x10
+#define DO_EDGEFLAG          0x20
+
+
 struct pt_post_vs {
    struct draw_context *draw;
 
+   unsigned flags;
+
    boolean (*run)( struct pt_post_vs *pvs,
-		struct vertex_header *vertices,
-		unsigned count,
-		unsigned stride );
+                   struct draw_vertex_info *info );
 };
 
-
+static INLINE void
+initialize_vertex_header(struct vertex_header *header)
+{
+   header->clipmask = 0;
+   header->edgeflag = 1;
+   header->pad = 0;
+   header->vertex_id = UNDEFINED_VERTEX_ID;
+}
 
 static INLINE float
 dot4(const float *a, const float *b)
@@ -52,215 +68,121 @@ dot4(const float *a, const float *b)
            a[3]*b[3]);
 }
 
+#define FLAGS (0)
+#define TAG(x) x##_none
+#include "draw_cliptest_tmp.h"
 
+#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT)
+#define TAG(x) x##_xy_fullz_viewport
+#include "draw_cliptest_tmp.h"
 
-static INLINE unsigned
-compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr)
-{
-   unsigned mask = 0x0;
-   unsigned i;
-
-#if 0
-   debug_printf("compute clipmask %f %f %f %f\n",
-                clip[0], clip[1], clip[2], clip[3]);
-   assert(clip[3] != 0.0);
-#endif
-
-   /* Do the hardwired planes first:
-    */
-   if (-clip[0] + clip[3] < 0) mask |= (1<<0);
-   if ( clip[0] + clip[3] < 0) mask |= (1<<1);
-   if (-clip[1] + clip[3] < 0) mask |= (1<<2);
-   if ( clip[1] + clip[3] < 0) mask |= (1<<3);
-   if ( clip[2] + clip[3] < 0) mask |= (1<<4); /* match mesa clipplane numbering - for now */
-   if (-clip[2] + clip[3] < 0) mask |= (1<<5); /* match mesa clipplane numbering - for now */
-
-   /* Followed by any remaining ones:
-    */
-   for (i = 6; i < nr; i++) {
-      if (dot4(clip, plane[i]) < 0) 
-         mask |= (1<<i);
-   }
-
-   return mask;
-}
-
-
-/* The normal case - cliptest, rhw divide, viewport transform.
- *
- * Also handle identity viewport here at the expense of a few wasted
- * instructions
- */
-static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
-					  struct vertex_header *vertices,
-					  unsigned count,
-					  unsigned stride )
-{
-   struct vertex_header *out = vertices;
-   const float *scale = pvs->draw->viewport.scale;
-   const float *trans = pvs->draw->viewport.translate;
-   const unsigned pos = draw_current_shader_position_output(pvs->draw);
-   unsigned clipped = 0;
-   unsigned j;
-
-   if (0) debug_printf("%s\n", __FUNCTION__);
-
-   for (j = 0; j < count; j++) {
-      float *position = out->data[pos];
-
-#if 0
-      debug_printf("%d) io = %p, data = %p = [%f, %f, %f, %f]\n",
-                   j, out, position, position[0], position[1], position[2], position[3]);
-#endif
-
-      out->clip[0] = position[0];
-      out->clip[1] = position[1];
-      out->clip[2] = position[2];
-      out->clip[3] = position[3];
-
-      out->vertex_id = 0xffff;
-      out->clipmask = compute_clipmask_gl(out->clip, 
-					  pvs->draw->plane,
-					  pvs->draw->nr_planes);
-      clipped += out->clipmask;
-
-      if (out->clipmask == 0)
-      {
-	 /* divide by w */
-	 float w = 1.0f / position[3];
-
-	 /* Viewport mapping */
-	 position[0] = position[0] * w * scale[0] + trans[0];
-	 position[1] = position[1] * w * scale[1] + trans[1];
-	 position[2] = position[2] * w * scale[2] + trans[2];
-	 position[3] = w;
-#if 0
-         debug_printf("post viewport: %f %f %f %f\n",
-                      position[0],
-                      position[1],
-                      position[2],
-                      position[3]);
-#endif
-      }
-
-      out = (struct vertex_header *)( (char *)out + stride );
-   }
-
-   return clipped != 0;
-}
-
-
+#define FLAGS (DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT)
+#define TAG(x) x##_xy_halfz_viewport
+#include "draw_cliptest_tmp.h"
 
-/* As above plus edgeflags
- */
-static boolean 
-post_vs_cliptest_viewport_gl_edgeflag(struct pt_post_vs *pvs,
-                                      struct vertex_header *vertices,
-                                      unsigned count,
-                                      unsigned stride )
-{
-   unsigned j;
-   boolean needpipe;
+#define FLAGS (DO_CLIP_FULL_Z | DO_VIEWPORT)
+#define TAG(x) x##_fullz_viewport
+#include "draw_cliptest_tmp.h"
 
-   needpipe = post_vs_cliptest_viewport_gl( pvs, vertices, count, stride);
+#define FLAGS (DO_CLIP_HALF_Z | DO_VIEWPORT)
+#define TAG(x) x##_halfz_viewport
+#include "draw_cliptest_tmp.h"
 
-   /* If present, copy edgeflag VS output into vertex header.
-    * Otherwise, leave header as is.
-    */
-   if (pvs->draw->vs.edgeflag_output) {
-      struct vertex_header *out = vertices;
-      int ef = pvs->draw->vs.edgeflag_output;
-
-      for (j = 0; j < count; j++) {
-         const float *edgeflag = out->data[ef];
-         out->edgeflag = !(edgeflag[0] != 1.0f);
-         needpipe |= !out->edgeflag;
-         out = (struct vertex_header *)( (char *)out + stride );
-      }
-   }
-   return needpipe;
-}
+#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT)
+#define TAG(x) x##_xy_fullz_user_viewport
+#include "draw_cliptest_tmp.h"
 
+#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT | DO_EDGEFLAG)
+#define TAG(x) x##_xy_fullz_user_viewport_edgeflag
+#include "draw_cliptest_tmp.h"
 
 
 
-/* If bypass_clipping is set, skip cliptest and rhw divide.
+/* Don't want to create 64 versions of this function, so catch the
+ * less common ones here.  This is looking like something which should
+ * be code-generated, perhaps appended to the end of the vertex
+ * shader.
  */
-static boolean post_vs_viewport( struct pt_post_vs *pvs,
-			      struct vertex_header *vertices,
-			      unsigned count,
-			      unsigned stride )
-{
-   struct vertex_header *out = vertices;
-   const float *scale = pvs->draw->viewport.scale;
-   const float *trans = pvs->draw->viewport.translate;
-   const unsigned pos = draw_current_shader_position_output(pvs->draw);
-   unsigned j;
-
-   if (0) debug_printf("%s\n", __FUNCTION__);
-   for (j = 0; j < count; j++) {
-      float *position = out->data[pos];
-
-      /* Viewport mapping only, no cliptest/rhw divide
-       */
-      position[0] = position[0] * scale[0] + trans[0];
-      position[1] = position[1] * scale[1] + trans[1];
-      position[2] = position[2] * scale[2] + trans[2];
-
-      out = (struct vertex_header *)((char *)out + stride);
-   }
-   
-   return FALSE;
-}
+#define FLAGS (pvs->flags)
+#define TAG(x) x##_generic
+#include "draw_cliptest_tmp.h"
 
 
-/* If bypass_clipping is set and we have an identity viewport, nothing
- * to do.
- */
-static boolean post_vs_none( struct pt_post_vs *pvs,
-			     struct vertex_header *vertices,
-			     unsigned count,
-			     unsigned stride )
-{
-   if (0) debug_printf("%s\n", __FUNCTION__);
-   return FALSE;
-}
 
 boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
-			     struct vertex_header *pipeline_verts,
-			     unsigned count,
-			     unsigned stride )
+			     struct draw_vertex_info *info )
 {
-   return pvs->run( pvs, pipeline_verts, count, stride );
+   return pvs->run( pvs, info );
 }
 
 
 void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
-			      boolean bypass_clipping,
+			      boolean clip_xy,
+			      boolean clip_z,
+                              boolean clip_user,
 			      boolean bypass_viewport,
 			      boolean opengl,
 			      boolean need_edgeflags )
 {
-   if (!need_edgeflags) {
-      if (bypass_clipping) {
-         if (bypass_viewport)
-            pvs->run = post_vs_none;
-         else
-            pvs->run = post_vs_viewport;
-      }
-      else {
-         /* if (opengl) */
-         pvs->run = post_vs_cliptest_viewport_gl;
-      }
+   pvs->flags = 0;
+
+   if (clip_xy)
+      pvs->flags |= DO_CLIP_XY;
+   
+   if (clip_z && opengl) {
+      pvs->flags |= DO_CLIP_FULL_Z;
+      ASSIGN_4V( pvs->draw->plane[4],  0,  0,  1, 1 );
    }
-   else {
-      /* If we need to copy edgeflags to the vertex header, it should
-       * mean we're running the primitive pipeline.  Hence the bypass
-       * flags should be false.
-       */
-      assert(!bypass_clipping);
-      assert(!bypass_viewport);
-      pvs->run = post_vs_cliptest_viewport_gl_edgeflag;
+
+   if (clip_z && !opengl) {
+      pvs->flags |= DO_CLIP_HALF_Z;
+      ASSIGN_4V( pvs->draw->plane[4],  0,  0,  1, 0 );
+   }
+
+   if (clip_user)
+      pvs->flags |= DO_CLIP_USER;
+
+   if (!bypass_viewport)
+      pvs->flags |= DO_VIEWPORT;
+
+   if (need_edgeflags)
+      pvs->flags |= DO_EDGEFLAG;
+
+   /* Now select the relevant function:
+    */
+   switch (pvs->flags) {
+   case 0:
+      pvs->run = do_cliptest_none;
+      break;
+
+   case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT:
+      pvs->run = do_cliptest_xy_fullz_viewport;
+      break;
+
+   case DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT:
+      pvs->run = do_cliptest_xy_halfz_viewport;
+      break;
+
+   case DO_CLIP_FULL_Z | DO_VIEWPORT:
+      pvs->run = do_cliptest_fullz_viewport;
+      break;
+
+   case DO_CLIP_HALF_Z | DO_VIEWPORT:
+      pvs->run = do_cliptest_halfz_viewport;
+      break;
+
+   case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT:
+      pvs->run = do_cliptest_xy_fullz_user_viewport;
+      break;
+
+   case (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER |
+         DO_VIEWPORT | DO_EDGEFLAG):
+      pvs->run = do_cliptest_xy_fullz_user_viewport_edgeflag;
+      break;
+      
+   default:
+      pvs->run = do_cliptest_generic;
+      break;
    }
 }
 
@@ -272,7 +194,7 @@ struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw )
       return NULL;
 
    pvs->draw = draw;
-   
+
    return pvs;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
new file mode 100644
index 00000000000..c86bdd99a33
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -0,0 +1,293 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+struct pt_so_emit {
+   struct draw_context *draw;
+
+   void *buffers[PIPE_MAX_SO_BUFFERS];
+
+   unsigned input_vertex_stride;
+   const float (*inputs)[4];
+
+   boolean has_so;
+
+   boolean single_buffer;
+
+   unsigned emitted_primitives;
+   unsigned emitted_vertices;
+};
+
+
+void draw_pt_so_emit_prepare(struct pt_so_emit *emit)
+{
+   struct draw_context *draw = emit->draw;
+
+   emit->has_so = (draw->so.state.num_outputs > 0);
+
+   /* if we have a state with outputs make sure we have
+    * buffers to output to */
+   if (emit->has_so) {
+      boolean has_valid_buffer = FALSE;
+      unsigned i;
+      for (i = 0; i < draw->so.num_buffers; ++i) {
+         if (draw->so.buffers[i]) {
+            has_valid_buffer = TRUE;
+            break;
+         }
+      }
+      emit->has_so = has_valid_buffer;
+   }
+
+   if (!emit->has_so)
+      return;
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation??
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+}
+
+static boolean
+is_component_writable(unsigned mask,
+                      unsigned compo)
+{
+   switch (mask) {
+   case TGSI_WRITEMASK_NONE:
+      return FALSE;
+   case TGSI_WRITEMASK_X:
+      return compo == 0;
+   case TGSI_WRITEMASK_Y:
+      return compo == 1;
+   case TGSI_WRITEMASK_XY:
+      return compo == 0 || compo == 1;
+   case TGSI_WRITEMASK_Z:
+      return compo == 2;
+   case TGSI_WRITEMASK_XZ:
+      return compo == 0 || compo == 2;
+   case TGSI_WRITEMASK_YZ:
+      return compo == 1 || compo == 2;
+   case TGSI_WRITEMASK_XYZ:
+      return compo == 0 || compo == 1 || compo == 2;
+   case TGSI_WRITEMASK_W:
+      return compo == 3;
+   case TGSI_WRITEMASK_XW:
+      return compo == 0 || compo == 3;
+   case TGSI_WRITEMASK_YW:
+      return compo == 1 || compo == 3;
+   case TGSI_WRITEMASK_XYW:
+      return compo == 0 || compo == 1 || compo == 3;
+   case TGSI_WRITEMASK_ZW:
+      return compo == 2 || compo == 3;
+   case TGSI_WRITEMASK_XZW:
+      return compo == 0 || compo == 1 || compo == 3;
+   case TGSI_WRITEMASK_YZW:
+      return compo == 1 || compo == 2 || compo == 4;
+   case TGSI_WRITEMASK_XYZW:
+      return compo < 4;
+   default:
+      debug_assert(!"Unknown writemask in stream out");
+      return compo < 4;
+   }
+}
+
+static void so_emit_prim(struct pt_so_emit *so,
+                         unsigned *indices,
+                         unsigned num_vertices)
+{
+   unsigned slot, i;
+   unsigned input_vertex_stride = so->input_vertex_stride;
+   struct draw_context *draw = so->draw;
+   const float (*input_ptr)[4];
+   const struct pipe_stream_output_state *state =
+      &draw->so.state;
+   float **buffer = 0;
+
+   input_ptr = so->inputs;
+
+   for (i = 0; i < num_vertices; ++i) {
+      const float (*input)[4];
+      unsigned total_written_compos = 0;
+      /*debug_printf("%d) vertex index = %d (prim idx = %d)\n", i, indices[i], prim_idx);*/
+      input = (const float (*)[4])(
+         (const char *)input_ptr + (indices[i] * input_vertex_stride));
+      for (slot = 0; slot < state->num_outputs; ++slot) {
+         unsigned idx = state->register_index[slot];
+         unsigned writemask = state->register_mask[slot];
+         unsigned written_compos = 0;
+         unsigned compo;
+
+         buffer = (float**)&so->buffers[state->output_buffer[slot]];
+
+         /*debug_printf("\tSlot = %d, vs_slot = %d, idx = %d:\n",
+           slot, vs_slot, idx);*/
+#if 1
+         assert(!util_is_inf_or_nan(input[idx][0]));
+         assert(!util_is_inf_or_nan(input[idx][1]));
+         assert(!util_is_inf_or_nan(input[idx][2]));
+         assert(!util_is_inf_or_nan(input[idx][3]));
+#endif
+         for (compo = 0; compo < 4; ++compo) {
+            if (is_component_writable(writemask, compo)) {
+               float *buf = *buffer;
+               buf[written_compos++] = input[idx][compo];
+            }
+         }
+#if 0
+         debug_printf("\t\t(writemask = %d)%f %f %f %f\n",
+                      writemask,
+                      input[idx][0],
+                      input[idx][1],
+                      input[idx][2],
+                      input[idx][3]);
+#endif
+         *buffer += written_compos;
+         total_written_compos += written_compos;
+      }
+      if (so->single_buffer) {
+         int stride = (int)state->stride -
+                      sizeof(float) * total_written_compos;
+
+         debug_assert(stride >= 0);
+         *buffer = (float*) (((char*)*buffer) + stride);
+      }
+   }
+   so->emitted_vertices += num_vertices;
+   ++so->emitted_primitives;
+}
+
+static void so_point(struct pt_so_emit *so, int idx)
+{
+   unsigned indices[1];
+
+   indices[0] = idx;
+
+   so_emit_prim(so, indices, 1);
+}
+
+static void so_line(struct pt_so_emit *so, int i0, int i1)
+{
+   unsigned indices[2];
+
+   indices[0] = i0;
+   indices[1] = i1;
+
+   so_emit_prim(so, indices, 2);
+}
+
+static void so_tri(struct pt_so_emit *so, int i0, int i1, int i2)
+{
+   unsigned indices[3];
+
+   indices[0] = i0;
+   indices[1] = i1;
+   indices[2] = i2;
+
+   so_emit_prim(so, indices, 3);
+}
+
+
+#define FUNC         so_run_linear
+#define GET_ELT(idx) (start + (idx))
+#include "draw_so_emit_tmp.h"
+
+
+#define FUNC         so_run_elts
+#define LOCAL_VARS   const ushort *elts = input_prims->elts;
+#define GET_ELT(idx) (elts[start + (idx)])
+#include "draw_so_emit_tmp.h"
+
+
+void draw_pt_so_emit( struct pt_so_emit *emit,
+                      const struct draw_vertex_info *input_verts,
+                      const struct draw_prim_info *input_prims )
+{
+   struct draw_context *draw = emit->draw;
+   struct vbuf_render *render = draw->render;
+   unsigned start, i;
+
+   if (!emit->has_so)
+      return;
+
+   emit->emitted_vertices = 0;
+   emit->emitted_primitives = 0;
+   emit->input_vertex_stride = input_verts->stride;
+   emit->inputs = (const float (*)[4])input_verts->verts->data;
+   for (i = 0; i < draw->so.num_buffers; ++i) {
+      emit->buffers[i] = draw->so.buffers[i];
+   }
+   emit->single_buffer = TRUE;
+   for (i = 0; i < draw->so.state.num_outputs; ++i) {
+      if (draw->so.state.output_buffer[i] != 0)
+         emit->single_buffer = FALSE;
+   }
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation??*/
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   for (start = i = 0; i < input_prims->primitive_count;
+        start += input_prims->primitive_lengths[i], i++)
+   {
+      unsigned count = input_prims->primitive_lengths[i];
+
+      if (input_prims->linear) {
+         so_run_linear(emit, input_prims, input_verts,
+                       start, count);
+      } else {
+         so_run_elts(emit, input_prims, input_verts,
+                     start, count);
+      }
+   }
+
+   render->set_stream_output_info(render,
+                                  emit->emitted_primitives,
+                                  emit->emitted_vertices);
+}
+
+
+struct pt_so_emit *draw_pt_so_emit_create( struct draw_context *draw )
+{
+   struct pt_so_emit *emit = CALLOC_STRUCT(pt_so_emit);
+   if (!emit)
+      return NULL;
+
+   emit->draw = draw;
+
+   return emit;
+}
+
+void draw_pt_so_emit_destroy( struct pt_so_emit *emit )
+{
+   FREE(emit);
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c
index 3236d38e6ab..513bbbed216 100644
--- a/src/gallium/auxiliary/draw/draw_pt_util.c
+++ b/src/gallium/auxiliary/draw/draw_pt_util.c
@@ -53,7 +53,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       break;
    case PIPE_PRIM_LINES_ADJACENCY:
       *first = 4;
-      *incr = 2;
+      *incr = 4;
       break;
    case PIPE_PRIM_LINE_STRIP_ADJACENCY:
       *first = 4;
@@ -65,7 +65,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       break;
    case PIPE_PRIM_TRIANGLES_ADJACENCY:
       *first = 6;
-      *incr = 3;
+      *incr = 6;
       break;
    case PIPE_PRIM_TRIANGLE_STRIP:
    case PIPE_PRIM_TRIANGLE_FAN:
@@ -75,7 +75,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       break;
    case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
       *first = 6;
-      *incr = 1;
+      *incr = 2;
       break;
    case PIPE_PRIM_QUADS:
       *first = 4;
@@ -92,3 +92,10 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       break;
    }
 }
+
+unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr)
+{
+   if (count < first)
+      return 0;
+   return count - (count - first) % incr;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
deleted file mode 100644
index d0e16c9bc3c..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_varray.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-#include "draw/draw_context.h"
-#include "draw/draw_private.h"
-#include "draw/draw_pt.h"
-
-#define FETCH_MAX 256
-#define DRAW_MAX (FETCH_MAX+8)
-
-struct varray_frontend {
-   struct draw_pt_front_end base;
-   struct draw_context *draw;
-
-   ushort draw_elts[DRAW_MAX];
-   unsigned fetch_elts[FETCH_MAX];
-
-   unsigned driver_fetch_max;
-   unsigned fetch_max;
-
-   struct draw_pt_middle_end *middle;
-
-   unsigned input_prim;
-   unsigned output_prim;
-};
-
-
-static void varray_flush_linear(struct varray_frontend *varray,
-                                unsigned start, unsigned count)
-{
-   if (count) {
-      assert(varray->middle->run_linear);
-      varray->middle->run_linear(varray->middle, start, count);
-   }
-}
-
-static void varray_line_loop_segment(struct varray_frontend *varray,
-                                     unsigned start,
-                                     unsigned segment_start,
-                                     unsigned segment_count,
-                                     boolean end )
-{
-   assert(segment_count < varray->fetch_max);
-   if (segment_count >= 1) {
-      unsigned nr = 0, i;
-
-      for (i = 0; i < segment_count; i++) 
-         varray->fetch_elts[nr++] = start + segment_start + i;
-
-      if (end) 
-         varray->fetch_elts[nr++] = start;
-
-      assert(nr <= FETCH_MAX);
-
-      varray->middle->run(varray->middle, 
-                          varray->fetch_elts,
-                          nr,
-                          varray->draw_elts, /* ie. linear */
-                          nr);
-   }
-}
-
-
-
-static void varray_fan_segment(struct varray_frontend *varray,
-                               unsigned start, 
-                               unsigned segment_start,
-                               unsigned segment_count )
-{
-   assert(segment_count < varray->fetch_max);
-   if (segment_count >= 2) {
-      unsigned nr = 0, i;
-
-      if (segment_start != 0)
-         varray->fetch_elts[nr++] = start;
-
-      for (i = 0 ; i < segment_count; i++) 
-         varray->fetch_elts[nr++] = start + segment_start + i;
-
-      assert(nr <= FETCH_MAX);
-
-      varray->middle->run(varray->middle, 
-                          varray->fetch_elts,
-                          nr,
-                          varray->draw_elts, /* ie. linear */
-                          nr);
-   }
-}
-
-
-
-
-#define FUNC varray_run
-#include "draw_pt_varray_tmp_linear.h"
-
-static unsigned decompose_prim[PIPE_PRIM_POLYGON + 1] = {
-   PIPE_PRIM_POINTS,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_LINE_STRIP,        /* decomposed LINELOOP */
-   PIPE_PRIM_LINE_STRIP,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLE_STRIP,
-   PIPE_PRIM_TRIANGLE_FAN, 
-   PIPE_PRIM_QUADS,
-   PIPE_PRIM_QUAD_STRIP,
-   PIPE_PRIM_POLYGON
-};
-
-
-
-static void varray_prepare(struct draw_pt_front_end *frontend,
-                           unsigned prim,
-                           struct draw_pt_middle_end *middle,
-                           unsigned opt)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-
-   varray->base.run = varray_run;
-
-   varray->input_prim = prim;
-   varray->output_prim = decompose_prim[prim];
-
-   varray->middle = middle;
-   middle->prepare(middle, varray->output_prim, opt, &varray->driver_fetch_max );
-
-   /* check that the max is even */
-   assert((varray->driver_fetch_max & 1) == 0);
-
-   varray->fetch_max = MIN2(FETCH_MAX, varray->driver_fetch_max);
-}
-
-
-
-
-static void varray_finish(struct draw_pt_front_end *frontend)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-   varray->middle->finish(varray->middle);
-   varray->middle = NULL;
-}
-
-static void varray_destroy(struct draw_pt_front_end *frontend)
-{
-   FREE(frontend);
-}
-
-
-struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw)
-{
-   ushort i;
-   struct varray_frontend *varray = CALLOC_STRUCT(varray_frontend);
-   if (varray == NULL)
-      return NULL;
-
-   varray->base.prepare = varray_prepare;
-   varray->base.run     = NULL;
-   varray->base.finish  = varray_finish;
-   varray->base.destroy = varray_destroy;
-   varray->draw = draw;
-
-   for (i = 0; i < DRAW_MAX; i++) {
-      varray->draw_elts[i] = i;
-   }
-
-   return &varray->base;
-}
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
deleted file mode 100644
index 7c722457c3c..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
+++ /dev/null
@@ -1,238 +0,0 @@
-
-static void FUNC(struct draw_pt_front_end *frontend,
-                 pt_elt_func get_elt,
-                 const void *elts,
-                 unsigned count)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-   struct draw_context *draw = varray->draw;
-   unsigned start = (unsigned)elts;
-
-   boolean flatfirst = (draw->rasterizer->flatshade &&
-                        draw->rasterizer->flatshade_first);
-   unsigned i, j;
-   ushort flags;
-   unsigned first, incr;
-
-   varray->fetch_start = start;
-
-   draw_pt_split_prim(varray->input_prim, &first, &incr);
-
-#if 0
-   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
-                varray->input_prim,
-                start, count);
-#endif
-
-   switch (varray->input_prim) {
-   case PIPE_PRIM_POINTS:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i < end; i++) {
-            POINT(varray, i + 0);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_LINES:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+1 < end; i += 2) {
-            LINE(varray, DRAW_PIPE_RESET_STIPPLE,
-                 i + 0, i + 1);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:
-      if (count >= 2) {
-         flags = DRAW_PIPE_RESET_STIPPLE;
-
-         for (j = 0; j + first <= count; j += i) {
-            unsigned end = MIN2(FETCH_MAX, count - j);
-            end -= (end % incr);
-            for (i = 1; i < end; i++, flags = 0) {
-               LINE(varray, flags, i - 1, i);
-            }
-            LINE(varray, flags, i - 1, 0);
-            i = end;
-            fetch_init(varray, end);
-            varray_flush(varray);
-         }
-      }
-      break;
-
-   case PIPE_PRIM_LINE_STRIP:
-      flags = DRAW_PIPE_RESET_STIPPLE;
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 1; i < end; i++, flags = 0) {
-            LINE(varray, flags, i - 1, i);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLES:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+2 < end; i += 3) {
-            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                     i + 0, i + 1, i + 2);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      if (flatfirst) {
-         for (j = 0; j + first <= count; j += i) {
-            unsigned end = MIN2(FETCH_MAX, count - j);
-            end -= (end % incr);
-            for (i = 0; i+2 < end; i++) {
-               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                        i + 0, i + 1 + (i&1), i + 2 - (i&1));
-            }
-            i = end;
-            fetch_init(varray, end);
-            varray_flush(varray);
-            if (j + first + i <= count) {
-               varray->fetch_start -= 2;
-               i -= 2;
-            }
-         }
-      }
-      else {
-         for (j = 0; j + first <= count; j += i) {
-            unsigned end = MIN2(FETCH_MAX, count - j);
-            end -= (end  % incr);
-            for (i = 0; i + 2 < end; i++) {
-               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                        i + 0 + (i&1), i + 1 - (i&1), i + 2);
-            }
-            i = end;
-            fetch_init(varray, end);
-            varray_flush(varray);
-            if (j + first + i <= count) {
-               varray->fetch_start -= 2;
-               i -= 2;
-            }
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_FAN:
-      if (count >= 3) {
-         if (flatfirst) {
-            flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
-            for (j = 0; j + first <= count; j += i) {
-               unsigned end = MIN2(FETCH_MAX, count - j);
-               end -= (end % incr);
-               for (i = 0; i+2 < end; i++) {
-                  TRIANGLE(varray, flags, i + 1, i + 2, 0);
-               }
-               i = end;
-               fetch_init(varray, end);
-               varray_flush(varray);
-            }
-         }
-         else {
-            flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
-            for (j = 0; j + first <= count; j += i) {
-               unsigned end = MIN2(FETCH_MAX, count - j);
-               end -= (end % incr);
-               for (i = 0; i+2 < end; i++) {
-                  TRIANGLE(varray, flags, 0, i + 1, i + 2);
-               }
-               i = end;
-               fetch_init(varray, end);
-               varray_flush(varray);
-            }
-         }
-      }
-      break;
-
-   case PIPE_PRIM_QUADS:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+3 < end; i += 4) {
-            QUAD(varray, i + 0, i + 1, i + 2, i + 3);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_QUAD_STRIP:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+3 < end; i += 2) {
-            QUAD(varray, i + 2, i + 0, i + 1, i + 3);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-         if (j + first + i <= count) {
-            varray->fetch_start -= 2;
-            i -= 2;
-         }
-      }
-      break;
-
-   case PIPE_PRIM_POLYGON:
-   {
-      /* These bitflags look a little odd because we submit the
-       * vertices as (1,2,0) to satisfy flatshade requirements.
-       */
-      const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
-      const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
-      const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
-
-      flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+2 < end; i++, flags = edge_middle) {
-
-            if (i + 3 == count)
-               flags |= edge_last;
-
-            TRIANGLE(varray, flags, i + 1, i + 2, 0);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-   }
-   break;
-
-   default:
-      assert(0);
-      break;
-   }
-
-   varray_flush(varray);
-}
-
-#undef TRIANGLE
-#undef QUAD
-#undef POINT
-#undef LINE
-#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
deleted file mode 100644
index a292346be95..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
+++ /dev/null
@@ -1,98 +0,0 @@
-static unsigned trim( unsigned count, unsigned first, unsigned incr )
-{
-   return count - (count - first) % incr; 
-}
-
-static void FUNC(struct draw_pt_front_end *frontend,
-                 pt_elt_func get_elt,
-                 const void *elts,
-                 int elt_bias,
-                 unsigned count)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-   unsigned start = (unsigned) ((char *) elts - (char *) NULL);
-
-   unsigned j;
-   unsigned first, incr;
-
-   assert(elt_bias == 0);
-
-   draw_pt_split_prim(varray->input_prim, &first, &incr);
-   
-   /* Sanitize primitive length:
-    */
-   count = trim(count, first, incr); 
-   if (count < first)
-      return;
-
-#if 0
-   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
-                varray->input_prim,
-                start, count);
-#endif
-
-   switch (varray->input_prim) {
-   case PIPE_PRIM_POINTS:
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_LINE_STRIP:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_QUADS:
-   case PIPE_PRIM_QUAD_STRIP:
-   case PIPE_PRIM_LINES_ADJACENCY:
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      for (j = 0; j < count;) {
-         unsigned remaining = count - j;
-         unsigned nr = trim( MIN2(varray->driver_fetch_max, remaining), first, incr );
-         varray_flush_linear(varray, start + j, nr);
-         j += nr;
-         if (nr != remaining) 
-            j -= (first - incr);
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:
-      /* Always have to decompose as we've stated that this will be
-       * emitted as a line-strip.
-       */
-      for (j = 0; j < count;) {
-         unsigned remaining = count - j;
-         unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr );
-         varray_line_loop_segment(varray, start, j, nr, nr == remaining);
-         j += nr;
-         if (nr != remaining) 
-            j -= (first - incr);
-      }
-      break;
-
-
-   case PIPE_PRIM_POLYGON:
-   case PIPE_PRIM_TRIANGLE_FAN: 
-      if (count < varray->driver_fetch_max) {
-         varray_flush_linear(varray, start, count);
-      }
-      else {
-         for ( j = 0; j < count;) {
-            unsigned remaining = count - j;
-            unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr );
-            varray_fan_segment(varray, start, j, nr);
-            j += nr;
-            if (nr != remaining) 
-               j -= (first - incr);
-         }
-      }
-      break;
-
-   default:
-      assert(0);
-      break;
-   }
-}
-
-#undef TRIANGLE
-#undef QUAD
-#undef POINT
-#undef LINE
-#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
deleted file mode 100644
index 37ffbac4f92..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <[email protected]>
-  */
-
-#include "util/u_memory.h"
-#include "util/u_prim.h"
-#include "draw/draw_context.h"
-#include "draw/draw_private.h"
-#include "draw/draw_pt.h"
-
-
-#define CACHE_MAX 256
-#define FETCH_MAX 256
-#define DRAW_MAX (16*1024)
-
-struct vcache_frontend {
-   struct draw_pt_front_end base;
-   struct draw_context *draw;
-
-   unsigned in[CACHE_MAX];
-   ushort out[CACHE_MAX];
-
-   ushort draw_elts[DRAW_MAX];
-   unsigned fetch_elts[FETCH_MAX];
-
-   unsigned draw_count;
-   unsigned fetch_count;
-   unsigned fetch_max;
-   
-   struct draw_pt_middle_end *middle;
-
-   unsigned input_prim;
-   unsigned output_prim;
-
-   unsigned middle_prim;
-   unsigned opt;
-};
-
-static INLINE void 
-vcache_flush( struct vcache_frontend *vcache )
-{
-   if (vcache->middle_prim != vcache->output_prim) {
-      vcache->middle_prim = vcache->output_prim;
-      vcache->middle->prepare( vcache->middle, 
-                               vcache->middle_prim, 
-                               vcache->opt, 
-                               &vcache->fetch_max );
-   }
-
-   if (vcache->draw_count) {
-      vcache->middle->run( vcache->middle,
-                           vcache->fetch_elts,
-                           vcache->fetch_count,
-                           vcache->draw_elts,
-                           vcache->draw_count );
-   }
-
-   memset(vcache->in, ~0, sizeof(vcache->in));
-   vcache->fetch_count = 0;
-   vcache->draw_count = 0;
-}
-
-static INLINE void 
-vcache_check_flush( struct vcache_frontend *vcache )
-{
-   if ( vcache->draw_count + 6 >= DRAW_MAX ||
-        vcache->fetch_count + 4 >= FETCH_MAX )
-   {
-      vcache_flush( vcache );
-   }
-}
-
-
-static INLINE void 
-vcache_elt( struct vcache_frontend *vcache,
-            unsigned felt,
-            ushort flags )
-{
-   unsigned idx = felt % CACHE_MAX;
-
-   if (vcache->in[idx] != felt) {
-      assert(vcache->fetch_count < FETCH_MAX);
-
-      vcache->in[idx] = felt;
-      vcache->out[idx] = (ushort)vcache->fetch_count;
-      vcache->fetch_elts[vcache->fetch_count++] = felt;
-   }
-
-   vcache->draw_elts[vcache->draw_count++] = vcache->out[idx] | flags;
-}
-
-
-                   
-static INLINE void 
-vcache_triangle( struct vcache_frontend *vcache,
-                 unsigned i0,
-                 unsigned i1,
-                 unsigned i2 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_check_flush(vcache);
-}
-
-			  
-static INLINE void 
-vcache_triangle_flags( struct vcache_frontend *vcache,
-                       ushort flags,
-                       unsigned i0,
-                       unsigned i1,
-                       unsigned i2 )
-{
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_check_flush(vcache);
-}
-
-static INLINE void 
-vcache_line( struct vcache_frontend *vcache,
-             unsigned i0,
-             unsigned i1 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void 
-vcache_line_flags( struct vcache_frontend *vcache,
-                   ushort flags,
-                   unsigned i0,
-                   unsigned i1 )
-{
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, i1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void 
-vcache_point( struct vcache_frontend *vcache,
-              unsigned i0 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_check_flush(vcache);
-}
-
-static INLINE void 
-vcache_quad( struct vcache_frontend *vcache,
-             unsigned i0,
-             unsigned i1,
-             unsigned i2,
-             unsigned i3 )
-{
-   vcache_triangle( vcache, i0, i1, i3 );
-   vcache_triangle( vcache, i1, i2, i3 );
-}
-
-static INLINE void 
-vcache_ef_quad( struct vcache_frontend *vcache,
-                unsigned i0,
-                unsigned i1,
-                unsigned i2,
-                unsigned i3 )
-{
-   if (vcache->draw->rasterizer->flatshade_first) {
-      vcache_triangle_flags( vcache,
-                             ( DRAW_PIPE_RESET_STIPPLE |
-                               DRAW_PIPE_EDGE_FLAG_0 |
-                               DRAW_PIPE_EDGE_FLAG_1 ),
-                             i0, i1, i2 );
-
-      vcache_triangle_flags( vcache,
-                             ( DRAW_PIPE_EDGE_FLAG_2 |
-                               DRAW_PIPE_EDGE_FLAG_1 ),
-                             i0, i2, i3 );
-   }
-   else {
-      vcache_triangle_flags( vcache,
-                             ( DRAW_PIPE_RESET_STIPPLE |
-                               DRAW_PIPE_EDGE_FLAG_0 |
-                               DRAW_PIPE_EDGE_FLAG_2 ),
-                             i0, i1, i3 );
-
-      vcache_triangle_flags( vcache,
-                             ( DRAW_PIPE_EDGE_FLAG_0 |
-                               DRAW_PIPE_EDGE_FLAG_1 ),
-                             i1, i2, i3 );
-   }
-}
-
-/* At least for now, we're back to using a template include file for
- * this.  The two paths aren't too different though - it may be
- * possible to reunify them.
- */
-#define TRIANGLE(vc,flags,i0,i1,i2) vcache_triangle_flags(vc,flags,i0,i1,i2)
-#define QUAD(vc,i0,i1,i2,i3)        vcache_ef_quad(vc,i0,i1,i2,i3)
-#define LINE(vc,flags,i0,i1)        vcache_line_flags(vc,flags,i0,i1)
-#define POINT(vc,i0)                vcache_point(vc,i0)
-#define FUNC vcache_run_extras
-#include "draw_pt_vcache_tmp.h"
-
-#define TRIANGLE(vc,flags,i0,i1,i2) vcache_triangle(vc,i0,i1,i2)
-#define QUAD(vc,i0,i1,i2,i3)        vcache_quad(vc,i0,i1,i2,i3)
-#define LINE(vc,flags,i0,i1)        vcache_line(vc,i0,i1)
-#define POINT(vc,i0)                vcache_point(vc,i0)
-#define FUNC vcache_run
-#include "draw_pt_vcache_tmp.h"
-
-static INLINE void 
-rebase_uint_elts( const unsigned *src,
-                  unsigned count,
-                  int delta,
-                  ushort *dest )
-{
-   unsigned i;
-
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i] + delta);
-}
-
-static INLINE void 
-rebase_ushort_elts( const ushort *src,
-                    unsigned count,
-                    int delta,
-                                ushort *dest )
-{
-   unsigned i;
-
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i] + delta);
-}
-
-static INLINE void 
-rebase_ubyte_elts( const ubyte *src,
-                   unsigned count,
-                   int delta,
-                   ushort *dest )
-{
-   unsigned i;
-
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i] + delta);
-}
-
-
-
-static INLINE void 
-translate_uint_elts( const unsigned *src,
-                     unsigned count,
-                     ushort *dest )
-{
-   unsigned i;
-
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i]);
-}
-
-static INLINE void 
-translate_ushort_elts( const ushort *src,
-                       unsigned count,
-                       ushort *dest )
-{
-   unsigned i;
-
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i]);
-}
-
-static INLINE void 
-translate_ubyte_elts( const ubyte *src,
-                      unsigned count,
-                      ushort *dest )
-{
-   unsigned i;
-
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i]);
-}
-
-
-
-
-#if 0
-static INLINE enum pipe_format 
-format_from_get_elt( pt_elt_func get_elt )
-{
-   switch (draw->pt.user.eltSize) {
-   case 1: return PIPE_FORMAT_R8_UNORM;
-   case 2: return PIPE_FORMAT_R16_UNORM;
-   case 4: return PIPE_FORMAT_R32_UNORM;
-   default: return PIPE_FORMAT_NONE;
-   }
-}
-#endif
-
-static INLINE void 
-vcache_check_run( struct draw_pt_front_end *frontend, 
-                  pt_elt_func get_elt,
-                  const void *elts,
-                  int elt_bias,
-                  unsigned draw_count )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; 
-   struct draw_context *draw = vcache->draw;
-   unsigned min_index = draw->pt.user.min_index;
-   unsigned max_index = draw->pt.user.max_index;
-   unsigned index_size = draw->pt.user.eltSize;
-   unsigned fetch_count = max_index + 1 - min_index;
-   const ushort *transformed_elts;
-   ushort *storage = NULL;
-   boolean ok = FALSE;
-
-
-   if (0) debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, 
-                       vcache->fetch_max,
-                       draw_count);
-      
-   if (elt_bias + max_index >= DRAW_PIPE_MAX_VERTICES ||
-       fetch_count >= UNDEFINED_VERTEX_ID ||
-       fetch_count > draw_count) {
-      if (0) debug_printf("fail\n");
-      goto fail;
-   }
-      
-   if (vcache->middle_prim != vcache->input_prim) {
-      vcache->middle_prim = vcache->input_prim;
-      vcache->middle->prepare( vcache->middle, 
-                               vcache->middle_prim, 
-                               vcache->opt, 
-                               &vcache->fetch_max );
-   }
-
-
-   assert((elt_bias >= 0 && min_index + elt_bias >= min_index) ||
-          (elt_bias <  0 && min_index + elt_bias <  min_index));
-
-   if (min_index == 0 &&
-       index_size == 2)
-   {
-      transformed_elts = (const ushort *)elts;
-   }
-   else 
-   {
-      storage = MALLOC( draw_count * sizeof(ushort) );
-      if (!storage)
-         goto fail;
-      
-      if (min_index == 0) {
-         switch(index_size) {
-         case 1:
-            translate_ubyte_elts( (const ubyte *)elts,
-                                  draw_count,
-                                  storage );
-            break;
-
-         case 2:
-            translate_ushort_elts( (const ushort *)elts,
-                                   draw_count,
-                                   storage );
-            break;
-
-         case 4:
-            translate_uint_elts( (const uint *)elts,
-                                 draw_count,
-                                 storage );
-            break;
-
-         default:
-            assert(0);
-            FREE(storage);
-            return;
-         }
-      }
-      else {
-         switch(index_size) {
-         case 1:
-            rebase_ubyte_elts( (const ubyte *)elts,
-                                  draw_count,
-                                  0 - (int)min_index,
-                                  storage );
-            break;
-
-         case 2:
-            rebase_ushort_elts( (const ushort *)elts,
-                                   draw_count,
-                                   0 - (int)min_index,
-                                   storage );
-            break;
-
-         case 4:
-            rebase_uint_elts( (const uint *)elts,
-                                 draw_count,
-                                 0 - (int)min_index,
-                                 storage );
-            break;
-
-         default:
-            assert(0);
-            FREE(storage);
-            return;
-         }
-      }
-      transformed_elts = storage;
-   }
-
-   if (fetch_count < UNDEFINED_VERTEX_ID)
-      ok = vcache->middle->run_linear_elts( vcache->middle,
-                                            min_index + elt_bias, /* start */
-                                            fetch_count,
-                                            transformed_elts,
-                                            draw_count );
-   
-   FREE(storage);
-
-   if (ok)
-      return;
-
-   debug_printf("failed to execute atomic draw elts for %d/%d, splitting up\n",
-                fetch_count, draw_count);
-
- fail:
-   vcache_run( frontend, get_elt, elts, elt_bias, draw_count );
-}
-
-
-
-
-static void 
-vcache_prepare( struct draw_pt_front_end *frontend,
-                unsigned prim,
-                struct draw_pt_middle_end *middle,
-                unsigned opt )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
-
-   if (opt & PT_PIPELINE)
-   {
-      vcache->base.run = vcache_run_extras;
-   }
-   else 
-   {
-      vcache->base.run = vcache_check_run;
-   }
-
-   vcache->input_prim = prim;
-   vcache->output_prim = u_reduced_prim(prim);
-
-   vcache->middle = middle;
-   vcache->opt = opt;
-
-   /* Have to run prepare here, but try and guess a good prim for
-    * doing so:
-    */
-   vcache->middle_prim = (opt & PT_PIPELINE) ? vcache->output_prim : vcache->input_prim;
-   middle->prepare( middle, vcache->middle_prim, opt, &vcache->fetch_max );
-}
-
-
-
-
-static void 
-vcache_finish( struct draw_pt_front_end *frontend )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
-   vcache->middle->finish( vcache->middle );
-   vcache->middle = NULL;
-}
-
-static void 
-vcache_destroy( struct draw_pt_front_end *frontend )
-{
-   FREE(frontend);
-}
-
-
-struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw )
-{
-   struct vcache_frontend *vcache = CALLOC_STRUCT( vcache_frontend );
-   if (vcache == NULL)
-      return NULL;
- 
-   vcache->base.prepare = vcache_prepare;
-   vcache->base.run     = NULL;
-   vcache->base.finish  = vcache_finish;
-   vcache->base.destroy = vcache_destroy;
-   vcache->draw = draw;
-   
-   memset(vcache->in, ~0, sizeof(vcache->in));
-  
-   return &vcache->base;
-}
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
deleted file mode 100644
index f7a63de3ba9..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
+++ /dev/null
@@ -1,197 +0,0 @@
-
-
-static void FUNC( struct draw_pt_front_end *frontend, 
-                  pt_elt_func get_elt,
-                  const void *elts,
-                  int elt_bias,
-                  unsigned count )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
-   struct draw_context *draw = vcache->draw;
-
-   boolean flatfirst = (draw->rasterizer->flatshade && 
-                        draw->rasterizer->flatshade_first);
-   unsigned i;
-   ushort flags;
-
-   if (0) debug_printf("%s %d\n", __FUNCTION__, count);
-
-
-   switch (vcache->input_prim) {
-   case PIPE_PRIM_POINTS:
-      for (i = 0; i < count; i ++) {
-	 POINT( vcache,
-                get_elt(elts, i + 0) + elt_bias );
-      }
-      break;
-
-   case PIPE_PRIM_LINES:
-      for (i = 0; i+1 < count; i += 2) {
-         LINE( vcache, 
-               DRAW_PIPE_RESET_STIPPLE,
-               get_elt(elts, i + 0) + elt_bias,
-               get_elt(elts, i + 1) + elt_bias);
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:  
-      if (count >= 2) {
-         flags = DRAW_PIPE_RESET_STIPPLE;
-
-         for (i = 1; i < count; i++, flags = 0) {
-            LINE( vcache, 
-                  flags,
-                  get_elt(elts, i - 1) + elt_bias,
-                  get_elt(elts, i ) + elt_bias);
-         }
-
-	 LINE( vcache, 
-               flags,
-               get_elt(elts, i - 1) + elt_bias,
-               get_elt(elts, 0 ) + elt_bias);
-      }
-      break;
-
-   case PIPE_PRIM_LINE_STRIP:
-      flags = DRAW_PIPE_RESET_STIPPLE;
-      for (i = 1; i < count; i++, flags = 0) {
-         LINE( vcache, 
-               flags,
-               get_elt(elts, i - 1) + elt_bias,
-               get_elt(elts, i ) + elt_bias);
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLES:
-      for (i = 0; i+2 < count; i += 3) {
-         TRIANGLE( vcache,
-                   DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
-                   get_elt(elts, i + 0) + elt_bias,
-                   get_elt(elts, i + 1) + elt_bias,
-                   get_elt(elts, i + 2 ) + elt_bias);
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      if (flatfirst) {
-         for (i = 0; i+2 < count; i++) {
-            TRIANGLE( vcache,
-                      DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
-                      get_elt(elts, i + 0) + elt_bias,
-                      get_elt(elts, i + 1 + (i&1)) + elt_bias,
-                      get_elt(elts, i + 2 - (i&1)) + elt_bias);
-         }
-      }
-      else {
-         for (i = 0; i+2 < count; i++) {
-            TRIANGLE( vcache,
-                      DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
-                      get_elt(elts, i + 0 + (i&1)) + elt_bias,
-                      get_elt(elts, i + 1 - (i&1)) + elt_bias,
-                      get_elt(elts, i + 2 ) + elt_bias);
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_FAN:
-      if (count >= 3) {
-         if (flatfirst) {
-            for (i = 0; i+2 < count; i++) {
-               TRIANGLE( vcache,
-                         DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
-                         get_elt(elts, i + 1) + elt_bias,
-                         get_elt(elts, i + 2) + elt_bias,
-                         get_elt(elts, 0 ) + elt_bias);
-            }
-         }
-         else {
-            for (i = 0; i+2 < count; i++) {
-               TRIANGLE( vcache,
-                         DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
-                         get_elt(elts, 0) + elt_bias,
-                         get_elt(elts, i + 1) + elt_bias,
-                         get_elt(elts, i + 2 ) + elt_bias);
-            }
-         }
-      }
-      break;
-
-
-   case PIPE_PRIM_QUADS:
-      for (i = 0; i+3 < count; i += 4) {
-         QUAD( vcache,
-               get_elt(elts, i + 0) + elt_bias,
-               get_elt(elts, i + 1) + elt_bias,
-               get_elt(elts, i + 2) + elt_bias,
-               get_elt(elts, i + 3) + elt_bias );
-      }
-      break;
-
-   case PIPE_PRIM_QUAD_STRIP:
-      for (i = 0; i+3 < count; i += 2) {
-         QUAD( vcache,
-               get_elt(elts, i + 2) + elt_bias,
-               get_elt(elts, i + 0) + elt_bias,
-               get_elt(elts, i + 1) + elt_bias,
-               get_elt(elts, i + 3) + elt_bias );
-      }
-      break;
-
-   case PIPE_PRIM_POLYGON:
-      {
-         /* These bitflags look a little odd because we submit the
-          * vertices as (1,2,0) to satisfy flatshade requirements.  
-          */
-         const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
-         const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
-         const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
-         ushort edge_next, edge_finish;
-
-         if (flatfirst) {
-            flags = DRAW_PIPE_RESET_STIPPLE | edge_middle | edge_last;
-            edge_next = edge_last;
-            edge_finish = edge_first;
-         }
-         else {
-            flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
-            edge_next = edge_middle;
-            edge_finish = edge_last;
-         }
-
-	 for (i = 0; i+2 < count; i++, flags = edge_next) {
-
-            if (i + 3 == count)
-               flags |= edge_finish;
-
-            if (flatfirst) {
-               TRIANGLE( vcache,
-                         flags,
-                         get_elt(elts, 0) + elt_bias,
-                         get_elt(elts, i + 1) + elt_bias,
-                         get_elt(elts, i + 2) + elt_bias );
-            }
-            else {
-               TRIANGLE( vcache,
-                         flags,
-                         get_elt(elts, i + 1) + elt_bias,
-                         get_elt(elts, i + 2) + elt_bias,
-                         get_elt(elts, 0) + elt_bias);
-            }
-	 }
-      }
-      break;
-
-   default:
-      assert(0);
-      break;
-   }
-   
-   vcache_flush( vcache );
-}
-
-
-#undef TRIANGLE
-#undef QUAD
-#undef POINT
-#undef LINE
-#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
new file mode 100644
index 00000000000..a6875253094
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -0,0 +1,208 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pt.h"
+
+#define SEGMENT_SIZE 1024
+#define MAP_SIZE     256
+
+struct vsplit_frontend {
+   struct draw_pt_front_end base;
+   struct draw_context *draw;
+
+   unsigned prim;
+
+   struct draw_pt_middle_end *middle;
+
+   unsigned max_vertices;
+   ushort segment_size;
+
+   /* buffers for splitting */
+   unsigned fetch_elts[SEGMENT_SIZE];
+   ushort draw_elts[SEGMENT_SIZE];
+   ushort identity_draw_elts[SEGMENT_SIZE];
+
+   struct {
+      /* map a fetch element to a draw element */
+      unsigned fetches[MAP_SIZE];
+      ushort draws[MAP_SIZE];
+      boolean has_max_fetch;
+
+      ushort num_fetch_elts;
+      ushort num_draw_elts;
+   } cache;
+};
+
+
+static void
+vsplit_clear_cache(struct vsplit_frontend *vsplit)
+{
+   memset(vsplit->cache.fetches, 0xff, sizeof(vsplit->cache.fetches));
+   vsplit->cache.has_max_fetch = FALSE;
+   vsplit->cache.num_fetch_elts = 0;
+   vsplit->cache.num_draw_elts = 0;
+}
+
+static void
+vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags)
+{
+   vsplit->middle->run(vsplit->middle,
+         vsplit->fetch_elts, vsplit->cache.num_fetch_elts,
+         vsplit->draw_elts, vsplit->cache.num_draw_elts, flags);
+}
+
+/**
+ * Add a fetch element and add it to the draw elements.
+ */
+static INLINE void
+vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch)
+{
+   unsigned hash = fetch % MAP_SIZE;
+
+   if (vsplit->cache.fetches[hash] != fetch) {
+      /* update cache */
+      vsplit->cache.fetches[hash] = fetch;
+      vsplit->cache.draws[hash] = vsplit->cache.num_fetch_elts;
+
+      /* add fetch */
+      assert(vsplit->cache.num_fetch_elts < vsplit->segment_size);
+      vsplit->fetch_elts[vsplit->cache.num_fetch_elts++] = fetch;
+   }
+
+   vsplit->draw_elts[vsplit->cache.num_draw_elts++] = vsplit->cache.draws[hash];
+}
+
+
+/**
+ * Add a fetch element and add it to the draw elements.  The fetch element is
+ * in full range (uint).
+ */
+static INLINE void
+vsplit_add_cache_uint(struct vsplit_frontend *vsplit, unsigned fetch)
+{
+   /* special care for 0xffffffff */
+   if (fetch == 0xffffffff && !vsplit->cache.has_max_fetch) {
+      unsigned hash = fetch % MAP_SIZE;
+      vsplit->cache.fetches[hash] = fetch - 1; /* force update */
+      vsplit->cache.has_max_fetch = TRUE;
+   }
+
+   vsplit_add_cache(vsplit, fetch);
+}
+
+
+#define FUNC vsplit_run_linear
+#include "draw_pt_vsplit_tmp.h"
+
+#define FUNC vsplit_run_ubyte
+#define ELT_TYPE ubyte
+#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch)
+#include "draw_pt_vsplit_tmp.h"
+
+#define FUNC vsplit_run_ushort
+#define ELT_TYPE ushort
+#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch)
+#include "draw_pt_vsplit_tmp.h"
+
+#define FUNC vsplit_run_uint
+#define ELT_TYPE uint
+#define ADD_CACHE(vsplit, fetch) vsplit_add_cache_uint(vsplit, fetch)
+#include "draw_pt_vsplit_tmp.h"
+
+
+static void vsplit_prepare(struct draw_pt_front_end *frontend,
+                           unsigned in_prim,
+                           struct draw_pt_middle_end *middle,
+                           unsigned opt)
+{
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;
+
+   switch (vsplit->draw->pt.user.eltSize) {
+   case 0:
+      vsplit->base.run = vsplit_run_linear;
+      break;
+   case 1:
+      vsplit->base.run = vsplit_run_ubyte;
+      break;
+   case 2:
+      vsplit->base.run = vsplit_run_ushort;
+      break;
+   case 4:
+      vsplit->base.run = vsplit_run_uint;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   /* split only */
+   vsplit->prim = in_prim;
+
+   vsplit->middle = middle;
+   middle->prepare(middle, vsplit->prim, opt, &vsplit->max_vertices);
+
+   vsplit->segment_size = MIN2(SEGMENT_SIZE, vsplit->max_vertices);
+}
+
+
+static void vsplit_finish(struct draw_pt_front_end *frontend)
+{
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;
+   vsplit->middle->finish(vsplit->middle);
+   vsplit->middle = NULL;
+}
+
+
+static void vsplit_destroy(struct draw_pt_front_end *frontend)
+{
+   FREE(frontend);
+}
+
+
+struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw)
+{
+   struct vsplit_frontend *vsplit = CALLOC_STRUCT(vsplit_frontend);
+   ushort i;
+
+   if (!vsplit)
+      return NULL;
+
+   vsplit->base.prepare = vsplit_prepare;
+   vsplit->base.run     = NULL;
+   vsplit->base.finish  = vsplit_finish;
+   vsplit->base.destroy = vsplit_destroy;
+   vsplit->draw = draw;
+
+   for (i = 0; i < SEGMENT_SIZE; i++)
+      vsplit->identity_draw_elts[i] = i;
+
+   return &vsplit->base;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
new file mode 100644
index 00000000000..3f66f962e11
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
@@ -0,0 +1,309 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#define CONCAT2(name, elt_type) name ## elt_type
+#define CONCAT(name, elt_type) CONCAT2(name, elt_type)
+
+#ifdef ELT_TYPE
+
+/**
+ * Fetch all elements in [min_index, max_index] with bias, and use the
+ * (rebased) index buffer as the draw elements.
+ */
+static boolean
+CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                    unsigned istart, unsigned icount)
+{
+   struct draw_context *draw = vsplit->draw;
+   const ELT_TYPE *ib = (const ELT_TYPE *)
+      ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset);
+   const unsigned min_index = draw->pt.user.min_index;
+   const unsigned max_index = draw->pt.user.max_index;
+   const int elt_bias = draw->pt.user.eltBias;
+   unsigned fetch_start, fetch_count;
+   const ushort *draw_elts = NULL;
+   unsigned i;
+
+   /* use the ib directly */
+   if (min_index == 0 && sizeof(ib[0]) == sizeof(draw_elts[0])) {
+      if (icount > vsplit->max_vertices)
+         return FALSE;
+
+      for (i = 0; i < icount; i++) {
+         ELT_TYPE idx = ib[istart + i];
+         assert(idx >= min_index && idx <= max_index);
+      }
+      draw_elts = (const ushort *) ib;
+   }
+   else {
+      /* have to go through vsplit->draw_elts */
+      if (icount > vsplit->segment_size)
+         return FALSE;
+   }
+
+   /* this is faster only when we fetch less elements than the normal path */
+   if (max_index - min_index > icount - 1)
+      return FALSE;
+
+   if (elt_bias < 0 && min_index < -elt_bias)
+      return FALSE;
+
+   /* why this check? */
+   for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
+      if (draw->pt.vertex_element[i].instance_divisor)
+         return FALSE;
+   }
+
+   fetch_start = min_index + elt_bias;
+   fetch_count = max_index - min_index + 1;
+
+   if (!draw_elts) {
+      if (min_index == 0) {
+         for (i = 0; i < icount; i++) {
+            ELT_TYPE idx = ib[istart + i];
+
+            assert(idx >= min_index && idx <= max_index);
+            vsplit->draw_elts[i] = (ushort) idx;
+         }
+      }
+      else {
+         for (i = 0; i < icount; i++) {
+            ELT_TYPE idx = ib[istart + i];
+
+            assert(idx >= min_index && idx <= max_index);
+            vsplit->draw_elts[i] = (ushort) (idx - min_index);
+         }
+      }
+
+      draw_elts = vsplit->draw_elts;
+   }
+
+   return vsplit->middle->run_linear_elts(vsplit->middle,
+                                          fetch_start, fetch_count,
+                                          draw_elts, icount, 0x0);
+}
+
+/**
+ * Use the cache to prepare the fetch and draw elements, and flush.
+ *
+ * When spoken is TRUE, ispoken replaces istart;  When close is TRUE, iclose is
+ * appended.
+ */
+static INLINE void
+CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                        unsigned flags,
+                                        unsigned istart, unsigned icount,
+                                        boolean spoken, unsigned ispoken,
+                                        boolean close, unsigned iclose)
+{
+   struct draw_context *draw = vsplit->draw;
+   const ELT_TYPE *ib = (const ELT_TYPE *)
+      ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset);
+   const int ibias = draw->pt.user.eltBias;
+   unsigned i;
+
+   assert(icount + !!close <= vsplit->segment_size);
+
+   vsplit_clear_cache(vsplit);
+
+   spoken = !!spoken;
+   if (ibias == 0) {
+      if (spoken)
+         ADD_CACHE(vsplit, ib[ispoken]);
+
+      for (i = spoken; i < icount; i++)
+         ADD_CACHE(vsplit, ib[istart + i]);
+
+      if (close)
+         ADD_CACHE(vsplit, ib[iclose]);
+   }
+   else if (ibias > 0) {
+      if (spoken)
+         ADD_CACHE(vsplit, (uint) ib[ispoken] + ibias);
+
+      for (i = spoken; i < icount; i++)
+         ADD_CACHE(vsplit, (uint) ib[istart + i] + ibias);
+
+      if (close)
+         ADD_CACHE(vsplit, (uint) ib[iclose] + ibias);
+   }
+   else {
+      if (spoken) {
+         if (ib[ispoken] < -ibias)
+            return;
+         ADD_CACHE(vsplit, ib[ispoken] + ibias);
+      }
+
+      for (i = spoken; i < icount; i++) {
+         if (ib[istart + i] < -ibias)
+            return;
+         ADD_CACHE(vsplit, ib[istart + i] + ibias);
+      }
+
+      if (close) {
+         if (ib[iclose] < -ibias)
+            return;
+         ADD_CACHE(vsplit, ib[iclose] + ibias);
+      }
+   }
+
+   vsplit_flush_cache(vsplit, flags);
+}
+
+static void
+CONCAT(vsplit_segment_simple_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                         unsigned flags,
+                                         unsigned istart,
+                                         unsigned icount)
+{
+   CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit,
+         flags, istart, icount, FALSE, 0, FALSE, 0);
+}
+
+static void
+CONCAT(vsplit_segment_loop_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                       unsigned flags,
+                                       unsigned istart,
+                                       unsigned icount,
+                                       unsigned i0)
+{
+   const boolean close_loop = ((flags) == DRAW_SPLIT_BEFORE);
+
+   CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit,
+         flags, istart, icount, FALSE, 0, close_loop, i0);
+}
+
+static void
+CONCAT(vsplit_segment_fan_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                      unsigned flags,
+                                      unsigned istart,
+                                      unsigned icount,
+                                      unsigned i0)
+{
+   const boolean use_spoken = (((flags) & DRAW_SPLIT_BEFORE) != 0);
+
+   CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit,
+         flags, istart, icount, use_spoken, i0, FALSE, 0);
+}
+
+#define LOCAL_VARS                                                         \
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;   \
+   const unsigned prim = vsplit->prim;                                     \
+   const unsigned max_count_simple = vsplit->segment_size;                 \
+   const unsigned max_count_loop = vsplit->segment_size - 1;               \
+   const unsigned max_count_fan = vsplit->segment_size;
+
+#define PRIMITIVE(istart, icount)   \
+   CONCAT(vsplit_primitive_, ELT_TYPE)(vsplit, istart, icount)
+
+#else /* ELT_TYPE */
+
+static void
+vsplit_segment_simple_linear(struct vsplit_frontend *vsplit, unsigned flags,
+                             unsigned istart, unsigned icount)
+{
+   assert(icount <= vsplit->max_vertices);
+   vsplit->middle->run_linear(vsplit->middle, istart, icount, flags);
+}
+
+static void
+vsplit_segment_loop_linear(struct vsplit_frontend *vsplit, unsigned flags,
+                           unsigned istart, unsigned icount, unsigned i0)
+{
+   boolean close_loop = (flags == DRAW_SPLIT_BEFORE);
+   unsigned nr;
+
+   assert(icount + !!close_loop <= vsplit->segment_size);
+
+   if (close_loop) {
+      for (nr = 0; nr < icount; nr++)
+         vsplit->fetch_elts[nr] = istart + nr;
+      vsplit->fetch_elts[nr++] = i0;
+
+      vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr,
+            vsplit->identity_draw_elts, nr, flags);
+   }
+   else {
+      vsplit->middle->run_linear(vsplit->middle, istart, icount, flags);
+   }
+}
+
+static void
+vsplit_segment_fan_linear(struct vsplit_frontend *vsplit, unsigned flags,
+                          unsigned istart, unsigned icount, unsigned i0)
+{
+   boolean use_spoken = ((flags & DRAW_SPLIT_BEFORE) != 0);
+   unsigned nr = 0, i;
+
+   assert(icount + !!use_spoken <= vsplit->segment_size);
+
+   if (use_spoken) {
+      vsplit->fetch_elts[nr++] = i0;
+      for (i = 1 ; i < icount; i++)
+         vsplit->fetch_elts[nr++] = istart + i;
+
+      vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr,
+            vsplit->identity_draw_elts, nr, flags);
+   }
+   else {
+      vsplit->middle->run_linear(vsplit->middle, istart, icount, flags);
+   }
+}
+
+#define LOCAL_VARS                                                         \
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;   \
+   const unsigned prim = vsplit->prim;                                     \
+   const unsigned max_count_simple = vsplit->max_vertices;                 \
+   const unsigned max_count_loop = vsplit->segment_size - 1;               \
+   const unsigned max_count_fan = vsplit->segment_size;
+
+#define PRIMITIVE(istart, icount) FALSE
+
+#define ELT_TYPE linear
+
+#endif /* ELT_TYPE */
+
+#define FUNC_VARS                      \
+   struct draw_pt_front_end *frontend, \
+   unsigned start,                     \
+   unsigned count
+
+#define SEGMENT_SIMPLE(flags, istart, icount)   \
+   CONCAT(vsplit_segment_simple_, ELT_TYPE)(vsplit, flags, istart, icount)
+
+#define SEGMENT_LOOP(flags, istart, icount, i0) \
+   CONCAT(vsplit_segment_loop_, ELT_TYPE)(vsplit, flags, istart, icount, i0)
+
+#define SEGMENT_FAN(flags, istart, icount, i0)  \
+   CONCAT(vsplit_segment_fan_, ELT_TYPE)(vsplit, flags, istart, icount, i0)
+
+#include "draw_split_tmp.h"
+
+#undef CONCAT2
+#undef CONCAT
+
+#undef ELT_TYPE
+#undef ADD_CACHE
diff --git a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
new file mode 100644
index 00000000000..7fafde9d5e6
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
@@ -0,0 +1,31 @@
+#define FUNC_VARS                               \
+   struct pt_so_emit *so,                       \
+   const struct draw_prim_info *input_prims,    \
+   const struct draw_vertex_info *input_verts,  \
+   unsigned start,                              \
+   unsigned count
+
+#define FUNC_ENTER                                                \
+   /* declare more local vars */                                  \
+   const unsigned prim = input_prims->prim;                       \
+   const unsigned prim_flags = input_prims->flags;                \
+   const boolean last_vertex_last = TRUE;                         \
+   do {                                                           \
+      debug_assert(input_prims->primitive_count == 1);            \
+      switch (prim) {                                             \
+      case PIPE_PRIM_LINES_ADJACENCY:                             \
+      case PIPE_PRIM_LINE_STRIP_ADJACENCY:                        \
+      case PIPE_PRIM_TRIANGLES_ADJACENCY:                         \
+      case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:                    \
+         debug_assert(!"unexpected primitive type in stream output"); \
+         return;                                                  \
+      default:                                                    \
+         break;                                                   \
+      }                                                           \
+   } while (0)                                                    \
+
+#define POINT(i0)                so_point(so,i0)
+#define LINE(flags,i0,i1)        so_line(so,i0,i1)
+#define TRIANGLE(flags,i0,i1,i2) so_tri(so,i0,i1,i2)
+
+#include "draw_decompose_tmp.h"
diff --git a/src/gallium/auxiliary/draw/draw_split_tmp.h b/src/gallium/auxiliary/draw/draw_split_tmp.h
new file mode 100644
index 00000000000..47defc62b96
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_split_tmp.h
@@ -0,0 +1,176 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+static void
+FUNC(FUNC_VARS)
+{
+   unsigned first, incr;
+   LOCAL_VARS
+
+   /*
+    * prim, start, count, and max_count_{simple,loop,fan} should have been
+    * defined
+    */
+   if (0) {
+      debug_printf("%s: prim 0x%x, start %d, count %d, max_count_simple %d, "
+                   "max_count_loop %d, max_count_fan %d\n",
+                   __FUNCTION__, prim, start, count, max_count_simple,
+                   max_count_loop, max_count_fan);
+   }
+
+   draw_pt_split_prim(prim, &first, &incr);
+   /* sanitize primitive length */
+   count = draw_pt_trim_count(count, first, incr);
+   if (count < first)
+      return;
+
+   /* try flushing the entire primitive */
+   if (PRIMITIVE(start, count))
+      return;
+
+   /* must be able to at least flush two complete primitives */
+   assert(max_count_simple >= first + incr &&
+          max_count_loop >= first + incr &&
+          max_count_fan >= first + incr);
+
+   /* no splitting required */
+   if (count <= max_count_simple) {
+      SEGMENT_SIMPLE(0x0, start, count);
+   }
+   else {
+      const unsigned rollback = first - incr;
+      unsigned flags = DRAW_SPLIT_AFTER, seg_start = 0, seg_max;
+
+      /*
+       * Both count and seg_max below are explicitly trimmed.  Because
+       *
+       *   seg_start = N * (seg_max - rollback) = N' * incr,
+       *
+       * we have
+       *
+       *   remaining = count - seg_start = first + N'' * incr.
+       *
+       * That is, remaining is implicitly trimmed.
+       */
+      switch (prim) {
+      case PIPE_PRIM_POINTS:
+      case PIPE_PRIM_LINES:
+      case PIPE_PRIM_LINE_STRIP:
+      case PIPE_PRIM_TRIANGLES:
+      case PIPE_PRIM_TRIANGLE_STRIP:
+      case PIPE_PRIM_QUADS:
+      case PIPE_PRIM_QUAD_STRIP:
+      case PIPE_PRIM_LINES_ADJACENCY:
+      case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+         seg_max =
+            draw_pt_trim_count(MIN2(max_count_simple, count), first, incr);
+         if (prim == PIPE_PRIM_TRIANGLE_STRIP ||
+             prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) {
+            /* make sure we flush even number of triangles at a time */
+            if (seg_max < count && !(((seg_max - first) / incr) & 1))
+               seg_max -= incr;
+         }
+
+         do {
+            const unsigned remaining = count - seg_start;
+
+            if (remaining > seg_max) {
+               SEGMENT_SIMPLE(flags, start + seg_start, seg_max);
+               seg_start += seg_max - rollback;
+
+               flags |= DRAW_SPLIT_BEFORE;
+            }
+            else {
+               flags &= ~DRAW_SPLIT_AFTER;
+
+               SEGMENT_SIMPLE(flags, start + seg_start, remaining);
+               seg_start += remaining;
+            }
+         } while (seg_start < count);
+         break;
+
+      case PIPE_PRIM_LINE_LOOP:
+         seg_max =
+            draw_pt_trim_count(MIN2(max_count_loop, count), first, incr);
+
+         do {
+            const unsigned remaining = count - seg_start;
+
+            if (remaining > seg_max) {
+               SEGMENT_LOOP(flags, start + seg_start, seg_max, start);
+               seg_start += seg_max - rollback;
+
+               flags |= DRAW_SPLIT_BEFORE;
+            }
+            else {
+               flags &= ~DRAW_SPLIT_AFTER;
+
+               SEGMENT_LOOP(flags, start + seg_start, remaining, start);
+               seg_start += remaining;
+            }
+         } while (seg_start < count);
+         break;
+
+      case PIPE_PRIM_TRIANGLE_FAN:
+      case PIPE_PRIM_POLYGON:
+         seg_max =
+            draw_pt_trim_count(MIN2(max_count_fan, count), first, incr);
+
+         do {
+            const unsigned remaining = count - seg_start;
+
+            if (remaining > seg_max) {
+               SEGMENT_FAN(flags, start + seg_start, seg_max, start);
+               seg_start += seg_max - rollback;
+
+               flags |= DRAW_SPLIT_BEFORE;
+            }
+            else {
+               flags &= ~DRAW_SPLIT_AFTER;
+
+               SEGMENT_FAN(flags, start + seg_start, remaining, start);
+               seg_start += remaining;
+            }
+         } while (seg_start < count);
+         break;
+
+      default:
+         assert(0);
+         break;
+      }
+   }
+}
+
+#undef FUNC
+#undef FUNC_VARS
+#undef LOCAL_VARS
+
+#undef PRIMITIVE
+#undef SEGMENT_SIMPLE
+#undef SEGMENT_LOOP
+#undef SEGMENT_FAN
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h
index cccd3bf4358..e32803c0720 100644
--- a/src/gallium/auxiliary/draw/draw_vbuf.h
+++ b/src/gallium/auxiliary/draw/draw_vbuf.h
@@ -98,14 +98,14 @@ struct vbuf_render {
    boolean (*set_primitive)( struct vbuf_render *, unsigned prim );
 
    /**
-    * DrawElements, note indices are ushort.  The driver must complete
-    * this call, if necessary splitting the index list itself.
+    * Draw indexed primitives.  Note that indices are ushort.  The driver
+    * must complete this call, if necessary splitting the index list itself.
     */
-   void (*draw)( struct vbuf_render *,
-		 const ushort *indices,
-		 uint nr_indices );
+   void (*draw_elements)( struct vbuf_render *,
+                          const ushort *indices,
+                          uint nr_indices );
 
-   /* Draw Arrays path too.
+   /* Draw non-indexed primitives.
     */
    void (*draw_arrays)( struct vbuf_render *,
 			unsigned start,
@@ -117,6 +117,14 @@ struct vbuf_render {
    void (*release_vertices)( struct vbuf_render * );
 
    void (*destroy)( struct vbuf_render * );
+
+
+   /**
+    * Called after writing data to the stream out buffers
+    */
+   void (*set_stream_output_info)( struct vbuf_render *vbufr,
+                                   unsigned primitive_count,
+                                   unsigned vertices_count );
 };
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index 3af31ffe126..e63cf5f4f98 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -166,7 +166,7 @@ static INLINE enum pipe_format draw_translate_vinfo_format(enum attrib_emit emit
    }
 }
 
-static INLINE enum attrib_emit draw_translate_vinfo_size(enum attrib_emit emit)
+static INLINE unsigned draw_translate_vinfo_size(enum attrib_emit emit)
 {
    switch (emit) {
    case EMIT_OMIT:
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index c2832eefa2a..fb665b08fff 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -48,18 +48,30 @@
 
 DEBUG_GET_ONCE_BOOL_OPTION(gallium_dump_vs, "GALLIUM_DUMP_VS", FALSE)
 
+
+/**
+ * Set a vertex shader constant buffer.
+ * \param slot  which constant buffer in [0, PIPE_MAX_CONSTANT_BUFFERS-1]
+ * \param constants  the mapped buffer
+ * \param size  size of buffer in bytes
+ */
 void
 draw_vs_set_constants(struct draw_context *draw,
                       unsigned slot,
                       const void *constants,
                       unsigned size)
 {
-   if (((uintptr_t)constants) & 0xf) {
+   const int alignment = 16;
+
+   /* check if buffer is 16-byte aligned */
+   if (((uintptr_t)constants) & (alignment - 1)) {
+      /* if not, copy the constants into a new, 16-byte aligned buffer */
       if (size > draw->vs.const_storage_size[slot]) {
          if (draw->vs.aligned_constant_storage[slot]) {
             align_free((void *)draw->vs.aligned_constant_storage[slot]);
          }
-         draw->vs.aligned_constant_storage[slot] = align_malloc(size, 16);
+         draw->vs.aligned_constant_storage[slot] =
+            align_malloc(size, alignment);
       }
       assert(constants);
       memcpy((void *)draw->vs.aligned_constant_storage[slot],
@@ -85,18 +97,27 @@ struct draw_vertex_shader *
 draw_create_vertex_shader(struct draw_context *draw,
                           const struct pipe_shader_state *shader)
 {
-   struct draw_vertex_shader *vs;
+   struct draw_vertex_shader *vs = NULL;
 
    if (draw->dump_vs) {
       tgsi_dump(shader->tokens, 0);
    }
 
-   vs = draw_create_vs_sse( draw, shader );
-   if (!vs) {
+   if (!draw->pt.middle.llvm) {
+#if defined(PIPE_ARCH_X86)
+      vs = draw_create_vs_sse( draw, shader );
+#elif defined(PIPE_ARCH_PPC)
       vs = draw_create_vs_ppc( draw, shader );
-      if (!vs) {
-         vs = draw_create_vs_exec( draw, shader );
-      }
+#endif
+   }
+#if HAVE_LLVM
+   else {
+      vs = draw_create_vs_llvm(draw, shader);
+   }
+#endif
+
+   if (!vs) {
+      vs = draw_create_vs_exec( draw, shader );
    }
 
    if (vs)
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 6c7e94db433..f9a038788fb 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -133,7 +133,8 @@ struct draw_vertex_shader {
    void (*run_linear)( struct draw_vertex_shader *shader,
 		       const float (*input)[4],
 		       float (*output)[4],
-                      const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+                       const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+                       const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS],
 		       unsigned count,
 		       unsigned input_stride,
 		       unsigned output_stride );
@@ -165,7 +166,6 @@ draw_create_vs_ppc(struct draw_context *draw,
 		   const struct pipe_shader_state *templ);
 
 
-
 struct draw_vs_varient_key;
 struct draw_vertex_shader;
 
@@ -173,6 +173,11 @@ struct draw_vs_varient *
 draw_vs_create_varient_aos_sse( struct draw_vertex_shader *vs,
                                 const struct draw_vs_varient_key *key );
 
+#if HAVE_LLVM
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+		    const struct pipe_shader_state *state);
+#endif
 
 
 /********************************************************************************
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 1911242f825..68e8295b5e1 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -32,6 +32,8 @@
 #define DRAW_VS_AOS_H
 
 #include "pipe/p_config.h"
+#include "tgsi/tgsi_exec.h"
+#include "draw_vs.h"
 
 #ifdef PIPE_ARCH_X86
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index bc34d390dae..dab3eb1ca8e 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -85,7 +85,8 @@ static void
 vs_exec_run_linear( struct draw_vertex_shader *shader,
 		    const float (*input)[4],
 		    float (*output)[4],
-                   const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+                    const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+                    const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS],
 		    unsigned count,
 		    unsigned input_stride,
 		    unsigned output_stride )
@@ -95,9 +96,8 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
    unsigned int i, j;
    unsigned slot;
 
-   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-      machine->Consts[i] = constants[i];
-   }
+   tgsi_exec_set_constant_buffers(machine, PIPE_MAX_CONSTANT_BUFFERS,
+                                  constants, const_size);
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
new file mode 100644
index 00000000000..fa9992db783
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -0,0 +1,127 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_screen.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+#include "draw_vs.h"
+#include "draw_llvm.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+
+static void
+vs_llvm_prepare(struct draw_vertex_shader *shader,
+                struct draw_context *draw)
+{
+   /*struct llvm_vertex_shader *evs = llvm_vertex_shader(shader);*/
+}
+
+static void
+vs_llvm_run_linear( struct draw_vertex_shader *shader,
+		    const float (*input)[4],
+		    float (*output)[4],
+                    const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+                    const unsigned constants_size[PIPE_MAX_CONSTANT_BUFFERS],
+		    unsigned count,
+		    unsigned input_stride,
+		    unsigned output_stride )
+{
+   /* we should never get here since the entire pipeline is
+    * generated in draw_pt_fetch_shade_pipeline_llvm.c */
+   debug_assert(0);
+}
+
+
+static void
+vs_llvm_delete( struct draw_vertex_shader *dvs )
+{
+   struct llvm_vertex_shader *shader = llvm_vertex_shader(dvs);
+   struct pipe_fence_handle *fence = NULL;
+   struct draw_llvm_variant_list_item *li;
+   struct pipe_context *pipe = dvs->draw->pipe;
+
+   /*
+    * XXX: This might be not neccessary at all.
+    */
+   pipe->flush(pipe, 0, &fence);
+   if (fence) {
+      pipe->screen->fence_finish(pipe->screen, fence, 0);
+      pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+   }
+
+
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      struct draw_llvm_variant_list_item *next = next_elem(li);
+      draw_llvm_destroy_variant(li->base);
+      li = next;
+   }
+
+   assert(shader->variants_cached == 0);
+   FREE((void*) dvs->state.tokens);
+   FREE( dvs );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+		    const struct pipe_shader_state *state)
+{
+   struct llvm_vertex_shader *vs = CALLOC_STRUCT( llvm_vertex_shader );
+
+   if (vs == NULL)
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(state->tokens);
+   if (!vs->base.state.tokens) {
+      FREE(vs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(state->tokens, &vs->base.info);
+
+   vs->variant_key_size = 
+      draw_llvm_variant_key_size(
+	 vs->base.info.file_max[TGSI_FILE_INPUT]+1,
+	 vs->base.info.file_max[TGSI_FILE_SAMPLER]+1);
+
+   vs->base.draw = draw;
+   vs->base.prepare = vs_llvm_prepare;
+   vs->base.run_linear = vs_llvm_run_linear;
+   vs->base.delete = vs_llvm_delete;
+   vs->base.create_varient = draw_vs_create_varient_generic;
+
+   make_empty_list(&vs->variants);
+
+   return &vs->base;
+}
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index d869eecec5e..5df84916c51 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -125,7 +125,7 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
        */
       shader->func(inputs_soa, outputs_soa, temps_soa,
 		   (float (*)[4]) shader->base.immediates,
-                   (const float (*)[4])constants[0],
+                   (float (*)[4])constants[0],
                    ppc_builtin_constants);
 
       /* convert (up to) four output verts from SoA back to AoS format */
@@ -190,7 +190,7 @@ draw_create_vs_ppc(struct draw_context *draw,
       vs->base.create_varient = draw_vs_varient_aos_ppc;
    else
 #endif
-      vs->base.create_varient = draw_vs_varient_generic;
+      vs->base.create_varient = draw_vs_create_varient_generic;
    vs->base.prepare = vs_ppc_prepare;
    vs->base.run_linear = vs_ppc_run_linear;
    vs->base.delete = vs_ppc_delete;
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 14c95082a9d..0b0c6077c6f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -84,6 +84,7 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
 		   const float (*input)[4],
 		   float (*output)[4],
                   const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+		  const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS],
 		   unsigned count,
 		   unsigned input_stride,
 		   unsigned output_stride )
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index 6eb26927f27..eacd1601877 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -149,7 +149,8 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_varient *varient,
    vsvg->base.vs->run_linear( vsvg->base.vs, 
                               temp_buffer,
                               temp_buffer,
-                             vsvg->base.vs->draw->pt.user.vs_constants,
+                              vsvg->base.vs->draw->pt.user.vs_constants,
+                              vsvg->base.vs->draw->pt.user.vs_constants_size,
                               count,
                               temp_vertex_stride, 
                               temp_vertex_stride);
@@ -214,7 +215,8 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_varient *varient,
    vsvg->base.vs->run_linear( vsvg->base.vs, 
                               temp_buffer,
                               temp_buffer,
-                             vsvg->base.vs->draw->pt.user.vs_constants,
+                              vsvg->base.vs->draw->pt.user.vs_constants,
+                              vsvg->base.vs->draw->pt.user.vs_constants_size,
                               count,
                               temp_vertex_stride, 
                               temp_vertex_stride);
diff --git a/src/gallium/auxiliary/gallivm/f.cpp b/src/gallium/auxiliary/gallivm/f.cpp
new file mode 100644
index 00000000000..5eb09c01ab3
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/f.cpp
@@ -0,0 +1,85 @@
+/**************************************************************************
+ *
+ * (C) Copyright VMware, Inc 2010.
+ * (C) Copyright John Maddock 2006.
+ * Use, modification and distribution are subject to the
+ * Boost Software License, Version 1.0. (See accompanying file
+ * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+ *
+ **************************************************************************/
+
+
+/*
+ * This file allows to compute the minimax polynomial coefficients we use
+ * for fast exp2/log2.
+ *
+ * How to use this source:
+ *
+ * - Download and abuild the NTL library from
+ *   http://shoup.net/ntl/download.html
+ *
+ * - Download boost source code matching to your distro. 
+ *
+ * - Goto libs/math/minimax and replace f.cpp with this file.
+ *
+ * - Build as
+ *
+ *   g++ -o minimax -I /path/to/ntl/include main.cpp f.cpp /path/to/ntl/src/ntl.a -lboost_math_tr1
+ *
+ * - Run as 
+ *
+ *    ./minimax
+ *
+ * - For example, to compute exp2 5th order polynomial between [0, 1] do:
+ *
+ *    variant 1
+ *    range 0 1
+ *    order 5 0
+ *    steps 200
+ *    info
+ *
+ * - For more info see
+ * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+ */
+
+#define L22
+#include <boost/math/bindings/rr.hpp>
+#include <boost/math/tools/polynomial.hpp>
+
+#include <cmath>
+
+
+boost::math::ntl::RR f(const boost::math::ntl::RR& x, int variant)
+{
+   static const boost::math::ntl::RR tiny = boost::math::tools::min_value<boost::math::ntl::RR>() * 64;
+   
+   switch(variant)
+   {
+   case 0:
+      // log2(x)/(x - 1)
+      return log(x)/log(2.0)/(x - 1.0);
+
+   case 1:
+      // exp2(x)
+      return exp(x*log(2.0));
+   }
+
+   return 0;
+}
+
+
+void show_extra(
+   const boost::math::tools::polynomial<boost::math::ntl::RR>& n, 
+   const boost::math::tools::polynomial<boost::math::ntl::RR>& d, 
+   const boost::math::ntl::RR& x_offset, 
+   const boost::math::ntl::RR& y_offset, 
+   int variant)
+{
+   switch(variant)
+   {
+   default:
+      // do nothing here...
+      ;
+   }
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld.h b/src/gallium/auxiliary/gallivm/lp_bld.h
index 2fa682f4009..8103bc917fc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld.h
@@ -35,6 +35,17 @@
 #define LP_BLD_H
 
 
+/**
+ * @file
+ * LLVM IR building helpers interfaces.
+ *
+ * We use LLVM-C bindings for now. They are not documented, but follow the C++
+ * interfaces very closely, and appear to be complete enough for code
+ * genration. See
+ * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
+ * for a standalone example.
+ */
+
 #include <llvm-c/Core.h>  
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 20ae958714b..e65c13e64b5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -60,6 +60,11 @@
 #include "lp_bld_arit.h"
 
 
+#define EXP_POLY_DEGREE 3
+
+#define LOG_POLY_DEGREE 5
+
+
 /**
  * Generate min(a, b)
  * No checks for special case values of a or b = 1 or 0 are done.
@@ -73,6 +78,9 @@ lp_build_min_simple(struct lp_build_context *bld,
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    /* TODO: optimize the constant case */
 
    if(type.width * type.length == 128) {
@@ -119,6 +127,9 @@ lp_build_max_simple(struct lp_build_context *bld,
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    /* TODO: optimize the constant case */
 
    if(type.width * type.length == 128) {
@@ -161,6 +172,8 @@ lp_build_comp(struct lp_build_context *bld,
 {
    const struct lp_type type = bld->type;
 
+   assert(lp_check_value(type, a));
+
    if(a == bld->one)
       return bld->zero;
    if(a == bld->zero)
@@ -174,9 +187,15 @@ lp_build_comp(struct lp_build_context *bld,
    }
 
    if(LLVMIsConstant(a))
-      return LLVMConstSub(bld->one, a);
+      if (type.floating)
+          return LLVMConstFSub(bld->one, a);
+      else
+          return LLVMConstSub(bld->one, a);
    else
-      return LLVMBuildSub(bld->builder, bld->one, a, "");
+      if (type.floating)
+         return LLVMBuildFSub(bld->builder, bld->one, a, "");
+      else
+         return LLVMBuildSub(bld->builder, bld->one, a, "");
 }
 
 
@@ -191,6 +210,9 @@ lp_build_add(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == bld->zero)
       return b;
    if(b == bld->zero)
@@ -218,9 +240,15 @@ lp_build_add(struct lp_build_context *bld,
    }
 
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      res = LLVMConstAdd(a, b);
+      if (type.floating)
+         res = LLVMConstFAdd(a, b);
+      else
+         res = LLVMConstAdd(a, b);
    else
-      res = LLVMBuildAdd(bld->builder, a, b, "");
+      if (type.floating)
+         res = LLVMBuildFAdd(bld->builder, a, b, "");
+      else
+         res = LLVMBuildAdd(bld->builder, a, b, "");
 
    /* clamp to ceiling of 1.0 */
    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
@@ -232,20 +260,20 @@ lp_build_add(struct lp_build_context *bld,
 }
 
 
-/** Return the sum of the elements of a */
+/** Return the scalar sum of the elements of a */
 LLVMValueRef
 lp_build_sum_vector(struct lp_build_context *bld,
                     LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
    LLVMValueRef index, res;
-   int i;
+   unsigned i;
 
-   if (a == bld->zero)
-      return bld->zero;
-   if (a == bld->undef)
-      return bld->undef;
-   assert(type.length > 1);
+   assert(lp_check_value(type, a));
+
+   if (type.length == 1) {
+      return a;
+   }
 
    assert(!bld->type.norm);
 
@@ -254,9 +282,16 @@ lp_build_sum_vector(struct lp_build_context *bld,
 
    for (i = 1; i < type.length; i++) {
       index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      res = LLVMBuildAdd(bld->builder, res,
-                         LLVMBuildExtractElement(bld->builder, a, index, ""),
-                         "");
+      if (type.floating)
+         res = LLVMBuildFAdd(bld->builder, res,
+                            LLVMBuildExtractElement(bld->builder,
+                                                    a, index, ""),
+                            "");
+      else
+         res = LLVMBuildAdd(bld->builder, res,
+                            LLVMBuildExtractElement(bld->builder,
+                                                    a, index, ""),
+                            "");
    }
 
    return res;
@@ -274,6 +309,9 @@ lp_build_sub(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(b == bld->zero)
       return a;
    if(a == bld->undef || b == bld->undef)
@@ -301,9 +339,15 @@ lp_build_sub(struct lp_build_context *bld,
    }
 
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      res = LLVMConstSub(a, b);
+      if (type.floating)
+         res = LLVMConstFSub(a, b);
+      else
+         res = LLVMConstSub(a, b);
    else
-      res = LLVMBuildSub(bld->builder, a, b, "");
+      if (type.floating)
+         res = LLVMBuildFSub(bld->builder, a, b, "");
+      else
+         res = LLVMBuildSub(bld->builder, a, b, "");
 
    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
       res = lp_build_max_simple(bld, res, bld->zero);
@@ -361,6 +405,10 @@ lp_build_mul_u8n(LLVMBuilderRef builder,
    LLVMValueRef c8;
    LLVMValueRef ab;
 
+   assert(!i16_type.floating);
+   assert(lp_check_value(i16_type, a));
+   assert(lp_check_value(i16_type, b));
+
    c8 = lp_build_const_int_vec(i16_type, 8);
    
 #if 0
@@ -396,6 +444,9 @@ lp_build_mul(struct lp_build_context *bld,
    LLVMValueRef shift;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == bld->zero)
       return bld->zero;
    if(a == bld->one)
@@ -434,7 +485,10 @@ lp_build_mul(struct lp_build_context *bld,
       shift = NULL;
 
    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
-      res =  LLVMConstMul(a, b);
+      if (type.floating)
+         res = LLVMConstFMul(a, b);
+      else
+         res = LLVMConstMul(a, b);
       if(shift) {
          if(type.sign)
             res = LLVMConstAShr(res, shift);
@@ -443,7 +497,10 @@ lp_build_mul(struct lp_build_context *bld,
       }
    }
    else {
-      res = LLVMBuildMul(bld->builder, a, b, "");
+      if (type.floating)
+         res = LLVMBuildFMul(bld->builder, a, b, "");
+      else
+         res = LLVMBuildMul(bld->builder, a, b, "");
       if(shift) {
          if(type.sign)
             res = LLVMBuildAShr(bld->builder, res, shift, "");
@@ -466,6 +523,8 @@ lp_build_mul_imm(struct lp_build_context *bld,
 {
    LLVMValueRef factor;
 
+   assert(lp_check_value(bld->type, a));
+
    if(b == 0)
       return bld->zero;
 
@@ -473,12 +532,12 @@ lp_build_mul_imm(struct lp_build_context *bld,
       return a;
 
    if(b == -1)
-      return LLVMBuildNeg(bld->builder, a, "");
+      return lp_build_negate(bld, a);
 
    if(b == 2 && bld->type.floating)
       return lp_build_add(bld, a, a);
 
-   if(util_is_pot(b)) {
+   if(util_is_power_of_two(b)) {
       unsigned shift = ffs(b) - 1;
 
       if(bld->type.floating) {
@@ -519,6 +578,9 @@ lp_build_div(struct lp_build_context *bld,
 {
    const struct lp_type type = bld->type;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == bld->zero)
       return bld->zero;
    if(a == bld->one)
@@ -530,44 +592,125 @@ lp_build_div(struct lp_build_context *bld,
    if(a == bld->undef || b == bld->undef)
       return bld->undef;
 
-   if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      return LLVMConstFDiv(a, b);
+   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
+      if (type.floating)
+         return LLVMConstFDiv(a, b);
+      else if (type.sign)
+         return LLVMConstSDiv(a, b);
+      else
+         return LLVMConstUDiv(a, b);
+   }
 
    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 
-   return LLVMBuildFDiv(bld->builder, a, b, "");
+   if (type.floating)
+      return LLVMBuildFDiv(bld->builder, a, b, "");
+   else if (type.sign)
+      return LLVMBuildSDiv(bld->builder, a, b, "");
+   else
+      return LLVMBuildUDiv(bld->builder, a, b, "");
 }
 
 
 /**
- * Linear interpolation.
- *
- * This also works for integer values with a few caveats.
+ * Linear interpolation -- without any checks.
  *
  * @sa http://www.stereopsis.com/doubleblend.html
  */
-LLVMValueRef
-lp_build_lerp(struct lp_build_context *bld,
-              LLVMValueRef x,
-              LLVMValueRef v0,
-              LLVMValueRef v1)
+static INLINE LLVMValueRef
+lp_build_lerp_simple(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef v0,
+                     LLVMValueRef v1)
 {
    LLVMValueRef delta;
    LLVMValueRef res;
 
+   assert(lp_check_value(bld->type, x));
+   assert(lp_check_value(bld->type, v0));
+   assert(lp_check_value(bld->type, v1));
+
    delta = lp_build_sub(bld, v1, v0);
 
    res = lp_build_mul(bld, x, delta);
 
    res = lp_build_add(bld, v0, res);
 
-   if(bld->type.fixed)
+   if (bld->type.fixed) {
       /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
        * but it will be wrong for other uses. Basically we need a more
        * powerful lp_type, capable of further distinguishing the values
        * interpretation from the value storage. */
       res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Linear interpolation.
+ */
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(lp_check_value(type, x));
+   assert(lp_check_value(type, v0));
+   assert(lp_check_value(type, v1));
+
+   if (type.norm) {
+      struct lp_type wide_type;
+      struct lp_build_context wide_bld;
+      LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh;
+      LLVMValueRef shift;
+
+      assert(type.length >= 2);
+      assert(!type.sign);
+
+      /*
+       * Create a wider type, enough to hold the intermediate result of the
+       * multiplication.
+       */
+      memset(&wide_type, 0, sizeof wide_type);
+      wide_type.fixed  = TRUE;
+      wide_type.width  = type.width*2;
+      wide_type.length = type.length/2;
+
+      lp_build_context_init(&wide_bld, bld->builder, wide_type);
+
+      lp_build_unpack2(bld->builder, type, wide_type, x,  &xl,  &xh);
+      lp_build_unpack2(bld->builder, type, wide_type, v0, &v0l, &v0h);
+      lp_build_unpack2(bld->builder, type, wide_type, v1, &v1l, &v1h);
+
+      /*
+       * Scale x from [0, 255] to [0, 256]
+       */
+
+      shift = lp_build_const_int_vec(wide_type, type.width - 1);
+
+      xl = lp_build_add(&wide_bld, xl,
+                        LLVMBuildAShr(bld->builder, xl, shift, ""));
+      xh = lp_build_add(&wide_bld, xh,
+                        LLVMBuildAShr(bld->builder, xh, shift, ""));
+
+      /*
+       * Lerp both halves.
+       */
+
+      resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l);
+      resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h);
+
+      res = lp_build_pack2(bld->builder, wide_type, type, resl, resh);
+   } else {
+      res = lp_build_lerp_simple(bld, x, v0, v1);
+   }
 
    return res;
 }
@@ -597,6 +740,9 @@ lp_build_min(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
+   assert(lp_check_value(bld->type, a));
+   assert(lp_check_value(bld->type, b));
+
    if(a == bld->undef || b == bld->undef)
       return bld->undef;
 
@@ -625,6 +771,9 @@ lp_build_max(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
+   assert(lp_check_value(bld->type, a));
+   assert(lp_check_value(bld->type, b));
+
    if(a == bld->undef || b == bld->undef)
       return bld->undef;
 
@@ -654,6 +803,10 @@ lp_build_clamp(struct lp_build_context *bld,
                LLVMValueRef min,
                LLVMValueRef max)
 {
+   assert(lp_check_value(bld->type, a));
+   assert(lp_check_value(bld->type, min));
+   assert(lp_check_value(bld->type, max));
+
    a = lp_build_min(bld, a, max);
    a = lp_build_max(bld, a, min);
    return a;
@@ -670,31 +823,20 @@ lp_build_abs(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
 
+   assert(lp_check_value(type, a));
+
    if(!type.sign)
       return a;
 
    if(type.floating) {
       /* Mask out the sign bit */
-      if (type.length == 1) {
-         LLVMTypeRef int_type = LLVMIntType(type.width);
-         LLVMTypeRef float_type = LLVMFloatType();
-         unsigned long long absMask = ~(1ULL << (type.width - 1));
-         LLVMValueRef mask = LLVMConstInt(int_type, absMask, 0);
-         a = LLVMBuildBitCast(bld->builder, a, int_type, "");
-         a = LLVMBuildAnd(bld->builder, a, mask, "");
-         a = LLVMBuildBitCast(bld->builder, a, float_type, "");
-         return a;
-      }
-      else {
-         /* vector of floats */
-         LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-         unsigned long long absMask = ~(1ULL << (type.width - 1));
-         LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
-         a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-         a = LLVMBuildAnd(bld->builder, a, mask, "");
-         a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
-         return a;
-      }
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      unsigned long long absMask = ~(1ULL << (type.width - 1));
+      LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask));
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+      return a;
    }
 
    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
@@ -716,7 +858,16 @@ LLVMValueRef
 lp_build_negate(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
-   return LLVMBuildNeg(bld->builder, a, "");
+   assert(lp_check_value(bld->type, a));
+
+#if HAVE_LLVM >= 0x0207
+   if (bld->type.floating)
+      a = LLVMBuildFNeg(bld->builder, a, "");
+   else
+#endif
+      a = LLVMBuildNeg(bld->builder, a, "");
+
+   return a;
 }
 
 
@@ -729,6 +880,8 @@ lp_build_sgn(struct lp_build_context *bld,
    LLVMValueRef cond;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+
    /* Handle non-zero case */
    if(!type.sign) {
       /* if not zero then sign must be positive */
@@ -742,17 +895,9 @@ lp_build_sgn(struct lp_build_context *bld,
       LLVMValueRef one;
       unsigned long long maskBit = (unsigned long long)1 << (type.width - 1);
 
-      if (type.length == 1) {
-         int_type = lp_build_int_elem_type(type);
-         vec_type = lp_build_elem_type(type);
-         mask = LLVMConstInt(int_type, maskBit, 0);
-      }
-      else {
-         /* vector */
-         int_type = lp_build_int_vec_type(type);
-         vec_type = lp_build_vec_type(type);
-         mask = lp_build_const_int_vec(type, maskBit);
-      }
+      int_type = lp_build_int_vec_type(type);
+      vec_type = lp_build_vec_type(type);
+      mask = lp_build_const_int_vec(type, maskBit);
 
       /* Take the sign bit and add it to 1 constant */
       sign = LLVMBuildBitCast(bld->builder, a, int_type, "");
@@ -795,6 +940,7 @@ lp_build_set_sign(struct lp_build_context *bld,
    LLVMValueRef val, res;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
 
    /* val = reinterpret_cast<int>(a) */
    val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
@@ -819,21 +965,11 @@ lp_build_int_to_float(struct lp_build_context *bld,
                       LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
 
    assert(type.floating);
-   /*assert(lp_check_value(type, a));*/
 
-   if (type.length == 1) {
-      LLVMTypeRef float_type = LLVMFloatType();
-      return LLVMBuildSIToFP(bld->builder, a, float_type, "");
-   }
-   else {
-      LLVMTypeRef vec_type = lp_build_vec_type(type);
-      /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
-      LLVMValueRef res;
-      res = LLVMBuildSIToFP(bld->builder, a, vec_type, "");
-      return res;
-   }
+   return LLVMBuildSIToFP(bld->builder, a, vec_type, "");
 }
 
 
@@ -853,31 +989,75 @@ lp_build_round_sse41(struct lp_build_context *bld,
                      enum lp_build_round_sse41_mode mode)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef i32t = LLVMInt32Type();
    const char *intrinsic;
+   LLVMValueRef res;
 
    assert(type.floating);
-   assert(type.width*type.length == 128);
+
    assert(lp_check_value(type, a));
    assert(util_cpu_caps.has_sse4_1);
 
-   switch(type.width) {
-   case 32:
-      intrinsic = "llvm.x86.sse41.round.ps";
-      break;
-   case 64:
-      intrinsic = "llvm.x86.sse41.round.pd";
-      break;
-   default:
-      assert(0);
-      return bld->undef;
+   if (type.length == 1) {
+      LLVMTypeRef vec_type;
+      LLVMValueRef undef;
+      LLVMValueRef args[3];
+      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
+
+      switch(type.width) {
+      case 32:
+         intrinsic = "llvm.x86.sse41.round.ss";
+         break;
+      case 64:
+         intrinsic = "llvm.x86.sse41.round.sd";
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+      vec_type = LLVMVectorType(bld->elem_type, 4);
+
+      undef = LLVMGetUndef(vec_type);
+
+      args[0] = undef;
+      args[1] = LLVMBuildInsertElement(bld->builder, undef, a, index0, "");
+      args[2] = LLVMConstInt(i32t, mode, 0);
+
+      res = lp_build_intrinsic(bld->builder, intrinsic,
+                               vec_type, args, Elements(args));
+
+      res = LLVMBuildExtractElement(bld->builder, res, index0, "");
+   }
+   else {
+      assert(type.width*type.length == 128);
+
+      switch(type.width) {
+      case 32:
+         intrinsic = "llvm.x86.sse41.round.ps";
+         break;
+      case 64:
+         intrinsic = "llvm.x86.sse41.round.pd";
+         break;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+
+      res = lp_build_intrinsic_binary(bld->builder, intrinsic,
+                                      bld->vec_type, a,
+                                      LLVMConstInt(i32t, mode, 0));
    }
 
-   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
-                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+   return res;
 }
 
 
+/**
+ * Return the integer part of a float (vector) value.  The returned value is
+ * a float (vector).
+ * Ex: trunc(-1.5) = 1.0
+ */
 LLVMValueRef
 lp_build_trunc(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -887,8 +1067,10 @@ lp_build_trunc(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if(util_cpu_caps.has_sse4_1)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
@@ -900,6 +1082,12 @@ lp_build_trunc(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return float (vector) rounded to nearest integer (vector).  The returned
+ * value is a float (vector).
+ * Ex: round(0.9) = 1.0
+ * Ex: round(-1.5) = -2.0
+ */
 LLVMValueRef
 lp_build_round(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -909,8 +1097,10 @@ lp_build_round(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if(util_cpu_caps.has_sse4_1)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMValueRef res;
@@ -921,6 +1111,11 @@ lp_build_round(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return floor of float (vector), result is a float (vector)
+ * Ex: floor(1.1) = 1.0
+ * Ex: floor(-1.1) = -2.0
+ */
 LLVMValueRef
 lp_build_floor(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -928,16 +1123,12 @@ lp_build_floor(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
 
-   if (type.length == 1) {
-      LLVMValueRef res;
-      res = lp_build_ifloor(bld, a);
-      res = LLVMBuildSIToFP(bld->builder, res, LLVMFloatType(), "");
-      return res;
-   }
-
-   if(util_cpu_caps.has_sse4_1)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMValueRef res;
@@ -948,6 +1139,11 @@ lp_build_floor(struct lp_build_context *bld,
 }
 
 
+/**
+ * Return ceiling of float (vector), returning float (vector).
+ * Ex: ceil( 1.1) = 2.0
+ * Ex: ceil(-1.1) = -1.0
+ */
 LLVMValueRef
 lp_build_ceil(struct lp_build_context *bld,
               LLVMValueRef a)
@@ -957,8 +1153,10 @@ lp_build_ceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if(util_cpu_caps.has_sse4_1)
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   }
    else {
       LLVMTypeRef vec_type = lp_build_vec_type(type);
       LLVMValueRef res;
@@ -970,7 +1168,7 @@ lp_build_ceil(struct lp_build_context *bld,
 
 
 /**
- * Return fractional part of 'a' computed as a - floor(f)
+ * Return fractional part of 'a' computed as a - floor(a)
  * Typically used in texture coord arithmetic.
  */
 LLVMValueRef
@@ -983,31 +1181,29 @@ lp_build_fract(struct lp_build_context *bld,
 
 
 /**
- * Convert to integer, through whichever rounding method that's fastest,
- * typically truncating toward zero.
+ * Return the integer part of a float (vector) value.  The returned value is
+ * an integer (vector).
+ * Ex: itrunc(-1.5) = 1
  */
 LLVMValueRef
 lp_build_itrunc(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
 
-   if (type.length == 1) {
-      LLVMTypeRef int_type = LLVMIntType(type.width);
-      return LLVMBuildFPToSI(bld->builder, a, int_type, "");
-   }
-   else {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-      assert(lp_check_value(type, a));
-      return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
-   }
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 }
 
 
 /**
- * Convert float[] to int[] with round().
+ * Return float (vector) rounded to nearest integer (vector).  The returned
+ * value is an integer (vector).
+ * Ex: iround(0.9) = 1
+ * Ex: iround(-1.5) = -2
  */
 LLVMValueRef
 lp_build_iround(struct lp_build_context *bld,
@@ -1019,17 +1215,10 @@ lp_build_iround(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   if (type.length == 1) {
-      /* scalar float to int */
-      LLVMTypeRef int_type = LLVMIntType(type.width);
-      /* XXX we want rounding here! */
-      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
-      return res;
-   }
-
    assert(lp_check_value(type, a));
 
-   if(util_cpu_caps.has_sse4_1) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
    }
    else {
@@ -1048,7 +1237,7 @@ lp_build_iround(struct lp_build_context *bld,
       half = LLVMBuildOr(bld->builder, sign, half, "");
       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
 
-      res = LLVMBuildAdd(bld->builder, a, half, "");
+      res = LLVMBuildFAdd(bld->builder, a, half, "");
    }
 
    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
@@ -1058,7 +1247,9 @@ lp_build_iround(struct lp_build_context *bld,
 
 
 /**
- * Convert float[] to int[] with floor().
+ * Return floor of float (vector), result is an int (vector)
+ * Ex: ifloor(1.1) = 1.0
+ * Ex: ifloor(-1.1) = -2.0
  */
 LLVMValueRef
 lp_build_ifloor(struct lp_build_context *bld,
@@ -1069,17 +1260,10 @@ lp_build_ifloor(struct lp_build_context *bld,
    LLVMValueRef res;
 
    assert(type.floating);
-
-   if (type.length == 1) {
-      /* scalar float to int */
-      LLVMTypeRef int_type = LLVMIntType(type.width);
-      res = LLVMBuildFPToSI(bld->builder, a, int_type, "");
-      return res;
-   }
-
    assert(lp_check_value(type, a));
 
-   if(util_cpu_caps.has_sse4_1) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
    }
    else {
@@ -1093,29 +1277,31 @@ lp_build_ifloor(struct lp_build_context *bld,
       /* sign = a < 0 ? ~0 : 0 */
       sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
       sign = LLVMBuildAnd(bld->builder, sign, mask, "");
-      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "");
-      lp_build_name(sign, "floor.sign");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign");
 
       /* offset = -0.99999(9)f */
-      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
       offset = LLVMConstBitCast(offset, int_vec_type);
 
-      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
+      /* offset = a < 0 ? offset : 0.0f */
       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
-      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
-      lp_build_name(offset, "floor.offset");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
 
-      res = LLVMBuildAdd(bld->builder, a, offset, "");
-      lp_build_name(res, "floor.res");
+      res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
    }
 
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
-   lp_build_name(res, "floor");
+   /* round to nearest (toward zero) */
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res");
 
    return res;
 }
 
 
+/**
+ * Return ceiling of float (vector), returning int (vector).
+ * Ex: iceil( 1.1) = 2
+ * Ex: iceil(-1.1) = -1
+ */
 LLVMValueRef
 lp_build_iceil(struct lp_build_context *bld,
                LLVMValueRef a)
@@ -1127,15 +1313,36 @@ lp_build_iceil(struct lp_build_context *bld,
    assert(type.floating);
    assert(lp_check_value(type, a));
 
-   if(util_cpu_caps.has_sse4_1) {
+   if (util_cpu_caps.has_sse4_1 &&
+       (type.length == 1 || type.width*type.length == 128)) {
       res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
    }
    else {
-      assert(0);
-      res = bld->undef;
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      unsigned mantissa = lp_mantissa(type);
+      LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef offset;
+
+      /* sign = a < 0 ? 0 : ~0 */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign");
+      sign = LLVMBuildNot(bld->builder, sign, "iceil.not");
+
+      /* offset = 0.99999(9)f */
+      offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa));
+      offset = LLVMConstBitCast(offset, int_vec_type);
+
+      /* offset = a < 0 ? 0.0 : offset */
+      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
+
+      res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
    }
 
-   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+   /* round to nearest (toward zero) */
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res");
 
    return res;
 }
@@ -1149,6 +1356,8 @@ lp_build_sqrt(struct lp_build_context *bld,
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
+   assert(lp_check_value(type, a));
+
    /* TODO: optimize the constant case */
    /* TODO: optimize the constant case */
 
@@ -1159,12 +1368,44 @@ lp_build_sqrt(struct lp_build_context *bld,
 }
 
 
+/**
+ * Do one Newton-Raphson step to improve reciprocate precision:
+ *
+ *   x_{i+1} = x_i * (2 - a * x_i)
+ *
+ * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or
+ * +/-Inf, giving NaN instead.  Certain applications rely on this behavior,
+ * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's
+ * halo. It would be necessary to clamp the argument to prevent this.
+ *
+ * See also:
+ * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
+ * - http://softwarecommunity.intel.com/articles/eng/1818.htm
+ */
+static INLINE LLVMValueRef
+lp_build_rcp_refine(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef rcp_a)
+{
+   LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
+   LLVMValueRef res;
+
+   res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
+   res = LLVMBuildFSub(bld->builder, two, res, "");
+   res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
+
+   return res;
+}
+
+
 LLVMValueRef
 lp_build_rcp(struct lp_build_context *bld,
              LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
 
+   assert(lp_check_value(type, a));
+
    if(a == bld->zero)
       return bld->undef;
    if(a == bld->one)
@@ -1177,15 +1418,65 @@ lp_build_rcp(struct lp_build_context *bld,
    if(LLVMIsConstant(a))
       return LLVMConstFDiv(bld->one, a);
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
-      /* FIXME: improve precision */
-      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
+   /*
+    * We don't use RCPPS because:
+    * - it only has 10bits of precision
+    * - it doesn't even get the reciprocate of 1.0 exactly
+    * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf
+    * - for recent processors the benefit over DIVPS is marginal, a case
+    *   depedent
+    *
+    * We could still use it on certain processors if benchmarks show that the
+    * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for
+    * particular uses that require less workarounds.
+    */
+
+   if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
+      LLVMValueRef res;
+      unsigned i;
+
+      res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
+
+      for (i = 0; i < num_iterations; ++i) {
+         res = lp_build_rcp_refine(bld, a, res);
+      }
+
+      return res;
+   }
 
    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
 }
 
 
 /**
+ * Do one Newton-Raphson step to improve rsqrt precision:
+ *
+ *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
+ *
+ * See also:
+ * - http://softwarecommunity.intel.com/articles/eng/1818.htm
+ */
+static INLINE LLVMValueRef
+lp_build_rsqrt_refine(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef rsqrt_a)
+{
+   LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
+   LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
+   LLVMValueRef res;
+
+   res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
+   res = LLVMBuildFMul(bld->builder, a, res, "");
+   res = LLVMBuildFSub(bld->builder, three, res, "");
+   res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
+   res = LLVMBuildFMul(bld->builder, half, res, "");
+
+   return res;
+}
+
+
+/**
  * Generate 1/sqrt(a)
  */
 LLVMValueRef
@@ -1194,70 +1485,476 @@ lp_build_rsqrt(struct lp_build_context *bld,
 {
    const struct lp_type type = bld->type;
 
+   assert(lp_check_value(type, a));
+
    assert(type.floating);
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
-      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
+   if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      const unsigned num_iterations = 0;
+      LLVMValueRef res;
+      unsigned i;
+
+      res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
+
+      for (i = 0; i < num_iterations; ++i) {
+         res = lp_build_rsqrt_refine(bld, a, res);
+      }
+
+      return res;
+   }
 
    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }
 
 
+static inline LLVMValueRef
+lp_build_const_v4si(unsigned long value)
+{
+   LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0);
+   LLVMValueRef elements[4] = { element, element, element, element };
+   return LLVMConstVector(elements, 4);
+}
+
+static inline LLVMValueRef
+lp_build_const_v4sf(float value)
+{
+   LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value);
+   LLVMValueRef elements[4] = { element, element, element, element };
+   return LLVMConstVector(elements, 4);
+}
+
+
 /**
- * Generate cos(a)
+ * Generate sin(a) using SSE2
  */
 LLVMValueRef
-lp_build_cos(struct lp_build_context *bld,
-              LLVMValueRef a)
+lp_build_sin(struct lp_build_context *bld,
+             LLVMValueRef a)
 {
-#ifdef PIPE_OS_WINDOWS
+   struct lp_type int_type = lp_int_type(bld->type);
+   LLVMBuilderRef b = bld->builder;
+   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
+   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
+
    /*
-    * FIXME: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf()
-    * which is neither efficient nor does the CRT linkage work on Windows
-    * causing segmentation fault. So simply disable the code for now.
+    *  take the absolute value,
+    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
     */
-   return bld->one;
-#else
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   char intrinsic[32];
 
-   /* TODO: optimize the constant case */
+   LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
+   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
 
-   assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
+   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
+   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
 
-   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
-#endif
+   /*
+    * extract the sign bit (upper one)
+    * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask);
+    */
+   LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000);
+   LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i");
+
+   /*
+    * scale by 4/Pi
+    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    */
+   
+   LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
+   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
+
+   /*
+    * store the integer part of y in mm0
+    * emm2 = _mm_cvttps_epi32(y);
+    */
+   
+   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
+
+   /*
+    * j=(j+1) & (~1) (see the cephes sources)
+    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+    */
+
+   LLVMValueRef all_one = lp_build_const_v4si(1);
+   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
+   /*
+    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+    */
+   LLVMValueRef inv_one = lp_build_const_v4si(~1);
+   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
+
+   /*
+    * y = _mm_cvtepi32_ps(emm2);
+    */
+   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
+
+   /* get the swap sign flag
+    * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4);
+    */
+   LLVMValueRef pi32_4 = lp_build_const_v4si(4);
+   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and");
+   
+   /*
+    * emm2 = _mm_slli_epi32(emm0, 29);
+    */  
+   LLVMValueRef const_29 = lp_build_const_v4si(29);
+   LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit");
+
+   /*
+    * get the polynom selection mask 
+    * there is one polynom for 0 <= x <= Pi/4
+    * and another one for Pi/4<x<=Pi/2
+    * Both branches will be computed.
+    *  
+    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+    */
+
+   LLVMValueRef pi32_2 = lp_build_const_v4si(2);
+   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3");
+   LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
+                                             emm2_3, lp_build_const_v4si(0));
+   /*
+    *   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+    */
+   LLVMValueRef sign_bit_1 =  LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit");
+
+   /*
+    * _PS_CONST(minus_cephes_DP1, -0.78515625);
+    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+    */
+   LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
+   LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
+   LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
+
+   /*
+    * The magic pass: "Extended precision modular arithmetic" 
+    * x = ((x - y * DP1) - y * DP2) - y * DP3; 
+    * xmm1 = _mm_mul_ps(y, xmm1);
+    * xmm2 = _mm_mul_ps(y, xmm2);
+    * xmm3 = _mm_mul_ps(y, xmm3);
+    */
+   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
+   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
+   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
+
+   /*
+    * x = _mm_add_ps(x, xmm1);
+    * x = _mm_add_ps(x, xmm2);
+    * x = _mm_add_ps(x, xmm3);
+    */ 
+
+   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
+   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
+   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
+
+   /*
+    * Evaluate the first polynom  (0 <= x <= Pi/4)
+    *
+    * z = _mm_mul_ps(x,x);
+    */
+   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
+
+   /*
+    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
+    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
+    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
+    */
+   LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
+   LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
+   LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
+
+   /*
+    * y = *(v4sf*)_ps_coscof_p0;
+    * y = _mm_mul_ps(y, z);
+    */
+   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
+   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
+   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
+   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
+   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
+
+
+   /*
+    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+    * y = _mm_sub_ps(y, tmp);
+    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
+    */ 
+   LLVMValueRef half = lp_build_const_v4sf(0.5);
+   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
+   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
+   LLVMValueRef one = lp_build_const_v4sf(1.0);
+   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
+
+   /*
+    * _PS_CONST(sincof_p0, -1.9515295891E-4);
+    * _PS_CONST(sincof_p1,  8.3321608736E-3);
+    * _PS_CONST(sincof_p2, -1.6666654611E-1);
+    */
+   LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
+   LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
+   LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
+
+   /*
+    * Evaluate the second polynom  (Pi/4 <= x <= 0)
+    *
+    * y2 = *(v4sf*)_ps_sincof_p0;
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_mul_ps(y2, x);
+    * y2 = _mm_add_ps(y2, x);
+    */
+
+   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
+   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
+   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
+   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
+   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
+   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
+
+   /*
+    * select the correct result from the two polynoms
+    * xmm3 = poly_mask;
+    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+    * y = _mm_andnot_ps(xmm3, y);
+    * y = _mm_add_ps(y,y2);
+    */
+   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
+   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
+   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
+   LLVMValueRef inv = lp_build_const_v4si(~0);
+   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
+   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
+   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
+
+   /*
+    * update the sign
+    * y = _mm_xor_ps(y, sign_bit);
+    */
+   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin");
+   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
+   return y_result;
 }
 
 
 /**
- * Generate sin(a)
+ * Generate cos(a) using SSE2
  */
 LLVMValueRef
-lp_build_sin(struct lp_build_context *bld,
-              LLVMValueRef a)
+lp_build_cos(struct lp_build_context *bld,
+             LLVMValueRef a)
 {
-#ifdef PIPE_OS_WINDOWS
+   struct lp_type int_type = lp_int_type(bld->type);
+   LLVMBuilderRef b = bld->builder;
+   LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4);
+   LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4);
+
    /*
-    * FIXME: X86 backend translates llvm.sin.v4f32 to 4 calls to CRT's sinf()
-    * which is neither efficient nor does the CRT linkage work on Windows
-    * causing segmentation fault. So simply disable the code for now.
+    *  take the absolute value,
+    *  x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask);
     */
-   return bld->zero;
-#else
-   const struct lp_type type = bld->type;
-   LLVMTypeRef vec_type = lp_build_vec_type(type);
-   char intrinsic[32];
 
-   /* TODO: optimize the constant case */
+   LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000);
+   LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si");
 
-   assert(type.floating);
-   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
+   LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi");
+   LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs");
 
-   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
-#endif
+   /*
+    * scale by 4/Pi
+    * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI);
+    */
+   
+   LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
+   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
+
+   /*
+    * store the integer part of y in mm0
+    * emm2 = _mm_cvttps_epi32(y);
+    */
+   
+   LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i");
+
+   /*
+    * j=(j+1) & (~1) (see the cephes sources)
+    * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1);
+    */
+
+   LLVMValueRef all_one = lp_build_const_v4si(1);
+   LLVMValueRef emm2_add =  LLVMBuildAdd(b, emm2_i, all_one, "emm2_add");
+   /*
+    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1);
+    */
+   LLVMValueRef inv_one = lp_build_const_v4si(~1);
+   LLVMValueRef emm2_and =  LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and");
+
+   /*
+    * y = _mm_cvtepi32_ps(emm2);
+    */
+   LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2");
+
+
+   /*
+    * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2);
+    */
+   LLVMValueRef const_2 = lp_build_const_v4si(2);
+   LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2");
+
+
+   /* get the swap sign flag
+    * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4);
+    */
+   LLVMValueRef inv = lp_build_const_v4si(~0);
+   LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not");
+   LLVMValueRef pi32_4 = lp_build_const_v4si(4);
+   LLVMValueRef emm0_and =  LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and");
+   
+   /*
+    * emm2 = _mm_slli_epi32(emm0, 29);
+    */  
+   LLVMValueRef const_29 = lp_build_const_v4si(29);
+   LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit");
+
+   /*
+    * get the polynom selection mask 
+    * there is one polynom for 0 <= x <= Pi/4
+    * and another one for Pi/4<x<=Pi/2
+    * Both branches will be computed.
+    *  
+    * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2);
+    * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+    */
+
+   LLVMValueRef pi32_2 = lp_build_const_v4si(2);
+   LLVMValueRef emm2_3 =  LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3");
+   LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL,
+   				             emm2_3, lp_build_const_v4si(0));
+
+   /*
+    * _PS_CONST(minus_cephes_DP1, -0.78515625);
+    * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
+    * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
+    */
+   LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625);
+   LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4);
+   LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8);
+
+   /*
+    * The magic pass: "Extended precision modular arithmetic" 
+    * x = ((x - y * DP1) - y * DP2) - y * DP3; 
+    * xmm1 = _mm_mul_ps(y, xmm1);
+    * xmm2 = _mm_mul_ps(y, xmm2);
+    * xmm3 = _mm_mul_ps(y, xmm3);
+    */
+   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
+   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
+   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
+
+   /*
+    * x = _mm_add_ps(x, xmm1);
+    * x = _mm_add_ps(x, xmm2);
+    * x = _mm_add_ps(x, xmm3);
+    */ 
+
+   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
+   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
+   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
+
+   /*
+    * Evaluate the first polynom  (0 <= x <= Pi/4)
+    *
+    * z = _mm_mul_ps(x,x);
+    */
+   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
+
+   /*
+    * _PS_CONST(coscof_p0,  2.443315711809948E-005);
+    * _PS_CONST(coscof_p1, -1.388731625493765E-003);
+    * _PS_CONST(coscof_p2,  4.166664568298827E-002);
+    */
+   LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005);
+   LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003);
+   LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002);
+
+   /*
+    * y = *(v4sf*)_ps_coscof_p0;
+    * y = _mm_mul_ps(y, z);
+    */
+   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
+   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
+   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
+   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
+   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
+
+
+   /*
+    * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5);
+    * y = _mm_sub_ps(y, tmp);
+    * y = _mm_add_ps(y, *(v4sf*)_ps_1);
+    */ 
+   LLVMValueRef half = lp_build_const_v4sf(0.5);
+   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
+   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
+   LLVMValueRef one = lp_build_const_v4sf(1.0);
+   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
+
+   /*
+    * _PS_CONST(sincof_p0, -1.9515295891E-4);
+    * _PS_CONST(sincof_p1,  8.3321608736E-3);
+    * _PS_CONST(sincof_p2, -1.6666654611E-1);
+    */
+   LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4);
+   LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3);
+   LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1);
+
+   /*
+    * Evaluate the second polynom  (Pi/4 <= x <= 0)
+    *
+    * y2 = *(v4sf*)_ps_sincof_p0;
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1);
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2);
+    * y2 = _mm_mul_ps(y2, z);
+    * y2 = _mm_mul_ps(y2, x);
+    * y2 = _mm_add_ps(y2, x);
+    */
+
+   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
+   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
+   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
+   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
+   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
+   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
+
+   /*
+    * select the correct result from the two polynoms
+    * xmm3 = poly_mask;
+    * y2 = _mm_and_ps(xmm3, y2); //, xmm3);
+    * y = _mm_andnot_ps(xmm3, y);
+    * y = _mm_add_ps(y,y2);
+    */
+   LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i");
+   LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i");
+   LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and");
+   LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv");
+   LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and");
+   LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine");
+
+   /*
+    * update the sign
+    * y = _mm_xor_ps(y, sign_bit);
+    */
+   LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin");
+   LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result");
+   return y_result;
 }
 
 
@@ -1270,9 +1967,11 @@ lp_build_pow(struct lp_build_context *bld,
              LLVMValueRef y)
 {
    /* TODO: optimize the constant case */
-   if(LLVMIsConstant(x) && LLVMIsConstant(y))
+   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
+       LLVMIsConstant(x) && LLVMIsConstant(y)) {
       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
                    __FUNCTION__);
+   }
 
    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
 }
@@ -1288,6 +1987,8 @@ lp_build_exp(struct lp_build_context *bld,
    /* log2(e) = 1/log(2) */
    LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
 
+   assert(lp_check_value(bld->type, x));
+
    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
 }
 
@@ -1302,14 +2003,12 @@ lp_build_log(struct lp_build_context *bld,
    /* log(2) */
    LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
 
+   assert(lp_check_value(bld->type, x));
+
    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
 }
 
 
-#define EXP_POLY_DEGREE 3
-#define LOG_POLY_DEGREE 5
-
-
 /**
  * Generate polynomial.
  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
@@ -1321,22 +2020,22 @@ lp_build_polynomial(struct lp_build_context *bld,
                     unsigned num_coeffs)
 {
    const struct lp_type type = bld->type;
-   LLVMTypeRef float_type = LLVMFloatType();
    LLVMValueRef res = NULL;
    unsigned i;
 
+   assert(lp_check_value(bld->type, x));
+
    /* TODO: optimize the constant case */
-   if(LLVMIsConstant(x))
+   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
+       LLVMIsConstant(x)) {
       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
                    __FUNCTION__);
+   }
 
    for (i = num_coeffs; i--; ) {
       LLVMValueRef coeff;
 
-      if (type.length == 1)
-         coeff = LLVMConstReal(float_type, coeffs[i]);
-      else
-         coeff = lp_build_const_vec(type, coeffs[i]);
+      coeff = lp_build_const_vec(type, coeffs[i]);
 
       if(res)
          res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
@@ -1352,17 +2051,31 @@ lp_build_polynomial(struct lp_build_context *bld,
 
 
 /**
- * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
+ * Minimax polynomial fit of 2**x, in range [0, 1[
  */
 const double lp_build_exp2_polynomial[] = {
 #if EXP_POLY_DEGREE == 5
-   9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
+   0.999999999690134838155,
+   0.583974334321735217258,
+   0.164553105719676828492,
+   0.0292811063701710962255,
+   0.00354944426657875141846,
+   0.000296253726543423377365
 #elif EXP_POLY_DEGREE == 4
-   1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
+   1.00000001502262084505,
+   0.563586057338685991394,
+   0.150436017652442413623,
+   0.0243220604213317927308,
+   0.0025359088446580436489
 #elif EXP_POLY_DEGREE == 3
-   9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
+   0.999925218562710312959,
+   0.695833540494823811697,
+   0.226067155427249155588,
+   0.0780245226406372992967
 #elif EXP_POLY_DEGREE == 2
-   1.0017247, 6.5763628e-1, 3.3718944e-1
+   1.00172476321474503578,
+   0.657636275736077639316,
+   0.33718943461968720704
 #else
 #error
 #endif
@@ -1385,28 +2098,31 @@ lp_build_exp2_approx(struct lp_build_context *bld,
    LLVMValueRef expfpart = NULL;
    LLVMValueRef res = NULL;
 
+   assert(lp_check_value(bld->type, x));
+
    if(p_exp2_int_part || p_frac_part || p_exp2) {
       /* TODO: optimize the constant case */
-      if(LLVMIsConstant(x))
+      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
+          LLVMIsConstant(x)) {
          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
                       __FUNCTION__);
+      }
 
       assert(type.floating && type.width == 32);
 
       x = lp_build_min(bld, x, lp_build_const_vec(type,  129.0));
       x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999));
 
-      /* ipart = int(x - 0.5) */
-      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_vec(type, 0.5f), "");
-      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
+      /* ipart = floor(x) */
+      ipart = lp_build_floor(bld, x);
 
       /* fpart = x - ipart */
-      fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
-      fpart = LLVMBuildSub(bld->builder, x, fpart, "");
+      fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
    }
 
    if(p_exp2_int_part || p_exp2) {
       /* expipart = (float) (1 << ipart) */
+      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
       expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), "");
       expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), "");
       expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
@@ -1416,7 +2132,7 @@ lp_build_exp2_approx(struct lp_build_context *bld,
       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
                                      Elements(lp_build_exp2_polynomial));
 
-      res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
+      res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
    }
 
    if(p_exp2_int_part)
@@ -1447,13 +2163,27 @@ lp_build_exp2(struct lp_build_context *bld,
  */
 const double lp_build_log2_polynomial[] = {
 #if LOG_POLY_DEGREE == 6
-   3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
+   3.11578814719469302614,
+   -3.32419399085241980044,
+   2.59883907202499966007,
+   -1.23152682416275988241,
+   0.318212422185251071475,
+   -0.0344359067839062357313
 #elif LOG_POLY_DEGREE == 5
-   2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
+   2.8882704548164776201,
+   -2.52074962577807006663,
+   1.48116647521213171641,
+   -0.465725644288844778798,
+   0.0596515482674574969533
 #elif LOG_POLY_DEGREE == 4
-   2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
+   2.61761038894603480148,
+   -1.75647175389045657003,
+   0.688243882994381274313,
+   -0.107254423828329604454
 #elif LOG_POLY_DEGREE == 3
-   2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
+   2.28330284476918490682,
+   -1.04913055217340124191,
+   0.204446009836232697516
 #else
 #error
 #endif
@@ -1485,11 +2215,15 @@ lp_build_log2_approx(struct lp_build_context *bld,
    LLVMValueRef logmant = NULL;
    LLVMValueRef res = NULL;
 
+   assert(lp_check_value(bld->type, x));
+
    if(p_exp || p_floor_log2 || p_log2) {
       /* TODO: optimize the constant case */
-      if(LLVMIsConstant(x))
+      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
+          LLVMIsConstant(x)) {
          debug_printf("%s: inefficient/imprecise constant arithmetic\n",
                       __FUNCTION__);
+      }
 
       assert(type.floating && type.width == 32);
 
@@ -1515,9 +2249,9 @@ lp_build_log2_approx(struct lp_build_context *bld,
                                     Elements(lp_build_log2_polynomial));
 
       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
+      logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
 
-      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
+      res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
    }
 
    if(p_exp) {
@@ -1533,89 +2267,11 @@ lp_build_log2_approx(struct lp_build_context *bld,
 }
 
 
-/** scalar version of above function */
-static void
-lp_build_float_log2_approx(struct lp_build_context *bld,
-                           LLVMValueRef x,
-                           LLVMValueRef *p_exp,
-                           LLVMValueRef *p_floor_log2,
-                           LLVMValueRef *p_log2)
-{
-   const struct lp_type type = bld->type;
-   LLVMTypeRef float_type = LLVMFloatType();
-   LLVMTypeRef int_type = LLVMIntType(type.width);
-
-   LLVMValueRef expmask = LLVMConstInt(int_type, 0x7f800000, 0);
-   LLVMValueRef mantmask = LLVMConstInt(int_type, 0x007fffff, 0);
-   LLVMValueRef one = LLVMConstBitCast(bld->one, int_type);
-
-   LLVMValueRef i = NULL;
-   LLVMValueRef exp = NULL;
-   LLVMValueRef mant = NULL;
-   LLVMValueRef logexp = NULL;
-   LLVMValueRef logmant = NULL;
-   LLVMValueRef res = NULL;
-
-   if(p_exp || p_floor_log2 || p_log2) {
-      /* TODO: optimize the constant case */
-      if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
-                      __FUNCTION__);
-
-      assert(type.floating && type.width == 32);
-
-      i = LLVMBuildBitCast(bld->builder, x, int_type, "");
-
-      /* exp = (float) exponent(x) */
-      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
-   }
-
-   if(p_floor_log2 || p_log2) {
-      LLVMValueRef c23 = LLVMConstInt(int_type, 23, 0);
-      LLVMValueRef c127 = LLVMConstInt(int_type, 127, 0);
-      logexp = LLVMBuildLShr(bld->builder, exp, c23, "");
-      logexp = LLVMBuildSub(bld->builder, logexp, c127, "");
-      logexp = LLVMBuildSIToFP(bld->builder, logexp, float_type, "");
-   }
-
-   if(p_log2) {
-      /* mant = (float) mantissa(x) */
-      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
-      mant = LLVMBuildOr(bld->builder, mant, one, "");
-      mant = LLVMBuildBitCast(bld->builder, mant, float_type, "");
-
-      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
-                                    Elements(lp_build_log2_polynomial));
-
-      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
-
-      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
-   }
-
-   if(p_exp) {
-      exp = LLVMBuildBitCast(bld->builder, exp, float_type, "");
-      *p_exp = exp;
-   }
-
-   if(p_floor_log2)
-      *p_floor_log2 = logexp;
-
-   if(p_log2)
-      *p_log2 = res;
-}
-
-
 LLVMValueRef
 lp_build_log2(struct lp_build_context *bld,
               LLVMValueRef x)
 {
    LLVMValueRef res;
-   if (bld->type.length == 1) {
-      lp_build_float_log2_approx(bld, x, NULL, NULL, &res);
-   }
-   else {
-      lp_build_log2_approx(bld, x, NULL, NULL, &res);
-   }
+   lp_build_log2_approx(bld, x, NULL, NULL, &res);
    return res;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_assert.c b/src/gallium/auxiliary/gallivm/lp_bld_assert.c
new file mode 100644
index 00000000000..f2ebd868a8d
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_assert.c
@@ -0,0 +1,101 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "lp_bld_assert.h"
+#include "lp_bld_init.h"
+#include "lp_bld_printf.h"
+
+
+/**
+ * A call to lp_build_assert() will build a function call to this function.
+ */
+static void
+lp_assert(int condition, const char *msg)
+{
+   if (!condition) {
+      debug_printf("LLVM assertion '%s' failed!\n", msg);
+      assert(condition);
+   }
+}
+
+
+
+/**
+ * lp_build_assert.
+ *
+ * Build an assertion in LLVM IR by building a function call to the
+ * lp_assert() function above.
+ *
+ * \param condition should be an 'i1' or 'i32' value
+ * \param msg  a string to print if the assertion fails.
+ */
+LLVMValueRef
+lp_build_assert(LLVMBuilderRef builder, LLVMValueRef condition,
+                const char *msg)
+{
+   LLVMModuleRef module;
+   LLVMTypeRef arg_types[2];
+   LLVMValueRef msg_string, assert_func, params[2], r;
+
+   module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
+                            LLVMGetInsertBlock(builder)));
+
+   msg_string = lp_build_const_string_variable(module, msg, strlen(msg) + 1);
+
+   arg_types[0] = LLVMInt32Type();
+   arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0);
+
+   /* lookup the lp_assert function */
+   assert_func = LLVMGetNamedFunction(module, "lp_assert");
+
+   /* Create the assertion function if not found */
+   if (!assert_func) {
+      LLVMTypeRef func_type =
+         LLVMFunctionType(LLVMVoidType(), arg_types, 2, 0);
+
+      assert_func = LLVMAddFunction(module, "lp_assert", func_type);
+      LLVMSetFunctionCallConv(assert_func, LLVMCCallConv);
+      LLVMSetLinkage(assert_func, LLVMExternalLinkage);
+      LLVMAddGlobalMapping(lp_build_engine, assert_func,
+                           func_to_pointer((func_pointer)lp_assert));
+   }
+   assert(assert_func);
+
+   /* build function call param list */
+   params[0] = LLVMBuildZExt(builder, condition, arg_types[0], "");
+   params[1] = LLVMBuildBitCast(builder, msg_string, arg_types[1], "");
+
+   /* check arg types */
+   assert(LLVMTypeOf(params[0]) == arg_types[0]);
+   assert(LLVMTypeOf(params[1]) == arg_types[1]);
+
+   r = LLVMBuildCall(builder, assert_func, params, 2, "");
+
+   return r;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_assert.h b/src/gallium/auxiliary/gallivm/lp_bld_assert.h
new file mode 100644
index 00000000000..ddd879dc2c6
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_assert.h
@@ -0,0 +1,41 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_BLD_ASSERT_H
+#define LP_BLD_ASSERT_H
+
+
+#include "lp_bld.h"
+
+
+LLVMValueRef
+lp_build_assert(LLVMBuilderRef builder, LLVMValueRef condition,
+                const char *msg);
+
+
+#endif
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c b/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c
new file mode 100644
index 00000000000..706479b4d56
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c
@@ -0,0 +1,187 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_const.h"
+#include "lp_bld_bitarit.h"
+
+
+/**
+ * Return (a | b)
+ */
+LLVMValueRef
+lp_build_or(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
+   /* can't do bitwise ops on floating-point values */
+   if (type.floating) {
+      a = LLVMBuildBitCast(bld->builder, a, bld->int_vec_type, "");
+      b = LLVMBuildBitCast(bld->builder, b, bld->int_vec_type, "");
+   }
+
+   res = LLVMBuildOr(bld->builder, a, b, "");
+
+   if (type.floating) {
+      res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Return (a & b)
+ */
+LLVMValueRef
+lp_build_and(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
+   /* can't do bitwise ops on floating-point values */
+   if (type.floating) {
+      a = LLVMBuildBitCast(bld->builder, a, bld->int_vec_type, "");
+      b = LLVMBuildBitCast(bld->builder, b, bld->int_vec_type, "");
+   }
+
+   res = LLVMBuildAnd(bld->builder, a, b, "");
+
+   if (type.floating) {
+      res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Return (a & ~b)
+ */
+LLVMValueRef
+lp_build_andnot(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
+   /* can't do bitwise ops on floating-point values */
+   if (type.floating) {
+      a = LLVMBuildBitCast(bld->builder, a, bld->int_vec_type, "");
+      b = LLVMBuildBitCast(bld->builder, b, bld->int_vec_type, "");
+   }
+
+   res = LLVMBuildNot(bld->builder, b, "");
+   res = LLVMBuildAnd(bld->builder, a, res, "");
+
+   if (type.floating) {
+      res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Shift left.
+ */
+LLVMValueRef
+lp_build_shl(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(!type.floating);
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
+   res = LLVMBuildShl(bld->builder, a, b, "");
+
+   return res;
+}
+
+
+/**
+ * Shift right.
+ */
+LLVMValueRef
+lp_build_shr(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(!type.floating);
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
+   if (type.sign) {
+      res = LLVMBuildAShr(bld->builder, a, b, "");
+   } else {
+      res = LLVMBuildLShr(bld->builder, a, b, "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Shift left with immediate.
+ */
+LLVMValueRef
+lp_build_shl_imm(struct lp_build_context *bld, LLVMValueRef a, unsigned imm)
+{
+   LLVMValueRef b = lp_build_const_int_vec(bld->type, imm);
+   assert(imm <= bld->type.width);
+   return lp_build_shl(bld, a, b);
+}
+
+
+/**
+ * Shift right with immediate.
+ */
+LLVMValueRef
+lp_build_shr_imm(struct lp_build_context *bld, LLVMValueRef a, unsigned imm)
+{
+   LLVMValueRef b = lp_build_const_int_vec(bld->type, imm);
+   assert(imm <= bld->type.width);
+   return lp_build_shr(bld, a, b);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h b/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h
new file mode 100644
index 00000000000..5c5b9818519
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h
@@ -0,0 +1,69 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper bitwise arithmetic functions.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_BITARIT_H
+#define LP_BLD_BITARIT_H
+
+
+#include "gallivm/lp_bld.h"
+
+
+struct lp_type;
+struct lp_build_context;
+
+
+LLVMValueRef
+lp_build_or(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b);
+
+LLVMValueRef
+lp_build_and(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b);
+
+LLVMValueRef
+lp_build_andnot(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b);
+
+LLVMValueRef
+lp_build_shl(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b);
+
+LLVMValueRef
+lp_build_shr(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b);
+
+LLVMValueRef
+lp_build_shl_imm(struct lp_build_context *bld, LLVMValueRef a, unsigned imm);
+
+LLVMValueRef
+lp_build_shr_imm(struct lp_build_context *bld, LLVMValueRef a, unsigned imm);
+
+
+#endif /* !LP_BLD_ARIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
index 57843e9a60c..dd839c0bea5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -280,31 +280,45 @@ lp_build_one(struct lp_type type)
                
 
 /**
- * Build constant-valued vector from a scalar value.
+ * Build constant-valued element from a scalar value.
  */
 LLVMValueRef
-lp_build_const_vec(struct lp_type type,
-                   double val)
+lp_build_const_elem(struct lp_type type,
+                    double val)
 {
    LLVMTypeRef elem_type = lp_build_elem_type(type);
-   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
-   unsigned i;
-
-   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+   LLVMValueRef elem;
 
    if(type.floating) {
-      elems[0] = LLVMConstReal(elem_type, val);
+      elem = LLVMConstReal(elem_type, val);
    }
    else {
       double dscale = lp_const_scale(type);
 
-      elems[0] = LLVMConstInt(elem_type, val*dscale + 0.5, 0);
+      elem = LLVMConstInt(elem_type, val*dscale + 0.5, 0);
    }
 
-   for(i = 1; i < type.length; ++i)
-      elems[i] = elems[0];
+   return elem;
+}
 
-   return LLVMConstVector(elems, type.length);
+
+/**
+ * Build constant-valued vector from a scalar value.
+ */
+LLVMValueRef
+lp_build_const_vec(struct lp_type type,
+                   double val)
+{
+   if (type.length == 1) {
+      return lp_build_const_elem(type, val);
+   } else {
+      LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+      unsigned i;
+      elems[0] = lp_build_const_elem(type, val);
+      for(i = 1; i < type.length; ++i)
+         elems[i] = elems[0];
+      return LLVMConstVector(elems, type.length);
+   }
 }
 
 
@@ -321,6 +335,9 @@ lp_build_const_int_vec(struct lp_type type,
    for(i = 0; i < type.length; ++i)
       elems[i] = LLVMConstInt(elem_type, val, type.sign ? 1 : 0);
 
+   if (type.length == 1)
+      return elems[0];
+
    return LLVMConstVector(elems, type.length);
 }
 
@@ -365,9 +382,12 @@ lp_build_const_aos(struct lp_type type,
 }
 
 
+/**
+ * @param mask TGSI_WRITEMASK_xxx
+ */
 LLVMValueRef
 lp_build_const_mask_aos(struct lp_type type,
-                        const boolean cond[4])
+                        unsigned mask)
 {
    LLVMTypeRef elem_type = LLVMIntType(type.width);
    LLVMValueRef masks[LP_MAX_VECTOR_LENGTH];
@@ -375,9 +395,13 @@ lp_build_const_mask_aos(struct lp_type type,
 
    assert(type.length <= LP_MAX_VECTOR_LENGTH);
 
-   for(j = 0; j < type.length; j += 4)
-      for(i = 0; i < 4; ++i)
-         masks[j + i] = LLVMConstInt(elem_type, cond[i] ? ~0 : 0, 0);
+   for (j = 0; j < type.length; j += 4) {
+      for( i = 0; i < 4; ++i) {
+         masks[j + i] = LLVMConstInt(elem_type,
+                                     mask & (1 << i) ? ~0ULL : 0,
+                                     1);
+      }
+   }
 
    return LLVMConstVector(masks, type.length);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
index 9ca2f0664eb..6b1fc590c17 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_const.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -85,6 +85,10 @@ lp_build_one(struct lp_type type);
 
 
 LLVMValueRef
+lp_build_const_elem(struct lp_type type,
+                    double val);
+
+LLVMValueRef
 lp_build_const_vec(struct lp_type type, double val);
 
 
@@ -100,7 +104,15 @@ lp_build_const_aos(struct lp_type type,
 
 LLVMValueRef
 lp_build_const_mask_aos(struct lp_type type,
-                        const boolean cond[4]);
+                        unsigned mask);
+
+
+static INLINE LLVMValueRef
+lp_build_const_int32(int i)
+{
+   return LLVMConstInt(LLVMInt32Type(), i, 0);
+}
+
 
 
 #endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 3f7f2ebde9c..8b477313d48 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -83,6 +83,9 @@
  *
  * Although the result values can be scaled to an arbitrary bit width specified
  * by dst_width, the actual result type will have the same width.
+ *
+ * Ex: src = { float, float, float, float }
+ * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1].
  */
 LLVMValueRef
 lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
@@ -114,8 +117,8 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
    scale = (double)mask/ubound;
    bias = (double)((unsigned long long)1 << (mantissa - n));
 
-   res = LLVMBuildMul(builder, src, lp_build_const_vec(src_type, scale), "");
-   res = LLVMBuildAdd(builder, res, lp_build_const_vec(src_type, bias), "");
+   res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+   res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), "");
    res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 
    if(dst_width > n) {
@@ -152,6 +155,8 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
 
 /**
  * Inverse of lp_build_clamped_float_to_unsigned_norm above.
+ * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1]
+ * return {float, float, float, float} with values in range [0, 1].
  */
 LLVMValueRef
 lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
@@ -170,6 +175,8 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
    double scale;
    double bias;
 
+   assert(dst_type.floating);
+
    mantissa = lp_mantissa(dst_type);
 
    n = MIN2(mantissa, src_width);
@@ -194,8 +201,8 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
    res = LLVMBuildBitCast(builder, res, vec_type, "");
 
-   res = LLVMBuildSub(builder, res, bias_, "");
-   res = LLVMBuildMul(builder, res, lp_build_const_vec(dst_type, scale), "");
+   res = LLVMBuildFSub(builder, res, bias_, "");
+   res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), "");
 
    return res;
 }
@@ -219,18 +226,19 @@ lp_build_conv(LLVMBuilderRef builder,
    unsigned num_tmps;
    unsigned i;
 
-   /* Register width must remain constant */
-   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
-
    /* We must not loose or gain channels. Only precision */
    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 
    assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
    assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
+   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
 
    tmp_type = src_type;
-   for(i = 0; i < num_srcs; ++i)
+   for(i = 0; i < num_srcs; ++i) {
+      assert(lp_check_value(src_type, src[i]));
       tmp[i] = src[i];
+   }
    num_tmps = num_srcs;
 
    /*
@@ -290,7 +298,7 @@ lp_build_conv(LLVMBuilderRef builder,
          if (dst_scale != 1.0) {
             LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale);
             for(i = 0; i < num_tmps; ++i)
-               tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
+               tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
          }
 
          /* Use an equally sized integer for intermediate computations */
@@ -326,30 +334,25 @@ lp_build_conv(LLVMBuilderRef builder,
 
    /*
     * Truncate or expand bit width
+    *
+    * No data conversion should happen here, although the sign bits are
+    * crucial to avoid bad clamping.
     */
 
-   assert(!tmp_type.floating || tmp_type.width == dst_type.width);
+   {
+      struct lp_type new_type;
 
-   if(tmp_type.width > dst_type.width) {
-      assert(num_dsts == 1);
-      tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
-      tmp_type.width = dst_type.width;
-      tmp_type.length = dst_type.length;
-      num_tmps = 1;
-   }
+      new_type = tmp_type;
+      new_type.sign   = dst_type.sign;
+      new_type.width  = dst_type.width;
+      new_type.length = dst_type.length;
 
-   if(tmp_type.width < dst_type.width) {
-      assert(num_tmps == 1);
-      lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
-      tmp_type.width = dst_type.width;
-      tmp_type.length = dst_type.length;
+      lp_build_resize(builder, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts);
+
+      tmp_type = new_type;
       num_tmps = num_dsts;
    }
 
-   assert(tmp_type.width == dst_type.width);
-   assert(tmp_type.length == dst_type.length);
-   assert(num_tmps == num_dsts);
-
    /*
     * Scale to the widest range
     */
@@ -390,7 +393,7 @@ lp_build_conv(LLVMBuilderRef builder,
           if (src_scale != 1.0) {
              LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale);
              for(i = 0; i < num_tmps; ++i)
-                tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
+                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
           }
       }
     }
@@ -406,8 +409,10 @@ lp_build_conv(LLVMBuilderRef builder,
        }
     }
 
-   for(i = 0; i < num_dsts; ++i)
+   for(i = 0; i < num_dsts; ++i) {
       dst[i] = tmp[i];
+      assert(lp_check_value(dst_type, dst[i]));
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
index 39dfc51e503..d3a5afff8c2 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
@@ -46,7 +46,7 @@
 boolean
 lp_check_alignment(const void *ptr, unsigned alignment)
 {
-   assert(util_is_pot(alignment));
+   assert(util_is_power_of_two(alignment));
    return ((uintptr_t)ptr & (alignment - 1)) == 0;
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index 7b010cbdb09..369c1bbf09a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -36,6 +36,20 @@
 #include "util/u_string.h"
 
 
+#define GALLIVM_DEBUG_TGSI      0x1
+#define GALLIVM_DEBUG_IR        0x2
+#define GALLIVM_DEBUG_ASM       0x4
+#define GALLIVM_DEBUG_NO_OPT    0x8
+#define GALLIVM_DEBUG_PERF      0x10
+
+
+#ifdef DEBUG
+extern unsigned gallivm_debug;
+#else
+#define gallivm_debug 0
+#endif
+
+
 static INLINE void
 lp_build_name(LLVMValueRef val, const char *format, ...)
 {
@@ -53,6 +67,10 @@ lp_build_name(LLVMValueRef val, const char *format, ...)
 }
 
 
+void
+lp_debug_dump_value(LLVMValueRef value);
+
+
 boolean
 lp_check_alignment(const void *ptr, unsigned alignment);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
index 8f15b1d287d..5bc9c741a88 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -38,7 +38,7 @@
 #include "lp_bld_flow.h"
 
 
-#define LP_BUILD_FLOW_MAX_VARIABLES 32
+#define LP_BUILD_FLOW_MAX_VARIABLES 64
 #define LP_BUILD_FLOW_MAX_DEPTH 32
 
 /**
@@ -407,6 +407,7 @@ lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
    /* for each variable, update the Phi node with a (variable, block) pair */
    for(i = 0; i < skip->num_variables; ++i) {
       assert(*flow->variables[i]);
+      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
       LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
    }
 
@@ -433,6 +434,7 @@ lp_build_flow_skip_end(struct lp_build_flow_context *flow)
    /* add (variable, block) tuples to the phi nodes */
    for(i = 0; i < skip->num_variables; ++i) {
       assert(*flow->variables[i]);
+      assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i]));
       LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
       *flow->variables[i] = skip->phi[i];
    }
@@ -821,8 +823,11 @@ lp_build_alloca(LLVMBuilderRef builder,
    LLVMBuilderRef first_builder = LLVMCreateBuilder();
    LLVMValueRef res;
 
-   LLVMPositionBuilderAtEnd(first_builder, first_block);
-   LLVMPositionBuilderBefore(first_builder, first_instr);
+   if (first_instr) {
+      LLVMPositionBuilderBefore(first_builder, first_instr);
+   } else {
+      LLVMPositionBuilderAtEnd(first_builder, first_block);
+   }
 
    res = LLVMBuildAlloca(first_builder, type, name);
 
@@ -840,7 +845,7 @@ lp_build_alloca(LLVMBuilderRef builder,
  * first block may prevent the X86 backend from successfully align the stack as
  * required.
  *
- * Also the scalarrepl pass is supossedly more powerful and can promote
+ * Also the scalarrepl pass is supposedly more powerful and can promote
  * arrays in many cases.
  *
  * See also:
@@ -859,7 +864,11 @@ lp_build_array_alloca(LLVMBuilderRef builder,
    LLVMBuilderRef first_builder = LLVMCreateBuilder();
    LLVMValueRef res;
 
-   LLVMPositionBuilderBefore(first_builder, first_instr);
+   if (first_instr) {
+      LLVMPositionBuilderBefore(first_builder, first_instr);
+   } else {
+      LLVMPositionBuilderAtEnd(first_builder, first_block);
+   }
 
    res = LLVMBuildArrayAlloca(first_builder, type, count, name);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 085937588ff..60e22d727ad 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -40,6 +40,7 @@
 
 struct util_format_description;
 struct lp_type;
+struct lp_build_context;
 
 
 /*
@@ -47,9 +48,9 @@ struct lp_type;
  */
 
 LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         const struct util_format_description *desc,
-                         LLVMValueRef packed);
+lp_build_format_swizzle_aos(const struct util_format_description *desc,
+                            struct lp_build_context *bld,
+                            LLVMValueRef unswizzled);
 
 LLVMValueRef
 lp_build_pack_rgba_aos(LLVMBuilderRef builder,
@@ -59,7 +60,9 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
 LLVMValueRef
 lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
                         const struct util_format_description *format_desc,
-                        LLVMValueRef ptr,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j);
 
@@ -70,17 +73,22 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
 
 void
 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
-                            struct lp_type type,
-                            const LLVMValueRef *unswizzled,
-                            LLVMValueRef *swizzled);
+                            struct lp_build_context *bld,
+                            const LLVMValueRef unswizzled[4],
+                            LLVMValueRef swizzled_out[4]);
 
 void
 lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          const struct util_format_description *format_desc,
                          struct lp_type type,
                          LLVMValueRef packed,
-                         LLVMValueRef *rgba);
+                         LLVMValueRef rgba_out[4]);
 
+void
+lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
+                          struct lp_type dst_type,
+                          LLVMValueRef packed,
+                          LLVMValueRef *rgba);
 
 void
 lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
@@ -90,7 +98,20 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
                         LLVMValueRef offsets,
                         LLVMValueRef i,
                         LLVMValueRef j,
-                        LLVMValueRef *rgba);
+                        LLVMValueRef rgba_out[4]);
+
+/*
+ * YUV
+ */
 
 
+LLVMValueRef
+lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder,
+                                   const struct util_format_description *format_desc,
+                                   unsigned n,
+                                   LLVMValueRef base_ptr,
+                                   LLVMValueRef offset,
+                                   LLVMValueRef i,
+                                   LLVMValueRef j);
+
 #endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 6257e9a4047..6b9189e1da5 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -38,33 +38,124 @@
 #include "util/u_math.h"
 #include "util/u_string.h"
 
+#include "lp_bld_arit.h"
 #include "lp_bld_init.h"
 #include "lp_bld_type.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_gather.h"
+#include "lp_bld_debug.h"
 #include "lp_bld_format.h"
 
 
 /**
+ * Basic swizzling.  Rearrange the order of the unswizzled array elements
+ * according to the format description.  PIPE_SWIZZLE_ZERO/ONE are supported
+ * too.
+ * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}.
+ */
+LLVMValueRef
+lp_build_format_swizzle_aos(const struct util_format_description *desc,
+                            struct lp_build_context *bld,
+                            LLVMValueRef unswizzled)
+{
+   unsigned char swizzles[4];
+   unsigned chan;
+
+   assert(bld->type.length % 4 == 0);
+
+   for (chan = 0; chan < 4; ++chan) {
+      enum util_format_swizzle swizzle;
+
+      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+         /*
+          * For ZS formats do RGBA = ZZZ1
+          */
+         if (chan == 3) {
+            swizzle = UTIL_FORMAT_SWIZZLE_1;
+         } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) {
+            swizzle = UTIL_FORMAT_SWIZZLE_0;
+         } else {
+            swizzle = desc->swizzle[0];
+         }
+      } else {
+         swizzle = desc->swizzle[chan];
+      }
+      swizzles[chan] = swizzle;
+   }
+
+   return lp_build_swizzle_aos(bld, unswizzled, swizzles);
+}
+
+
+/**
+ * Whether the format matches the vector type, apart of swizzles.
+ */
+static INLINE boolean
+format_matches_type(const struct util_format_description *desc,
+                    struct lp_type type)
+{
+   enum util_format_type chan_type;
+   unsigned chan;
+
+   assert(type.length % 4 == 0);
+
+   if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+       desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
+       desc->block.width != 1 ||
+       desc->block.height != 1) {
+      return FALSE;
+   }
+
+   if (type.floating) {
+      chan_type = UTIL_FORMAT_TYPE_FLOAT;
+   } else if (type.fixed) {
+      chan_type = UTIL_FORMAT_TYPE_FIXED;
+   } else if (type.sign) {
+      chan_type = UTIL_FORMAT_TYPE_SIGNED;
+   } else {
+      chan_type = UTIL_FORMAT_TYPE_UNSIGNED;
+   }
+
+   for (chan = 0; chan < desc->nr_channels; ++chan) {
+      if (desc->channel[chan].size != type.width) {
+         return FALSE;
+      }
+
+      if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) {
+         if (desc->channel[chan].type != chan_type ||
+             desc->channel[chan].normalized != type.norm) {
+            return FALSE;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+/**
  * Unpack a single pixel into its RGBA components.
  *
- * @param packed integer.
+ * @param desc  the pixel format for the packed pixel value
+ * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM
  *
- * @return RGBA in a 4 floats vector.
+ * @return RGBA in a float[4] or ubyte[4] or ushort[4] vector.
  */
-LLVMValueRef
-lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
-                         const struct util_format_description *desc,
-                         LLVMValueRef packed)
+static INLINE LLVMValueRef
+lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder,
+                               const struct util_format_description *desc,
+                               LLVMValueRef packed)
 {
    LLVMValueRef shifted, casted, scaled, masked;
    LLVMValueRef shifts[4];
    LLVMValueRef masks[4];
    LLVMValueRef scales[4];
-   LLVMValueRef swizzles[4];
-   LLVMValueRef aux[4];
-   bool normalized;
-   int empty_channel;
-   bool needs_uitofp;
+
+   boolean normalized;
+   boolean needs_uitofp;
    unsigned shift;
    unsigned i;
 
@@ -76,10 +167,12 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
 
    /* Do the intermediate integer computations with 32bit integers since it
     * matches floating point size */
-   if (desc->block.bits < 32)
-      packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), "");
+   assert (LLVMTypeOf(packed) == LLVMInt32Type());
 
-   /* Broadcast the packed value to all four channels */
+   /* Broadcast the packed value to all four channels
+    * before: packed = BGRA
+    * after: packed = {BGRA, BGRA, BGRA, BGRA}
+    */
    packed = LLVMBuildInsertElement(builder,
                                    LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)),
                                    packed,
@@ -94,8 +187,9 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
    /* Initialize vector constants */
    normalized = FALSE;
    needs_uitofp = FALSE;
-   empty_channel = -1;
    shift = 0;
+
+   /* Loop over 4 color components */
    for (i = 0; i < 4; ++i) {
       unsigned bits = desc->channel[i].size;
 
@@ -103,7 +197,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
          shifts[i] = LLVMGetUndef(LLVMInt32Type());
          masks[i] = LLVMConstNull(LLVMInt32Type());
          scales[i] =  LLVMConstNull(LLVMFloatType());
-         empty_channel = i;
       }
       else {
          unsigned long long mask = (1ULL << bits) - 1;
@@ -128,8 +221,13 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
       shift += bits;
    }
 
+   /* Ex: convert packed = {BGRA, BGRA, BGRA, BGRA}
+    * into masked = {B, G, R, A}
+    */
    shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
    masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+
+
    if (!needs_uitofp) {
       /* UIToFP can't be expressed in SSE2 */
       casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
@@ -137,55 +235,17 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
       casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
    }
 
+   /* At this point 'casted' may be a vector of floats such as
+    * {255.0, 255.0, 255.0, 255.0}.  Next, if the pixel values are normalized
+    * we'll scale this to {1.0, 1.0, 1.0, 1.0}.
+    */
+
    if (normalized)
-      scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), "");
+      scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), "");
    else
       scaled = casted;
 
-   for (i = 0; i < 4; ++i)
-      aux[i] = LLVMGetUndef(LLVMFloatType());
-
-   for (i = 0; i < 4; ++i) {
-      enum util_format_swizzle swizzle;
-
-      if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
-         /*
-          * For ZS formats do RGBA = ZZZ1
-          */
-         if (i == 3) {
-            swizzle = UTIL_FORMAT_SWIZZLE_1;
-         } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) {
-            swizzle = UTIL_FORMAT_SWIZZLE_0;
-         } else {
-            swizzle = desc->swizzle[0];
-         }
-      } else {
-         swizzle = desc->swizzle[i];
-      }
-
-      switch (swizzle) {
-      case UTIL_FORMAT_SWIZZLE_X:
-      case UTIL_FORMAT_SWIZZLE_Y:
-      case UTIL_FORMAT_SWIZZLE_Z:
-      case UTIL_FORMAT_SWIZZLE_W:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_0:
-         assert(empty_channel >= 0);
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_1:
-         swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0);
-         aux[0] = LLVMConstReal(LLVMFloatType(), 1.0);
-         break;
-      case UTIL_FORMAT_SWIZZLE_NONE:
-         swizzles[i] = LLVMGetUndef(LLVMFloatType());
-         assert(0);
-         break;
-      }
-   }
-
-   return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4), LLVMConstVector(swizzles, 4), "");
+   return scaled;
 }
 
 
@@ -208,7 +268,7 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
    LLVMValueRef shifted, casted, scaled, unswizzled;
    LLVMValueRef shifts[4];
    LLVMValueRef scales[4];
-   bool normalized;
+   boolean normalized;
    unsigned shift;
    unsigned i, j;
 
@@ -263,7 +323,7 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
    }
 
    if (normalized)
-      scaled = LLVMBuildMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
+      scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
    else
       scaled = unswizzled;
 
@@ -292,44 +352,152 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
 }
 
 
+
+
 /**
  * Fetch a pixel into a 4 float AoS.
  *
- * i and j are the sub-block pixel coordinates.
+ * \param format_desc  describes format of the image we're fetching from
+ * \param ptr  address of the pixel block (or the texel if uncompressed)
+ * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
+ *              these will always be (0, 0).
+ * \return  a 4 element vector with the pixel's RGBA values.
  */
 LLVMValueRef
 lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
                         const struct util_format_description *format_desc,
-                        LLVMValueRef ptr,
+                        struct lp_type type,
+                        LLVMValueRef base_ptr,
+                        LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j)
 {
+   unsigned num_pixels = type.length / 4;
+   struct lp_build_context bld;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(type.length % 4 == 0);
+
+   lp_build_context_init(&bld, builder, type);
+
+   /*
+    * Trivial case
+    *
+    * The format matches the type (apart of a swizzle) so no need for
+    * scaling or converting.
+    */
+
+   if (format_matches_type(format_desc, type) &&
+       format_desc->block.bits <= type.width * 4 &&
+       util_is_power_of_two(format_desc->block.bits)) {
+      LLVMValueRef packed;
+
+      /*
+       * The format matches the type (apart of a swizzle) so no need for
+       * scaling or converting.
+       */
+
+      packed = lp_build_gather(builder, type.length/4,
+                               format_desc->block.bits, type.width*4,
+                               base_ptr, offset);
+
+      assert(format_desc->block.bits <= type.width * type.length);
+
+      packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(type), "");
+
+      return lp_build_format_swizzle_aos(format_desc, &bld, packed);
+   }
+
+   /*
+    * Bit arithmetic
+    */
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
        (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
         format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) &&
        format_desc->block.width == 1 &&
        format_desc->block.height == 1 &&
-       util_is_pot(format_desc->block.bits) &&
+       util_is_power_of_two(format_desc->block.bits) &&
        format_desc->block.bits <= 32 &&
        format_desc->is_bitmask &&
        !format_desc->is_mixed &&
        (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED ||
-        format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED))
-   {
-      LLVMValueRef packed;
+        format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) {
+
+      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
+      LLVMValueRef res;
+      unsigned k;
+
+      /*
+       * Unpack a pixel at a time into a <4 x float> RGBA vector
+       */
+
+      for (k = 0; k < num_pixels; ++k) {
+         LLVMValueRef packed;
+
+         packed = lp_build_gather_elem(builder, num_pixels,
+                                       format_desc->block.bits, 32,
+                                       base_ptr, offset, k);
+
+         tmps[k] = lp_build_unpack_arith_rgba_aos(builder, format_desc,
+                                                  packed);
+      }
+
+      /*
+       * Type conversion.
+       *
+       * TODO: We could avoid floating conversion for integer to
+       * integer conversions.
+       */
 
-      ptr = LLVMBuildBitCast(builder, ptr,
-                             LLVMPointerType(LLVMIntType(format_desc->block.bits), 0) ,
-                             "");
+      if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) {
+         debug_printf("%s: unpacking %s with floating point\n",
+                      __FUNCTION__, format_desc->short_name);
+      }
 
-      packed = LLVMBuildLoad(builder, ptr, "packed");
+      lp_build_conv(builder,
+                    lp_float32_vec4_type(),
+                    type,
+                    tmps, num_pixels, &res, 1);
 
-      return lp_build_unpack_rgba_aos(builder, format_desc, packed);
+      return lp_build_format_swizzle_aos(format_desc, &bld, res);
    }
-   else if (format_desc->fetch_rgba_float) {
+
+   /*
+    * YUV / subsampled formats
+    */
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
+      struct lp_type tmp_type;
+      LLVMValueRef tmp;
+
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = num_pixels * 4;
+      tmp_type.norm = TRUE;
+
+      tmp = lp_build_fetch_subsampled_rgba_aos(builder,
+                                               format_desc,
+                                               num_pixels,
+                                               base_ptr,
+                                               offset,
+                                               i, j);
+
+      lp_build_conv(builder,
+                    tmp_type, type,
+                    &tmp, 1, &tmp, 1);
+
+      return tmp;
+   }
+
+   /*
+    * Fallback to util_format_description::fetch_rgba_8unorm().
+    */
+
+   if (format_desc->fetch_rgba_8unorm &&
+       !type.floating && type.width == 8 && !type.sign && type.norm) {
       /*
-       * Fallback to calling util_format_description::fetch_rgba_float.
+       * Fallback to calling util_format_description::fetch_rgba_8unorm.
        *
        * This is definitely not the most efficient way of fetching pixels, as
        * we miss the opportunity to do vectorization, but this it is a
@@ -339,13 +507,125 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
 
       LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
       char name[256];
+      LLVMTypeRef i8t = LLVMInt8Type();
+      LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
+      LLVMTypeRef i32t = LLVMInt32Type();
       LLVMValueRef function;
+      LLVMValueRef tmp_ptr;
       LLVMValueRef tmp;
-      LLVMValueRef args[4];
+      LLVMValueRef res;
+      unsigned k;
+
+      util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm",
+                    format_desc->short_name);
+
+      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+         debug_printf("%s: falling back to %s\n", __FUNCTION__, name);
+      }
+
+      /*
+       * Declare and bind format_desc->fetch_rgba_8unorm().
+       */
+
+      function = LLVMGetNamedFunction(module, name);
+      if (!function) {
+         LLVMTypeRef ret_type;
+         LLVMTypeRef arg_types[4];
+         LLVMTypeRef function_type;
+
+         ret_type = LLVMVoidType();
+         arg_types[0] = pi8t;
+         arg_types[1] = pi8t;
+         arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8);
+         function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
+         function = LLVMAddFunction(module, name, function_type);
+
+         LLVMSetFunctionCallConv(function, LLVMCCallConv);
+         LLVMSetLinkage(function, LLVMExternalLinkage);
+
+         assert(LLVMIsDeclaration(function));
+
+         LLVMAddGlobalMapping(lp_build_engine, function,
+                              func_to_pointer((func_pointer)format_desc->fetch_rgba_8unorm));
+      }
+
+      tmp_ptr = lp_build_alloca(builder, i32t, "");
+
+      res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels));
+
+      /*
+       * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result
+       * in the SoA vectors.
+       */
+
+      for (k = 0; k < num_pixels; ++k) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+         LLVMValueRef args[4];
+
+         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
+         args[1] = lp_build_gather_elem_ptr(builder, num_pixels,
+                                            base_ptr, offset, k);
+
+         if (num_pixels == 1) {
+            args[2] = i;
+            args[3] = j;
+         }
+         else {
+            args[2] = LLVMBuildExtractElement(builder, i, index, "");
+            args[3] = LLVMBuildExtractElement(builder, j, index, "");
+         }
+
+         LLVMBuildCall(builder, function, args, Elements(args), "");
+
+         tmp = LLVMBuildLoad(builder, tmp_ptr, "");
+
+         if (num_pixels == 1) {
+            res = tmp;
+         }
+         else {
+            res = LLVMBuildInsertElement(builder, res, tmp, index, "");
+         }
+      }
+
+      /* Bitcast from <n x i32> to <4n x i8> */
+      res = LLVMBuildBitCast(builder, res, bld.vec_type, "");
+
+      return res;
+   }
+
+
+   /*
+    * Fallback to util_format_description::fetch_rgba_float().
+    */
+
+   if (format_desc->fetch_rgba_float) {
+      /*
+       * Fallback to calling util_format_description::fetch_rgba_float.
+       *
+       * This is definitely not the most efficient way of fetching pixels, as
+       * we miss the opportunity to do vectorization, but this it is a
+       * convenient for formats or scenarios for which there was no opportunity
+       * or incentive to optimize.
+       */
+
+      LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
+      char name[256];
+      LLVMTypeRef f32t = LLVMFloatType();
+      LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4);
+      LLVMTypeRef pf32t = LLVMPointerType(f32t, 0);
+      LLVMValueRef function;
+      LLVMValueRef tmp_ptr;
+      LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4];
+      LLVMValueRef res;
+      unsigned k;
 
       util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float",
                     format_desc->short_name);
 
+      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+         debug_printf("%s: falling back to %s\n", __FUNCTION__, name);
+      }
+
       /*
        * Declare and bind format_desc->fetch_rgba_float().
        */
@@ -357,7 +637,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
          LLVMTypeRef function_type;
 
          ret_type = LLVMVoidType();
-         arg_types[0] = LLVMPointerType(LLVMFloatType(), 0);
+         arg_types[0] = pf32t;
          arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0);
          arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8);
          function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0);
@@ -368,28 +648,47 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder,
 
          assert(LLVMIsDeclaration(function));
 
-         LLVMAddGlobalMapping(lp_build_engine, function, format_desc->fetch_rgba_float);
+         LLVMAddGlobalMapping(lp_build_engine, function,
+                              func_to_pointer((func_pointer)format_desc->fetch_rgba_float));
       }
 
-      tmp = lp_build_alloca(builder, LLVMVectorType(LLVMFloatType(), 4), "");
+      tmp_ptr = lp_build_alloca(builder, f32x4t, "");
 
       /*
        * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result
        * in the SoA vectors.
        */
 
-      args[0] = LLVMBuildBitCast(builder, tmp,
-                                 LLVMPointerType(LLVMFloatType(), 0), "");
-      args[1] = ptr;
-      args[2] = i;
-      args[3] = j;
+      for (k = 0; k < num_pixels; ++k) {
+         LLVMValueRef args[4];
 
-      LLVMBuildCall(builder, function, args, 4, "");
+         args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, "");
+         args[1] = lp_build_gather_elem_ptr(builder, num_pixels,
+                                            base_ptr, offset, k);
 
-      return LLVMBuildLoad(builder, tmp, "");
-   }
-   else {
-      assert(0);
-      return LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4));
+         if (num_pixels == 1) {
+            args[2] = i;
+            args[3] = j;
+         }
+         else {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
+            args[2] = LLVMBuildExtractElement(builder, i, index, "");
+            args[3] = LLVMBuildExtractElement(builder, j, index, "");
+         }
+
+         LLVMBuildCall(builder, function, args, Elements(args), "");
+
+         tmps[k] = LLVMBuildLoad(builder, tmp_ptr, "");
+      }
+
+      lp_build_conv(builder,
+                    lp_float32_vec4_type(),
+                    type,
+                    tmps, num_pixels, &res, 1);
+
+      return res;
    }
+
+   assert(0);
+   return lp_build_undef(type);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 26b947b3b1c..ce7e54afc76 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include "pipe/p_defines.h"
+
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
@@ -33,51 +35,39 @@
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
-#include "lp_bld_sample.h" /* for lp_build_gather */
+#include "lp_bld_swizzle.h"
+#include "lp_bld_gather.h"
+#include "lp_bld_debug.h"
 #include "lp_bld_format.h"
 
 
-static LLVMValueRef
-lp_build_format_swizzle_chan_soa(struct lp_type type,
-                                 const LLVMValueRef *unswizzled,
-                                 enum util_format_swizzle swizzle)
-{
-   switch (swizzle) {
-   case UTIL_FORMAT_SWIZZLE_X:
-   case UTIL_FORMAT_SWIZZLE_Y:
-   case UTIL_FORMAT_SWIZZLE_Z:
-   case UTIL_FORMAT_SWIZZLE_W:
-      return unswizzled[swizzle];
-   case UTIL_FORMAT_SWIZZLE_0:
-      return lp_build_zero(type);
-   case UTIL_FORMAT_SWIZZLE_1:
-      return lp_build_one(type);
-   case UTIL_FORMAT_SWIZZLE_NONE:
-      return lp_build_undef(type);
-   default:
-      assert(0);
-      return lp_build_undef(type);
-   }
-}
-
-
 void
 lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
-                            struct lp_type type,
+                            struct lp_build_context *bld,
                             const LLVMValueRef *unswizzled,
-                            LLVMValueRef *swizzled)
+                            LLVMValueRef swizzled_out[4])
 {
-   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+   assert(UTIL_FORMAT_SWIZZLE_0 == PIPE_SWIZZLE_ZERO);
+   assert(UTIL_FORMAT_SWIZZLE_1 == PIPE_SWIZZLE_ONE);
+
+   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      /*
+       * Return zzz1 for depth-stencil formats.
+       *
+       * XXX: Allow to control the depth swizzle with an additional parameter,
+       * as the caller may wish another depth swizzle, or retain the stencil
+       * value.
+       */
       enum util_format_swizzle swizzle = format_desc->swizzle[0];
-      LLVMValueRef depth = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
-      swizzled[2] = swizzled[1] = swizzled[0] = depth;
-      swizzled[3] = lp_build_one(type);
+      LLVMValueRef depth = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
+      swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth;
+      swizzled_out[3] = bld->one;
    }
    else {
       unsigned chan;
       for (chan = 0; chan < 4; ++chan) {
          enum util_format_swizzle swizzle = format_desc->swizzle[chan];
-         swizzled[chan] = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
+         swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle);
       }
    }
 }
@@ -100,14 +90,20 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
  * It requires that a packed pixel fits into an element of the output
  * channels. The common case is when converting pixel with a depth of 32 bit or
  * less into floats.
+ *
+ * \param format_desc  the format of the 'packed' incoming pixel vector
+ * \param type  the desired type for rgba_out (type.length = n, above)
+ * \param packed  the incoming vector of packed pixels
+ * \param rgba_out  returns the SoA R,G,B,A vectors
  */
 void
 lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
                          const struct util_format_description *format_desc,
                          struct lp_type type,
                          LLVMValueRef packed,
-                         LLVMValueRef *rgba)
+                         LLVMValueRef rgba_out[4])
 {
+   struct lp_build_context bld;
    LLVMValueRef inputs[4];
    unsigned start;
    unsigned chan;
@@ -120,11 +116,13 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
    assert(type.floating);
    assert(type.width == 32);
 
+   lp_build_context_init(&bld, builder, type);
+
    /* Decode the input vector components */
    start = 0;
    for (chan = 0; chan < format_desc->nr_channels; ++chan) {
-      unsigned width = format_desc->channel[chan].size;
-      unsigned stop = start + width;
+      const unsigned width = format_desc->channel[chan].size;
+      const unsigned stop = start + width;
       LLVMValueRef input;
 
       input = packed;
@@ -200,7 +198,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
             if (format_desc->channel[chan].normalized) {
                double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
                LLVMValueRef scale_val = lp_build_const_vec(type, scale);
-               input = LLVMBuildMul(builder, input, scale_val, "");
+               input = LLVMBuildFMul(builder, input, scale_val, "");
             }
          }
          else {
@@ -230,7 +228,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
             double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
             LLVMValueRef scale_val = lp_build_const_vec(type, scale);
             input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
-            input = LLVMBuildMul(builder, input, scale_val, "");
+            input = LLVMBuildFMul(builder, input, scale_val, "");
          }
          else {
             /* FIXME */
@@ -250,14 +248,59 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
       start = stop;
    }
 
-   lp_build_format_swizzle_soa(format_desc, type, inputs, rgba);
+   lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out);
+}
+
+
+void
+lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
+                          struct lp_type dst_type,
+                          LLVMValueRef packed,
+                          LLVMValueRef *rgba)
+{
+   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
+   unsigned chan;
+
+   packed = LLVMBuildBitCast(builder, packed,
+                             lp_build_int_vec_type(dst_type), "");
+
+   /* Decode the input vector components */
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned start = chan*8;
+      unsigned stop = start + 8;
+      LLVMValueRef input;
+
+      input = packed;
+
+      if (start)
+         input = LLVMBuildLShr(builder, input,
+                               lp_build_const_int_vec(dst_type, start), "");
+
+      if (stop < 32)
+         input = LLVMBuildAnd(builder, input, mask, "");
+
+      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
+
+      rgba[chan] = input;
+   }
 }
 
 
+
 /**
- * Fetch a pixel into a SoA.
+ * Fetch a texels from a texture, returning them in SoA layout.
  *
- * i and j are the sub-block pixel coordinates.
+ * \param type  the desired return type for 'rgba'.  The vector length
+ *              is the number of texels to fetch
+ *
+ * \param base_ptr  points to start of the texture image block.  For non-
+ *                  compressed formats, this simply points to the texel.
+ *                  For compressed formats, it points to the start of the
+ *                  compressed data block.
+ *
+ * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
+ *              these will always be (0,0).  For compressed formats, i will
+ *              be in [0, block_width-1] and j will be in [0, block_height-1].
  */
 void
 lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
@@ -267,7 +310,7 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
                         LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j,
-                        LLVMValueRef *rgba)
+                        LLVMValueRef rgba_out[4])
 {
 
    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN &&
@@ -281,7 +324,7 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
    {
       /*
        * The packed pixel fits into an element of the destination format. Put
-       * the packed pixels into a vector and estract each component for all
+       * the packed pixels into a vector and extract each component for all
        * vector elements in parallel.
        */
 
@@ -289,6 +332,7 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
 
       /*
        * gather the texels from the texture
+       * Ex: packed = {BGRA, BGRA, BGRA, BGRA}.
        */
       packed = lp_build_gather(builder,
                                type.length,
@@ -302,49 +346,86 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder,
       lp_build_unpack_rgba_soa(builder,
                                format_desc,
                                type,
-                               packed, rgba);
+                               packed, rgba_out);
+      return;
    }
-   else {
-      /*
-       * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
-       *
-       * This is not the most efficient way of fetching pixels, as
-       * we miss some opportunities to do vectorization, but this it is a
-       * convenient for formats or scenarios for which there was no opportunity
-       * or incentive to optimize.
-       */
 
+   /*
+    * Try calling lp_build_fetch_rgba_aos for all pixels.
+    */
+
+   if (util_format_fits_8unorm(format_desc) &&
+       type.floating && type.width == 32 && type.length == 4) {
+      struct lp_type tmp_type;
+      LLVMValueRef tmp;
+
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = type.length * 4;
+      tmp_type.norm = TRUE;
+
+      tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
+                                    base_ptr, offset, i, j);
+
+      lp_build_rgba8_to_f32_soa(builder,
+                                type,
+                                tmp,
+                                rgba_out);
+
+      return;
+   }
+
+   /*
+    * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
+    *
+    * This is not the most efficient way of fetching pixels, as we
+    * miss some opportunities to do vectorization, but this is
+    * convenient for formats or scenarios for which there was no
+    * opportunity or incentive to optimize.
+    */
+
+   {
       unsigned k, chan;
+      struct lp_type tmp_type;
 
-      assert(type.floating);
+      if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+         debug_printf("%s: scalar unpacking of %s\n",
+                      __FUNCTION__, format_desc->short_name);
+      }
+
+      tmp_type = type;
+      tmp_type.length = 4;
 
       for (chan = 0; chan < 4; ++chan) {
-         rgba[chan] = lp_build_undef(type);
+         rgba_out[chan] = lp_build_undef(type);
       }
 
+      /* loop over number of pixels */
       for(k = 0; k < type.length; ++k) {
          LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0);
          LLVMValueRef offset_elem;
-         LLVMValueRef ptr;
          LLVMValueRef i_elem, j_elem;
          LLVMValueRef tmp;
 
          offset_elem = LLVMBuildExtractElement(builder, offset, index, "");
-         ptr = LLVMBuildGEP(builder, base_ptr, &offset_elem, 1, "");
 
          i_elem = LLVMBuildExtractElement(builder, i, index, "");
          j_elem = LLVMBuildExtractElement(builder, j, index, "");
 
-         tmp = lp_build_fetch_rgba_aos(builder, format_desc, ptr, i_elem, j_elem);
+         /* Get a single float[4]={R,G,B,A} pixel */
+         tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type,
+                                       base_ptr, offset_elem,
+                                       i_elem, j_elem);
 
          /*
-          * AoS to SoA
+          * Insert the AoS tmp value channels into the SoA result vectors at
+          * position = 'index'.
           */
-
          for (chan = 0; chan < 4; ++chan) {
             LLVMValueRef chan_val = LLVMConstInt(LLVMInt32Type(), chan, 0),
             tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, "");
-            rgba[chan] = LLVMBuildInsertElement(builder, rgba[chan], tmp_chan, index, "");
+            rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan],
+                                                    tmp_chan, index, "");
          }
       }
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
new file mode 100644
index 00000000000..2bce2895551
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c
@@ -0,0 +1,445 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * YUV pixel format manipulation.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_arit.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_gather.h"
+#include "lp_bld_format.h"
+#include "lp_bld_logic.h"
+
+/**
+ * Extract Y, U, V channels from packed UYVY.
+ * @param packed  is a <n x i32> vector with the packed UYVY blocks
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
+ */
+static void
+uyvy_to_yuv_soa(LLVMBuilderRef builder,
+                unsigned n,
+                LLVMValueRef packed,
+                LLVMValueRef i,
+                LLVMValueRef *y,
+                LLVMValueRef *u,
+                LLVMValueRef *v)
+{
+   struct lp_type type;
+   LLVMValueRef mask;
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   assert(lp_check_value(type, packed));
+   assert(lp_check_value(type, i));
+
+   /*
+    * y = (uyvy >> (16*i + 8)) & 0xff
+    * u = (uyvy        ) & 0xff
+    * v = (uyvy >> 16  ) & 0xff
+    */
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * Avoid shift with per-element count.
+    * No support on x86, gets translated to roughly 5 instructions
+    * per element. Didn't measure performance but cuts shader size
+    * by quite a bit (less difference if cpu has no sse4.1 support).
+    */
+   if (util_cpu_caps.has_sse2 && n == 4) {
+      LLVMValueRef sel, tmp, tmp2;
+      struct lp_build_context bld32;
+
+      lp_build_context_init(&bld32, builder, type);
+
+      tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
+      tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(type, 16), "");
+      sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0));
+      *y = lp_build_select(&bld32, sel, tmp, tmp2);
+   } else
+#endif
+   {
+      LLVMValueRef shift;
+      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+      shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), "");
+      *y = LLVMBuildLShr(builder, packed, shift, "");
+   }
+
+   *u = packed;
+   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
+
+   mask = lp_build_const_int_vec(type, 0xff);
+
+   *y = LLVMBuildAnd(builder, *y, mask, "y");
+   *u = LLVMBuildAnd(builder, *u, mask, "u");
+   *v = LLVMBuildAnd(builder, *v, mask, "v");
+}
+
+
+/**
+ * Extract Y, U, V channels from packed YUYV.
+ * @param packed  is a <n x i32> vector with the packed YUYV blocks
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
+ */
+static void
+yuyv_to_yuv_soa(LLVMBuilderRef builder,
+                unsigned n,
+                LLVMValueRef packed,
+                LLVMValueRef i,
+                LLVMValueRef *y,
+                LLVMValueRef *u,
+                LLVMValueRef *v)
+{
+   struct lp_type type;
+   LLVMValueRef mask;
+
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   assert(lp_check_value(type, packed));
+   assert(lp_check_value(type, i));
+
+   /*
+    * y = (yuyv >> 16*i) & 0xff
+    * u = (yuyv >> 8   ) & 0xff
+    * v = (yuyv >> 24  ) & 0xff
+    */
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * Avoid shift with per-element count.
+    * No support on x86, gets translated to roughly 5 instructions
+    * per element. Didn't measure performance but cuts shader size
+    * by quite a bit (less difference if cpu has no sse4.1 support).
+    */
+   if (util_cpu_caps.has_sse2 && n == 4) {
+      LLVMValueRef sel, tmp;
+      struct lp_build_context bld32;
+
+      lp_build_context_init(&bld32, builder, type);
+
+      tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), "");
+      sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0));
+       *y = lp_build_select(&bld32, sel, packed, tmp);
+   } else
+#endif
+   {
+      LLVMValueRef shift;
+      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), "");
+      *y = LLVMBuildLShr(builder, packed, shift, "");
+   }
+
+   *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), "");
+   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 24), "");
+
+   mask = lp_build_const_int_vec(type, 0xff);
+
+   *y = LLVMBuildAnd(builder, *y, mask, "y");
+   *u = LLVMBuildAnd(builder, *u, mask, "u");
+   *v = LLVMBuildAnd(builder, *v, mask, "v");
+}
+
+
+static INLINE void
+yuv_to_rgb_soa(LLVMBuilderRef builder,
+               unsigned n,
+               LLVMValueRef y, LLVMValueRef u, LLVMValueRef v,
+               LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b)
+{
+   struct lp_type type;
+   struct lp_build_context bld;
+
+   LLVMValueRef c0;
+   LLVMValueRef c8;
+   LLVMValueRef c16;
+   LLVMValueRef c128;
+   LLVMValueRef c255;
+
+   LLVMValueRef cy;
+   LLVMValueRef cug;
+   LLVMValueRef cub;
+   LLVMValueRef cvr;
+   LLVMValueRef cvg;
+
+   memset(&type, 0, sizeof type);
+   type.sign = TRUE;
+   type.width = 32;
+   type.length = n;
+
+   lp_build_context_init(&bld, builder, type);
+
+   assert(lp_check_value(type, y));
+   assert(lp_check_value(type, u));
+   assert(lp_check_value(type, v));
+
+   /*
+    * Constants
+    */
+
+   c0   = lp_build_const_int_vec(type,   0);
+   c8   = lp_build_const_int_vec(type,   8);
+   c16  = lp_build_const_int_vec(type,  16);
+   c128 = lp_build_const_int_vec(type, 128);
+   c255 = lp_build_const_int_vec(type, 255);
+
+   cy  = lp_build_const_int_vec(type,  298);
+   cug = lp_build_const_int_vec(type, -100);
+   cub = lp_build_const_int_vec(type,  516);
+   cvr = lp_build_const_int_vec(type,  409);
+   cvg = lp_build_const_int_vec(type, -208);
+
+   /*
+    *  y -= 16;
+    *  u -= 128;
+    *  v -= 128;
+    */
+
+   y = LLVMBuildSub(builder, y, c16, "");
+   u = LLVMBuildSub(builder, u, c128, "");
+   v = LLVMBuildSub(builder, v, c128, "");
+
+   /*
+    * r = 298 * _y            + 409 * _v + 128;
+    * g = 298 * _y - 100 * _u - 208 * _v + 128;
+    * b = 298 * _y + 516 * _u            + 128;
+    */
+
+   y = LLVMBuildMul(builder, y, cy, "");
+   y = LLVMBuildAdd(builder, y, c128, "");
+
+   *r = LLVMBuildMul(builder, v, cvr, "");
+   *g = LLVMBuildAdd(builder,
+                     LLVMBuildMul(builder, u, cug, ""),
+                     LLVMBuildMul(builder, v, cvg, ""),
+                     "");
+   *b = LLVMBuildMul(builder, u, cub, "");
+
+   *r = LLVMBuildAdd(builder, *r, y, "");
+   *g = LLVMBuildAdd(builder, *g, y, "");
+   *b = LLVMBuildAdd(builder, *b, y, "");
+
+   /*
+    * r >>= 8;
+    * g >>= 8;
+    * b >>= 8;
+    */
+
+   *r = LLVMBuildAShr(builder, *r, c8, "r");
+   *g = LLVMBuildAShr(builder, *g, c8, "g");
+   *b = LLVMBuildAShr(builder, *b, c8, "b");
+
+   /*
+    * Clamp
+    */
+
+   *r = lp_build_clamp(&bld, *r, c0, c255);
+   *g = lp_build_clamp(&bld, *g, c0, c255);
+   *b = lp_build_clamp(&bld, *b, c0, c255);
+}
+
+
+static LLVMValueRef
+rgb_to_rgba_aos(LLVMBuilderRef builder,
+                unsigned n,
+                LLVMValueRef r, LLVMValueRef g, LLVMValueRef b)
+{
+   struct lp_type type;
+   LLVMValueRef a;
+   LLVMValueRef rgba;
+
+   memset(&type, 0, sizeof type);
+   type.sign = TRUE;
+   type.width = 32;
+   type.length = n;
+
+   assert(lp_check_value(type, r));
+   assert(lp_check_value(type, g));
+   assert(lp_check_value(type, b));
+
+   /*
+    * Make a 4 x unorm8 vector
+    */
+
+   r = r;
+   g = LLVMBuildShl(builder, g, lp_build_const_int_vec(type, 8), "");
+   b = LLVMBuildShl(builder, b, lp_build_const_int_vec(type, 16), "");
+   a = lp_build_const_int_vec(type, 0xff000000);
+
+   rgba = r;
+   rgba = LLVMBuildOr(builder, rgba, g, "");
+   rgba = LLVMBuildOr(builder, rgba, b, "");
+   rgba = LLVMBuildOr(builder, rgba, a, "");
+
+   rgba = LLVMBuildBitCast(builder, rgba,
+                           LLVMVectorType(LLVMInt8Type(), 4*n), "");
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed UYVY to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+uyvy_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef y, u, v;
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   uyvy_to_yuv_soa(builder, n, packed, i, &y, &u, &v);
+   yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed YUYV to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+yuyv_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef y, u, v;
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   yuyv_to_yuv_soa(builder, n, packed, i, &y, &u, &v);
+   yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed RG_BG to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+rgbg_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   uyvy_to_yuv_soa(builder, n, packed, i, &g, &r, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * Convert from <n x i32> packed GR_GB to <4n x i8> RGBA AoS
+ */
+static LLVMValueRef
+grgb_to_rgba_aos(LLVMBuilderRef builder,
+                 unsigned n,
+                 LLVMValueRef packed,
+                 LLVMValueRef i)
+{
+   LLVMValueRef r, g, b;
+   LLVMValueRef rgba;
+
+   yuyv_to_yuv_soa(builder, n, packed, i, &g, &r, &b);
+   rgba = rgb_to_rgba_aos(builder, n, r, g, b);
+
+   return rgba;
+}
+
+
+/**
+ * @param n  is the number of pixels processed
+ * @param packed  is a <n x i32> vector with the packed YUYV blocks
+ * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
+ * @return  a <4*n x i8> vector with the pixel RGBA values in AoS
+ */
+LLVMValueRef
+lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder,
+                                   const struct util_format_description *format_desc,
+                                   unsigned n,
+                                   LLVMValueRef base_ptr,
+                                   LLVMValueRef offset,
+                                   LLVMValueRef i,
+                                   LLVMValueRef j)
+{
+   LLVMValueRef packed;
+   LLVMValueRef rgba;
+
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED);
+   assert(format_desc->block.bits == 32);
+   assert(format_desc->block.width == 2);
+   assert(format_desc->block.height == 1);
+
+   packed = lp_build_gather(builder, n, 32, 32, base_ptr, offset);
+
+   (void)j;
+
+   switch (format_desc->format) {
+   case PIPE_FORMAT_UYVY:
+      rgba = uyvy_to_rgba_aos(builder, n, packed, i);
+      break;
+   case PIPE_FORMAT_YUYV:
+      rgba = yuyv_to_rgba_aos(builder, n, packed, i);
+      break;
+   case PIPE_FORMAT_R8G8_B8G8_UNORM:
+      rgba = rgbg_to_rgba_aos(builder, n, packed, i);
+      break;
+   case PIPE_FORMAT_G8R8_G8B8_UNORM:
+      rgba = grgb_to_rgba_aos(builder, n, packed, i);
+      break;
+   default:
+      assert(0);
+      rgba =  LLVMGetUndef(LLVMVectorType(LLVMInt8Type(), 4*n));
+      break;
+   }
+
+   return rgba;
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
new file mode 100644
index 00000000000..d60472e0656
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c
@@ -0,0 +1,148 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "util/u_debug.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_const.h"
+#include "lp_bld_format.h"
+#include "lp_bld_gather.h"
+
+
+/**
+ * Get the pointer to one element from scatter positions in memory.
+ *
+ * @sa lp_build_gather()
+ */
+LLVMValueRef
+lp_build_gather_elem_ptr(LLVMBuilderRef builder,
+                         unsigned length,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i)
+{
+   LLVMValueRef offset;
+   LLVMValueRef ptr;
+
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0));
+
+   if (length == 1) {
+      assert(i == 0);
+      offset = offsets;
+   } else {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      offset = LLVMBuildExtractElement(builder, offsets, index, "");
+   }
+
+   ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, "");
+
+   return ptr;
+}
+
+
+/**
+ * Gather one element from scatter positions in memory.
+ *
+ * @sa lp_build_gather()
+ */
+LLVMValueRef
+lp_build_gather_elem(LLVMBuilderRef builder,
+                     unsigned length,
+                     unsigned src_width,
+                     unsigned dst_width,
+                     LLVMValueRef base_ptr,
+                     LLVMValueRef offsets,
+                     unsigned i)
+{
+   LLVMTypeRef src_type = LLVMIntType(src_width);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0));
+
+   ptr = lp_build_gather_elem_ptr(builder, length, base_ptr, offsets, i);
+   ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, "");
+   res = LLVMBuildLoad(builder, ptr, "");
+
+   assert(src_width <= dst_width);
+   if (src_width > dst_width)
+      res = LLVMBuildTrunc(builder, res, dst_elem_type, "");
+   if (src_width < dst_width)
+      res = LLVMBuildZExt(builder, res, dst_elem_type, "");
+
+   return res;
+}
+
+
+/**
+ * Gather elements from scatter positions in memory into a single vector.
+ * Use for fetching texels from a texture.
+ * For SSE, typical values are length=4, src_width=32, dst_width=32.
+ *
+ * @param length length of the offsets
+ * @param src_width src element width in bits
+ * @param dst_width result element width in bits (src will be expanded to fit)
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets)
+{
+   LLVMValueRef res;
+
+   if (length == 1) {
+      /* Scalar */
+      return lp_build_gather_elem(builder, length,
+                                  src_width, dst_width,
+                                  base_ptr, offsets, 0);
+   } else {
+      /* Vector */
+
+      LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+      LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+      unsigned i;
+
+      res = LLVMGetUndef(dst_vec_type);
+      for (i = 0; i < length; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         LLVMValueRef elem;
+         elem = lp_build_gather_elem(builder, length,
+                                     src_width, dst_width,
+                                     base_ptr, offsets, i);
+         res = LLVMBuildInsertElement(builder, res, elem, index, "");
+      }
+   }
+
+   return res;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
new file mode 100644
index 00000000000..131af8ea07e
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h
@@ -0,0 +1,61 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_GATHER_H_
+#define LP_BLD_GATHER_H_
+
+
+#include "gallivm/lp_bld.h"
+
+
+LLVMValueRef
+lp_build_gather_elem_ptr(LLVMBuilderRef builder,
+                         unsigned length,
+                         LLVMValueRef base_ptr,
+                         LLVMValueRef offsets,
+                         unsigned i);
+
+LLVMValueRef
+lp_build_gather_elem(LLVMBuilderRef builder,
+                     unsigned length,
+                     unsigned src_width,
+                     unsigned dst_width,
+                     LLVMValueRef base_ptr,
+                     LLVMValueRef offsets,
+                     unsigned i);
+
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets);
+
+
+#endif /* LP_BLD_GATHER_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 5067d0a164f..761f33b578d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -29,18 +29,74 @@
 #include "pipe/p_compiler.h"
 #include "util/u_cpu_detect.h"
 #include "util/u_debug.h"
+#include "lp_bld_debug.h"
 #include "lp_bld_init.h"
 
+#include <llvm-c/Transforms/Scalar.h>
+
+
+#ifdef DEBUG
+unsigned gallivm_debug = 0;
+
+static const struct debug_named_value lp_bld_debug_flags[] = {
+   { "tgsi",   GALLIVM_DEBUG_TGSI, NULL },
+   { "ir",     GALLIVM_DEBUG_IR, NULL },
+   { "asm",    GALLIVM_DEBUG_ASM, NULL },
+   { "nopt",   GALLIVM_DEBUG_NO_OPT, NULL },
+   { "perf",   GALLIVM_DEBUG_PERF, NULL },
+   DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags, 0)
+#endif
+
 
 LLVMModuleRef lp_build_module = NULL;
 LLVMExecutionEngineRef lp_build_engine = NULL;
 LLVMModuleProviderRef lp_build_provider = NULL;
 LLVMTargetDataRef lp_build_target = NULL;
+LLVMPassManagerRef lp_build_pass = NULL;
+
+
+/*
+ * Optimization values are:
+ * - 0: None (-O0)
+ * - 1: Less (-O1)
+ * - 2: Default (-O2, -Os)
+ * - 3: Aggressive (-O3)
+ *
+ * See also CodeGenOpt::Level in llvm/Target/TargetMachine.h
+ */
+enum LLVM_CodeGenOpt_Level {
+#if HAVE_LLVM >= 0x207
+   None,        // -O0
+   Less,        // -O1
+   Default,     // -O2, -Os
+   Aggressive   // -O3
+#else
+   Default,
+   None,
+   Aggressive
+#endif
+};
+
+
+extern void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE);
+
+extern void
+lp_set_target_options(void);
 
 
 void
 lp_build_init(void)
 {
+#ifdef DEBUG
+   gallivm_debug = debug_get_option_gallivm_debug();
+#endif
+
+   lp_set_target_options();
+
    LLVMInitializeNativeTarget();
 
    LLVMLinkInJIT();
@@ -52,18 +108,58 @@ lp_build_init(void)
       lp_build_provider = LLVMCreateModuleProviderForExistingModule(lp_build_module);
 
    if (!lp_build_engine) {
+      enum LLVM_CodeGenOpt_Level optlevel;
       char *error = NULL;
 
-      if (LLVMCreateJITCompiler(&lp_build_engine, lp_build_provider, 1, &error)) {
+      if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) {
+         optlevel = None;
+      }
+      else {
+         optlevel = Default;
+      }
+
+      if (LLVMCreateJITCompiler(&lp_build_engine, lp_build_provider,
+                                (unsigned)optlevel, &error)) {
          _debug_printf("%s\n", error);
          LLVMDisposeMessage(error);
          assert(0);
       }
+
+#if defined(DEBUG) || defined(PROFILE)
+      lp_register_oprofile_jit_event_listener(lp_build_engine);
+#endif
    }
 
    if (!lp_build_target)
       lp_build_target = LLVMGetExecutionEngineTargetData(lp_build_engine);
 
+   if (!lp_build_pass) {
+      lp_build_pass = LLVMCreateFunctionPassManager(lp_build_provider);
+      LLVMAddTargetData(lp_build_target, lp_build_pass);
+
+      if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
+         /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
+          * but there are more on SVN. */
+         /* TODO: Add more passes */
+         LLVMAddCFGSimplificationPass(lp_build_pass);
+         LLVMAddPromoteMemoryToRegisterPass(lp_build_pass);
+         LLVMAddConstantPropagationPass(lp_build_pass);
+         if(util_cpu_caps.has_sse4_1) {
+            /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+             * and sitofp (necessary for trunc/floor/ceil/round implementation)
+             * somehow becomes invalid code.
+             */
+            LLVMAddInstructionCombiningPass(lp_build_pass);
+         }
+         LLVMAddGVNPass(lp_build_pass);
+      } else {
+         /* We need at least this pass to prevent the backends to fail in
+          * unexpected ways.
+          */
+         LLVMAddPromoteMemoryToRegisterPass(lp_build_pass);
+      }
+   }
+
    util_cpu_detect();
 
 #if 0
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 0ec2afcd1be..f26fdac4663 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -38,10 +38,13 @@ extern LLVMModuleRef lp_build_module;
 extern LLVMExecutionEngineRef lp_build_engine;
 extern LLVMModuleProviderRef lp_build_provider;
 extern LLVMTargetDataRef lp_build_target;
+extern LLVMPassManagerRef lp_build_pass;
 
 
 void
 lp_build_init(void);
 
+extern void
+lp_func_delete_body(LLVMValueRef func);
 
 #endif /* !LP_BLD_INIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
new file mode 100644
index 00000000000..369c8121b5c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -0,0 +1,55 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_LIMITS_H_
+#define LP_BLD_LIMITS_H_
+
+/*
+ * TGSI translation limits.
+ *
+ * Some are slightly above SM 3.0 requirements to give some wiggle room to
+ * the state trackers.
+ */
+
+#define LP_MAX_TGSI_TEMPS 256
+
+#define LP_MAX_TGSI_ADDRS 16
+
+#define LP_MAX_TGSI_IMMEDIATES 256
+
+#define LP_MAX_TGSI_PREDS 16
+
+/**
+ * Maximum control flow nesting
+ *
+ * SM3.0 requires 24
+ */
+#define LP_MAX_TGSI_NESTING 32
+
+
+#endif /* LP_BLD_LIMITS_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index d13fa1a5d04..d5c62a3f734 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -34,11 +34,13 @@
 
 
 #include "util/u_cpu_detect.h"
+#include "util/u_memory.h"
 #include "util/u_debug.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
+#include "lp_bld_debug.h"
 #include "lp_bld_logic.h"
 
 
@@ -82,6 +84,8 @@ lp_build_compare(LLVMBuilderRef builder,
 
    assert(func >= PIPE_FUNC_NEVER);
    assert(func <= PIPE_FUNC_ALWAYS);
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
 
    if(func == PIPE_FUNC_NEVER)
       return zeros;
@@ -187,12 +191,10 @@ lp_build_compare(LLVMBuilderRef builder,
             return lp_build_undef(type);
          }
 
-         /* There are no signed byte and unsigned word/dword comparison
-          * instructions. So flip the sign bit so that the results match.
+         /* There are no unsigned comparison instructions. So flip the sign bit
+          * so that the results match.
           */
-         if(table[func].gt &&
-            ((type.width == 8 && type.sign) ||
-             (type.width != 8 && !type.sign))) {
+         if (table[func].gt && !type.sign) {
             LLVMValueRef msb = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1));
             a = LLVMBuildXor(builder, a, msb, "");
             b = LLVMBuildXor(builder, b, msb, "");
@@ -324,8 +326,10 @@ lp_build_compare(LLVMBuilderRef builder,
 
          res = LLVMGetUndef(int_vec_type);
 
-         debug_printf("%s: warning: using slow element-wise int"
-                      " vector comparison\n", __FUNCTION__);
+         if (gallivm_debug & GALLIVM_DEBUG_PERF) {
+            debug_printf("%s: using slow element-wise int"
+                         " vector comparison\n", __FUNCTION__);
+         }
 
          for(i = 0; i < type.length; ++i) {
             LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
@@ -364,9 +368,55 @@ lp_build_cmp(struct lp_build_context *bld,
 
 
 /**
+ * Return (mask & a) | (~mask & b);
+ */
+LLVMValueRef
+lp_build_select_bitwise(struct lp_build_context *bld,
+                        LLVMValueRef mask,
+                        LLVMValueRef a,
+                        LLVMValueRef b)
+{
+   struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
+   if (a == b) {
+      return a;
+   }
+
+   if(type.floating) {
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+   }
+
+   a = LLVMBuildAnd(bld->builder, a, mask, "");
+
+   /* This often gets translated to PANDN, but sometimes the NOT is
+    * pre-computed and stored in another constant. The best strategy depends
+    * on available registers, so it is not a big deal -- hopefully LLVM does
+    * the right decision attending the rest of the program.
+    */
+   b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
+
+   res = LLVMBuildOr(bld->builder, a, b, "");
+
+   if(type.floating) {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+   }
+
+   return res;
+}
+
+
+/**
  * Return mask ? a : b;
  *
- * mask is a bitwise mask, composed of 0 or ~0 for each element.
+ * mask is a bitwise mask, composed of 0 or ~0 for each element. Any other value
+ * will yield unpredictable results.
  */
 LLVMValueRef
 lp_build_select(struct lp_build_context *bld,
@@ -377,6 +427,9 @@ lp_build_select(struct lp_build_context *bld,
    struct lp_type type = bld->type;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == b)
       return a;
 
@@ -384,49 +437,78 @@ lp_build_select(struct lp_build_context *bld,
       mask = LLVMBuildTrunc(bld->builder, mask, LLVMInt1Type(), "");
       res = LLVMBuildSelect(bld->builder, mask, a, b, "");
    }
-   else {
-      if(type.floating) {
-         LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
-         a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
-         b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+   else if (util_cpu_caps.has_sse4_1 &&
+            type.width * type.length == 128 &&
+            !LLVMIsConstant(a) &&
+            !LLVMIsConstant(b) &&
+            !LLVMIsConstant(mask)) {
+      const char *intrinsic;
+      LLVMTypeRef arg_type;
+      LLVMValueRef args[3];
+
+      if (type.width == 64) {
+         intrinsic = "llvm.x86.sse41.blendvpd";
+         arg_type = LLVMVectorType(LLVMDoubleType(), 2);
+      } else if (type.width == 32) {
+         intrinsic = "llvm.x86.sse41.blendvps";
+         arg_type = LLVMVectorType(LLVMFloatType(), 4);
+      } else {
+         intrinsic = "llvm.x86.sse41.pblendvb";
+         arg_type = LLVMVectorType(LLVMInt8Type(), 16);
       }
 
-      a = LLVMBuildAnd(bld->builder, a, mask, "");
+      if (arg_type != bld->int_vec_type) {
+         mask = LLVMBuildBitCast(bld->builder, mask, arg_type, "");
+      }
 
-      /* This often gets translated to PANDN, but sometimes the NOT is
-       * pre-computed and stored in another constant. The best strategy depends
-       * on available registers, so it is not a big deal -- hopefully LLVM does
-       * the right decision attending the rest of the program.
-       */
-      b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
+      if (arg_type != bld->vec_type) {
+         a = LLVMBuildBitCast(bld->builder, a, arg_type, "");
+         b = LLVMBuildBitCast(bld->builder, b, arg_type, "");
+      }
 
-      res = LLVMBuildOr(bld->builder, a, b, "");
+      args[0] = b;
+      args[1] = a;
+      args[2] = mask;
 
-      if(type.floating) {
-         LLVMTypeRef vec_type = lp_build_vec_type(type);
-         res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+      res = lp_build_intrinsic(bld->builder, intrinsic,
+                               arg_type, args, Elements(args));
+
+      if (arg_type != bld->vec_type) {
+         res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, "");
       }
    }
+   else {
+      res = lp_build_select_bitwise(bld, mask, a, b);
+   }
 
    return res;
 }
 
 
+/**
+ * Return mask ? a : b;
+ *
+ * mask is a TGSI_WRITEMASK_xxx.
+ */
 LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
+                    unsigned mask,
                     LLVMValueRef a,
-                    LLVMValueRef b,
-                    const boolean cond[4])
+                    LLVMValueRef b)
 {
    const struct lp_type type = bld->type;
    const unsigned n = type.length;
    unsigned i, j;
 
+   assert((mask & ~0xf) == 0);
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == b)
       return a;
-   if(cond[0] && cond[1] && cond[2] && cond[3])
+   if((mask & 0xf) == 0xf)
       return a;
-   if(!cond[0] && !cond[1] && !cond[2] && !cond[3])
+   if((mask & 0xf) == 0x0)
       return b;
    if(a == bld->undef || b == bld->undef)
       return bld->undef;
@@ -449,7 +531,9 @@ lp_build_select_aos(struct lp_build_context *bld,
 
       for(j = 0; j < n; j += 4)
          for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, (cond[i] ? 0 : n) + j + i, 0);
+            shuffles[j + i] = LLVMConstInt(elem_type,
+                                           (mask & (1 << i) ? 0 : n) + j + i,
+                                           0);
 
       return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
    }
@@ -458,26 +542,17 @@ lp_build_select_aos(struct lp_build_context *bld,
       /* XXX: Unfortunately select of vectors do not work */
       /* Use a select */
       LLVMTypeRef elem_type = LLVMInt1Type();
-      LLVMValueRef cond[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef cond_vec[LP_MAX_VECTOR_LENGTH];
 
       for(j = 0; j < n; j += 4)
          for(i = 0; i < 4; ++i)
-            cond[j + i] = LLVMConstInt(elem_type, cond[i] ? 1 : 0, 0);
+            cond_vec[j + i] = LLVMConstInt(elem_type,
+                                           mask & (1 << i) ? 1 : 0, 0);
 
-      return LLVMBuildSelect(bld->builder, LLVMConstVector(cond, n), a, b, "");
+      return LLVMBuildSelect(bld->builder, LLVMConstVector(cond_vec, n), a, b, "");
 #else
-      LLVMValueRef mask = lp_build_const_mask_aos(type, cond);
-      return lp_build_select(bld, mask, a, b);
+      LLVMValueRef mask_vec = lp_build_const_mask_aos(type, mask);
+      return lp_build_select(bld, mask_vec, a, b);
 #endif
    }
 }
-
-
-/** Return (a & ~b) */
-LLVMValueRef
-lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
-{
-   b = LLVMBuildNot(bld->builder, b, "");
-   b = LLVMBuildAnd(bld->builder, a, b, "");
-   return b;
-}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
index 29f9fc3b205..141fb92058a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -63,6 +63,11 @@ lp_build_cmp(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b);
 
+LLVMValueRef
+lp_build_select_bitwise(struct lp_build_context *bld,
+                        LLVMValueRef mask,
+                        LLVMValueRef a,
+                        LLVMValueRef b);
 
 LLVMValueRef
 lp_build_select(struct lp_build_context *bld,
@@ -72,13 +77,9 @@ lp_build_select(struct lp_build_context *bld,
 
 LLVMValueRef
 lp_build_select_aos(struct lp_build_context *bld,
+                    unsigned mask,
                     LLVMValueRef a,
-                    LLVMValueRef b,
-                    const boolean cond[4]);
-
-
-LLVMValueRef
-lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b);
+                    LLVMValueRef b);
 
 
 #endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
new file mode 100644
index 00000000000..48baf7c425c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -0,0 +1,180 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS
+#endif
+
+#ifndef __STDC_CONSTANT_MACROS
+#define __STDC_CONSTANT_MACROS
+#endif
+
+#include <llvm-c/Core.h>
+#include <llvm-c/ExecutionEngine.h>
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
+#include <llvm/ExecutionEngine/JITEventListener.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/PrettyStackTrace.h>
+
+#include "pipe/p_config.h"
+#include "util/u_debug.h"
+
+
+#if (defined(PIPE_OS_WINDOWS) && !defined(PIPE_CC_MSVC)) || defined(PIPE_OS_EMBDDED)
+
+#include "llvm/Support/raw_ostream.h"
+
+class raw_debug_ostream :
+   public llvm::raw_ostream
+{
+   uint64_t pos;
+
+   void write_impl(const char *Ptr, size_t Size);
+   uint64_t current_pos() { return pos; }
+   uint64_t current_pos() const { return pos; }
+
+#if HAVE_LLVM >= 0x207
+   uint64_t preferred_buffer_size() { return 512; }
+#else
+   size_t preferred_buffer_size() { return 512; }
+#endif
+};
+
+
+void
+raw_debug_ostream::write_impl(const char *Ptr, size_t Size)
+{
+   if (Size > 0) {
+      char *lastPtr = (char *)&Ptr[Size];
+      char last = *lastPtr;
+      *lastPtr = 0;
+      _debug_printf("%*s", Size, Ptr);
+      *lastPtr = last;
+      pos += Size;
+   }
+}
+
+
+/**
+ * Same as LLVMDumpValue, but through our debugging channels.
+ */
+extern "C" void
+lp_debug_dump_value(LLVMValueRef value)
+{
+   raw_debug_ostream os;
+   llvm::unwrap(value)->print(os);
+   os.flush();
+}
+
+
+#else
+
+
+extern "C" void
+lp_debug_dump_value(LLVMValueRef value)
+{
+   LLVMDumpValue(value);
+}
+
+
+#endif
+
+
+/**
+ * Register the engine with oprofile.
+ *
+ * This allows to see the LLVM IR function names in oprofile output.
+ *
+ * To actually work LLVM needs to be built with the --with-oprofile configure
+ * option.
+ *
+ * Also a oprofile:oprofile user:group is necessary. Which is not created by
+ * default on some distributions.
+ */
+extern "C" void
+lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE)
+{
+   llvm::unwrap(EE)->RegisterJITEventListener(llvm::createOProfileJITEventListener());
+}
+
+
+extern "C" void
+lp_set_target_options(void)
+{
+#if defined(DEBUG)
+#if HAVE_LLVM >= 0x0207
+   llvm::JITEmitDebugInfo = true;
+#endif
+#endif
+
+#if defined(DEBUG) || defined(PROFILE)
+   llvm::NoFramePointerElim = true;
+#endif
+
+   llvm::NoExcessFPPrecision = false;
+
+   /* XXX: Investigate this */
+#if 0
+   llvm::UnsafeFPMath = true;
+#endif
+
+   /*
+    * LLVM will generate MMX instructions for vectors <= 64 bits, leading to
+    * innefficient code, and in 32bit systems, to the corruption of the FPU
+    * stack given that it expects the user to generate the EMMS instructions.
+    *
+    * See also:
+    * - http://llvm.org/bugs/show_bug.cgi?id=3287
+    * - http://l4.me.uk/post/2009/06/07/llvm-wrinkle-3-configuration-what-configuration/
+    */
+   static boolean first = TRUE;
+   if (first) {
+      static const char* options[] = {
+         "prog",
+         "-disable-mmx"
+      };
+      llvm::cl::ParseCommandLineOptions(2, const_cast<char**>(options));
+      first = FALSE;
+   }
+
+   /*
+    * By default LLVM adds a signal handler to output a pretty stack trace.
+    * This signal handler is never removed, causing problems when unloading the
+    * shared object where the gallium driver resides.
+    */
+   llvm::DisablePrettyStackTrace = true;
+}
+
+
+extern "C" void
+lp_func_delete_body(LLVMValueRef FF)
+{
+   llvm::Function *func = llvm::unwrap<llvm::Function>(FF);
+   func->deleteBody();
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 186f8849b8d..f7eb7148ab8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -111,8 +111,6 @@ lp_build_const_pack_shuffle(unsigned n)
 
    assert(n <= LP_MAX_VECTOR_LENGTH);
 
-   /* TODO: cache results in a static table */
-
    for(i = 0; i < n; ++i)
       elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
 
@@ -171,14 +169,13 @@ lp_build_unpack2(LLVMBuilderRef builder,
       msb = lp_build_zero(src_type);
 
    /* Interleave bits */
-   if(util_cpu_caps.little_endian) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
       *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0);
       *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1);
-   }
-   else {
+#else
       *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0);
       *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1);
-   }
+#endif
 
    /* Cast the result into the new type (twice as wide) */
 
@@ -261,13 +258,14 @@ lp_build_pack2(LLVMBuilderRef builder,
 #endif
    LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
    LLVMValueRef shuffle;
-   LLVMValueRef res;
+   LLVMValueRef res = NULL;
 
    assert(!src_type.floating);
    assert(!dst_type.floating);
    assert(src_type.width == dst_type.width * 2);
    assert(src_type.length * 2 == dst_type.length);
 
+   /* Check for special cases first */
    if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
       switch(src_type.width) {
       case 32:
@@ -283,8 +281,8 @@ lp_build_pack2(LLVMBuilderRef builder,
                return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
             }
             else {
-               assert(0);
-               return LLVMGetUndef(dst_vec_type);
+               /* use generic shuffle below */
+               res = NULL;
             }
          }
          break;
@@ -310,10 +308,13 @@ lp_build_pack2(LLVMBuilderRef builder,
          break;
       }
 
-      res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
-      return res;
+      if (res) {
+         res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+         return res;
+      }
    }
 
+   /* generic shuffle */
    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
 
@@ -427,3 +428,123 @@ lp_build_pack(LLVMBuilderRef builder,
 
    return tmp[0];
 }
+
+
+/**
+ * Truncate or expand the bitwidth.
+ *
+ * NOTE: Getting the right sign flags is crucial here, as we employ some
+ * intrinsics that do saturation.
+ */
+void
+lp_build_resize(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                const LLVMValueRef *src, unsigned num_srcs,
+                LLVMValueRef *dst, unsigned num_dsts)
+{
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   /*
+    * We don't support float <-> int conversion here. That must be done
+    * before/after calling this function.
+    */
+   assert(src_type.floating == dst_type.floating);
+
+   /*
+    * We don't support double <-> float conversion yet, although it could be
+    * added with little effort.
+    */
+   assert((!src_type.floating && !dst_type.floating) ||
+          src_type.width == dst_type.width);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */
+   assert(num_srcs == 1 || num_dsts == 1);
+
+   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(num_srcs <= LP_MAX_VECTOR_LENGTH);
+   assert(num_dsts <= LP_MAX_VECTOR_LENGTH);
+
+   if (src_type.width > dst_type.width) {
+      /*
+       * Truncate bit width.
+       */
+
+      assert(num_dsts == 1);
+
+      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
+        /*
+         * Register width remains constant -- use vector packing intrinsics
+         */
+
+         tmp[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
+      }
+      else {
+         /*
+          * Do it element-wise.
+          */
+
+         assert(src_type.length == dst_type.length);
+         tmp[0] = lp_build_undef(dst_type);
+         for (i = 0; i < dst_type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+            val = LLVMBuildTrunc(builder, val, lp_build_elem_type(dst_type), "");
+            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+         }
+      }
+   }
+   else if (src_type.width < dst_type.width) {
+      /*
+       * Expand bit width.
+       */
+
+      assert(num_srcs == 1);
+
+      if (src_type.width * src_type.length == dst_type.width * dst_type.length) {
+         /*
+          * Register width remains constant -- use vector unpack intrinsics
+          */
+         lp_build_unpack(builder, src_type, dst_type, src[0], tmp, num_dsts);
+      }
+      else {
+         /*
+          * Do it element-wise.
+          */
+
+         assert(src_type.length == dst_type.length);
+         tmp[0] = lp_build_undef(dst_type);
+         for (i = 0; i < dst_type.length; ++i) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+            LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, "");
+
+            if (src_type.sign && dst_type.sign) {
+               val = LLVMBuildSExt(builder, val, lp_build_elem_type(dst_type), "");
+            } else {
+               val = LLVMBuildZExt(builder, val, lp_build_elem_type(dst_type), "");
+            }
+            tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, "");
+         }
+      }
+   }
+   else {
+      /*
+       * No-op
+       */
+
+      assert(num_srcs == 1);
+      assert(num_dsts == 1);
+
+      tmp[0] = src[0];
+   }
+
+   for(i = 0; i < num_dsts; ++i)
+      dst[i] = tmp[i];
+}
+
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index 41adeed220c..e947b90d164 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -37,6 +37,8 @@
 #define LP_BLD_PACK_H
 
 
+#include "pipe/p_compiler.h"
+
 #include "gallivm/lp_bld.h"
 
 
@@ -92,4 +94,12 @@ lp_build_pack(LLVMBuilderRef builder,
               const LLVMValueRef *src, unsigned num_srcs);
 
 
+void
+lp_build_resize(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                const LLVMValueRef *src, unsigned num_srcs,
+                LLVMValueRef *dst, unsigned num_dsts);
+
+
 #endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
new file mode 100644
index 00000000000..7b1088939b9
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -0,0 +1,101 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "lp_bld_type.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_quad.h"
+
+
+static const unsigned char
+swizzle_left[4] = {
+   LP_BLD_QUAD_TOP_LEFT,     LP_BLD_QUAD_TOP_LEFT,
+   LP_BLD_QUAD_BOTTOM_LEFT,  LP_BLD_QUAD_BOTTOM_LEFT
+};
+
+static const unsigned char
+swizzle_right[4] = {
+   LP_BLD_QUAD_TOP_RIGHT,    LP_BLD_QUAD_TOP_RIGHT,
+   LP_BLD_QUAD_BOTTOM_RIGHT, LP_BLD_QUAD_BOTTOM_RIGHT
+};
+
+static const unsigned char
+swizzle_top[4] = {
+   LP_BLD_QUAD_TOP_LEFT,     LP_BLD_QUAD_TOP_RIGHT,
+   LP_BLD_QUAD_TOP_LEFT,     LP_BLD_QUAD_TOP_RIGHT
+};
+
+static const unsigned char
+swizzle_bottom[4] = {
+   LP_BLD_QUAD_BOTTOM_LEFT,  LP_BLD_QUAD_BOTTOM_RIGHT,
+   LP_BLD_QUAD_BOTTOM_LEFT,  LP_BLD_QUAD_BOTTOM_RIGHT
+};
+
+
+LLVMValueRef
+lp_build_ddx(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   LLVMValueRef a_left  = lp_build_swizzle_aos(bld, a, swizzle_left);
+   LLVMValueRef a_right = lp_build_swizzle_aos(bld, a, swizzle_right);
+   return lp_build_sub(bld, a_right, a_left);
+}
+
+
+LLVMValueRef
+lp_build_ddy(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   LLVMValueRef a_top    = lp_build_swizzle_aos(bld, a, swizzle_top);
+   LLVMValueRef a_bottom = lp_build_swizzle_aos(bld, a, swizzle_bottom);
+   return lp_build_sub(bld, a_bottom, a_top);
+}
+
+
+LLVMValueRef
+lp_build_scalar_ddx(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   LLVMValueRef idx_left  = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_right = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_RIGHT, 0);
+   LLVMValueRef a_left  = LLVMBuildExtractElement(bld->builder, a, idx_left, "");
+   LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "");
+   return lp_build_sub(bld, a_right, a_left);
+}
+
+
+LLVMValueRef
+lp_build_scalar_ddy(struct lp_build_context *bld,
+                    LLVMValueRef a)
+{
+   LLVMValueRef idx_top    = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0);
+   LLVMValueRef idx_bottom = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_BOTTOM_LEFT, 0);
+   LLVMValueRef a_top    = LLVMBuildExtractElement(bld->builder, a, idx_top, "");
+   LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "");
+   return lp_build_sub(bld, a_bottom, a_top);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
new file mode 100644
index 00000000000..b7992912927
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h
@@ -0,0 +1,96 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_BLD_QUAD_H_
+#define LP_BLD_QUAD_H_
+
+
+#include "gallivm/lp_bld.h"
+
+
+struct lp_build_context;
+
+
+/*
+ * Each quad is composed of four elements.
+ *
+ * #########
+ * # 0 | 1 #
+ * #---+---#
+ * # 2 | 3 #
+ * #########
+ */
+
+enum lp_bld_quad {
+   LP_BLD_QUAD_TOP_LEFT     = 0,
+   LP_BLD_QUAD_TOP_RIGHT    = 1,
+   LP_BLD_QUAD_BOTTOM_LEFT  = 2,
+   LP_BLD_QUAD_BOTTOM_RIGHT = 3
+};
+
+
+/*
+ * (Vector) derivates.
+ *
+ * More than one quad is supported. The only requirement is that the vector
+ * contains a whole number of quads:
+ *
+ * ######### ######### ...
+ * # 0 | 1 # # 4 | 5 #
+ * #---+---# #---+---# ...
+ * # 2 | 3 # # 6 | 7 #
+ * ######### ######### ...
+ */
+
+LLVMValueRef
+lp_build_ddx(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+
+LLVMValueRef
+lp_build_ddy(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+
+/*
+ * Scalar derivatives.
+ *
+ * Same as getting the first value of above.
+ */
+
+LLVMValueRef
+lp_build_scalar_ddx(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+
+LLVMValueRef
+lp_build_scalar_ddy(struct lp_build_context *bld,
+                    LLVMValueRef a);
+
+
+#endif /* LP_BLD_QUAD_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 195a4953ab1..aee94c1b866 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -36,12 +36,46 @@
 #include "pipe/p_state.h"
 #include "util/u_format.h"
 #include "util/u_math.h"
-#include "lp_bld_debug.h"
-#include "lp_bld_const.h"
 #include "lp_bld_arit.h"
-#include "lp_bld_type.h"
-#include "lp_bld_format.h"
+#include "lp_bld_const.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_flow.h"
 #include "lp_bld_sample.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_type.h"
+
+
+/**
+ * Does the given texture wrap mode allow sampling the texture border color?
+ * XXX maybe move this into gallium util code.
+ */
+boolean
+lp_sampler_wrap_mode_uses_border_color(unsigned mode,
+                                       unsigned min_img_filter,
+                                       unsigned mag_img_filter)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return FALSE;
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      if (min_img_filter == PIPE_TEX_FILTER_NEAREST &&
+          mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
+         return FALSE;
+      } else {
+         return TRUE;
+      }
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return TRUE;
+   default:
+      assert(0 && "unexpected wrap mode");
+      return FALSE;
+   }
+}
 
 
 /**
@@ -83,34 +117,34 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
    state->swizzle_a         = view->swizzle_a;
 
    state->target            = texture->target;
-   state->pot_width         = util_is_pot(texture->width0);
-   state->pot_height        = util_is_pot(texture->height0);
-   state->pot_depth         = util_is_pot(texture->depth0);
+   state->pot_width         = util_is_power_of_two(texture->width0);
+   state->pot_height        = util_is_power_of_two(texture->height0);
+   state->pot_depth         = util_is_power_of_two(texture->depth0);
 
    state->wrap_s            = sampler->wrap_s;
    state->wrap_t            = sampler->wrap_t;
    state->wrap_r            = sampler->wrap_r;
    state->min_img_filter    = sampler->min_img_filter;
    state->mag_img_filter    = sampler->mag_img_filter;
-   if (texture->last_level) {
+   if (view->last_level) {
       state->min_mip_filter = sampler->min_mip_filter;
    } else {
       state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    }
 
+   /* If min_lod == max_lod we can greatly simplify mipmap selection.
+    * This is a case that occurs during automatic mipmap generation.
+    */
+   if (sampler->min_lod == sampler->max_lod) {
+      state->min_max_lod_equal = 1;
+   }
+
    state->compare_mode      = sampler->compare_mode;
    if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
       state->compare_func   = sampler->compare_func;
    }
 
    state->normalized_coords = sampler->normalized_coords;
-   state->lod_bias          = sampler->lod_bias;
-   state->min_lod           = sampler->min_lod;
-   state->max_lod           = sampler->max_lod;
-   state->border_color[0]   = sampler->border_color[0];
-   state->border_color[1]   = sampler->border_color[1];
-   state->border_color[2]   = sampler->border_color[2];
-   state->border_color[3]   = sampler->border_color[3];
 
    /*
     * FIXME: Handle the remainder of pipe_sampler_view.
@@ -119,84 +153,630 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 /**
- * Gather elements from scatter positions in memory into a single vector.
+ * Generate code to compute texture level of detail (lambda).
+ * \param ddx  partial derivatives of (s, t, r, q) with respect to X
+ * \param ddy  partial derivatives of (s, t, r, q) with respect to Y
+ * \param lod_bias  optional float vector with the shader lod bias
+ * \param explicit_lod  optional float vector with the explicit lod
+ * \param width  scalar int texture width
+ * \param height  scalar int texture height
+ * \param depth  scalar int texture depth
  *
- * @param src_width src element width
- * @param dst_width result element width (source will be expanded to fit)
- * @param length length of the offsets,
- * @param base_ptr base pointer, should be a i8 pointer type.
- * @param offsets vector with offsets
+ * XXX: The resulting lod is scalar, so ignore all but the first element of
+ * derivatives, lod_bias, etc that are passed by the shader.
  */
 LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets)
-{
-   LLVMTypeRef src_type = LLVMIntType(src_width);
-   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
-   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
-   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
-   LLVMValueRef res;
-   unsigned i;
+lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      unsigned unit,
+                      const LLVMValueRef ddx[4],
+                      const LLVMValueRef ddy[4],
+                      LLVMValueRef lod_bias, /* optional */
+                      LLVMValueRef explicit_lod, /* optional */
+                      LLVMValueRef width,
+                      LLVMValueRef height,
+                      LLVMValueRef depth)
+
+{
+   LLVMValueRef min_lod =
+      bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit);
+
+   if (bld->static_state->min_max_lod_equal) {
+      /* User is forcing sampling from a particular mipmap level.
+       * This is hit during mipmap generation.
+       */
+      return min_lod;
+   }
+   else {
+      struct lp_build_context *float_bld = &bld->float_bld;
+      LLVMValueRef sampler_lod_bias =
+         bld->dynamic_state->lod_bias(bld->dynamic_state, bld->builder, unit);
+      LLVMValueRef max_lod =
+         bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit);
+      LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      LLVMValueRef lod;
+
+      if (explicit_lod) {
+         lod = LLVMBuildExtractElement(bld->builder, explicit_lod,
+                                       index0, "");
+      }
+      else {
+         const int dims = texture_dims(bld->static_state->target);
+         LLVMValueRef dsdx, dsdy;
+         LLVMValueRef dtdx = NULL, dtdy = NULL, drdx = NULL, drdy = NULL;
+         LLVMValueRef rho;
+
+         dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx");
+         dsdx = lp_build_abs(float_bld, dsdx);
+         dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy");
+         dsdy = lp_build_abs(float_bld, dsdy);
+         if (dims > 1) {
+            dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx");
+            dtdx = lp_build_abs(float_bld, dtdx);
+            dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy");
+            dtdy = lp_build_abs(float_bld, dtdy);
+            if (dims > 2) {
+               drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx");
+               drdx = lp_build_abs(float_bld, drdx);
+               drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy");
+               drdy = lp_build_abs(float_bld, drdy);
+            }
+         }
+
+         /* Compute rho = max of all partial derivatives scaled by texture size.
+          * XXX this could be vectorized somewhat
+          */
+         rho = LLVMBuildFMul(bld->builder,
+                            lp_build_max(float_bld, dsdx, dsdy),
+                            lp_build_int_to_float(float_bld, width), "");
+         if (dims > 1) {
+            LLVMValueRef max;
+            max = LLVMBuildFMul(bld->builder,
+                               lp_build_max(float_bld, dtdx, dtdy),
+                               lp_build_int_to_float(float_bld, height), "");
+            rho = lp_build_max(float_bld, rho, max);
+            if (dims > 2) {
+               max = LLVMBuildFMul(bld->builder,
+                                  lp_build_max(float_bld, drdx, drdy),
+                                  lp_build_int_to_float(float_bld, depth), "");
+               rho = lp_build_max(float_bld, rho, max);
+            }
+         }
+
+         /* compute lod = log2(rho) */
+         lod = lp_build_log2(float_bld, rho);
+
+         /* add shader lod bias */
+         if (lod_bias) {
+            lod_bias = LLVMBuildExtractElement(bld->builder, lod_bias,
+                                               index0, "");
+            lod = LLVMBuildFAdd(bld->builder, lod, lod_bias, "shader_lod_bias");
+         }
+      }
+
+      /* add sampler lod bias */
+      lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
+
+      /* clamp lod */
+      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
+
+      return lod;
+   }
+}
+
+
+/**
+ * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
+ * mipmap level index.
+ * Note: this is all scalar code.
+ * \param lod  scalar float texture level of detail
+ * \param level_out  returns integer 
+ */
+void
+lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
+
+   LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
+
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_iround(float_bld, lod);
+
+   /* clamp level to legal range of levels */
+   *level_out = lp_build_clamp(int_bld, level, zero, last_level);
+}
+
+
+/**
+ * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
+ * two (adjacent) mipmap level indexes.  Later, we'll sample from those
+ * two mipmap levels and interpolate between them.
+ */
+void
+lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level0_out,
+                           LLVMValueRef *level1_out,
+                           LLVMValueRef *weight_out)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *int_bld = &bld->int_bld;
+   LLVMValueRef last_level, level;
+
+   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
+                                               bld->builder, unit);
+
+   /* convert float lod to integer */
+   level = lp_build_ifloor(float_bld, lod);
+
+   /* compute level 0 and clamp to legal range of levels */
+   *level0_out = lp_build_clamp(int_bld, level,
+                                int_bld->zero,
+                                last_level);
+   /* compute level 1 and clamp to legal range of levels */
+   level = lp_build_add(int_bld, level, int_bld->one);
+   *level1_out = lp_build_clamp(int_bld, level,
+                                int_bld->zero,
+                                last_level);
+
+   *weight_out = lp_build_fract(float_bld, lod);
+}
+
+
+/**
+ * Return pointer to a single mipmap level.
+ * \param data_array  array of pointers to mipmap levels
+ * \param level  integer mipmap level
+ */
+LLVMValueRef
+lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
+                          LLVMValueRef data_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], data_ptr;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, "");
+   data_ptr = LLVMBuildLoad(bld->builder, data_ptr, "");
+   return data_ptr;
+}
 
-   res = LLVMGetUndef(dst_vec_type);
-   for(i = 0; i < length; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      LLVMValueRef elem_offset;
-      LLVMValueRef elem_ptr;
-      LLVMValueRef elem;
 
-      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
-      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
-      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
-      elem = LLVMBuildLoad(builder, elem_ptr, "");
+LLVMValueRef
+lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
+                                LLVMValueRef data_array, int level)
+{
+   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
+   return lp_build_get_mipmap_level(bld, data_array, lvl);
+}
 
-      assert(src_width <= dst_width);
-      if(src_width > dst_width)
-         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
-      if(src_width < dst_width)
-         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
 
-      res = LLVMBuildInsertElement(builder, res, elem, index, "");
+/**
+ * Codegen equivalent for u_minify().
+ * Return max(1, base_size >> level);
+ */
+static LLVMValueRef
+lp_build_minify(struct lp_build_sample_context *bld,
+                LLVMValueRef base_size,
+                LLVMValueRef level)
+{
+   if (level == bld->int_coord_bld.zero) {
+      /* if we're using mipmap level zero, no minification is needed */
+      return base_size;
+   }
+   else {
+      LLVMValueRef size =
+         LLVMBuildLShr(bld->builder, base_size, level, "minify");
+      size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
+      return size;
    }
+}
+
+
+/**
+ * Dereference stride_array[mipmap_level] array to get a stride.
+ * Return stride as a vector.
+ */
+static LLVMValueRef
+lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
+                              LLVMValueRef stride_array, LLVMValueRef level)
+{
+   LLVMValueRef indexes[2], stride;
+   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indexes[1] = level;
+   stride = LLVMBuildGEP(bld->builder, stride_array, indexes, 2, "");
+   stride = LLVMBuildLoad(bld->builder, stride, "");
+   stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride);
+   return stride;
+}
+
+
+/**
+ * When sampling a mipmap, we need to compute the width, height, depth
+ * of the source levels from the level indexes.  This helper function
+ * does that.
+ */
+void
+lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
+                            unsigned dims,
+                            LLVMValueRef width_vec,
+                            LLVMValueRef height_vec,
+                            LLVMValueRef depth_vec,
+                            LLVMValueRef ilevel0,
+                            LLVMValueRef ilevel1,
+                            LLVMValueRef row_stride_array,
+                            LLVMValueRef img_stride_array,
+                            LLVMValueRef *width0_vec,
+                            LLVMValueRef *width1_vec,
+                            LLVMValueRef *height0_vec,
+                            LLVMValueRef *height1_vec,
+                            LLVMValueRef *depth0_vec,
+                            LLVMValueRef *depth1_vec,
+                            LLVMValueRef *row_stride0_vec,
+                            LLVMValueRef *row_stride1_vec,
+                            LLVMValueRef *img_stride0_vec,
+                            LLVMValueRef *img_stride1_vec)
+{
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   LLVMValueRef ilevel0_vec, ilevel1_vec;
 
+   ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
+      ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1);
+
+   /*
+    * Compute width, height, depth at mipmap level 'ilevel0'
+    */
+   *width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec);
+   if (dims >= 2) {
+      *height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec);
+      *row_stride0_vec = lp_build_get_level_stride_vec(bld,
+                                                       row_stride_array,
+                                                       ilevel0);
+      if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         *img_stride0_vec = lp_build_get_level_stride_vec(bld,
+                                                          img_stride_array,
+                                                          ilevel0);
+         if (dims == 3) {
+            *depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec);
+         }
+      }
+   }
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* compute width, height, depth for second mipmap level at 'ilevel1' */
+      *width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec);
+      if (dims >= 2) {
+         *height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec);
+         *row_stride1_vec = lp_build_get_level_stride_vec(bld,
+                                                          row_stride_array,
+                                                          ilevel1);
+         if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
+            *img_stride1_vec = lp_build_get_level_stride_vec(bld,
+                                                             img_stride_array,
+                                                             ilevel1);
+            if (dims == 3) {
+               *depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec);
+            }
+         }
+      }
+   }
+}
+
+
+
+/** Helper used by lp_build_cube_lookup() */
+static LLVMValueRef
+lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
+{
+   /* ima = -0.5 / abs(coord); */
+   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5);
+   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
+   LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
+   return ima;
+}
+
+
+/**
+ * Helper used by lp_build_cube_lookup()
+ * \param sign  scalar +1 or -1
+ * \param coord  float vector
+ * \param ima  float vector
+ */
+static LLVMValueRef
+lp_build_cube_coord(struct lp_build_context *coord_bld,
+                    LLVMValueRef sign, int negate_coord,
+                    LLVMValueRef coord, LLVMValueRef ima)
+{
+   /* return negate(coord) * ima * sign + 0.5; */
+   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
+   LLVMValueRef res;
+
+   assert(negate_coord == +1 || negate_coord == -1);
+
+   if (negate_coord == -1) {
+      coord = lp_build_negate(coord_bld, coord);
+   }
+
+   res = lp_build_mul(coord_bld, coord, ima);
+   if (sign) {
+      sign = lp_build_broadcast_scalar(coord_bld, sign);
+      res = lp_build_mul(coord_bld, res, sign);
+   }
+   res = lp_build_add(coord_bld, res, half);
+
+   return res;
+}
+
+
+/** Helper used by lp_build_cube_lookup()
+ * Return (major_coord >= 0) ? pos_face : neg_face;
+ */
+static LLVMValueRef
+lp_build_cube_face(struct lp_build_sample_context *bld,
+                   LLVMValueRef major_coord,
+                   unsigned pos_face, unsigned neg_face)
+{
+   LLVMValueRef cmp = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
+                                    major_coord,
+                                    bld->float_bld.zero, "");
+   LLVMValueRef pos = LLVMConstInt(LLVMInt32Type(), pos_face, 0);
+   LLVMValueRef neg = LLVMConstInt(LLVMInt32Type(), neg_face, 0);
+   LLVMValueRef res = LLVMBuildSelect(bld->builder, cmp, pos, neg, "");
    return res;
 }
 
 
+
+/**
+ * Generate code to do cube face selection and compute per-face texcoords.
+ */
+void
+lp_build_cube_lookup(struct lp_build_sample_context *bld,
+                     LLVMValueRef s,
+                     LLVMValueRef t,
+                     LLVMValueRef r,
+                     LLVMValueRef *face,
+                     LLVMValueRef *face_s,
+                     LLVMValueRef *face_t)
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   LLVMValueRef rx, ry, rz;
+   LLVMValueRef arx, ary, arz;
+   LLVMValueRef c25 = LLVMConstReal(LLVMFloatType(), 0.25);
+   LLVMValueRef arx_ge_ary, arx_ge_arz;
+   LLVMValueRef ary_ge_arx, ary_ge_arz;
+   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
+   LLVMValueRef rx_pos, ry_pos, rz_pos;
+
+   assert(bld->coord_bld.type.length == 4);
+
+   /*
+    * Use the average of the four pixel's texcoords to choose the face.
+    */
+   rx = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, s));
+   ry = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, t));
+   rz = lp_build_mul(float_bld, c25,
+                     lp_build_sum_vector(&bld->coord_bld, r));
+
+   arx = lp_build_abs(float_bld, rx);
+   ary = lp_build_abs(float_bld, ry);
+   arz = lp_build_abs(float_bld, rz);
+
+   /*
+    * Compare sign/magnitude of rx,ry,rz to determine face
+    */
+   arx_ge_ary = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, ary, "");
+   arx_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, arz, "");
+   ary_ge_arx = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arx, "");
+   ary_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arz, "");
+
+   arx_ge_ary_arz = LLVMBuildAnd(bld->builder, arx_ge_ary, arx_ge_arz, "");
+   ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+   rx_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rx, float_bld->zero, "");
+   ry_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ry, float_bld->zero, "");
+   rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
+
+   {
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+
+      flow_ctx = lp_build_flow_create(bld->builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      *face_s = bld->coord_bld.undef;
+      *face_t = bld->coord_bld.undef;
+      *face = bld->int_bld.undef;
+
+      lp_build_name(*face_s, "face_s");
+      lp_build_name(*face_t, "face_t");
+      lp_build_name(*face, "face");
+
+      lp_build_flow_scope_declare(flow_ctx, face_s);
+      lp_build_flow_scope_declare(flow_ctx, face_t);
+      lp_build_flow_scope_declare(flow_ctx, face);
+
+      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
+      {
+         /* +/- X face */
+         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
+         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
+         *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
+         *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+         *face = lp_build_cube_face(bld, rx,
+                                    PIPE_TEX_FACE_POS_X,
+                                    PIPE_TEX_FACE_NEG_X);
+      }
+      lp_build_else(&if_ctx);
+      {
+         struct lp_build_flow_context *flow_ctx2;
+         struct lp_build_if_state if_ctx2;
+
+         LLVMValueRef face_s2 = bld->coord_bld.undef;
+         LLVMValueRef face_t2 = bld->coord_bld.undef;
+         LLVMValueRef face2 = bld->int_bld.undef;
+
+         flow_ctx2 = lp_build_flow_create(bld->builder);
+         lp_build_flow_scope_begin(flow_ctx2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
+         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
+         lp_build_flow_scope_declare(flow_ctx2, &face2);
+
+         ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
+
+         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
+         {
+            /* +/- Y face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
+            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
+            face2 = lp_build_cube_face(bld, ry,
+                                       PIPE_TEX_FACE_POS_Y,
+                                       PIPE_TEX_FACE_NEG_Y);
+         }
+         lp_build_else(&if_ctx2);
+         {
+            /* +/- Z face */
+            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
+            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
+            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
+            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
+            face2 = lp_build_cube_face(bld, rz,
+                                       PIPE_TEX_FACE_POS_Z,
+                                       PIPE_TEX_FACE_NEG_Z);
+         }
+         lp_build_endif(&if_ctx2);
+         lp_build_flow_scope_end(flow_ctx2);
+         lp_build_flow_destroy(flow_ctx2);
+         *face_s = face_s2;
+         *face_t = face_t2;
+         *face = face2;
+      }
+
+      lp_build_endif(&if_ctx);
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+   }
+}
+
+
+/**
+ * Compute the partial offset of a pixel block along an arbitrary axis.
+ *
+ * @param coord   coordinate in pixels
+ * @param stride  number of bytes between rows of successive pixel blocks
+ * @param block_length  number of pixels in a pixels block along the coordinate
+ *                      axis
+ * @param out_offset    resulting relative offset of the pixel block in bytes
+ * @param out_subcoord  resulting sub-block pixel coordinate
+ */
+void
+lp_build_sample_partial_offset(struct lp_build_context *bld,
+                               unsigned block_length,
+                               LLVMValueRef coord,
+                               LLVMValueRef stride,
+                               LLVMValueRef *out_offset,
+                               LLVMValueRef *out_subcoord)
+{
+   LLVMValueRef offset;
+   LLVMValueRef subcoord;
+
+   if (block_length == 1) {
+      subcoord = bld->zero;
+   }
+   else {
+      /*
+       * Pixel blocks have power of two dimensions. LLVM should convert the
+       * rem/div to bit arithmetic.
+       * TODO: Verify this.
+       * It does indeed BUT it does transform it to scalar (and back) when doing so
+       * (using roughly extract, shift/and, mov, unpack) (llvm 2.7).
+       * The generated code looks seriously unfunny and is quite expensive.
+       */
+#if 0
+      LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length);
+      subcoord = LLVMBuildURem(bld->builder, coord, block_width, "");
+      coord    = LLVMBuildUDiv(bld->builder, coord, block_width, "");
+#else
+      unsigned logbase2 = util_unsigned_logbase2(block_length);
+      LLVMValueRef block_shift = lp_build_const_int_vec(bld->type, logbase2);
+      LLVMValueRef block_mask = lp_build_const_int_vec(bld->type, block_length - 1);
+      subcoord = LLVMBuildAnd(bld->builder, coord, block_mask, "");
+      coord = LLVMBuildLShr(bld->builder, coord, block_shift, "");
+#endif
+   }
+
+   offset = lp_build_mul(bld, coord, stride);
+
+   assert(out_offset);
+   assert(out_subcoord);
+
+   *out_offset = offset;
+   *out_subcoord = subcoord;
+}
+
+
 /**
  * Compute the offset of a pixel block.
  *
- * x, y, z, y_stride, z_stride are vectors, and they refer to pixel blocks, as
- * per format description, and not individual pixels.
+ * x, y, z, y_stride, z_stride are vectors, and they refer to pixels.
+ *
+ * Returns the relative offset and i,j sub-block coordinates
  */
-LLVMValueRef
+void
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
                        LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef z_stride)
+                       LLVMValueRef z_stride,
+                       LLVMValueRef *out_offset,
+                       LLVMValueRef *out_i,
+                       LLVMValueRef *out_j)
 {
    LLVMValueRef x_stride;
    LLVMValueRef offset;
 
    x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8);
-   offset = lp_build_mul(bld, x, x_stride);
+
+   lp_build_sample_partial_offset(bld,
+                                  format_desc->block.width,
+                                  x, x_stride,
+                                  &offset, out_i);
 
    if (y && y_stride) {
-      LLVMValueRef y_offset = lp_build_mul(bld, y, y_stride);
+      LLVMValueRef y_offset;
+      lp_build_sample_partial_offset(bld,
+                                     format_desc->block.height,
+                                     y, y_stride,
+                                     &y_offset, out_j);
       offset = lp_build_add(bld, offset, y_offset);
    }
+   else {
+      *out_j = bld->zero;
+   }
 
    if (z && z_stride) {
-      LLVMValueRef z_offset = lp_build_mul(bld, z, z_stride);
+      LLVMValueRef z_offset;
+      LLVMValueRef k;
+      lp_build_sample_partial_offset(bld,
+                                     1, /* pixel blocks are always 2D */
+                                     z, z_stride,
+                                     &z_offset, &k);
       offset = lp_build_add(bld, offset, z_offset);
    }
 
-   return offset;
+   *out_offset = offset;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index 8ceb20473d5..4d2eeaa5eb4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -36,7 +36,12 @@
 #define LP_BLD_SAMPLE_H
 
 
+#include "pipe/p_format.h"
+#include "util/u_debug.h"
 #include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_swizzle.h"
+
 
 struct pipe_resource;
 struct pipe_sampler_view;
@@ -56,14 +61,14 @@ struct lp_sampler_static_state
 {
    /* pipe_sampler_view's state */
    enum pipe_format format;
-   unsigned swizzle_r:3;
+   unsigned swizzle_r:3;     /**< PIPE_SWIZZLE_* */
    unsigned swizzle_g:3;
    unsigned swizzle_b:3;
    unsigned swizzle_a:3;
 
    /* pipe_texture's state */
-   unsigned target:3;
-   unsigned pot_width:1;
+   unsigned target:3;        /**< PIPE_TEXTURE_* */
+   unsigned pot_width:1;     /**< is the width a power of two? */
    unsigned pot_height:1;
    unsigned pot_depth:1;
 
@@ -77,8 +82,7 @@ struct lp_sampler_static_state
    unsigned compare_mode:1;
    unsigned compare_func:3;
    unsigned normalized_coords:1;
-   float lod_bias, min_lod, max_lod;
-   float border_color[4];
+   unsigned min_max_lod_equal:1;  /**< min_lod == max_lod ? */
 };
 
 
@@ -95,49 +99,170 @@ struct lp_sampler_static_state
 struct lp_sampler_dynamic_state
 {
 
-   /** Obtain the base texture width. */
+   /** Obtain the base texture width (returns int32) */
    LLVMValueRef
-   (*width)( struct lp_sampler_dynamic_state *state,
+   (*width)( const struct lp_sampler_dynamic_state *state,
              LLVMBuilderRef builder,
              unsigned unit);
 
-   /** Obtain the base texture height. */
+   /** Obtain the base texture height (returns int32) */
    LLVMValueRef
-   (*height)( struct lp_sampler_dynamic_state *state,
+   (*height)( const struct lp_sampler_dynamic_state *state,
               LLVMBuilderRef builder,
               unsigned unit);
 
-   /** Obtain the base texture depth. */
+   /** Obtain the base texture depth (returns int32) */
    LLVMValueRef
-   (*depth)( struct lp_sampler_dynamic_state *state,
+   (*depth)( const struct lp_sampler_dynamic_state *state,
              LLVMBuilderRef builder,
              unsigned unit);
 
-   /** Obtain the number of mipmap levels (minus one). */
+   /** Obtain the number of mipmap levels minus one (returns int32) */
    LLVMValueRef
-   (*last_level)( struct lp_sampler_dynamic_state *state,
+   (*last_level)( const struct lp_sampler_dynamic_state *state,
                   LLVMBuilderRef builder,
                   unsigned unit);
 
+   /** Obtain stride in bytes between image rows/blocks (returns int32) */
    LLVMValueRef
-   (*row_stride)( struct lp_sampler_dynamic_state *state,
+   (*row_stride)( const struct lp_sampler_dynamic_state *state,
                   LLVMBuilderRef builder,
                   unsigned unit);
 
+   /** Obtain stride in bytes between image slices (returns int32) */
    LLVMValueRef
-   (*img_stride)( struct lp_sampler_dynamic_state *state,
+   (*img_stride)( const struct lp_sampler_dynamic_state *state,
                   LLVMBuilderRef builder,
                   unsigned unit);
 
+   /** Obtain pointer to array of pointers to mimpap levels */
    LLVMValueRef
-   (*data_ptr)( struct lp_sampler_dynamic_state *state,
+   (*data_ptr)( const struct lp_sampler_dynamic_state *state,
                 LLVMBuilderRef builder,
                 unsigned unit);
 
+   /** Obtain texture min lod (returns float) */
+   LLVMValueRef
+   (*min_lod)(const struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder, unsigned unit);
+
+   /** Obtain texture max lod (returns float) */
+   LLVMValueRef
+   (*max_lod)(const struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder, unsigned unit);
+
+   /** Obtain texture lod bias (returns float) */
+   LLVMValueRef
+   (*lod_bias)(const struct lp_sampler_dynamic_state *state,
+               LLVMBuilderRef builder, unsigned unit);
+
+   /** Obtain texture border color (returns ptr to float[4]) */
+   LLVMValueRef
+   (*border_color)(const struct lp_sampler_dynamic_state *state,
+                   LLVMBuilderRef builder, unsigned unit);
 };
 
 
 /**
+ * Keep all information for sampling code generation in a single place.
+ */
+struct lp_build_sample_context
+{
+   LLVMBuilderRef builder;
+
+   const struct lp_sampler_static_state *static_state;
+
+   struct lp_sampler_dynamic_state *dynamic_state;
+
+   const struct util_format_description *format_desc;
+
+   /** regular scalar float type */
+   struct lp_type float_type;
+   struct lp_build_context float_bld;
+
+   /** float vector type */
+   struct lp_build_context float_vec_bld;
+
+   /** regular scalar float type */
+   struct lp_type int_type;
+   struct lp_build_context int_bld;
+
+   /** Incoming coordinates type and build context */
+   struct lp_type coord_type;
+   struct lp_build_context coord_bld;
+
+   /** Unsigned integer coordinates */
+   struct lp_type uint_coord_type;
+   struct lp_build_context uint_coord_bld;
+
+   /** Signed integer coordinates */
+   struct lp_type int_coord_type;
+   struct lp_build_context int_coord_bld;
+
+   /** Output texels type and build context */
+   struct lp_type texel_type;
+   struct lp_build_context texel_bld;
+};
+
+
+
+/**
+ * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at
+ * this time.  Return whether the given mode is supported by that function.
+ */
+static INLINE boolean
+lp_is_simple_wrap_mode(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return TRUE;
+   default:
+      return FALSE;
+   }
+}
+
+
+static INLINE void
+apply_sampler_swizzle(struct lp_build_sample_context *bld,
+                      LLVMValueRef *texel)
+{
+   unsigned char swizzles[4];
+
+   swizzles[0] = bld->static_state->swizzle_r;
+   swizzles[1] = bld->static_state->swizzle_g;
+   swizzles[2] = bld->static_state->swizzle_b;
+   swizzles[3] = bld->static_state->swizzle_a;
+
+   lp_build_swizzle_soa_inplace(&bld->texel_bld, texel, swizzles);
+}
+
+
+static INLINE int
+texture_dims(enum pipe_texture_target tex)
+{
+   switch (tex) {
+   case PIPE_TEXTURE_1D:
+      return 1;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_CUBE:
+      return 2;
+   case PIPE_TEXTURE_3D:
+      return 3;
+   default:
+      assert(0 && "bad texture target in texture_dims()");
+      return 2;
+   }
+}
+
+
+boolean
+lp_sampler_wrap_mode_uses_border_color(unsigned mode,
+                                       unsigned min_img_filter,
+                                       unsigned mag_img_filter);
+
+/**
  * Derive the sampler static state.
  */
 void
@@ -147,22 +272,91 @@ lp_sampler_static_state(struct lp_sampler_static_state *state,
 
 
 LLVMValueRef
-lp_build_gather(LLVMBuilderRef builder,
-                unsigned length,
-                unsigned src_width,
-                unsigned dst_width,
-                LLVMValueRef base_ptr,
-                LLVMValueRef offsets);
+lp_build_lod_selector(struct lp_build_sample_context *bld,
+                      unsigned unit,
+                      const LLVMValueRef ddx[4],
+                      const LLVMValueRef ddy[4],
+                      LLVMValueRef lod_bias, /* optional */
+                      LLVMValueRef explicit_lod, /* optional */
+                      LLVMValueRef width,
+                      LLVMValueRef height,
+                      LLVMValueRef depth);
+
+void
+lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level_out);
+
+void
+lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
+                           unsigned unit,
+                           LLVMValueRef lod,
+                           LLVMValueRef *level0_out,
+                           LLVMValueRef *level1_out,
+                           LLVMValueRef *weight_out);
 
+LLVMValueRef
+lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
+                          LLVMValueRef data_array, LLVMValueRef level);
 
 LLVMValueRef
+lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
+                                LLVMValueRef data_array, int level);
+
+
+void
+lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld,
+                            unsigned dims,
+                            LLVMValueRef width_vec,
+                            LLVMValueRef height_vec,
+                            LLVMValueRef depth_vec,
+                            LLVMValueRef ilevel0,
+                            LLVMValueRef ilevel1,
+                            LLVMValueRef row_stride_array,
+                            LLVMValueRef img_stride_array,
+                            LLVMValueRef *width0_vec,
+                            LLVMValueRef *width1_vec,
+                            LLVMValueRef *height0_vec,
+                            LLVMValueRef *height1_vec,
+                            LLVMValueRef *depth0_vec,
+                            LLVMValueRef *depth1_vec,
+                            LLVMValueRef *row_stride0_vec,
+                            LLVMValueRef *row_stride1_vec,
+                            LLVMValueRef *img_stride0_vec,
+                            LLVMValueRef *img_stride1_vec);
+
+
+void
+lp_build_cube_lookup(struct lp_build_sample_context *bld,
+                     LLVMValueRef s,
+                     LLVMValueRef t,
+                     LLVMValueRef r,
+                     LLVMValueRef *face,
+                     LLVMValueRef *face_s,
+                     LLVMValueRef *face_t);
+
+
+void
+lp_build_sample_partial_offset(struct lp_build_context *bld,
+                               unsigned block_length,
+                               LLVMValueRef coord,
+                               LLVMValueRef stride,
+                               LLVMValueRef *out_offset,
+                               LLVMValueRef *out_i);
+
+
+void
 lp_build_sample_offset(struct lp_build_context *bld,
                        const struct util_format_description *format_desc,
                        LLVMValueRef x,
                        LLVMValueRef y,
                        LLVMValueRef z,
                        LLVMValueRef y_stride,
-                       LLVMValueRef z_stride);
+                       LLVMValueRef z_stride,
+                       LLVMValueRef *out_offset,
+                       LLVMValueRef *out_i,
+                       LLVMValueRef *out_j);
 
 
 void
@@ -173,9 +367,15 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     unsigned unit,
                     unsigned num_coords,
                     const LLVMValueRef *coords,
-                    LLVMValueRef lodbias,
-                    LLVMValueRef *texel);
+                    const LLVMValueRef *ddx,
+                    const LLVMValueRef *ddy,
+                    LLVMValueRef lod_bias,
+                    LLVMValueRef explicit_lod,
+                    LLVMValueRef texel_out[4]);
 
+void
+lp_build_sample_nop(struct lp_type type,
+                    LLVMValueRef texel_out[4]);
 
 
 #endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
new file mode 100644
index 00000000000..49a6eed615f
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -0,0 +1,1119 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling -- SoA.
+ *
+ * @author Jose Fonseca <[email protected]>
+ * @author Brian Paul <[email protected]>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "util/u_dump.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_gather.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+#include "lp_bld_sample_aos.h"
+#include "lp_bld_quad.h"
+
+
+/**
+ * Build LLVM code for texture coord wrapping, for nearest filtering,
+ * for scaled integer texcoords.
+ * \param block_length  is the length of the pixel block along the
+ *                      coordinate axis
+ * \param coord  the incoming texcoord (s,t,r or q) scaled to the texture size
+ * \param length  the texture size along one dimension
+ * \param stride  pixel stride along the coordinate axis (in bytes)
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param out_offset  byte offset for the wrapped coordinate
+ * \param out_i  resulting sub-block pixel coordinate for coord0
+ */
+static void
+lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld,
+                                 unsigned block_length,
+                                 LLVMValueRef coord,
+                                 LLVMValueRef length,
+                                 LLVMValueRef stride,
+                                 boolean is_pot,
+                                 unsigned wrap_mode,
+                                 LLVMValueRef *out_offset,
+                                 LLVMValueRef *out_i)
+{
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+
+   length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if(is_pot)
+         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
+      else {
+         /* Add a bias to the texcoord to handle negative coords */
+         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         coord = LLVMBuildAdd(bld->builder, coord, bias, "");
+         coord = LLVMBuildURem(bld->builder, coord, length, "");
+      }
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(0);
+   }
+
+   lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride,
+                                  out_offset, out_i);
+}
+
+
+/**
+ * Build LLVM code for texture coord wrapping, for linear filtering,
+ * for scaled integer texcoords.
+ * \param block_length  is the length of the pixel block along the
+ *                      coordinate axis
+ * \param coord0  the incoming texcoord (s,t,r or q) scaled to the texture size
+ * \param length  the texture size along one dimension
+ * \param stride  pixel stride along the coordinate axis (in bytes)
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ * \param offset0  resulting relative offset for coord0
+ * \param offset1  resulting relative offset for coord0 + 1
+ * \param i0  resulting sub-block pixel coordinate for coord0
+ * \param i1  resulting sub-block pixel coordinate for coord0 + 1
+ */
+static void
+lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
+                                unsigned block_length,
+                                LLVMValueRef coord0,
+                                LLVMValueRef length,
+                                LLVMValueRef stride,
+                                boolean is_pot,
+                                unsigned wrap_mode,
+                                LLVMValueRef *offset0,
+                                LLVMValueRef *offset1,
+                                LLVMValueRef *i0,
+                                LLVMValueRef *i1)
+{
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+   LLVMValueRef lmask, umask, mask;
+
+   if (block_length != 1) {
+      /*
+       * If the pixel block covers more than one pixel then there is no easy
+       * way to calculate offset1 relative to offset0. Instead, compute them
+       * independently.
+       */
+
+      LLVMValueRef coord1;
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord0,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset0, i0);
+
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+
+      lp_build_sample_wrap_nearest_int(bld,
+                                       block_length,
+                                       coord1,
+                                       length,
+                                       stride,
+                                       is_pot,
+                                       wrap_mode,
+                                       offset1, i1);
+
+      return;
+   }
+
+   /*
+    * Scalar pixels -- try to compute offset0 and offset1 with a single stride
+    * multiplication.
+    */
+
+   *i0 = uint_coord_bld->zero;
+   *i1 = uint_coord_bld->zero;
+
+   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if (is_pot) {
+         coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
+      }
+      else {
+         /* Add a bias to the texcoord to handle negative coords */
+         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         coord0 = LLVMBuildAdd(bld->builder, coord0, bias, "");
+         coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
+      }
+
+      mask = lp_build_compare(bld->builder, int_coord_bld->type,
+                              PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = LLVMBuildAnd(bld->builder,
+                              lp_build_add(uint_coord_bld, *offset0, stride),
+                              mask, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
+      umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type,
+                               PIPE_FUNC_LESS, coord0, length_minus_one);
+
+      coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
+      coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);
+
+      mask = LLVMBuildAnd(bld->builder, lmask, umask, "");
+
+      *offset0 = lp_build_mul(uint_coord_bld, coord0, stride);
+      *offset1 = lp_build_add(uint_coord_bld,
+                              *offset0,
+                              LLVMBuildAnd(bld->builder, stride, mask, ""));
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(0);
+      *offset0 = uint_coord_bld->zero;
+      *offset1 = uint_coord_bld->zero;
+      break;
+   }
+}
+
+
+/**
+ * Sample a single texture image with nearest sampling.
+ * If sampling a cube texture, r = cube face in [0,5].
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
+                              LLVMValueRef width_vec,
+                              LLVMValueRef height_vec,
+                              LLVMValueRef depth_vec,
+                              LLVMValueRef row_stride_vec,
+                              LLVMValueRef img_stride_vec,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef r,
+                              LLVMValueRef *colors_lo,
+                              LLVMValueRef *colors_hi)
+{
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMBuilderRef builder = bld->builder;
+   struct lp_build_context i32, h16, u8n;
+   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+   LLVMValueRef i32_c8;
+   LLVMValueRef s_ipart, t_ipart, r_ipart;
+   LLVMValueRef x_stride;
+   LLVMValueRef x_offset, offset;
+   LLVMValueRef x_subcoord, y_subcoord, z_subcoord;
+
+   lp_build_context_init(&i32, builder, lp_type_int_vec(32));
+   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
+
+   i32_vec_type = lp_build_vec_type(i32.type);
+   h16_vec_type = lp_build_vec_type(h16.type);
+   u8n_vec_type = lp_build_vec_type(u8n.type);
+
+   if (bld->static_state->normalized_coords) {
+      /* s = s * width, t = t * height */
+      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
+      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
+                                              coord_vec_type, "");
+      s = lp_build_mul(&bld->coord_bld, s, fp_width);
+      if (dims >= 2) {
+         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
+                                                  coord_vec_type, "");
+         t = lp_build_mul(&bld->coord_bld, t, fp_height);
+         if (dims >= 3) {
+            LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
+                                                    coord_vec_type, "");
+            r = lp_build_mul(&bld->coord_bld, r, fp_depth);
+         }
+      }
+   }
+
+   /* scale coords by 256 (8 fractional bits) */
+   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+   if (dims >= 2)
+      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+   if (dims >= 3)
+      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+
+   /* convert float to int */
+   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
+   if (dims >= 2)
+      t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
+   if (dims >= 3)
+      r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
+
+   /* compute floor (shift right 8) */
+   i32_c8 = lp_build_const_int_vec(i32.type, 8);
+   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
+   if (dims >= 2)
+      t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
+   if (dims >= 3)
+      r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
+
+   /* get pixel, row, image strides */
+   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+                                 bld->format_desc->block.bits/8);
+
+   /* Do texcoord wrapping, compute texel offset */
+   lp_build_sample_wrap_nearest_int(bld,
+                                    bld->format_desc->block.width,
+                                    s_ipart, width_vec, x_stride,
+                                    bld->static_state->pot_width,
+                                    bld->static_state->wrap_s,
+                                    &x_offset, &x_subcoord);
+   offset = x_offset;
+   if (dims >= 2) {
+      LLVMValueRef y_offset;
+      lp_build_sample_wrap_nearest_int(bld,
+                                       bld->format_desc->block.height,
+                                       t_ipart, height_vec, row_stride_vec,
+                                       bld->static_state->pot_height,
+                                       bld->static_state->wrap_t,
+                                       &y_offset, &y_subcoord);
+      offset = lp_build_add(&bld->uint_coord_bld, offset, y_offset);
+      if (dims >= 3) {
+         LLVMValueRef z_offset;
+         lp_build_sample_wrap_nearest_int(bld,
+                                          1, /* block length (depth) */
+                                          r_ipart, depth_vec, img_stride_vec,
+                                          bld->static_state->pot_height,
+                                          bld->static_state->wrap_r,
+                                          &z_offset, &z_subcoord);
+         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+      }
+      else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         LLVMValueRef z_offset;
+         /* The r coord is the cube face in [0,5] */
+         z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
+         offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset);
+      }
+   }
+
+   /*
+    * Fetch the pixels as 4 x 32bit (rgba order might differ):
+    *
+    *   rgba0 rgba1 rgba2 rgba3
+    *
+    * bit cast them into 16 x u8
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * unpack them into two 8 x i16:
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1
+    *   r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * The higher 8 bits of the resulting elements will be zero.
+    */
+   {
+      LLVMValueRef rgba8;
+
+      if (util_format_is_rgba8_variant(bld->format_desc)) {
+         /*
+          * Given the format is a rgba8, just read the pixels as is,
+          * without any swizzling. Swizzling will be done later.
+          */
+         rgba8 = lp_build_gather(bld->builder,
+                                 bld->texel_type.length,
+                                 bld->format_desc->block.bits,
+                                 bld->texel_type.width,
+                                 data_ptr, offset);
+
+         rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+      }
+      else {
+         rgba8 = lp_build_fetch_rgba_aos(bld->builder,
+                                         bld->format_desc,
+                                         u8n.type,
+                                         data_ptr, offset,
+                                         x_subcoord,
+                                         y_subcoord);
+      }
+
+      /* Expand one 4*rgba8 to two 2*rgba16 */
+      lp_build_unpack2(builder, u8n.type, h16.type,
+                       rgba8,
+                       colors_lo, colors_hi);
+   }
+}
+
+
+/**
+ * Sample a single texture image with (bi-)(tri-)linear sampling.
+ * Return filtered color as two vectors of 16-bit fixed point values.
+ */
+static void
+lp_build_sample_image_linear(struct lp_build_sample_context *bld,
+                             LLVMValueRef width_vec,
+                             LLVMValueRef height_vec,
+                             LLVMValueRef depth_vec,
+                             LLVMValueRef row_stride_vec,
+                             LLVMValueRef img_stride_vec,
+                             LLVMValueRef data_ptr,
+                             LLVMValueRef s,
+                             LLVMValueRef t,
+                             LLVMValueRef r,
+                             LLVMValueRef *colors_lo,
+                             LLVMValueRef *colors_hi)
+{
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMBuilderRef builder = bld->builder;
+   struct lp_build_context i32, h16, u8n;
+   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+   LLVMValueRef i32_c8, i32_c128, i32_c255;
+   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
+   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
+   LLVMValueRef r_ipart, r_fpart, r_fpart_lo, r_fpart_hi;
+   LLVMValueRef x_stride, y_stride, z_stride;
+   LLVMValueRef x_offset0, x_offset1;
+   LLVMValueRef y_offset0, y_offset1;
+   LLVMValueRef z_offset0, z_offset1;
+   LLVMValueRef offset[2][2][2]; /* [z][y][x] */
+   LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2];
+   LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */
+   LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */
+   LLVMValueRef packed_lo, packed_hi;
+   unsigned x, y, z;
+   unsigned i, j, k;
+   unsigned numj, numk;
+
+   lp_build_context_init(&i32, builder, lp_type_int_vec(32));
+   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
+
+   i32_vec_type = lp_build_vec_type(i32.type);
+   h16_vec_type = lp_build_vec_type(h16.type);
+   u8n_vec_type = lp_build_vec_type(u8n.type);
+
+   if (bld->static_state->normalized_coords) {
+      /* s = s * width, t = t * height */
+      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
+      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec,
+                                              coord_vec_type, "");
+      s = lp_build_mul(&bld->coord_bld, s, fp_width);
+      if (dims >= 2) {
+         LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec,
+                                                  coord_vec_type, "");
+         t = lp_build_mul(&bld->coord_bld, t, fp_height);
+      }
+      if (dims >= 3) {
+         LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec,
+                                                 coord_vec_type, "");
+         r = lp_build_mul(&bld->coord_bld, r, fp_depth);
+      }
+   }
+
+   /* scale coords by 256 (8 fractional bits) */
+   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+   if (dims >= 2)
+      t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+   if (dims >= 3)
+      r = lp_build_mul_imm(&bld->coord_bld, r, 256);
+
+   /* convert float to int */
+   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
+   if (dims >= 2)
+      t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
+   if (dims >= 3)
+      r = LLVMBuildFPToSI(builder, r, i32_vec_type, "");
+
+   /* subtract 0.5 (add -128) */
+   i32_c128 = lp_build_const_int_vec(i32.type, -128);
+   s = LLVMBuildAdd(builder, s, i32_c128, "");
+   if (dims >= 2) {
+      t = LLVMBuildAdd(builder, t, i32_c128, "");
+   }
+   if (dims >= 3) {
+      r = LLVMBuildAdd(builder, r, i32_c128, "");
+   }
+
+   /* compute floor (shift right 8) */
+   i32_c8 = lp_build_const_int_vec(i32.type, 8);
+   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
+   if (dims >= 2)
+      t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
+   if (dims >= 3)
+      r_ipart = LLVMBuildAShr(builder, r, i32_c8, "");
+
+   /* compute fractional part (AND with 0xff) */
+   i32_c255 = lp_build_const_int_vec(i32.type, 255);
+   s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
+   if (dims >= 2)
+      t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
+   if (dims >= 3)
+      r_fpart = LLVMBuildAnd(builder, r, i32_c255, "");
+
+   /* get pixel, row and image strides */
+   x_stride = lp_build_const_vec(bld->uint_coord_bld.type,
+                                 bld->format_desc->block.bits/8);
+   y_stride = row_stride_vec;
+   z_stride = img_stride_vec;
+
+   /* do texcoord wrapping and compute texel offsets */
+   lp_build_sample_wrap_linear_int(bld,
+                                   bld->format_desc->block.width,
+                                   s_ipart, width_vec, x_stride,
+                                   bld->static_state->pot_width,
+                                   bld->static_state->wrap_s,
+                                   &x_offset0, &x_offset1,
+                                   &x_subcoord[0], &x_subcoord[1]);
+   for (z = 0; z < 2; z++) {
+      for (y = 0; y < 2; y++) {
+         offset[z][y][0] = x_offset0;
+         offset[z][y][1] = x_offset1;
+      }
+   }
+
+   if (dims >= 2) {
+      lp_build_sample_wrap_linear_int(bld,
+                                      bld->format_desc->block.height,
+                                      t_ipart, height_vec, y_stride,
+                                      bld->static_state->pot_height,
+                                      bld->static_state->wrap_t,
+                                      &y_offset0, &y_offset1,
+                                      &y_subcoord[0], &y_subcoord[1]);
+
+      for (z = 0; z < 2; z++) {
+         for (x = 0; x < 2; x++) {
+            offset[z][0][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[z][0][x], y_offset0);
+            offset[z][1][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[z][1][x], y_offset1);
+         }
+      }
+   }
+
+   if (dims >= 3) {
+      lp_build_sample_wrap_linear_int(bld,
+                                      bld->format_desc->block.height,
+                                      r_ipart, depth_vec, z_stride,
+                                      bld->static_state->pot_depth,
+                                      bld->static_state->wrap_r,
+                                      &z_offset0, &z_offset1,
+                                      &z_subcoord[0], &z_subcoord[1]);
+      for (y = 0; y < 2; y++) {
+         for (x = 0; x < 2; x++) {
+            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[0][y][x], z_offset0);
+            offset[1][y][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[1][y][x], z_offset1);
+         }
+      }
+   }
+   else if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef z_offset;
+      z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec);
+      for (y = 0; y < 2; y++) {
+         for (x = 0; x < 2; x++) {
+            /* The r coord is the cube face in [0,5] */
+            offset[0][y][x] = lp_build_add(&bld->uint_coord_bld,
+                                           offset[0][y][x], z_offset);
+         }
+      }
+   }
+
+   /*
+    * Transform 4 x i32 in
+    *
+    *   s_fpart = {s0, s1, s2, s3}
+    *
+    * into 8 x i16
+    *
+    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+    *
+    * into two 8 x i16
+    *
+    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+    *
+    * and likewise for t_fpart. There is no risk of loosing precision here
+    * since the fractional parts only use the lower 8bits.
+    */
+   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+   if (dims >= 2)
+      t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+   if (dims >= 3)
+      r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, "");
+
+   {
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffle_lo;
+      LLVMValueRef shuffle_hi;
+
+      for (j = 0; j < h16.type.length; j += 4) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         unsigned subindex = 0;
+#else
+         unsigned subindex = 1;
+#endif
+         LLVMValueRef index;
+
+         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+         for (i = 0; i < 4; ++i)
+            shuffles_lo[j + i] = index;
+
+         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+         for (i = 0; i < 4; ++i)
+            shuffles_hi[j + i] = index;
+      }
+
+      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+                                          shuffle_lo, "");
+      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef,
+                                          shuffle_hi, "");
+      if (dims >= 2) {
+         t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+                                             shuffle_lo, "");
+         t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef,
+                                             shuffle_hi, "");
+      }
+      if (dims >= 3) {
+         r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+                                             shuffle_lo, "");
+         r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef,
+                                             shuffle_hi, "");
+      }
+   }
+
+   /*
+    * Fetch the pixels as 4 x 32bit (rgba order might differ):
+    *
+    *   rgba0 rgba1 rgba2 rgba3
+    *
+    * bit cast them into 16 x u8
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * unpack them into two 8 x i16:
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1
+    *   r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * The higher 8 bits of the resulting elements will be zero.
+    */
+   numj = 1 + (dims >= 2);
+   numk = 1 + (dims >= 3);
+
+   for (k = 0; k < numk; k++) {
+      for (j = 0; j < numj; j++) {
+         for (i = 0; i < 2; i++) {
+            LLVMValueRef rgba8;
+
+            if (util_format_is_rgba8_variant(bld->format_desc)) {
+               /*
+                * Given the format is a rgba8, just read the pixels as is,
+                * without any swizzling. Swizzling will be done later.
+                */
+               rgba8 = lp_build_gather(bld->builder,
+                                       bld->texel_type.length,
+                                       bld->format_desc->block.bits,
+                                       bld->texel_type.width,
+                                       data_ptr, offset[k][j][i]);
+
+               rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, "");
+            }
+            else {
+               rgba8 = lp_build_fetch_rgba_aos(bld->builder,
+                                               bld->format_desc,
+                                               u8n.type,
+                                               data_ptr, offset[k][j][i],
+                                               x_subcoord[i],
+                                               y_subcoord[j]);
+            }
+
+            /* Expand one 4*rgba8 to two 2*rgba16 */
+            lp_build_unpack2(builder, u8n.type, h16.type,
+                             rgba8,
+                             &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]);
+         }
+      }
+   }
+
+   /*
+    * Linear interpolation with 8.8 fixed point.
+    */
+   if (dims == 1) {
+      /* 1-D lerp */
+      packed_lo = lp_build_lerp(&h16,
+				s_fpart_lo,
+				neighbors_lo[0][0][0],
+				neighbors_lo[0][0][1]);
+
+      packed_hi = lp_build_lerp(&h16,
+				s_fpart_hi,
+				neighbors_hi[0][0][0],
+				neighbors_hi[0][0][1]);
+   }
+   else {
+      /* 2-D lerp */
+      packed_lo = lp_build_lerp_2d(&h16,
+				   s_fpart_lo, t_fpart_lo,
+				   neighbors_lo[0][0][0],
+				   neighbors_lo[0][0][1],
+				   neighbors_lo[0][1][0],
+				   neighbors_lo[0][1][1]);
+
+      packed_hi = lp_build_lerp_2d(&h16,
+				   s_fpart_hi, t_fpart_hi,
+				   neighbors_hi[0][0][0],
+				   neighbors_hi[0][0][1],
+				   neighbors_hi[0][1][0],
+				   neighbors_hi[0][1][1]);
+
+      if (dims >= 3) {
+	 LLVMValueRef packed_lo2, packed_hi2;
+
+	 /* lerp in the second z slice */
+	 packed_lo2 = lp_build_lerp_2d(&h16,
+				       s_fpart_lo, t_fpart_lo,
+				       neighbors_lo[1][0][0],
+				       neighbors_lo[1][0][1],
+				       neighbors_lo[1][1][0],
+				       neighbors_lo[1][1][1]);
+
+	 packed_hi2 = lp_build_lerp_2d(&h16,
+				       s_fpart_hi, t_fpart_hi,
+				       neighbors_hi[1][0][0],
+				       neighbors_hi[1][0][1],
+				       neighbors_hi[1][1][0],
+				       neighbors_hi[1][1][1]);
+	 /* interp between two z slices */
+	 packed_lo = lp_build_lerp(&h16, r_fpart_lo,
+				   packed_lo, packed_lo2);
+	 packed_hi = lp_build_lerp(&h16, r_fpart_hi,
+				   packed_hi, packed_hi2);
+      }
+   }
+
+   *colors_lo = packed_lo;
+   *colors_hi = packed_hi;
+}
+
+
+/**
+ * Sample the texture/mipmap using given image filter and mip filter.
+ * data0_ptr and data1_ptr point to the two mipmap levels to sample
+ * from.  width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes.
+ * If we're using nearest miplevel sampling the '1' values will be null/unused.
+ */
+static void
+lp_build_sample_mipmap(struct lp_build_sample_context *bld,
+                       unsigned img_filter,
+                       unsigned mip_filter,
+                       LLVMValueRef s,
+                       LLVMValueRef t,
+                       LLVMValueRef r,
+                       LLVMValueRef lod_fpart,
+                       LLVMValueRef width0_vec,
+                       LLVMValueRef width1_vec,
+                       LLVMValueRef height0_vec,
+                       LLVMValueRef height1_vec,
+                       LLVMValueRef depth0_vec,
+                       LLVMValueRef depth1_vec,
+                       LLVMValueRef row_stride0_vec,
+                       LLVMValueRef row_stride1_vec,
+                       LLVMValueRef img_stride0_vec,
+                       LLVMValueRef img_stride1_vec,
+                       LLVMValueRef data_ptr0,
+                       LLVMValueRef data_ptr1,
+                       LLVMValueRef *colors_lo,
+                       LLVMValueRef *colors_hi)
+{
+   LLVMValueRef colors0_lo, colors0_hi;
+   LLVMValueRef colors1_lo, colors1_hi;
+
+   if (img_filter == PIPE_TEX_FILTER_NEAREST) {
+      /* sample the first mipmap level */
+      lp_build_sample_image_nearest(bld,
+                                    width0_vec, height0_vec, depth0_vec,
+                                    row_stride0_vec, img_stride0_vec,
+                                    data_ptr0, s, t, r,
+                                    &colors0_lo, &colors0_hi);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level */
+         lp_build_sample_image_nearest(bld,
+                                       width1_vec, height1_vec, depth1_vec,
+                                       row_stride1_vec, img_stride1_vec,
+                                       data_ptr1, s, t, r,
+                                       &colors1_lo, &colors1_hi);
+      }
+   }
+   else {
+      assert(img_filter == PIPE_TEX_FILTER_LINEAR);
+
+      /* sample the first mipmap level */
+      lp_build_sample_image_linear(bld,
+                                   width0_vec, height0_vec, depth0_vec,
+                                   row_stride0_vec, img_stride0_vec,
+                                   data_ptr0, s, t, r,
+                                   &colors0_lo, &colors0_hi);
+
+      if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+         /* sample the second mipmap level */
+         lp_build_sample_image_linear(bld,
+                                      width1_vec, height1_vec, depth1_vec,
+                                      row_stride1_vec, img_stride1_vec,
+                                      data_ptr1, s, t, r,
+                                      &colors1_lo, &colors1_hi);
+      }
+   }
+
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      /* interpolate samples from the two mipmap levels */
+      struct lp_build_context h16;
+      lp_build_context_init(&h16, bld->builder, lp_type_ufixed(16));
+
+      *colors_lo = lp_build_lerp(&h16, lod_fpart,
+                                 colors0_lo, colors1_lo);
+      *colors_hi = lp_build_lerp(&h16, lod_fpart,
+                                 colors0_hi, colors1_hi);
+   }
+   else {
+      /* use first/only level's colors */
+      *colors_lo = colors0_lo;
+      *colors_hi = colors0_hi;
+   }
+}
+
+
+
+/**
+ * Texture sampling in AoS format.  Used when sampling common 32-bit/texel
+ * formats.  1D/2D/3D/cube texture supported.  All mipmap sampling modes
+ * but only limited texture coord wrap modes.
+ */
+void
+lp_build_sample_aos(struct lp_build_sample_context *bld,
+                    unsigned unit,
+                    LLVMValueRef s,
+                    LLVMValueRef t,
+                    LLVMValueRef r,
+                    const LLVMValueRef *ddx,
+                    const LLVMValueRef *ddy,
+                    LLVMValueRef lod_bias, /* optional */
+                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef width,
+                    LLVMValueRef height,
+                    LLVMValueRef depth,
+                    LLVMValueRef width_vec,
+                    LLVMValueRef height_vec,
+                    LLVMValueRef depth_vec,
+                    LLVMValueRef row_stride_array,
+                    LLVMValueRef img_stride_array,
+                    LLVMValueRef data_array,
+                    LLVMValueRef texel_out[4])
+{
+   struct lp_build_context *float_bld = &bld->float_bld;
+   LLVMBuilderRef builder = bld->builder;
+   const unsigned mip_filter = bld->static_state->min_mip_filter;
+   const unsigned min_filter = bld->static_state->min_img_filter;
+   const unsigned mag_filter = bld->static_state->mag_img_filter;
+   const int dims = texture_dims(bld->static_state->target);
+   LLVMValueRef lod = NULL, lod_fpart = NULL;
+   LLVMValueRef ilevel0, ilevel1 = NULL;
+   LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
+   LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
+   LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
+   LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
+   LLVMValueRef data_ptr0, data_ptr1 = NULL;
+   LLVMValueRef packed, packed_lo, packed_hi;
+   LLVMValueRef unswizzled[4];
+   LLVMValueRef face_ddx[4], face_ddy[4];
+   struct lp_build_context h16;
+   LLVMTypeRef h16_vec_type;
+
+   /* we only support the common/simple wrap modes at this time */
+   assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s));
+   if (dims >= 2)
+      assert(lp_is_simple_wrap_mode(bld->static_state->wrap_t));
+   if (dims >= 3)
+      assert(lp_is_simple_wrap_mode(bld->static_state->wrap_r));
+
+
+   /* make 16-bit fixed-pt builder context */
+   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+   h16_vec_type = lp_build_vec_type(h16.type);
+
+
+   /* cube face selection, compute pre-face coords, etc. */
+   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef face, face_s, face_t;
+      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
+      s = face_s; /* vec */
+      t = face_t; /* vec */
+      /* use 'r' to indicate cube face */
+      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+
+      /* recompute ddx, ddy using the new (s,t) face texcoords */
+      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[2] = NULL;
+      face_ddx[3] = NULL;
+      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[2] = NULL;
+      face_ddy[3] = NULL;
+      ddx = face_ddx;
+      ddy = face_ddy;
+   }
+
+
+   /*
+    * Compute the level of detail (float).
+    */
+   if (min_filter != mag_filter ||
+       mip_filter != PIPE_TEX_MIPFILTER_NONE) {
+      /* Need to compute lod either to choose mipmap levels or to
+       * distinguish between minification/magnification with one mipmap level.
+       */
+      lod = lp_build_lod_selector(bld, unit, ddx, ddy,
+                                  lod_bias, explicit_lod,
+                                  width, height, depth);
+   }
+
+   /*
+    * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
+    * If mipfilter=linear, also compute the weight between the two
+    * mipmap levels: lod_fpart
+    */
+   switch (mip_filter) {
+   default:
+      assert(0 && "bad mip_filter value in lp_build_sample_aos()");
+      /* fall-through */
+   case PIPE_TEX_MIPFILTER_NONE:
+      /* always use mip level 0 */
+      if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         /* XXX this is a work-around for an apparent bug in LLVM 2.7.
+          * We should be able to set ilevel0 = const(0) but that causes
+          * bad x86 code to be emitted.
+          */
+         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
+         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      }
+      else {
+         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      }
+      break;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      assert(lod);
+      lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      break;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      {
+         LLVMValueRef f256 = LLVMConstReal(LLVMFloatType(), 256.0);
+         LLVMValueRef i255 = lp_build_const_int32(255);
+         LLVMTypeRef i16_type = LLVMIntType(16);
+
+         assert(lod);
+
+         lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1,
+                                    &lod_fpart);
+         lod_fpart = LLVMBuildFMul(builder, lod_fpart, f256, "");
+         lod_fpart = lp_build_ifloor(&bld->float_bld, lod_fpart);
+         lod_fpart = LLVMBuildAnd(builder, lod_fpart, i255, "");
+         lod_fpart = LLVMBuildTrunc(builder, lod_fpart, i16_type, "");
+         lod_fpart = lp_build_broadcast_scalar(&h16, lod_fpart);
+
+         /* the lod_fpart values will be fixed pt values in [0,1) */
+      }
+      break;
+   }
+
+   /* compute image size(s) of source mipmap level(s) */
+   lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec,
+                               ilevel0, ilevel1,
+                               row_stride_array, img_stride_array,
+                               &width0_vec, &width1_vec,
+                               &height0_vec, &height1_vec,
+                               &depth0_vec, &depth1_vec,
+                               &row_stride0_vec, &row_stride1_vec,
+                               &img_stride0_vec, &img_stride1_vec);
+
+   /*
+    * Get pointer(s) to image data for mipmap level(s).
+    */
+   data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0);
+   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
+      data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1);
+   }
+
+
+   /*
+    * Get/interpolate texture colors.
+    */
+   if (min_filter == mag_filter) {
+      /* no need to distinquish between minification and magnification */
+      lp_build_sample_mipmap(bld, min_filter, mip_filter,
+                             s, t, r, lod_fpart,
+                             width0_vec, width1_vec,
+                             height0_vec, height1_vec,
+                             depth0_vec, depth1_vec,
+                             row_stride0_vec, row_stride1_vec,
+                             img_stride0_vec, img_stride1_vec,
+                             data_ptr0, data_ptr1,
+                             &packed_lo, &packed_hi);
+   }
+   else {
+      /* Emit conditional to choose min image filter or mag image filter
+       * depending on the lod being > 0 or <= 0, respectively.
+       */
+      struct lp_build_flow_context *flow_ctx;
+      struct lp_build_if_state if_ctx;
+      LLVMValueRef minify;
+
+      flow_ctx = lp_build_flow_create(builder);
+      lp_build_flow_scope_begin(flow_ctx);
+
+      packed_lo = LLVMGetUndef(h16_vec_type);
+      packed_hi = LLVMGetUndef(h16_vec_type);
+
+      lp_build_flow_scope_declare(flow_ctx, &packed_lo);
+      lp_build_flow_scope_declare(flow_ctx, &packed_hi);
+
+      /* minify = lod > 0.0 */
+      minify = LLVMBuildFCmp(builder, LLVMRealUGE,
+                             lod, float_bld->zero, "");
+
+      lp_build_if(&if_ctx, flow_ctx, builder, minify);
+      {
+         /* Use the minification filter */
+         lp_build_sample_mipmap(bld, min_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                &packed_lo, &packed_hi);
+      }
+      lp_build_else(&if_ctx);
+      {
+         /* Use the magnification filter */
+         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
+                                s, t, r, lod_fpart,
+                                width0_vec, width1_vec,
+                                height0_vec, height1_vec,
+                                depth0_vec, depth1_vec,
+                                row_stride0_vec, row_stride1_vec,
+                                img_stride0_vec, img_stride1_vec,
+                                data_ptr0, data_ptr1,
+                                &packed_lo, &packed_hi);
+      }
+      lp_build_endif(&if_ctx);
+
+      lp_build_flow_scope_end(flow_ctx);
+      lp_build_flow_destroy(flow_ctx);
+   }
+
+   /* combine 'packed_lo', 'packed_hi' into 'packed' */
+   {
+      struct lp_build_context h16, u8n;
+
+      lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+      lp_build_context_init(&u8n, builder, lp_type_unorm(8));
+
+      packed = lp_build_pack2(builder, h16.type, u8n.type,
+                              packed_lo, packed_hi);
+   }
+
+   /*
+    * Convert to SoA and swizzle.
+    */
+   lp_build_rgba8_to_f32_soa(builder,
+                             bld->texel_type,
+                             packed, unswizzled);
+
+   if (util_format_is_rgba8_variant(bld->format_desc)) {
+      lp_build_format_swizzle_soa(bld->format_desc,
+                                  &bld->texel_bld,
+                                  unswizzled, texel_out);
+   }
+   else {
+      texel_out[0] = unswizzled[0];
+      texel_out[1] = unswizzled[1];
+      texel_out[2] = unswizzled[2];
+      texel_out[3] = unswizzled[3];
+   }
+
+   apply_sampler_swizzle(bld, texel_out);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
new file mode 100644
index 00000000000..e1045bbbc21
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling -- SoA.
+ *
+ * @author Jose Fonseca <[email protected]>
+ * @author Brian Paul <[email protected]>
+ */
+
+#ifndef LP_BLD_SAMPLE_AOS_H
+#define LP_BLD_SAMPLE_AOS_H
+
+
+#include "lp_bld_sample.h"
+
+
+void
+lp_build_sample_aos(struct lp_build_sample_context *bld,
+                    unsigned unit,
+                    LLVMValueRef s,
+                    LLVMValueRef t,
+                    LLVMValueRef r,
+                    const LLVMValueRef *ddx,
+                    const LLVMValueRef *ddy,
+                    LLVMValueRef lod_bias, /* optional */
+                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef width,
+                    LLVMValueRef height,
+                    LLVMValueRef depth,
+                    LLVMValueRef width_vec,
+                    LLVMValueRef height_vec,
+                    LLVMValueRef depth_vec,
+                    LLVMValueRef row_stride_array,
+                    LLVMValueRef img_stride_array,
+                    LLVMValueRef data_array,
+                    LLVMValueRef texel_out[4]);
+
+
+#endif /* LP_BLD_SAMPLE_AOS_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 54ef921678d..36a77d3aff0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -30,6 +30,7 @@
  * Texture sampling -- SoA.
  *
  * @author Jose Fonseca <[email protected]>
+ * @author Brian Paul <[email protected]>
  */
 
 #include "pipe/p_defines.h"
@@ -39,204 +40,28 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
-#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_conv.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_bitarit.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_printf.h"
 #include "lp_bld_swizzle.h"
-#include "lp_bld_pack.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_format.h"
 #include "lp_bld_sample.h"
-
-
-/**
- * Keep all information for sampling code generation in a single place.
- */
-struct lp_build_sample_context
-{
-   LLVMBuilderRef builder;
-
-   const struct lp_sampler_static_state *static_state;
-
-   struct lp_sampler_dynamic_state *dynamic_state;
-
-   const struct util_format_description *format_desc;
-
-   /** regular scalar float type */
-   struct lp_type float_type;
-   struct lp_build_context float_bld;
-
-   /** regular scalar float type */
-   struct lp_type int_type;
-   struct lp_build_context int_bld;
-
-   /** Incoming coordinates type and build context */
-   struct lp_type coord_type;
-   struct lp_build_context coord_bld;
-
-   /** Unsigned integer coordinates */
-   struct lp_type uint_coord_type;
-   struct lp_build_context uint_coord_bld;
-
-   /** Signed integer coordinates */
-   struct lp_type int_coord_type;
-   struct lp_build_context int_coord_bld;
-
-   /** Output texels type and build context */
-   struct lp_type texel_type;
-   struct lp_build_context texel_bld;
-};
-
-
-/**
- * Does the given texture wrap mode allow sampling the texture border color?
- * XXX maybe move this into gallium util code.
- */
-static boolean
-wrap_mode_uses_border_color(unsigned mode)
-{
-   switch (mode) {
-   case PIPE_TEX_WRAP_REPEAT:
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      return FALSE;
-   case PIPE_TEX_WRAP_CLAMP:
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      return TRUE;
-   default:
-      assert(0 && "unexpected wrap mode");
-      return FALSE;
-   }
-}
-
-
-static LLVMValueRef
-lp_build_get_mipmap_level(struct lp_build_sample_context *bld,
-                          LLVMValueRef data_array, LLVMValueRef level)
-{
-   LLVMValueRef indexes[2], data_ptr;
-   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   indexes[1] = level;
-   data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, "");
-   data_ptr = LLVMBuildLoad(bld->builder, data_ptr, "");
-   return data_ptr;
-}
-
-
-static LLVMValueRef
-lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld,
-                                LLVMValueRef data_array, int level)
-{
-   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
-   return lp_build_get_mipmap_level(bld, data_array, lvl);
-}
-
-
-/**
- * Dereference stride_array[mipmap_level] array to get a stride.
- * Return stride as a vector.
- */
-static LLVMValueRef
-lp_build_get_level_stride_vec(struct lp_build_sample_context *bld,
-                              LLVMValueRef stride_array, LLVMValueRef level)
-{
-   LLVMValueRef indexes[2], stride;
-   indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
-   indexes[1] = level;
-   stride = LLVMBuildGEP(bld->builder, stride_array, indexes, 2, "");
-   stride = LLVMBuildLoad(bld->builder, stride, "");
-   stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride);
-   return stride;
-}
-
-
-/** Dereference stride_array[0] array to get a stride (as vector). */
-static LLVMValueRef
-lp_build_get_const_level_stride_vec(struct lp_build_sample_context *bld,
-                                    LLVMValueRef stride_array, int level)
-{
-   LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0);
-   return lp_build_get_level_stride_vec(bld, stride_array, lvl);
-}
-
-
-static int
-texture_dims(enum pipe_texture_target tex)
-{
-   switch (tex) {
-   case PIPE_TEXTURE_1D:
-      return 1;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_CUBE:
-      return 2;
-   case PIPE_TEXTURE_3D:
-      return 3;
-   default:
-      assert(0 && "bad texture target in texture_dims()");
-      return 2;
-   }
-}
-
-
-static LLVMValueRef
-lp_build_swizzle_chan_soa(struct lp_type type,
-                          const LLVMValueRef *unswizzled,
-                          enum util_format_swizzle swizzle)
-{
-   switch (swizzle) {
-   case PIPE_SWIZZLE_RED:
-   case PIPE_SWIZZLE_GREEN:
-   case PIPE_SWIZZLE_BLUE:
-   case PIPE_SWIZZLE_ALPHA:
-      return unswizzled[swizzle];
-   case PIPE_SWIZZLE_ZERO:
-      return lp_build_zero(type);
-   case PIPE_SWIZZLE_ONE:
-      return lp_build_one(type);
-   default:
-      assert(0);
-      return lp_build_undef(type);
-   }
-}
-
-
-static void
-lp_build_swizzle_soa(struct lp_build_sample_context *bld,
-                     LLVMValueRef *texel)
-{
-   LLVMValueRef unswizzled[4];
-   unsigned char swizzles[4];
-   unsigned chan;
-
-   for (chan = 0; chan < 4; ++chan) {
-      unswizzled[chan] = texel[chan];
-   }
-
-   swizzles[0] = bld->static_state->swizzle_r;
-   swizzles[1] = bld->static_state->swizzle_g;
-   swizzles[2] = bld->static_state->swizzle_b;
-   swizzles[3] = bld->static_state->swizzle_a;
-
-   for (chan = 0; chan < 4; ++chan) {
-      unsigned swizzle = swizzles[chan];
-      texel[chan] = lp_build_swizzle_chan_soa(bld->texel_type,
-                                              unswizzled, swizzle);
-   }
-}
-
+#include "lp_bld_sample_aos.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_quad.h"
 
 
 /**
  * Generate code to fetch a texel from a texture at int coords (x, y, z).
  * The computation depends on whether the texture is 1D, 2D or 3D.
- * The result, texel, will be:
+ * The result, texel, will be float vectors:
  *   texel[0] = red values
  *   texel[1] = green values
  *   texel[2] = blue values
@@ -244,6 +69,7 @@ lp_build_swizzle_soa(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
+                          unsigned unit,
                           LLVMValueRef width,
                           LLVMValueRef height,
                           LLVMValueRef depth,
@@ -253,23 +79,29 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                           LLVMValueRef y_stride,
                           LLVMValueRef z_stride,
                           LLVMValueRef data_ptr,
-                          LLVMValueRef *texel)
+                          LLVMValueRef texel_out[4])
 {
-   const int dims = texture_dims(bld->static_state->target);
+   const struct lp_sampler_static_state *static_state = bld->static_state;
+   const int dims = texture_dims(static_state->target);
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    LLVMValueRef offset;
    LLVMValueRef i, j;
    LLVMValueRef use_border = NULL;
 
    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
-   if (wrap_mode_uses_border_color(bld->static_state->wrap_s)) {
+   if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
+                                              static_state->min_img_filter,
+                                              static_state->mag_img_filter)) {
       LLVMValueRef b1, b2;
       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
       use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
    }
 
-   if (dims >= 2 && wrap_mode_uses_border_color(bld->static_state->wrap_t)) {
+   if (dims >= 2 &&
+       lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
+                                              static_state->min_img_filter,
+                                              static_state->mag_img_filter)) {
       LLVMValueRef b1, b2;
       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
@@ -282,7 +114,10 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       }
    }
 
-   if (dims == 3 && wrap_mode_uses_border_color(bld->static_state->wrap_r)) {
+   if (dims == 3 &&
+       lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
+                                              static_state->min_img_filter,
+                                              static_state->mag_img_filter)) {
       LLVMValueRef b1, b2;
       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
@@ -295,44 +130,30 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
       }
    }
 
-   /*
-    * Describe the coordinates in terms of pixel blocks.
-    *
-    * TODO: pixel blocks are power of two. LLVM should convert rem/div to
-    * bit arithmetic. Verify this.
-    */
-
-   if (bld->format_desc->block.width == 1) {
-      i = bld->uint_coord_bld.zero;
-   }
-   else {
-      LLVMValueRef block_width = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.width);
-      i = LLVMBuildURem(bld->builder, x, block_width, "");
-      x = LLVMBuildUDiv(bld->builder, x, block_width, "");
-   }
+   /* convert x,y,z coords to linear offset from start of texture, in bytes */
+   lp_build_sample_offset(&bld->uint_coord_bld,
+                          bld->format_desc,
+                          x, y, z, y_stride, z_stride,
+                          &offset, &i, &j);
 
-   if (bld->format_desc->block.height == 1) {
-      j = bld->uint_coord_bld.zero;
-   }
-   else {
-      LLVMValueRef block_height = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.height);
-      j = LLVMBuildURem(bld->builder, y, block_height, "");
-      y = LLVMBuildUDiv(bld->builder, y, block_height, "");
+   if (use_border) {
+      /* If we can sample the border color, it means that texcoords may
+       * lie outside the bounds of the texture image.  We need to do
+       * something to prevent reading out of bounds and causing a segfault.
+       *
+       * Simply AND the texture coords with !use_border.  This will cause
+       * coords which are out of bounds to become zero.  Zero's guaranteed
+       * to be inside the texture image.
+       */
+      offset = lp_build_andnot(&bld->uint_coord_bld, offset, use_border);
    }
 
-   /* convert x,y,z coords to linear offset from start of texture, in bytes */
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, z, y_stride, z_stride);
-
    lp_build_fetch_rgba_soa(bld->builder,
                            bld->format_desc,
                            bld->texel_type,
                            data_ptr, offset,
                            i, j,
-                           texel);
-
-   lp_build_swizzle_soa(bld, texel);
+                           texel_out);
 
    /*
     * Note: if we find an app which frequently samples the texture border
@@ -351,44 +172,22 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
 
    if (use_border) {
       /* select texel color or border color depending on use_border */
+      LLVMValueRef border_color_ptr = 
+         bld->dynamic_state->border_color(bld->dynamic_state,
+                                          bld->builder, unit);
       int chan;
       for (chan = 0; chan < 4; chan++) {
          LLVMValueRef border_chan =
-            lp_build_const_vec(bld->texel_type,
-                                  bld->static_state->border_color[chan]);
-         texel[chan] = lp_build_select(&bld->texel_bld, use_border,
-                                       border_chan, texel[chan]);
+            lp_build_array_get(bld->builder, border_color_ptr,
+                               lp_build_const_int32(chan));
+         LLVMValueRef border_chan_vec =
+            lp_build_broadcast_scalar(&bld->float_vec_bld, border_chan);
+         texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
+                                           border_chan_vec, texel_out[chan]);
       }
    }
-}
-
-
-static LLVMValueRef
-lp_build_sample_packed(struct lp_build_sample_context *bld,
-                       LLVMValueRef x,
-                       LLVMValueRef y,
-                       LLVMValueRef y_stride,
-                       LLVMValueRef data_array)
-{
-   LLVMValueRef offset;
-   LLVMValueRef data_ptr;
-
-   offset = lp_build_sample_offset(&bld->uint_coord_bld,
-                                   bld->format_desc,
-                                   x, y, NULL, y_stride, NULL);
 
-   assert(bld->format_desc->block.width == 1);
-   assert(bld->format_desc->block.height == 1);
-   assert(bld->format_desc->block.bits <= bld->texel_type.width);
-
-   /* get pointer to mipmap level 0 data */
-   data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0);
-
-   return lp_build_gather(bld->builder,
-                          bld->texel_type.length,
-                          bld->format_desc->block.bits,
-                          bld->texel_type.width,
-                          data_ptr, offset);
+   apply_sampler_swizzle(bld, texel_out);
 }
 
 
@@ -426,81 +225,6 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld,
 
 
 /**
- * We only support a few wrap modes in lp_build_sample_wrap_int() at this time.
- * Return whether the given mode is supported by that function.
- */
-static boolean
-is_simple_wrap_mode(unsigned mode)
-{
-   switch (mode) {
-   case PIPE_TEX_WRAP_REPEAT:
-   case PIPE_TEX_WRAP_CLAMP:
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      return TRUE;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-   default:
-      return FALSE;
-   }
-}
-
-
-/**
- * Build LLVM code for texture wrap mode, for scaled integer texcoords.
- * \param coord  the incoming texcoord (s,t,r or q) scaled to the texture size
- * \param length  the texture size along one dimension
- * \param is_pot  if TRUE, length is a power of two
- * \param wrap_mode  one of PIPE_TEX_WRAP_x
- */
-static LLVMValueRef
-lp_build_sample_wrap_int(struct lp_build_sample_context *bld,
-                         LLVMValueRef coord,
-                         LLVMValueRef length,
-                         boolean is_pot,
-                         unsigned wrap_mode)
-{
-   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
-   LLVMValueRef length_minus_one;
-
-   length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
-
-   switch(wrap_mode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      if(is_pot)
-         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
-      else
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
-         coord = LLVMBuildURem(bld->builder, coord, length, "");
-      break;
-
-   case PIPE_TEX_WRAP_CLAMP:
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
-      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
-      break;
-
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      /* FIXME */
-      _debug_printf("llvmpipe: failed to translate texture wrap mode %s\n",
-                    util_dump_tex_wrap(wrap_mode, TRUE));
-      coord = lp_build_max(uint_coord_bld, coord, uint_coord_bld->zero);
-      coord = lp_build_min(uint_coord_bld, coord, length_minus_one);
-      break;
-
-   default:
-      assert(0);
-   }
-
-   return coord;
-}
-
-
-/**
  * Build LLVM code for texture wrap mode for linear filtering.
  * \param x0_out  returns first integer texcoord
  * \param x1_out  returns second integer texcoord
@@ -519,11 +243,9 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
    LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
-   LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
    LLVMValueRef coord0, coord1, weight;
 
    switch(wrap_mode) {
@@ -542,8 +264,10 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
          coord1 = LLVMBuildAnd(bld->builder, coord1, length_minus_one, "");
       }
       else {
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
+         /* Add a bias to the texcoord to handle negative coords */
+         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         coord0 = LLVMBuildAdd(bld->builder, coord0, bias, "");
+         coord1 = LLVMBuildAdd(bld->builder, coord1, bias, "");
          coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
          coord1 = LLVMBuildURem(bld->builder, coord1, length, "");
       }
@@ -551,16 +275,18 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 
    case PIPE_TEX_WRAP_CLAMP:
       if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
+
+      /* clamp to [0, length] */
+      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
+
+      coord = lp_build_sub(coord_bld, coord, half);
+
       weight = lp_build_fract(coord_bld, coord);
-      coord0 = lp_build_clamp(coord_bld, coord, coord_bld->zero,
-                              length_f_minus_one);
-      coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
-      coord1 = lp_build_clamp(coord_bld, coord1, coord_bld->zero,
-                              length_f_minus_one);
-      coord0 = lp_build_ifloor(coord_bld, coord0);
-      coord1 = lp_build_ifloor(coord_bld, coord1);
+      coord0 = lp_build_ifloor(coord_bld, coord);
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
@@ -574,7 +300,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       else {
          LLVMValueRef min, max;
          /* clamp to [0.5, length - 0.5] */
-         min = lp_build_const_vec(coord_bld->type, 0.5F);
+         min = half;
          max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
       }
@@ -593,25 +319,14 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       {
          LLVMValueRef min, max;
          if (bld->static_state->normalized_coords) {
-            /* min = -1.0 / (2 * length) = -0.5 / length */
-            min = lp_build_mul(coord_bld,
-                               lp_build_const_vec(coord_bld->type, -0.5F),
-                               lp_build_rcp(coord_bld, length_f));
-            /* max = 1.0 - min */
-            max = lp_build_sub(coord_bld, coord_bld->one, min);
-            /* coord = clamp(coord, min, max) */
-            coord = lp_build_clamp(coord_bld, coord, min, max);
-            /* scale coord to length (and sub 0.5?) */
+            /* scale coord to length */
             coord = lp_build_mul(coord_bld, coord, length_f);
-            coord = lp_build_sub(coord_bld, coord, half);
-         }
-         else {
-            /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_vec(coord_bld->type, -0.5F);
-            max = lp_build_sub(coord_bld, length_f, min);
-            coord = lp_build_clamp(coord_bld, coord, min, max);
-            coord = lp_build_sub(coord_bld, coord, half);
          }
+         /* clamp to [-0.5, length + 0.5] */
+         min = lp_build_const_vec(coord_bld->type, -0.5F);
+         max = lp_build_sub(coord_bld, length_f, min);
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         coord = lp_build_sub(coord_bld, coord, half);
          /* compute lerp weight */
          weight = lp_build_fract(coord_bld, coord);
          /* convert to int */
@@ -642,35 +357,41 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = 1.0 - min */
-         max = lp_build_sub(coord_bld, coord_bld->one, min);
+      coord = lp_build_abs(coord_bld, coord);
 
-         coord = lp_build_abs(coord_bld, coord);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
-         if(0)coord = lp_build_sub(coord_bld, coord, half);
-         weight = lp_build_fract(coord_bld, coord);
-         coord0 = lp_build_ifloor(coord_bld, coord);
-         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       }
+
+      /* clamp to [0, length] */
+      coord = lp_build_min(coord_bld, coord, length_f);
+
+      coord = lp_build_sub(coord_bld, coord, half);
+
+      weight = lp_build_fract(coord_bld, coord);
+      coord0 = lp_build_ifloor(coord_bld, coord);
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
       {
          LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = 1.0 - min */
-         max = lp_build_sub(coord_bld, coord_bld->one, min);
 
          coord = lp_build_abs(coord_bld, coord);
+
+         if (bld->static_state->normalized_coords) {
+            /* scale coord to length */
+            coord = lp_build_mul(coord_bld, coord, length_f);
+         }
+
+         /* clamp to [0.5, length - 0.5] */
+         min = half;
+         max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
-         coord = lp_build_mul(coord_bld, coord, length_f);
+
          coord = lp_build_sub(coord_bld, coord, half);
+
          weight = lp_build_fract(coord_bld, coord);
          coord0 = lp_build_ifloor(coord_bld, coord);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
@@ -680,17 +401,21 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
       {
          LLVMValueRef min, max;
-         /* min = -1.0 / (2 * length) = -0.5 / length */
-         min = lp_build_mul(coord_bld,
-                            lp_build_const_vec(coord_bld->type, -0.5F),
-                            lp_build_rcp(coord_bld, length_f));
-         /* max = 1.0 - min */
-         max = lp_build_sub(coord_bld, coord_bld->one, min);
 
          coord = lp_build_abs(coord_bld, coord);
+
+         if (bld->static_state->normalized_coords) {
+            /* scale coord to length */
+            coord = lp_build_mul(coord_bld, coord, length_f);
+         }
+
+         /* clamp to [-0.5, length + 0.5] */
+         min = lp_build_negate(coord_bld, half);
+         max = lp_build_sub(coord_bld, length_f, min);
          coord = lp_build_clamp(coord_bld, coord, min, max);
-         coord = lp_build_mul(coord_bld, coord, length_f);
+
          coord = lp_build_sub(coord_bld, coord, half);
+
          weight = lp_build_fract(coord_bld, coord);
          coord0 = lp_build_ifloor(coord_bld, coord);
          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
@@ -713,7 +438,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
 /**
  * Build LLVM code for texture wrap mode for nearest filtering.
  * \param coord  the incoming texcoord (nominally in [0,1])
- * \param length  the texture size along one dimension, as int
+ * \param length  the texture size along one dimension, as int vector
  * \param is_pot  if TRUE, length is a power of two
  * \param wrap_mode  one of PIPE_TEX_WRAP_x
  */
@@ -727,10 +452,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
    struct lp_build_context *coord_bld = &bld->coord_bld;
    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
    struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
-   LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0);
    LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
    LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
-   LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
    LLVMValueRef icoord;
    
    switch(wrap_mode) {
@@ -739,127 +462,89 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
       icoord = lp_build_ifloor(coord_bld, coord);
       if (is_pot)
          icoord = LLVMBuildAnd(bld->builder, icoord, length_minus_one, "");
-      else
-         /* Signed remainder won't give the right results for negative
-          * dividends but unsigned remainder does.*/
+      else {
+         /* Add a bias to the texcoord to handle negative coords */
+         LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024);
+         icoord = LLVMBuildAdd(bld->builder, icoord, bias, "");
          icoord = LLVMBuildURem(bld->builder, icoord, length, "");
+      }
       break;
 
    case PIPE_TEX_WRAP_CLAMP:
-      /* mul by size */
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
       if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
+
       /* floor */
       icoord = lp_build_ifloor(coord_bld, coord);
-      /* clamp to [0, size-1].  Note: int coord builder type */
+
+      /* clamp to [0, length - 1]. */
       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
                               length_minus_one);
       break;
 
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         LLVMValueRef min, max;
-         if (bld->static_state->normalized_coords) {
-            /* min = 1.0 / (2 * length) */
-            min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-            /* max = length - min */
-            max = lp_build_sub(coord_bld, length_f, min);
-            /* scale coord to length */
-            coord = lp_build_mul(coord_bld, coord, length_f);
-         }
-         else {
-            /* clamp to [0.5, length - 0.5] */
-            min = lp_build_const_vec(coord_bld->type, 0.5F);
-            max = lp_build_sub(coord_bld, length_f, min);
-         }
-         /* coord = clamp(coord, min, max) */
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
-      }
-      break;
-
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
       {
          LLVMValueRef min, max;
+
          if (bld->static_state->normalized_coords) {
-            /* min = -1.0 / (2 * length) = -0.5 / length */
-            min = lp_build_mul(coord_bld,
-                               lp_build_const_vec(coord_bld->type, -0.5F),
-                               lp_build_rcp(coord_bld, length_f));
-            /* max = length - min */
-            max = lp_build_sub(coord_bld, length_f, min);
             /* scale coord to length */
             coord = lp_build_mul(coord_bld, coord, length_f);
          }
-         else {
-            /* clamp to [-0.5, length + 0.5] */
-            min = lp_build_const_vec(coord_bld->type, -0.5F);
-            max = lp_build_sub(coord_bld, length_f, min);
-         }
-         /* coord = clamp(coord, min, max) */
-         coord = lp_build_clamp(coord_bld, coord, min, max);
+
          icoord = lp_build_ifloor(coord_bld, coord);
+
+         /* clamp to [-1, length] */
+         min = lp_build_negate(int_coord_bld, int_coord_bld->one);
+         max = length;
+         icoord = lp_build_clamp(int_coord_bld, icoord, min, max);
       }
       break;
 
    case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = length - min */
-         max = lp_build_sub(coord_bld, length_f, min);
+      /* compute mirror function */
+      coord = lp_build_coord_mirror(bld, coord);
 
-         /* compute mirror function */
-         coord = lp_build_coord_mirror(bld, coord);
+      /* scale coord to length */
+      assert(bld->static_state->normalized_coords);
+      coord = lp_build_mul(coord_bld, coord, length_f);
 
-         /* scale coord to length */
-         coord = lp_build_mul(coord_bld, coord, length_f);
+      icoord = lp_build_ifloor(coord_bld, coord);
 
-         /* coord = clamp(coord, min, max) */
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
-      }
+      /* clamp to [0, length - 1] */
+      icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      coord = lp_build_abs(coord_bld, coord);
-      coord = lp_build_mul(coord_bld, coord, length_f);
-      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f_minus_one);
-      icoord = lp_build_ifloor(coord_bld, coord);
-      break;
-
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         /* max = length - min */
-         max = lp_build_sub(coord_bld, length_f, min);
+      coord = lp_build_abs(coord_bld, coord);
 
-         coord = lp_build_abs(coord_bld, coord);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
       }
+
+      icoord = lp_build_ifloor(coord_bld, coord);
+
+      /* clamp to [0, length - 1] */
+      icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
       break;
 
    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         LLVMValueRef min, max;
-         /* min = 1.0 / (2 * length) */
-         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
-         min = lp_build_negate(coord_bld, min);
-         /* max = length - min */
-         max = lp_build_sub(coord_bld, length_f, min);
+      coord = lp_build_abs(coord_bld, coord);
 
-         coord = lp_build_abs(coord_bld, coord);
+      if (bld->static_state->normalized_coords) {
+         /* scale coord to length */
          coord = lp_build_mul(coord_bld, coord, length_f);
-         coord = lp_build_clamp(coord_bld, coord, min, max);
-         icoord = lp_build_ifloor(coord_bld, coord);
       }
+
+      icoord = lp_build_ifloor(coord_bld, coord);
+
+      /* clamp to [0, length] */
+      icoord = lp_build_min(int_coord_bld, icoord, length);
       break;
 
    default:
@@ -872,213 +557,12 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
 
 
 /**
- * Codegen equivalent for u_minify().
- * Return max(1, base_size >> level);
- */
-static LLVMValueRef
-lp_build_minify(struct lp_build_sample_context *bld,
-                LLVMValueRef base_size,
-                LLVMValueRef level)
-{
-   LLVMValueRef size = LLVMBuildAShr(bld->builder, base_size, level, "minify");
-   size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
-   return size;
-}
-
-
-/**
- * Generate code to compute texture level of detail (lambda).
- * \param s  vector of texcoord s values
- * \param t  vector of texcoord t values
- * \param r  vector of texcoord r values
- * \param shader_lod_bias  vector float with the shader lod bias,
- * \param width  scalar int texture width
- * \param height  scalar int texture height
- * \param depth  scalar int texture depth
- */
-static LLVMValueRef
-lp_build_lod_selector(struct lp_build_sample_context *bld,
-                      LLVMValueRef s,
-                      LLVMValueRef t,
-                      LLVMValueRef r,
-                      LLVMValueRef shader_lod_bias,
-                      LLVMValueRef width,
-                      LLVMValueRef height,
-                      LLVMValueRef depth)
-
-{
-   if (bld->static_state->min_lod == bld->static_state->max_lod) {
-      /* User is forcing sampling from a particular mipmap level.
-       * This is hit during mipmap generation.
-       */
-      return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod);
-   }
-   else {
-      const int dims = texture_dims(bld->static_state->target);
-      struct lp_build_context *float_bld = &bld->float_bld;
-      LLVMValueRef sampler_lod_bias = LLVMConstReal(LLVMFloatType(),
-                                                    bld->static_state->lod_bias);
-      LLVMValueRef min_lod = LLVMConstReal(LLVMFloatType(),
-                                           bld->static_state->min_lod);
-      LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(),
-                                           bld->static_state->max_lod);
-
-      LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
-      LLVMValueRef index1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
-      LLVMValueRef index2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
-
-      LLVMValueRef s0, s1, s2;
-      LLVMValueRef t0, t1, t2;
-      LLVMValueRef r0, r1, r2;
-      LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy;
-      LLVMValueRef rho, lod;
-
-      /*
-       * dsdx = abs(s[1] - s[0]);
-       * dsdy = abs(s[2] - s[0]);
-       * dtdx = abs(t[1] - t[0]);
-       * dtdy = abs(t[2] - t[0]);
-       * drdx = abs(r[1] - r[0]);
-       * drdy = abs(r[2] - r[0]);
-       * XXX we're assuming a four-element quad in 2x2 layout here.
-       */
-      s0 = LLVMBuildExtractElement(bld->builder, s, index0, "s0");
-      s1 = LLVMBuildExtractElement(bld->builder, s, index1, "s1");
-      s2 = LLVMBuildExtractElement(bld->builder, s, index2, "s2");
-      dsdx = LLVMBuildSub(bld->builder, s1, s0, "");
-      dsdx = lp_build_abs(float_bld, dsdx);
-      dsdy = LLVMBuildSub(bld->builder, s2, s0, "");
-      dsdy = lp_build_abs(float_bld, dsdy);
-      if (dims > 1) {
-         t0 = LLVMBuildExtractElement(bld->builder, t, index0, "t0");
-         t1 = LLVMBuildExtractElement(bld->builder, t, index1, "t1");
-         t2 = LLVMBuildExtractElement(bld->builder, t, index2, "t2");
-         dtdx = LLVMBuildSub(bld->builder, t1, t0, "");
-         dtdx = lp_build_abs(float_bld, dtdx);
-         dtdy = LLVMBuildSub(bld->builder, t2, t0, "");
-         dtdy = lp_build_abs(float_bld, dtdy);
-         if (dims > 2) {
-            r0 = LLVMBuildExtractElement(bld->builder, r, index0, "r0");
-            r1 = LLVMBuildExtractElement(bld->builder, r, index1, "r1");
-            r2 = LLVMBuildExtractElement(bld->builder, r, index2, "r2");
-            drdx = LLVMBuildSub(bld->builder, r1, r0, "");
-            drdx = lp_build_abs(float_bld, drdx);
-            drdy = LLVMBuildSub(bld->builder, r2, r0, "");
-            drdy = lp_build_abs(float_bld, drdy);
-         }
-      }
-
-      /* Compute rho = max of all partial derivatives scaled by texture size.
-       * XXX this could be vectorized somewhat
-       */
-      rho = LLVMBuildMul(bld->builder,
-                         lp_build_max(float_bld, dsdx, dsdy),
-                         lp_build_int_to_float(float_bld, width), "");
-      if (dims > 1) {
-         LLVMValueRef max;
-         max = LLVMBuildMul(bld->builder,
-                            lp_build_max(float_bld, dtdx, dtdy),
-                            lp_build_int_to_float(float_bld, height), "");
-         rho = lp_build_max(float_bld, rho, max);
-         if (dims > 2) {
-            max = LLVMBuildMul(bld->builder,
-                               lp_build_max(float_bld, drdx, drdy),
-                               lp_build_int_to_float(float_bld, depth), "");
-            rho = lp_build_max(float_bld, rho, max);
-         }
-      }
-
-      /* compute lod = log2(rho) */
-      lod = lp_build_log2(float_bld, rho);
-
-      /* add sampler lod bias */
-      lod = LLVMBuildAdd(bld->builder, lod, sampler_lod_bias, "sampler LOD bias");
-
-      /* add shader lod bias */
-      /* XXX for now we take only the first element since our lod is scalar */
-      shader_lod_bias = LLVMBuildExtractElement(bld->builder, shader_lod_bias,
-                                                LLVMConstInt(LLVMInt32Type(), 0, 0), "");
-      lod = LLVMBuildAdd(bld->builder, lod, shader_lod_bias, "shader LOD bias");
-
-      /* clamp lod */
-      lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
-
-      return lod;
-   }
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer
- * mipmap level index.
- * Note: this is all scalar code.
- * \param lod  scalar float texture level of detail
- * \param level_out  returns integer 
- */
-static void
-lp_build_nearest_mip_level(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level_out)
-{
-   struct lp_build_context *float_bld = &bld->float_bld;
-   struct lp_build_context *int_bld = &bld->int_bld;
-   LLVMValueRef last_level, level;
-
-   LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_iround(float_bld, lod);
-
-   /* clamp level to legal range of levels */
-   *level_out = lp_build_clamp(int_bld, level, zero, last_level);
-}
-
-
-/**
- * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to
- * two (adjacent) mipmap level indexes.  Later, we'll sample from those
- * two mipmap levels and interpolate between them.
- */
-static void
-lp_build_linear_mip_levels(struct lp_build_sample_context *bld,
-                           unsigned unit,
-                           LLVMValueRef lod,
-                           LLVMValueRef *level0_out,
-                           LLVMValueRef *level1_out,
-                           LLVMValueRef *weight_out)
-{
-   struct lp_build_context *float_bld = &bld->float_bld;
-   struct lp_build_context *int_bld = &bld->int_bld;
-   LLVMValueRef last_level, level;
-
-   last_level = bld->dynamic_state->last_level(bld->dynamic_state,
-                                               bld->builder, unit);
-
-   /* convert float lod to integer */
-   level = lp_build_ifloor(float_bld, lod);
-
-   /* compute level 0 and clamp to legal range of levels */
-   *level0_out = lp_build_clamp(int_bld, level,
-                                int_bld->zero,
-                                last_level);
-   /* compute level 1 and clamp to legal range of levels */
-   *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one);
-   *level1_out = lp_build_min(int_bld, *level1_out, last_level);
-
-   *weight_out = lp_build_fract(float_bld, lod);
-}
-
-
-/**
  * Generate code to sample a mipmap level with nearest filtering.
  * If sampling a cube texture, r = cube face in [0,5].
  */
 static void
 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
+                              unsigned unit,
                               LLVMValueRef width_vec,
                               LLVMValueRef height_vec,
                               LLVMValueRef depth_vec,
@@ -1109,7 +593,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
 
       if (dims == 3) {
          z = lp_build_sample_wrap_nearest(bld, r, depth_vec,
-                                          bld->static_state->pot_height,
+                                          bld->static_state->pot_depth,
                                           bld->static_state->wrap_r);
          lp_build_name(z, "tex.z.wrapped");
       }
@@ -1127,7 +611,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
    /*
     * Get texture colors.
     */
-   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+   lp_build_sample_texel_soa(bld, unit,
+                             width_vec, height_vec, depth_vec,
                              x, y, z,
                              row_stride_vec, img_stride_vec,
                              data_ptr, colors_out);
@@ -1140,6 +625,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
+                             unsigned unit,
                              LLVMValueRef width_vec,
                              LLVMValueRef height_vec,
                              LLVMValueRef depth_vec,
@@ -1201,11 +687,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
     * Get texture colors.
     */
    /* get x0/x1 texels */
-   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+   lp_build_sample_texel_soa(bld, unit,
+                             width_vec, height_vec, depth_vec,
                              x0, y0, z0,
                              row_stride_vec, img_stride_vec,
                              data_ptr, neighbors[0][0]);
-   lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+   lp_build_sample_texel_soa(bld, unit,
+                             width_vec, height_vec, depth_vec,
                              x1, y0, z0,
                              row_stride_vec, img_stride_vec,
                              data_ptr, neighbors[0][1]);
@@ -1223,11 +711,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
       LLVMValueRef colors0[4];
 
       /* get x0/x1 texels at y1 */
-      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+      lp_build_sample_texel_soa(bld, unit,
+                                width_vec, height_vec, depth_vec,
                                 x0, y1, z0,
                                 row_stride_vec, img_stride_vec,
                                 data_ptr, neighbors[1][0]);
-      lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+      lp_build_sample_texel_soa(bld, unit,
+                                width_vec, height_vec, depth_vec,
                                 x1, y1, z0,
                                 row_stride_vec, img_stride_vec,
                                 data_ptr, neighbors[1][1]);
@@ -1247,19 +737,23 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
          LLVMValueRef colors1[4];
 
          /* get x0/x1/y0/y1 texels at z1 */
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x0, y0, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[0][0]);
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x1, y0, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[0][1]);
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x0, y1, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[1][0]);
-         lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec,
+         lp_build_sample_texel_soa(bld, unit,
+                                   width_vec, height_vec, depth_vec,
                                    x1, y1, z1,
                                    row_stride_vec, img_stride_vec,
                                    data_ptr, neighbors1[1][1]);
@@ -1291,209 +785,6 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld,
 }
 
 
-/** Helper used by lp_build_cube_lookup() */
-static LLVMValueRef
-lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
-{
-   /* ima = -0.5 / abs(coord); */
-   LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5);
-   LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
-   LLVMValueRef ima = lp_build_mul(coord_bld, negHalf,
-                                   lp_build_rcp(coord_bld, absCoord));
-   return ima;
-}
-
-
-/**
- * Helper used by lp_build_cube_lookup()
- * \param sign  scalar +1 or -1
- * \param coord  float vector
- * \param ima  float vector
- */
-static LLVMValueRef
-lp_build_cube_coord(struct lp_build_context *coord_bld,
-                    LLVMValueRef sign, int negate_coord,
-                    LLVMValueRef coord, LLVMValueRef ima)
-{
-   /* return negate(coord) * ima * sign + 0.5; */
-   LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5);
-   LLVMValueRef res;
-
-   assert(negate_coord == +1 || negate_coord == -1);
-
-   if (negate_coord == -1) {
-      coord = lp_build_negate(coord_bld, coord);
-   }
-
-   res = lp_build_mul(coord_bld, coord, ima);
-   if (sign) {
-      sign = lp_build_broadcast_scalar(coord_bld, sign);
-      res = lp_build_mul(coord_bld, res, sign);
-   }
-   res = lp_build_add(coord_bld, res, half);
-
-   return res;
-}
-
-
-/** Helper used by lp_build_cube_lookup()
- * Return (major_coord >= 0) ? pos_face : neg_face;
- */
-static LLVMValueRef
-lp_build_cube_face(struct lp_build_sample_context *bld,
-                   LLVMValueRef major_coord,
-                   unsigned pos_face, unsigned neg_face)
-{
-   LLVMValueRef cmp = LLVMBuildFCmp(bld->builder, LLVMRealUGE,
-                                    major_coord,
-                                    bld->float_bld.zero, "");
-   LLVMValueRef pos = LLVMConstInt(LLVMInt32Type(), pos_face, 0);
-   LLVMValueRef neg = LLVMConstInt(LLVMInt32Type(), neg_face, 0);
-   LLVMValueRef res = LLVMBuildSelect(bld->builder, cmp, pos, neg, "");
-   return res;
-}
-
-
-
-/**
- * Generate code to do cube face selection and per-face texcoords.
- */
-static void
-lp_build_cube_lookup(struct lp_build_sample_context *bld,
-                     LLVMValueRef s,
-                     LLVMValueRef t,
-                     LLVMValueRef r,
-                     LLVMValueRef *face,
-                     LLVMValueRef *face_s,
-                     LLVMValueRef *face_t)
-{
-   struct lp_build_context *float_bld = &bld->float_bld;
-   struct lp_build_context *coord_bld = &bld->coord_bld;
-   LLVMValueRef rx, ry, rz;
-   LLVMValueRef arx, ary, arz;
-   LLVMValueRef c25 = LLVMConstReal(LLVMFloatType(), 0.25);
-   LLVMValueRef arx_ge_ary, arx_ge_arz;
-   LLVMValueRef ary_ge_arx, ary_ge_arz;
-   LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz;
-   LLVMValueRef rx_pos, ry_pos, rz_pos;
-
-   assert(bld->coord_bld.type.length == 4);
-
-   /*
-    * Use the average of the four pixel's texcoords to choose the face.
-    */
-   rx = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, s));
-   ry = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, t));
-   rz = lp_build_mul(float_bld, c25,
-                     lp_build_sum_vector(&bld->coord_bld, r));
-
-   arx = lp_build_abs(float_bld, rx);
-   ary = lp_build_abs(float_bld, ry);
-   arz = lp_build_abs(float_bld, rz);
-
-   /*
-    * Compare sign/magnitude of rx,ry,rz to determine face
-    */
-   arx_ge_ary = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, ary, "");
-   arx_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, arz, "");
-   ary_ge_arx = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arx, "");
-   ary_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arz, "");
-
-   arx_ge_ary_arz = LLVMBuildAnd(bld->builder, arx_ge_ary, arx_ge_arz, "");
-   ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
-
-   rx_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rx, float_bld->zero, "");
-   ry_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ry, float_bld->zero, "");
-   rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, "");
-
-   {
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
-
-      *face_s = bld->coord_bld.undef;
-      *face_t = bld->coord_bld.undef;
-      *face = bld->int_bld.undef;
-
-      lp_build_name(*face_s, "face_s");
-      lp_build_name(*face_t, "face_t");
-      lp_build_name(*face, "face");
-
-      lp_build_flow_scope_declare(flow_ctx, face_s);
-      lp_build_flow_scope_declare(flow_ctx, face_t);
-      lp_build_flow_scope_declare(flow_ctx, face);
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz);
-      {
-         /* +/- X face */
-         LLVMValueRef sign = lp_build_sgn(float_bld, rx);
-         LLVMValueRef ima = lp_build_cube_ima(coord_bld, s);
-         *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima);
-         *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
-         *face = lp_build_cube_face(bld, rx,
-                                    PIPE_TEX_FACE_POS_X,
-                                    PIPE_TEX_FACE_NEG_X);
-      }
-      lp_build_else(&if_ctx);
-      {
-         struct lp_build_flow_context *flow_ctx2;
-         struct lp_build_if_state if_ctx2;
-
-         LLVMValueRef face_s2 = bld->coord_bld.undef;
-         LLVMValueRef face_t2 = bld->coord_bld.undef;
-         LLVMValueRef face2 = bld->int_bld.undef;
-
-         flow_ctx2 = lp_build_flow_create(bld->builder);
-         lp_build_flow_scope_begin(flow_ctx2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_s2);
-         lp_build_flow_scope_declare(flow_ctx2, &face_t2);
-         lp_build_flow_scope_declare(flow_ctx2, &face2);
-
-         ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, "");
-
-         lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz);
-         {
-            /* +/- Y face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, ry);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, t);
-            face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima);
-            face2 = lp_build_cube_face(bld, ry,
-                                       PIPE_TEX_FACE_POS_Y,
-                                       PIPE_TEX_FACE_NEG_Y);
-         }
-         lp_build_else(&if_ctx2);
-         {
-            /* +/- Z face */
-            LLVMValueRef sign = lp_build_sgn(float_bld, rz);
-            LLVMValueRef ima = lp_build_cube_ima(coord_bld, r);
-            face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima);
-            face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima);
-            face2 = lp_build_cube_face(bld, rz,
-                                       PIPE_TEX_FACE_POS_Z,
-                                       PIPE_TEX_FACE_NEG_Z);
-         }
-         lp_build_endif(&if_ctx2);
-         lp_build_flow_scope_end(flow_ctx2);
-         lp_build_flow_destroy(flow_ctx2);
-
-         *face_s = face_s2;
-         *face_t = face_t2;
-         *face = face2;
-      }
-
-      lp_build_endif(&if_ctx);
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
-   }
-}
-
-
-
 /**
  * Sample the texture/mipmap using given image filter and mip filter.
  * data0_ptr and data1_ptr point to the two mipmap levels to sample
@@ -1502,6 +793,7 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld,
  */
 static void
 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
+                       unsigned unit,
                        unsigned img_filter,
                        unsigned mip_filter,
                        LLVMValueRef s,
@@ -1526,14 +818,15 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    int chan;
 
    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
-      lp_build_sample_image_nearest(bld,
+      /* sample the first mipmap level */
+      lp_build_sample_image_nearest(bld, unit,
                                     width0_vec, height0_vec, depth0_vec,
                                     row_stride0_vec, img_stride0_vec,
                                     data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level, and interp */
-         lp_build_sample_image_nearest(bld,
+         /* sample the second mipmap level */
+         lp_build_sample_image_nearest(bld, unit,
                                        width1_vec, height1_vec, depth1_vec,
                                        row_stride1_vec, img_stride1_vec,
                                        data_ptr1, s, t, r, colors1);
@@ -1542,14 +835,15 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
    else {
       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
 
-      lp_build_sample_image_linear(bld,
+      /* sample the first mipmap level */
+      lp_build_sample_image_linear(bld, unit,
                                    width0_vec, height0_vec, depth0_vec,
                                    row_stride0_vec, img_stride0_vec,
                                    data_ptr0, s, t, r, colors0);
 
       if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-         /* sample the second mipmap level, and interp */
-         lp_build_sample_image_linear(bld,
+         /* sample the second mipmap level */
+         lp_build_sample_image_linear(bld, unit,
                                       width1_vec, height1_vec, depth1_vec,
                                       row_stride1_vec, img_stride1_vec,
                                       data_ptr1, s, t, r, colors1);
@@ -1584,7 +878,10 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
                         LLVMValueRef s,
                         LLVMValueRef t,
                         LLVMValueRef r,
-                        LLVMValueRef lodbias,
+                        const LLVMValueRef *ddx,
+                        const LLVMValueRef *ddy,
+                        LLVMValueRef lod_bias, /* optional */
+                        LLVMValueRef explicit_lod, /* optional */
                         LLVMValueRef width,
                         LLVMValueRef height,
                         LLVMValueRef depth,
@@ -1602,12 +899,13 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    const unsigned mag_filter = bld->static_state->mag_img_filter;
    const int dims = texture_dims(bld->static_state->target);
    LLVMValueRef lod = NULL, lod_fpart = NULL;
-   LLVMValueRef ilevel0, ilevel1 = NULL, ilevel0_vec, ilevel1_vec = NULL;
+   LLVMValueRef ilevel0, ilevel1 = NULL;
    LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL;
    LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL;
    LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL;
    LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL;
    LLVMValueRef data_ptr0, data_ptr1 = NULL;
+   LLVMValueRef face_ddx[4], face_ddy[4];
 
    /*
    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
@@ -1615,6 +913,30 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
    */
 
    /*
+    * Choose cube face, recompute texcoords and derivatives for the chosen face.
+    */
+   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+      LLVMValueRef face, face_s, face_t;
+      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
+      s = face_s; /* vec */
+      t = face_t; /* vec */
+      /* use 'r' to indicate cube face */
+      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
+
+      /* recompute ddx, ddy using the new (s,t) face texcoords */
+      face_ddx[0] = lp_build_ddx(&bld->coord_bld, s);
+      face_ddx[1] = lp_build_ddx(&bld->coord_bld, t);
+      face_ddx[2] = NULL;
+      face_ddx[3] = NULL;
+      face_ddy[0] = lp_build_ddy(&bld->coord_bld, s);
+      face_ddy[1] = lp_build_ddy(&bld->coord_bld, t);
+      face_ddy[2] = NULL;
+      face_ddy[3] = NULL;
+      ddx = face_ddx;
+      ddy = face_ddy;
+   }
+
+   /*
     * Compute the level of detail (float).
     */
    if (min_filter != mag_filter ||
@@ -1622,7 +944,9 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       /* Need to compute lod either to choose mipmap levels or to
        * distinguish between minification/magnification with one mipmap level.
        */
-      lod = lp_build_lod_selector(bld, s, t, r, lodbias, width, height, depth);
+      lod = lp_build_lod_selector(bld, unit, ddx, ddy,
+                                  lod_bias, explicit_lod,
+                                  width, height, depth);
    }
 
    /*
@@ -1630,9 +954,20 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
     */
    if (mip_filter == PIPE_TEX_MIPFILTER_NONE) {
       /* always use mip level 0 */
-      ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
+         /* XXX this is a work-around for an apparent bug in LLVM 2.7.
+          * We should be able to set ilevel0 = const(0) but that causes
+          * bad x86 code to be emitted.
+          */
+         lod = lp_build_const_elem(bld->coord_bld.type, 0.0);
+         lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
+      }
+      else {
+         ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+      }
    }
    else {
+      assert(lod);
       if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
          lp_build_nearest_mip_level(bld, unit, lod, &ilevel0);
       }
@@ -1644,59 +979,15 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       }
    }
 
-   /*
-    * Convert scalar integer mipmap levels into vectors.
-    */
-   ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0);
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
-      ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1);
-
-   /*
-    * Compute width, height at mipmap level 'ilevel0'
-    */
-   width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec);
-   if (dims >= 2) {
-      height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec);
-      row_stride0_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
-                                                      ilevel0);
-      if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
-         img_stride0_vec = lp_build_get_level_stride_vec(bld,
-                                                         img_stride_array,
-                                                         ilevel0);
-         if (dims == 3) {
-            depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec);
-         }
-      }
-   }
-   if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
-      /* compute width, height, depth for second mipmap level at 'ilevel1' */
-      width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec);
-      if (dims >= 2) {
-         height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec);
-         row_stride1_vec = lp_build_get_level_stride_vec(bld, row_stride_array,
-                                                         ilevel1);
-         if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) {
-            img_stride1_vec = lp_build_get_level_stride_vec(bld,
-                                                            img_stride_array,
-                                                            ilevel1);
-            if (dims ==3) {
-               depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec);
-            }
-         }
-      }
-   }
-
-   /*
-    * Choose cube face, recompute per-face texcoords.
-    */
-   if (bld->static_state->target == PIPE_TEXTURE_CUBE) {
-      LLVMValueRef face, face_s, face_t;
-      lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t);
-      s = face_s; /* vec */
-      t = face_t; /* vec */
-      /* use 'r' to indicate cube face */
-      r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */
-   }
+   /* compute image size(s) of source mipmap level(s) */
+   lp_build_mipmap_level_sizes(bld, dims, width_vec, height_vec, depth_vec,
+                               ilevel0, ilevel1,
+                               row_stride_array, img_stride_array,
+                               &width0_vec, &width1_vec,
+                               &height0_vec, &height1_vec,
+                               &depth0_vec, &depth1_vec,
+                               &row_stride0_vec, &row_stride1_vec,
+                               &img_stride0_vec, &img_stride1_vec);
 
    /*
     * Get pointer(s) to image data for mipmap level(s).
@@ -1711,7 +1002,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
     */
    if (min_filter == mag_filter) {
       /* no need to distinquish between minification and magnification */
-      lp_build_sample_mipmap(bld, min_filter, mip_filter, s, t, r, lod_fpart,
+      lp_build_sample_mipmap(bld, unit,
+                             min_filter, mip_filter, s, t, r, lod_fpart,
                              width0_vec, width1_vec,
                              height0_vec, height1_vec,
                              depth0_vec, depth1_vec,
@@ -1743,7 +1035,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       lp_build_if(&if_ctx, flow_ctx, bld->builder, minify);
       {
          /* Use the minification filter */
-         lp_build_sample_mipmap(bld, min_filter, mip_filter,
+         lp_build_sample_mipmap(bld, unit,
+                                min_filter, mip_filter,
                                 s, t, r, lod_fpart,
                                 width0_vec, width1_vec,
                                 height0_vec, height1_vec,
@@ -1756,7 +1049,8 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
       lp_build_else(&if_ctx);
       {
          /* Use the magnification filter */
-         lp_build_sample_mipmap(bld, mag_filter, mip_filter,
+         lp_build_sample_mipmap(bld, unit,
+                                mag_filter, mip_filter,
                                 s, t, r, lod_fpart,
                                 width0_vec, width1_vec,
                                 height0_vec, height1_vec,
@@ -1774,280 +1068,70 @@ lp_build_sample_general(struct lp_build_sample_context *bld,
 }
 
 
-
-static void
-lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
-                          struct lp_type dst_type,
-                          LLVMValueRef packed,
-                          LLVMValueRef *rgba)
-{
-   LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff);
-   unsigned chan;
-
-   /* Decode the input vector components */
-   for (chan = 0; chan < 4; ++chan) {
-      unsigned start = chan*8;
-      unsigned stop = start + 8;
-      LLVMValueRef input;
-
-      input = packed;
-
-      if(start)
-         input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(dst_type, start), "");
-
-      if(stop < 32)
-         input = LLVMBuildAnd(builder, input, mask, "");
-
-      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
-
-      rgba[chan] = input;
-   }
-}
-
-
-static void
-lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
-                              LLVMValueRef s,
-                              LLVMValueRef t,
-                              LLVMValueRef width,
-                              LLVMValueRef height,
-                              LLVMValueRef stride_array,
-                              LLVMValueRef data_array,
-                              LLVMValueRef *texel)
-{
-   LLVMBuilderRef builder = bld->builder;
-   struct lp_build_context i32, h16, u8n;
-   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
-   LLVMValueRef i32_c8, i32_c128, i32_c255;
-   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
-   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
-   LLVMValueRef x0, x1;
-   LLVMValueRef y0, y1;
-   LLVMValueRef neighbors[2][2];
-   LLVMValueRef neighbors_lo[2][2];
-   LLVMValueRef neighbors_hi[2][2];
-   LLVMValueRef packed, packed_lo, packed_hi;
-   LLVMValueRef unswizzled[4];
-   LLVMValueRef stride;
-
-   lp_build_context_init(&i32, builder, lp_type_int_vec(32));
-   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
-   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
-
-   i32_vec_type = lp_build_vec_type(i32.type);
-   h16_vec_type = lp_build_vec_type(h16.type);
-   u8n_vec_type = lp_build_vec_type(u8n.type);
-
-   if (bld->static_state->normalized_coords) {
-      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
-      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width, coord_vec_type, "");
-      LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height, coord_vec_type, "");
-      s = lp_build_mul(&bld->coord_bld, s, fp_width);
-      t = lp_build_mul(&bld->coord_bld, t, fp_height);
-   }
-
-   /* scale coords by 256 (8 fractional bits) */
-   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
-   t = lp_build_mul_imm(&bld->coord_bld, t, 256);
-
-   /* convert float to int */
-   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
-   t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
-
-   /* subtract 0.5 (add -128) */
-   i32_c128 = lp_build_const_int_vec(i32.type, -128);
-   s = LLVMBuildAdd(builder, s, i32_c128, "");
-   t = LLVMBuildAdd(builder, t, i32_c128, "");
-
-   /* compute floor (shift right 8) */
-   i32_c8 = lp_build_const_int_vec(i32.type, 8);
-   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
-   t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
-
-   /* compute fractional part (AND with 0xff) */
-   i32_c255 = lp_build_const_int_vec(i32.type, 255);
-   s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
-   t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
-
-   x0 = s_ipart;
-   y0 = t_ipart;
-
-   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
-   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
-
-   x0 = lp_build_sample_wrap_int(bld, x0, width,  bld->static_state->pot_width,
-                                 bld->static_state->wrap_s);
-   y0 = lp_build_sample_wrap_int(bld, y0, height, bld->static_state->pot_height,
-                                 bld->static_state->wrap_t);
-
-   x1 = lp_build_sample_wrap_int(bld, x1, width,  bld->static_state->pot_width,
-                                 bld->static_state->wrap_s);
-   y1 = lp_build_sample_wrap_int(bld, y1, height, bld->static_state->pot_height,
-                                 bld->static_state->wrap_t);
-
-   /*
-    * Transform 4 x i32 in
-    *
-    *   s_fpart = {s0, s1, s2, s3}
-    *
-    * into 8 x i16
-    *
-    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
-    *
-    * into two 8 x i16
-    *
-    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
-    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
-    *
-    * and likewise for t_fpart. There is no risk of loosing precision here
-    * since the fractional parts only use the lower 8bits.
-    */
-
-   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
-   t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
-
-   {
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef shuffle_lo;
-      LLVMValueRef shuffle_hi;
-      unsigned i, j;
-
-      for(j = 0; j < h16.type.length; j += 4) {
-         unsigned subindex = util_cpu_caps.little_endian ? 0 : 1;
-         LLVMValueRef index;
-
-         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
-         for(i = 0; i < 4; ++i)
-            shuffles_lo[j + i] = index;
-
-         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
-         for(i = 0; i < 4; ++i)
-            shuffles_hi[j + i] = index;
-      }
-
-      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
-      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
-
-      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_lo, "");
-      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_lo, "");
-      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_hi, "");
-      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
-   }
-
-   stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0);
-
-   /*
-    * Fetch the pixels as 4 x 32bit (rgba order might differ):
-    *
-    *   rgba0 rgba1 rgba2 rgba3
-    *
-    * bit cast them into 16 x u8
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * unpack them into two 8 x i16:
-    *
-    *   r0 g0 b0 a0 r1 g1 b1 a1
-    *   r2 g2 b2 a2 r3 g3 b3 a3
-    *
-    * The higher 8 bits of the resulting elements will be zero.
-    */
-
-   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array);
-   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array);
-   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array);
-   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array);
-
-   neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
-   neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
-   neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, "");
-   neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, "");
-
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]);
-   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]);
-
-   /*
-    * Linear interpolate with 8.8 fixed point.
-    */
-
-   packed_lo = lp_build_lerp_2d(&h16,
-                                s_fpart_lo, t_fpart_lo,
-                                neighbors_lo[0][0],
-                                neighbors_lo[0][1],
-                                neighbors_lo[1][0],
-                                neighbors_lo[1][1]);
-
-   packed_hi = lp_build_lerp_2d(&h16,
-                                s_fpart_hi, t_fpart_hi,
-                                neighbors_hi[0][0],
-                                neighbors_hi[0][1],
-                                neighbors_hi[1][0],
-                                neighbors_hi[1][1]);
-
-   packed = lp_build_pack2(builder, h16.type, u8n.type, packed_lo, packed_hi);
-
-   /*
-    * Convert to SoA and swizzle.
-    */
-
-   packed = LLVMBuildBitCast(builder, packed, i32_vec_type, "");
-
-   lp_build_rgba8_to_f32_soa(bld->builder,
-                             bld->texel_type,
-                             packed, unswizzled);
-
-   lp_build_format_swizzle_soa(bld->format_desc,
-                               bld->texel_type, unswizzled,
-                               texel);
-
-   lp_build_swizzle_soa(bld, texel);
-}
-
-
+/**
+ * Do shadow test/comparison.
+ * \param p  the texcoord Z (aka R, aka P) component
+ * \param texel  the texel to compare against (use the X channel)
+ */
 static void
 lp_build_sample_compare(struct lp_build_sample_context *bld,
                         LLVMValueRef p,
-                        LLVMValueRef *texel)
+                        LLVMValueRef texel[4])
 {
    struct lp_build_context *texel_bld = &bld->texel_bld;
    LLVMValueRef res;
-   unsigned chan;
+   const unsigned chan = 0;
 
-   if(bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE)
+   if (bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE)
       return;
 
-   /* TODO: Compare before swizzling, to avoid redundant computations */
-   res = NULL;
-   for(chan = 0; chan < 4; ++chan) {
-      LLVMValueRef cmp;
-      cmp = lp_build_cmp(texel_bld, bld->static_state->compare_func, p, texel[chan]);
-      cmp = lp_build_select(texel_bld, cmp, texel_bld->one, texel_bld->zero);
-
-      if(res)
-         res = lp_build_add(texel_bld, res, cmp);
-      else
-         res = cmp;
+   /* debug code */
+   if (0) {
+      LLVMValueRef indx = lp_build_const_int32(0);
+      LLVMValueRef coord = LLVMBuildExtractElement(bld->builder, p, indx, "");
+      LLVMValueRef tex = LLVMBuildExtractElement(bld->builder,
+                                                 texel[chan], indx, "");
+      lp_build_printf(bld->builder, "shadow compare coord %f to texture %f\n",
+                      coord, tex);
    }
 
-   assert(res);
-   res = lp_build_mul(texel_bld, res, lp_build_const_vec(texel_bld->type, 0.25));
+   /* result = (p FUNC texel) ? 1 : 0 */
+   res = lp_build_cmp(texel_bld, bld->static_state->compare_func,
+                      p, texel[chan]);
+   res = lp_build_select(texel_bld, res, texel_bld->one, texel_bld->zero);
 
    /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   for(chan = 0; chan < 3; ++chan)
-      texel[chan] = res;
+   texel[0] =
+   texel[1] =
+   texel[2] = res;
    texel[3] = texel_bld->one;
 }
 
 
 /**
+ * Just set texels to white instead of actually sampling the texture.
+ * For debugging.
+ */
+void
+lp_build_sample_nop(struct lp_type type,
+                    LLVMValueRef texel_out[4])
+{
+   LLVMValueRef one = lp_build_one(type);
+   unsigned chan;
+
+   for (chan = 0; chan < 4; chan++) {
+      texel_out[chan] = one;
+   }  
+}
+
+
+/**
  * Build texture sampling code.
  * 'texel' will return a vector of four LLVMValueRefs corresponding to
  * R, G, B, A.
  * \param type  vector float type to use for coords, etc.
+ * \param ddx  partial derivatives of (s,t,r,q) with respect to x
+ * \param ddy  partial derivatives of (s,t,r,q) with respect to y
  */
 void
 lp_build_sample_soa(LLVMBuilderRef builder,
@@ -2057,8 +1141,11 @@ lp_build_sample_soa(LLVMBuilderRef builder,
                     unsigned unit,
                     unsigned num_coords,
                     const LLVMValueRef *coords,
-                    LLVMValueRef lodbias,
-                    LLVMValueRef *texel)
+                    const LLVMValueRef ddx[4],
+                    const LLVMValueRef ddy[4],
+                    LLVMValueRef lod_bias, /* optional */
+                    LLVMValueRef explicit_lod, /* optional */
+                    LLVMValueRef texel_out[4])
 {
    struct lp_build_sample_context bld;
    LLVMValueRef width, width_vec;
@@ -2069,6 +1156,14 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    LLVMValueRef s;
    LLVMValueRef t;
    LLVMValueRef r;
+   struct lp_type float_vec_type;
+
+   if (0) {
+      enum pipe_format fmt = static_state->format;
+      debug_printf("Sample from %s\n", util_format_name(fmt));
+   }
+
+   assert(type.floating);
 
    /* Setup our build context */
    memset(&bld, 0, sizeof bld);
@@ -2084,7 +1179,10 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    bld.int_coord_type = lp_int_type(type);
    bld.texel_type = type;
 
+   float_vec_type = lp_type_float_vec(32);
+
    lp_build_context_init(&bld.float_bld, builder, bld.float_type);
+   lp_build_context_init(&bld.float_vec_bld, builder, float_vec_type);
    lp_build_context_init(&bld.int_bld, builder, bld.int_type);
    lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
    lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type);
@@ -2104,29 +1202,48 @@ lp_build_sample_soa(LLVMBuilderRef builder,
    t = coords[1];
    r = coords[2];
 
+   /* width, height, depth as uint vectors */
    width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
    height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
    depth_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, depth);
 
-   if (util_format_is_rgba8_variant(bld.format_desc) &&
-       static_state->target == PIPE_TEXTURE_2D &&
-       static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR &&
-       static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR &&
-       static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
-       is_simple_wrap_mode(static_state->wrap_s) &&
-       is_simple_wrap_mode(static_state->wrap_t)) {
-      /* special case */
-      lp_build_sample_2d_linear_aos(&bld, s, t, width_vec, height_vec,
-                                    row_stride_array, data_array, texel);
+   if (0) {
+      /* For debug: no-op texture sampling */
+      lp_build_sample_nop(bld.texel_type, texel_out);
+   }
+   else if (util_format_fits_8unorm(bld.format_desc) &&
+            lp_is_simple_wrap_mode(static_state->wrap_s) &&
+            lp_is_simple_wrap_mode(static_state->wrap_t)) {
+      /* do sampling/filtering with fixed pt arithmetic */
+      lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy,
+                          lod_bias, explicit_lod,
+                          width, height, depth,
+                          width_vec, height_vec, depth_vec,
+                          row_stride_array, img_stride_array,
+                          data_array, texel_out);
    }
+
    else {
-      lp_build_sample_general(&bld, unit, s, t, r, lodbias,
+      if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
+          util_format_fits_8unorm(bld.format_desc)) {
+         debug_printf("%s: using floating point linear filtering for %s\n",
+                      __FUNCTION__, bld.format_desc->short_name);
+         debug_printf("  min_img %d  mag_img %d  mip %d  wraps %d  wrapt %d\n",
+                      static_state->min_img_filter,
+                      static_state->mag_img_filter,
+                      static_state->min_mip_filter,
+                      static_state->wrap_s,
+                      static_state->wrap_t);
+      }
+
+      lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy,
+                              lod_bias, explicit_lod,
                               width, height, depth,
                               width_vec, height_vec, depth_vec,
                               row_stride_array, img_stride_array,
                               data_array,
-                              texel);
+                              texel_out);
    }
 
-   lp_build_sample_compare(&bld, r, texel);
+   lp_build_sample_compare(&bld, r, texel_out);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.c b/src/gallium/auxiliary/gallivm/lp_bld_struct.c
index 3998ac374fe..4693c2de6f9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_struct.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.c
@@ -49,6 +49,8 @@ lp_build_struct_get_ptr(LLVMBuilderRef builder,
 {
    LLVMValueRef indices[2];
    LLVMValueRef member_ptr;
+   assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind);
+   assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMStructTypeKind);
    indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
    indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
    member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
@@ -65,8 +67,91 @@ lp_build_struct_get(LLVMBuilderRef builder,
 {
    LLVMValueRef member_ptr;
    LLVMValueRef res;
+   assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind);
+   assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMStructTypeKind);
    member_ptr = lp_build_struct_get_ptr(builder, ptr, member, name);
    res = LLVMBuildLoad(builder, member_ptr, "");
    lp_build_name(res, "%s.%s", LLVMGetValueName(ptr), name);
    return res;
 }
+
+
+LLVMValueRef
+lp_build_array_get_ptr(LLVMBuilderRef builder,
+                       LLVMValueRef ptr,
+                       LLVMValueRef index)
+{
+   LLVMValueRef indices[2];
+   LLVMValueRef element_ptr;
+   assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind);
+   assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMArrayTypeKind);
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indices[1] = index;
+   element_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+#ifdef DEBUG
+   lp_build_name(element_ptr, "&%s[%s]",
+                 LLVMGetValueName(ptr), LLVMGetValueName(index));
+#endif
+   return element_ptr;
+}
+
+
+LLVMValueRef
+lp_build_array_get(LLVMBuilderRef builder,
+                   LLVMValueRef ptr,
+                   LLVMValueRef index)
+{
+   LLVMValueRef element_ptr;
+   LLVMValueRef res;
+   assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind);
+   assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMArrayTypeKind);
+   element_ptr = lp_build_array_get_ptr(builder, ptr, index);
+   res = LLVMBuildLoad(builder, element_ptr, "");
+#ifdef DEBUG
+   lp_build_name(res, "%s[%s]", LLVMGetValueName(ptr), LLVMGetValueName(index));
+#endif
+   return res;
+}
+
+
+void
+lp_build_array_set(LLVMBuilderRef builder,
+                   LLVMValueRef ptr,
+                   LLVMValueRef index,
+                   LLVMValueRef value)
+{
+   LLVMValueRef element_ptr;
+   assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind);
+   assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMArrayTypeKind);
+   element_ptr = lp_build_array_get_ptr(builder, ptr, index);
+   LLVMBuildStore(builder, value, element_ptr);
+}
+
+
+LLVMValueRef
+lp_build_pointer_get(LLVMBuilderRef builder,
+                     LLVMValueRef ptr,
+                     LLVMValueRef index)
+{
+   LLVMValueRef element_ptr;
+   LLVMValueRef res;
+   assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind);
+   element_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
+   res = LLVMBuildLoad(builder, element_ptr, "");
+#ifdef DEBUG
+   lp_build_name(res, "%s[%s]", LLVMGetValueName(ptr), LLVMGetValueName(index));
+#endif
+   return res;
+}
+
+
+void
+lp_build_pointer_set(LLVMBuilderRef builder,
+                     LLVMValueRef ptr,
+                     LLVMValueRef index,
+                     LLVMValueRef value)
+{
+   LLVMValueRef element_ptr;
+   element_ptr = LLVMBuildGEP(builder, ptr, &index, 1, "");
+   LLVMBuildStore(builder, value, element_ptr);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.h b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
index 147336edb4b..eb87a8eee9e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_struct.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
@@ -71,5 +71,46 @@ lp_build_struct_get(LLVMBuilderRef builder,
                     unsigned member,
                     const char *name);
 
+/**
+ * Get value pointer to an array element.
+ */
+LLVMValueRef
+lp_build_array_get_ptr(LLVMBuilderRef builder,
+                       LLVMValueRef ptr,
+                       LLVMValueRef index);
+
+/**
+ * Get the value of an array element.
+ */
+LLVMValueRef
+lp_build_array_get(LLVMBuilderRef builder,
+                   LLVMValueRef ptr,
+                   LLVMValueRef index);
+
+/**
+ * Set the value of an array element.
+ */
+void
+lp_build_array_set(LLVMBuilderRef builder,
+                   LLVMValueRef ptr,
+                   LLVMValueRef index,
+                   LLVMValueRef value);
+
+/**
+ * Get the value of an array element.
+ */
+LLVMValueRef
+lp_build_pointer_get(LLVMBuilderRef builder,
+                   LLVMValueRef ptr,
+                   LLVMValueRef index);
+
+/**
+ * Set the value of an array element.
+ */
+void
+lp_build_pointer_set(LLVMBuilderRef builder,
+                     LLVMValueRef ptr,
+                     LLVMValueRef index,
+                     LLVMValueRef value);
 
 #endif /* !LP_BLD_STRUCT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
index 278c838eaca..2e9e8386de0 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -60,28 +60,53 @@ lp_build_broadcast(LLVMBuilderRef builder,
 }
 
 
+/**
+ * Broadcast
+ */
 LLVMValueRef
 lp_build_broadcast_scalar(struct lp_build_context *bld,
                           LLVMValueRef scalar)
 {
    const struct lp_type type = bld->type;
-   LLVMValueRef res;
-   unsigned i;
 
-   res = bld->undef;
-   for(i = 0; i < type.length; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      res = LLVMBuildInsertElement(bld->builder, res, scalar, index, "");
-   }
+   assert(lp_check_elem_type(type, LLVMTypeOf(scalar)));
 
-   return res;
+   if (type.length == 1) {
+      return scalar;
+   }
+   else {
+      LLVMValueRef res;
+
+#if HAVE_LLVM >= 0x207
+      /* The shuffle vector is always made of int32 elements */
+      struct lp_type i32_vec_type = lp_type_int_vec(32);
+      i32_vec_type.length = type.length;
+
+      res = LLVMBuildInsertElement(bld->builder, bld->undef, scalar,
+                                   LLVMConstInt(LLVMInt32Type(), 0, 0), "");
+      res = LLVMBuildShuffleVector(bld->builder, res, bld->undef,
+                                   lp_build_const_int_vec(i32_vec_type, 0), "");
+#else
+      /* XXX: The above path provokes a bug in LLVM 2.6 */
+      unsigned i;
+      res = bld->undef;
+      for(i = 0; i < type.length; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         res = LLVMBuildInsertElement(bld->builder, res, scalar, index, "");
+      }
+#endif
+      return res;
+   }
 }
 
 
+/**
+ * Swizzle one channel into all other three channels.
+ */
 LLVMValueRef
-lp_build_broadcast_aos(struct lp_build_context *bld,
-                       LLVMValueRef a,
-                       unsigned channel)
+lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
+                            LLVMValueRef a,
+                            unsigned channel)
 {
    const struct lp_type type = bld->type;
    const unsigned n = type.length;
@@ -93,7 +118,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
    /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
     * using shuffles here actually causes worst results. More investigation is
     * needed. */
-   if (n <= 4) {
+   if (type.width >= 16) {
       /*
        * Shuffle.
        */
@@ -115,21 +140,25 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
        *   YY00 YY00 .... YY00
        *   YYYY YYYY .... YYYY  <= output
        */
-      struct lp_type type4 = type;
+      struct lp_type type4;
       const char shifts[4][2] = {
          { 1,  2},
          {-1,  2},
          { 1, -2},
          {-1, -2}
       };
-      boolean cond[4];
       unsigned i;
 
-      memset(cond, 0, sizeof cond);
-      cond[channel] = 1;
+      a = LLVMBuildAnd(bld->builder, a,
+                       lp_build_const_mask_aos(type, 1 << channel), "");
 
-      a = LLVMBuildAnd(bld->builder, a, lp_build_const_mask_aos(type, cond), "");
+      /*
+       * Build a type where each element is an integer that cover the four
+       * channels.
+       */
 
+      type4 = type;
+      type4.floating = FALSE;
       type4.width *= 4;
       type4.length /= 4;
 
@@ -159,81 +188,248 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 
 
 LLVMValueRef
-lp_build_swizzle1_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      const unsigned char swizzle[4])
+lp_build_swizzle_aos(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     const unsigned char swizzles[4])
 {
-   const unsigned n = bld->type.length;
+   const struct lp_type type = bld->type;
+   const unsigned n = type.length;
    unsigned i, j;
 
-   if(a == bld->undef || a == bld->zero || a == bld->one)
+   if (swizzles[0] == PIPE_SWIZZLE_RED &&
+       swizzles[1] == PIPE_SWIZZLE_GREEN &&
+       swizzles[2] == PIPE_SWIZZLE_BLUE &&
+       swizzles[3] == PIPE_SWIZZLE_ALPHA) {
       return a;
+   }
 
-   if(swizzle[0] == swizzle[1] && swizzle[1] == swizzle[2] && swizzle[2] == swizzle[3])
-      return lp_build_broadcast_aos(bld, a, swizzle[0]);
+   if (swizzles[0] == swizzles[1] &&
+       swizzles[1] == swizzles[2] &&
+       swizzles[2] == swizzles[3]) {
+      switch (swizzles[0]) {
+      case PIPE_SWIZZLE_RED:
+      case PIPE_SWIZZLE_GREEN:
+      case PIPE_SWIZZLE_BLUE:
+      case PIPE_SWIZZLE_ALPHA:
+         return lp_build_swizzle_scalar_aos(bld, a, swizzles[0]);
+      case PIPE_SWIZZLE_ZERO:
+         return bld->zero;
+      case PIPE_SWIZZLE_ONE:
+         return bld->one;
+      default:
+         assert(0);
+         return bld->undef;
+      }
+   }
 
-   {
+   if (type.width >= 16) {
       /*
        * Shuffle.
        */
-      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef undef = LLVMGetUndef(lp_build_elem_type(type));
+      LLVMTypeRef i32t = LLVMInt32Type();
       LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef aux[LP_MAX_VECTOR_LENGTH];
+
+      memset(aux, 0, sizeof aux);
+
+      for(j = 0; j < n; j += 4) {
+         for(i = 0; i < 4; ++i) {
+            unsigned shuffle;
+            switch (swizzles[i]) {
+            default:
+               assert(0);
+               /* fall through */
+            case PIPE_SWIZZLE_RED:
+            case PIPE_SWIZZLE_GREEN:
+            case PIPE_SWIZZLE_BLUE:
+            case PIPE_SWIZZLE_ALPHA:
+               shuffle = j + swizzles[i];
+               break;
+            case PIPE_SWIZZLE_ZERO:
+               shuffle = type.length + 0;
+               if (!aux[0]) {
+                  aux[0] = lp_build_const_elem(type, 0.0);
+               }
+               break;
+            case PIPE_SWIZZLE_ONE:
+               shuffle = type.length + 1;
+               if (!aux[1]) {
+                  aux[1] = lp_build_const_elem(type, 1.0);
+               }
+               break;
+            }
+            shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0);
+         }
+      }
 
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, j + swizzle[i], 0);
+      for (i = 0; i < n; ++i) {
+         if (!aux[i]) {
+            aux[i] = undef;
+         }
+      }
 
-      return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
+      return LLVMBuildShuffleVector(bld->builder, a,
+                                    LLVMConstVector(aux, n),
+                                    LLVMConstVector(shuffles, n), "");
+   } else {
+      /*
+       * Bit mask and shifts.
+       *
+       * For example, this will convert BGRA to RGBA by doing
+       *
+       *   rgba = (bgra & 0x00ff0000) >> 16
+       *        | (bgra & 0xff00ff00)
+       *        | (bgra & 0x000000ff) << 16
+       *
+       * This is necessary not only for faster cause, but because X86 backend
+       * will refuse shuffles of <4 x i8> vectors
+       */
+      LLVMValueRef res;
+      struct lp_type type4;
+      unsigned cond = 0;
+      unsigned chan;
+      int shift;
+
+      /*
+       * Start with a mixture of 1 and 0.
+       */
+      for (chan = 0; chan < 4; ++chan) {
+         if (swizzles[chan] == PIPE_SWIZZLE_ONE) {
+            cond |= 1 << chan;
+         }
+      }
+      res = lp_build_select_aos(bld, cond, bld->one, bld->zero);
+
+      /*
+       * Build a type where each element is an integer that cover the four
+       * channels.
+       */
+      type4 = type;
+      type4.floating = FALSE;
+      type4.width *= 4;
+      type4.length /= 4;
+
+      a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type4), "");
+      res = LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type4), "");
+
+      /*
+       * Mask and shift the channels, trying to group as many channels in the
+       * same shift as possible
+       */
+      for (shift = -3; shift <= 3; ++shift) {
+         unsigned long long mask = 0;
+
+         assert(type4.width <= sizeof(mask)*8);
+
+         for (chan = 0; chan < 4; ++chan) {
+            /* FIXME: big endian */
+            if (swizzles[chan] < 4 &&
+                chan - swizzles[chan] == shift) {
+               mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width);
+            }
+         }
+
+         if (mask) {
+            LLVMValueRef masked;
+            LLVMValueRef shifted;
+
+            if (0)
+               debug_printf("shift = %i, mask = 0x%08llx\n", shift, mask);
+
+            masked = LLVMBuildAnd(bld->builder, a,
+                                  lp_build_const_int_vec(type4, mask), "");
+            if (shift > 0) {
+               shifted = LLVMBuildShl(bld->builder, masked,
+                                      lp_build_const_int_vec(type4, shift*type.width), "");
+            } else if (shift < 0) {
+               shifted = LLVMBuildLShr(bld->builder, masked,
+                                       lp_build_const_int_vec(type4, -shift*type.width), "");
+            } else {
+               shifted = masked;
+            }
+
+            res = LLVMBuildOr(bld->builder, res, shifted, "");
+         }
+      }
+
+      return LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type), "");
    }
 }
 
 
+/**
+ * Extended swizzle of a single channel of a SoA vector.
+ *
+ * @param bld         building context
+ * @param unswizzled  array with the 4 unswizzled values
+ * @param swizzle     one of the PIPE_SWIZZLE_*
+ *
+ * @return  the swizzled value.
+ */
 LLVMValueRef
-lp_build_swizzle2_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      LLVMValueRef b,
-                      const unsigned char swizzle[4])
+lp_build_swizzle_soa_channel(struct lp_build_context *bld,
+                             const LLVMValueRef *unswizzled,
+                             unsigned swizzle)
 {
-   const unsigned n = bld->type.length;
-   unsigned i, j;
+   switch (swizzle) {
+   case PIPE_SWIZZLE_RED:
+   case PIPE_SWIZZLE_GREEN:
+   case PIPE_SWIZZLE_BLUE:
+   case PIPE_SWIZZLE_ALPHA:
+      return unswizzled[swizzle];
+   case PIPE_SWIZZLE_ZERO:
+      return bld->zero;
+   case PIPE_SWIZZLE_ONE:
+      return bld->one;
+   default:
+      assert(0);
+      return bld->undef;
+   }
+}
 
-   if(swizzle[0] < 4 && swizzle[1] < 4 && swizzle[2] < 4 && swizzle[3] < 4)
-      return lp_build_swizzle1_aos(bld, a, swizzle);
 
-   if(a == b) {
-      unsigned char swizzle1[4];
-      swizzle1[0] = swizzle[0] % 4;
-      swizzle1[1] = swizzle[1] % 4;
-      swizzle1[2] = swizzle[2] % 4;
-      swizzle1[3] = swizzle[3] % 4;
-      return lp_build_swizzle1_aos(bld, a, swizzle1);
-   }
+/**
+ * Extended swizzle of a SoA vector.
+ *
+ * @param bld         building context
+ * @param unswizzled  array with the 4 unswizzled values
+ * @param swizzles    array of PIPE_SWIZZLE_*
+ * @param swizzled    output swizzled values
+ */
+void
+lp_build_swizzle_soa(struct lp_build_context *bld,
+                     const LLVMValueRef *unswizzled,
+                     const unsigned char swizzles[4],
+                     LLVMValueRef *swizzled)
+{
+   unsigned chan;
 
-   if(swizzle[0] % 4 == 0 &&
-      swizzle[1] % 4 == 1 &&
-      swizzle[2] % 4 == 2 &&
-      swizzle[3] % 4 == 3) {
-      boolean cond[4];
-      cond[0] = swizzle[0] / 4;
-      cond[1] = swizzle[1] / 4;
-      cond[2] = swizzle[2] / 4;
-      cond[3] = swizzle[3] / 4;
-      return lp_build_select_aos(bld, a, b, cond);
+   for (chan = 0; chan < 4; ++chan) {
+      swizzled[chan] = lp_build_swizzle_soa_channel(bld, unswizzled,
+                                                    swizzles[chan]);
    }
+}
 
-   {
-      /*
-       * Shuffle.
-       */
-      LLVMTypeRef elem_type = LLVMInt32Type();
-      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
 
-      for(j = 0; j < n; j += 4)
-         for(i = 0; i < 4; ++i)
-            shuffles[j + i] = LLVMConstInt(elem_type, j + (swizzle[i] % 4) + (swizzle[i] / 4 * n), 0);
+/**
+ * Do an extended swizzle of a SoA vector inplace.
+ *
+ * @param bld         building context
+ * @param values      intput/output array with the 4 values
+ * @param swizzles    array of PIPE_SWIZZLE_*
+ */
+void
+lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
+                             LLVMValueRef *values,
+                             const unsigned char swizzles[4])
+{
+   LLVMValueRef unswizzled[4];
+   unsigned chan;
 
-      return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
+   for (chan = 0; chan < 4; ++chan) {
+      unswizzled[chan] = values[chan];
    }
-}
-
 
+   lp_build_swizzle_soa(bld, unswizzled, swizzles, values);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
index 138ca620e63..f9b6a5e7258 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -60,7 +60,7 @@ lp_build_broadcast_scalar(struct lp_build_context *bld,
  * all four channel.
  */
 LLVMValueRef
-lp_build_broadcast_aos(struct lp_build_context *bld,
+lp_build_swizzle_scalar_aos(struct lp_build_context *bld,
                        LLVMValueRef a,
                        unsigned channel);
 
@@ -68,24 +68,31 @@ lp_build_broadcast_aos(struct lp_build_context *bld,
 /**
  * Swizzle a vector consisting of an array of XYZW structs.
  *
- * @param swizzle is the in [0,4[ range.
+ * @param swizzles is the in [0,4[ range.
  */
 LLVMValueRef
-lp_build_swizzle1_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      const unsigned char swizzle[4]);
+lp_build_swizzle_aos(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     const unsigned char swizzles[4]);
 
 
-/**
- * Swizzle two vector consisting of an array of XYZW structs.
- *
- * @param swizzle is the in [0,8[ range. Values in [4,8[ range refer to b.
- */
 LLVMValueRef
-lp_build_swizzle2_aos(struct lp_build_context *bld,
-                      LLVMValueRef a,
-                      LLVMValueRef b,
-                      const unsigned char swizzle[4]);
+lp_build_swizzle_soa_channel(struct lp_build_context *bld,
+                             const LLVMValueRef *unswizzled,
+                             unsigned swizzle);
+
+
+void
+lp_build_swizzle_soa(struct lp_build_context *bld,
+                     const LLVMValueRef *unswizzled,
+                     const unsigned char swizzles[4],
+                     LLVMValueRef *swizzled);
+
+
+void
+lp_build_swizzle_soa_inplace(struct lp_build_context *bld,
+                             LLVMValueRef *values,
+                             const unsigned char swizzles[4]);
 
 
 #endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 2eac5da6c69..97318b3456c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -45,6 +45,15 @@ struct lp_build_context;
 struct lp_build_mask_context;
 
 
+enum lp_build_tex_modifier {
+   LP_BLD_TEX_MODIFIER_NONE = 0,
+   LP_BLD_TEX_MODIFIER_PROJECTED,
+   LP_BLD_TEX_MODIFIER_LOD_BIAS,
+   LP_BLD_TEX_MODIFIER_EXPLICIT_LOD,
+   LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV
+};
+
+
 /**
  * Sampler code generation interface.
  *
@@ -59,17 +68,34 @@ struct lp_build_sampler_soa
    (*destroy)( struct lp_build_sampler_soa *sampler );
 
    void
-   (*emit_fetch_texel)( struct lp_build_sampler_soa *sampler,
+   (*emit_fetch_texel)( const struct lp_build_sampler_soa *sampler,
                         LLVMBuilderRef builder,
                         struct lp_type type,
                         unsigned unit,
                         unsigned num_coords,
                         const LLVMValueRef *coords,
-                        LLVMValueRef lodbias,
+                        const LLVMValueRef *ddx,
+                        const LLVMValueRef *ddy,
+                        LLVMValueRef lod_bias, /* optional */
+                        LLVMValueRef explicit_lod, /* optional */
                         LLVMValueRef *texel);
 };
 
 
+struct lp_build_sampler_aos
+{
+   LLVMValueRef
+   (*emit_fetch_texel)( struct lp_build_sampler_aos *sampler,
+                        struct lp_build_context *bld,
+                        unsigned target, /* TGSI_TEXTURE_* */
+                        unsigned unit,
+                        LLVMValueRef coords,
+                        LLVMValueRef ddx,
+                        LLVMValueRef ddy,
+                        enum lp_build_tex_modifier modifier);
+};
+
+
 void
 lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const struct tgsi_token *tokens,
@@ -80,7 +106,19 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const LLVMValueRef (*inputs)[4],
                   LLVMValueRef (*outputs)[4],
                   struct lp_build_sampler_soa *sampler,
-                  struct tgsi_shader_info *info);
+                  const struct tgsi_shader_info *info);
+
+
+void
+lp_build_tgsi_aos(LLVMBuilderRef builder,
+                  const struct tgsi_token *tokens,
+                  struct lp_type type,
+                  const unsigned char swizzles[4],
+                  LLVMValueRef consts_ptr,
+                  const LLVMValueRef *inputs,
+                  LLVMValueRef *outputs,
+                  struct lp_build_sampler_aos *sampler,
+                  const struct tgsi_shader_info *info);
 
 
 #endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
new file mode 100644
index 00000000000..d5f963be58d
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -0,0 +1,1176 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * TGSI to LLVM IR translation -- AoS.
+ *
+ * FIXME:
+ * - No control flow support: the existing control flow code should be factored
+ * out into from the SoA code into a common module and shared.
+ * - No derivatives. Derivate logic should be pluggable, just like the samplers.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#include "pipe/p_config.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_debug.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_scan.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_quad.h"
+#include "lp_bld_tgsi.h"
+#include "lp_bld_limits.h"
+#include "lp_bld_debug.h"
+
+
+#define LP_MAX_INSTRUCTIONS 256
+
+
+struct lp_build_tgsi_aos_context
+{
+   struct lp_build_context base;
+
+   /* Builder for integer masks and indices */
+   struct lp_build_context int_bld;
+
+   /*
+    * AoS swizzle used:
+    * - swizzles[0] = red index
+    * - swizzles[1] = green index
+    * - swizzles[2] = blue index
+    * - swizzles[3] = alpha index
+    */
+   unsigned char swizzles[4];
+   unsigned char inv_swizzles[4];
+
+   LLVMValueRef consts_ptr;
+   const LLVMValueRef *inputs;
+   LLVMValueRef *outputs;
+
+   struct lp_build_sampler_aos *sampler;
+
+   LLVMValueRef immediates[LP_MAX_TGSI_IMMEDIATES];
+   LLVMValueRef temps[LP_MAX_TGSI_TEMPS];
+   LLVMValueRef addr[LP_MAX_TGSI_ADDRS];
+   LLVMValueRef preds[LP_MAX_TGSI_PREDS];
+
+   /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
+    * set in the indirect_files field.
+    * The temps[] array above is unused then.
+    */
+   LLVMValueRef temps_array;
+
+   /** bitmask indicating which register files are accessed indirectly */
+   unsigned indirect_files;
+
+   struct tgsi_full_instruction *instructions;
+   uint max_instructions;
+};
+
+
+/**
+ * Wrapper around lp_build_swizzle_aos which translates swizzles to another 
+ * ordering.
+ */
+static LLVMValueRef
+swizzle_aos(struct lp_build_tgsi_aos_context *bld,
+            LLVMValueRef a,
+            unsigned swizzle_x,
+            unsigned swizzle_y,
+            unsigned swizzle_z,
+            unsigned swizzle_w)
+{
+   unsigned char swizzles[4];
+
+   assert(swizzle_x < 4);
+   assert(swizzle_y < 4);
+   assert(swizzle_z < 4);
+   assert(swizzle_w < 4);
+
+   swizzles[bld->inv_swizzles[0]] = bld->swizzles[swizzle_x];
+   swizzles[bld->inv_swizzles[1]] = bld->swizzles[swizzle_y];
+   swizzles[bld->inv_swizzles[2]] = bld->swizzles[swizzle_z];
+   swizzles[bld->inv_swizzles[3]] = bld->swizzles[swizzle_w];
+
+   return lp_build_swizzle_aos(&bld->base, a, swizzles);
+}
+
+
+static LLVMValueRef
+swizzle_scalar_aos(struct lp_build_tgsi_aos_context *bld,
+                   LLVMValueRef a,
+                   unsigned chan)
+{
+   chan = bld->swizzles[chan];
+   return lp_build_swizzle_scalar_aos(&bld->base, a, chan);
+}
+
+
+/**
+ * Register fetch.
+ */
+static LLVMValueRef
+emit_fetch(
+   struct lp_build_tgsi_aos_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned src_op)
+{
+   struct lp_type type = bld->base.type;
+   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
+   LLVMValueRef res;
+   unsigned chan;
+
+   assert(!reg->Register.Indirect);
+
+   /*
+    * Fetch the from the register file.
+    */
+
+   switch (reg->Register.File) {
+   case TGSI_FILE_CONSTANT:
+      /*
+       * Get the constants components
+       */
+
+      res = bld->base.undef;
+      for (chan = 0; chan < 4; ++chan) {
+         LLVMValueRef index;
+         LLVMValueRef scalar_ptr;
+         LLVMValueRef scalar;
+         LLVMValueRef swizzle;
+
+         index = LLVMConstInt(LLVMInt32Type(),
+                              reg->Register.Index*4 + chan,
+                              0);
+
+         scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr,
+                                   &index, 1, "");
+
+         scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
+
+         lp_build_name(scalar, "const[%u].%c", reg->Register.Index, "xyzw"[chan]);
+
+         /*
+          * NOTE: constants array is always assumed to be RGBA
+          */
+
+         swizzle = LLVMConstInt(LLVMInt32Type(), chan, 0);
+
+         res = LLVMBuildInsertElement(bld->base.builder, res, scalar, swizzle, "");
+      }
+
+      /*
+       * Broadcast the first quaternion to all others.
+       *
+       * XXX: could be factored into a reusable function.
+       */
+
+      if (type.length > 4) {
+         LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+         unsigned i;
+
+         for (chan = 0; chan < 4; ++chan) {
+            shuffles[chan] = LLVMConstInt(LLVMInt32Type(), chan, 0);
+         }
+
+         for (i = 4; i < type.length; ++i) {
+            shuffles[i] = shuffles[i % 4];
+         }
+
+         res = LLVMBuildShuffleVector(bld->base.builder,
+                                      res, bld->base.undef,
+                                      LLVMConstVector(shuffles, type.length),
+                                      "");
+      }
+      break;
+
+   case TGSI_FILE_IMMEDIATE:
+      res = bld->immediates[reg->Register.Index];
+      assert(res);
+      break;
+
+   case TGSI_FILE_INPUT:
+      res = bld->inputs[reg->Register.Index];
+      assert(res);
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      {
+         LLVMValueRef temp_ptr;
+         temp_ptr = bld->temps[reg->Register.Index];
+         res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
+         if (!res)
+            return bld->base.undef;
+      }
+      break;
+
+   default:
+      assert(0 && "invalid src register in emit_fetch()");
+      return bld->base.undef;
+   }
+
+   /*
+    * Apply sign modifier.
+    */
+
+   if (reg->Register.Absolute) {
+      res = lp_build_abs(&bld->base, res);
+   }
+
+   if(reg->Register.Negate) {
+      res = lp_build_negate(&bld->base, res);
+   }
+
+   /*
+    * Swizzle the argument
+    */
+
+   res = swizzle_aos(bld, res,
+                     reg->Register.SwizzleX,
+                     reg->Register.SwizzleY,
+                     reg->Register.SwizzleZ,
+                     reg->Register.SwizzleW);
+
+   return res;
+}
+
+
+/**
+ * Register store.
+ */
+static void
+emit_store(
+   struct lp_build_tgsi_aos_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   LLVMValueRef value)
+{
+   const struct tgsi_full_dst_register *reg = &inst->Dst[index];
+   LLVMValueRef mask = NULL;
+   LLVMValueRef ptr;
+
+   /*
+    * Saturate the value
+    */
+
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      value = lp_build_max(&bld->base, value, bld->base.zero);
+      value = lp_build_min(&bld->base, value, bld->base.one);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      value = lp_build_max(&bld->base, value, lp_build_const_vec(bld->base.type, -1.0));
+      value = lp_build_min(&bld->base, value, bld->base.one);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   /*
+    * Translate the register file
+    */
+
+   assert(!reg->Register.Indirect);
+
+   switch (reg->Register.File) {
+   case TGSI_FILE_OUTPUT:
+      ptr = bld->outputs[reg->Register.Index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      ptr = bld->temps[reg->Register.Index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      ptr = bld->addr[reg->Indirect.Index];
+      break;
+
+   case TGSI_FILE_PREDICATE:
+      ptr = bld->preds[reg->Register.Index];
+      break;
+
+   default:
+      assert(0);
+      return;
+   }
+
+   /*
+    * Predicate
+    */
+
+   if (inst->Instruction.Predicate) {
+      LLVMValueRef pred;
+
+      assert(inst->Predicate.Index < LP_MAX_TGSI_PREDS);
+
+      pred = LLVMBuildLoad(bld->base.builder,
+                           bld->preds[inst->Predicate.Index], "");
+
+      /*
+       * Convert the value to an integer mask.
+       */
+      pred = lp_build_compare(bld->base.builder,
+                               bld->base.type,
+                               PIPE_FUNC_NOTEQUAL,
+                               pred,
+                               bld->base.zero);
+
+      if (inst->Predicate.Negate) {
+         pred = LLVMBuildNot(bld->base.builder, pred, "");
+      }
+
+      pred = swizzle_aos(bld, pred,
+                         inst->Predicate.SwizzleX,
+                         inst->Predicate.SwizzleY,
+                         inst->Predicate.SwizzleZ,
+                         inst->Predicate.SwizzleW);
+
+      if (mask) {
+         mask = LLVMBuildAnd(bld->base.builder, mask, pred, "");
+      } else {
+         mask = pred;
+      }
+   }
+
+   /*
+    * Writemask
+    */
+
+   if (reg->Register.WriteMask != TGSI_WRITEMASK_XYZW) {
+      LLVMValueRef writemask;
+
+      writemask = lp_build_const_mask_aos(bld->base.type, reg->Register.WriteMask);
+
+      if (mask) {
+         mask = LLVMBuildAnd(bld->base.builder, mask, writemask, "");
+      } else {
+         mask = writemask;
+      }
+   }
+
+   if (mask) {
+      LLVMValueRef orig_value;
+
+      orig_value = LLVMBuildLoad(bld->base.builder, ptr, "");
+      value = lp_build_select(&bld->base,
+                              mask, value, orig_value);
+   }
+
+   LLVMBuildStore(bld->base.builder, value, ptr);
+}
+
+
+/**
+ * High-level instruction translators.
+ */
+
+static LLVMValueRef
+emit_tex(struct lp_build_tgsi_aos_context *bld,
+         const struct tgsi_full_instruction *inst,
+         enum lp_build_tex_modifier modifier)
+{
+   unsigned target;
+   unsigned unit;
+   LLVMValueRef coords;
+   LLVMValueRef ddx;
+   LLVMValueRef ddy;
+
+   if (!bld->sampler) {
+      _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
+      return bld->base.undef;
+   }
+
+   target = inst->Texture.Texture;
+
+   coords = emit_fetch( bld, inst, 0 );
+
+   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+      ddx = emit_fetch( bld, inst, 1 );
+      ddy = emit_fetch( bld, inst, 2 );
+      unit = inst->Src[3].Register.Index;
+   }  else {
+#if 0
+      ddx = lp_build_ddx( &bld->base, coords );
+      ddy = lp_build_ddy( &bld->base, coords );
+#else
+      /* TODO */
+      ddx = bld->base.one;
+      ddy = bld->base.one;
+#endif
+      unit = inst->Src[1].Register.Index;
+   }
+
+   return bld->sampler->emit_fetch_texel(bld->sampler,
+                                         &bld->base,
+                                         target, unit,
+                                         coords, ddx, ddy,
+                                         modifier);
+}
+
+
+static void
+emit_declaration(
+   struct lp_build_tgsi_aos_context *bld,
+   const struct tgsi_full_declaration *decl)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type);
+
+   unsigned first = decl->Range.First;
+   unsigned last = decl->Range.Last;
+   unsigned idx;
+
+   for (idx = first; idx <= last; ++idx) {
+      switch (decl->Declaration.File) {
+      case TGSI_FILE_TEMPORARY:
+         assert(idx < LP_MAX_TGSI_TEMPS);
+         if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
+            LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
+                                                   last + 1, 0);
+            bld->temps_array = lp_build_array_alloca(bld->base.builder,
+                                                     vec_type, array_size, "");
+         } else {
+            bld->temps[idx] = lp_build_alloca(bld->base.builder,
+                                              vec_type, "");
+         }
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         bld->outputs[idx] = lp_build_alloca(bld->base.builder,
+                                             vec_type, "");
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         assert(idx < LP_MAX_TGSI_ADDRS);
+         bld->addr[idx] = lp_build_alloca(bld->base.builder,
+                                          vec_type, "");
+         break;
+
+      case TGSI_FILE_PREDICATE:
+         assert(idx < LP_MAX_TGSI_PREDS);
+         bld->preds[idx] = lp_build_alloca(bld->base.builder,
+                                           vec_type, "");
+         break;
+
+      default:
+         /* don't need to declare other vars */
+         break;
+      }
+   }
+}
+
+
+/**
+ * Emit LLVM for one TGSI instruction.
+ * \param return TRUE for success, FALSE otherwise
+ */
+static boolean
+emit_instruction(
+   struct lp_build_tgsi_aos_context *bld,
+   const struct tgsi_full_instruction *inst,
+   const struct tgsi_opcode_info *info,
+   int *pc)
+{
+   LLVMValueRef src0, src1, src2;
+   LLVMValueRef tmp0, tmp1;
+   LLVMValueRef dst0;
+
+   /*
+    * Stores and write masks are handled in a general fashion after the long
+    * instruction opcode switch statement.
+    *
+    * Although not stricitly necessary, we avoid generating instructions for
+    * channels which won't be stored, in cases where's that easy. For some
+    * complex instructions, like texture sampling, it is more convenient to
+    * assume a full writemask and then let LLVM optimization passes eliminate
+    * redundant code.
+    */
+
+   (*pc)++;
+
+   assert(info->num_dst <= 1);
+   if (info->num_dst) {
+      dst0 = bld->base.undef;
+   }
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      src0 = emit_fetch(bld, inst, 0);
+      dst0 = lp_build_floor(&bld->base, src0);
+      break;
+
+   case TGSI_OPCODE_MOV:
+      dst0 = emit_fetch(bld, inst, 0);
+      break;
+
+   case TGSI_OPCODE_LIT:
+      return FALSE;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      src0 = emit_fetch(bld, inst, 0);
+      dst0 = lp_build_rcp(&bld->base, src0);
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      src0 = emit_fetch(bld, inst, 0);
+      tmp0 = lp_build_abs(&bld->base, src0);
+      dst0 = lp_build_rsqrt(&bld->base, tmp0);
+      break;
+
+   case TGSI_OPCODE_EXP:
+      return FALSE;
+
+   case TGSI_OPCODE_LOG:
+      return FALSE;
+
+   case TGSI_OPCODE_MUL:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      dst0 = lp_build_mul(&bld->base, src0, src1);
+      break;
+
+   case TGSI_OPCODE_ADD:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      dst0 = lp_build_add(&bld->base, src0, src1);
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      return FALSE;
+
+   case TGSI_OPCODE_DP4:
+   /* TGSI_OPCODE_DOT4 */
+      return FALSE;
+
+   case TGSI_OPCODE_DST:
+      return FALSE;
+
+   case TGSI_OPCODE_MIN:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      dst0 = lp_build_max(&bld->base, src0, src1);
+      break;
+
+   case TGSI_OPCODE_MAX:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      dst0 = lp_build_max(&bld->base, src0, src1);
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_LESS, src0, src1);
+      dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero);
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_GEQUAL, src0, src1);
+      dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero);
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      src2 = emit_fetch(bld, inst, 2);
+      tmp0 = lp_build_mul(&bld->base, src0, src1);
+      dst0 = lp_build_add(&bld->base, tmp0, src2);
+      break;
+
+   case TGSI_OPCODE_SUB:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      dst0 = lp_build_sub(&bld->base, src0, src1);
+      break;
+
+   case TGSI_OPCODE_LRP:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      src2 = emit_fetch(bld, inst, 2);
+      tmp0 = lp_build_sub(&bld->base, src1, src2);
+      tmp0 = lp_build_mul(&bld->base, src0, tmp0);
+      dst0 = lp_build_add(&bld->base, tmp0, src2);
+      break;
+
+   case TGSI_OPCODE_CND:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      src2 = emit_fetch(bld, inst, 2);
+      tmp1 = lp_build_const_vec(bld->base.type, 0.5);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, src2, tmp1);
+      dst0 = lp_build_select(&bld->base, tmp0, src0, src1);
+      break;
+
+   case TGSI_OPCODE_DP2A:
+      return FALSE;
+
+   case TGSI_OPCODE_FRC:
+      src0 = emit_fetch(bld, inst, 0);
+      tmp0 = lp_build_floor(&bld->base, src0);
+      dst0 = lp_build_sub(&bld->base, src0, tmp0);
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      src2 = emit_fetch(bld, inst, 2);
+      tmp0 = lp_build_max(&bld->base, src0, src1);
+      dst0 = lp_build_min(&bld->base, tmp0, src2);
+      break;
+
+   case TGSI_OPCODE_FLR:
+      src0 = emit_fetch(bld, inst, 0);
+      dst0 = lp_build_floor(&bld->base, src0);
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      src0 = emit_fetch(bld, inst, 0);
+      dst0 = lp_build_round(&bld->base, src0);
+      break;
+
+   case TGSI_OPCODE_EX2:
+      src0 = emit_fetch(bld, inst, 0);
+      tmp0 = lp_build_swizzle_scalar_aos(&bld->base, src0, TGSI_SWIZZLE_X);
+      dst0 = lp_build_exp2(&bld->base, tmp0);
+      break;
+
+   case TGSI_OPCODE_LG2:
+      src0 = emit_fetch(bld, inst, 0);
+      tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X);
+      dst0 = lp_build_log2(&bld->base, tmp0);
+      break;
+
+   case TGSI_OPCODE_POW:
+      src0 = emit_fetch(bld, inst, 0);
+      src0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X);
+      src1 = emit_fetch(bld, inst, 1);
+      src1 = swizzle_scalar_aos(bld, src1, TGSI_SWIZZLE_X);
+      dst0 = lp_build_pow(&bld->base, src0, src1);
+      break;
+
+   case TGSI_OPCODE_XPD:
+      return FALSE;
+
+   case TGSI_OPCODE_ABS:
+      src0 = emit_fetch(bld, inst, 0);
+      dst0 = lp_build_abs(&bld->base, src0);
+      break;
+
+   case TGSI_OPCODE_RCC:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+
+   case TGSI_OPCODE_DPH:
+      return FALSE;
+
+   case TGSI_OPCODE_COS:
+      src0 = emit_fetch(bld, inst, 0);
+      tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X);
+      dst0 = lp_build_cos(&bld->base, tmp0);
+      break;
+
+   case TGSI_OPCODE_DDX:
+      return FALSE;
+
+   case TGSI_OPCODE_DDY:
+      return FALSE;
+
+   case TGSI_OPCODE_KILP:
+      /* predicated kill */
+      return FALSE;
+
+   case TGSI_OPCODE_KIL:
+      /* conditional kill */
+      return FALSE;
+
+   case TGSI_OPCODE_PK2H:
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      return FALSE;
+
+   case TGSI_OPCODE_RFL:
+      return FALSE;
+
+   case TGSI_OPCODE_SEQ:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_EQUAL, src0, src1);
+      dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero);
+      break;
+
+   case TGSI_OPCODE_SFL:
+      dst0 = bld->base.zero;
+      break;
+
+   case TGSI_OPCODE_SGT:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, src0, src1);
+      dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero);
+      break;
+
+   case TGSI_OPCODE_SIN:
+      src0 = emit_fetch(bld, inst, 0);
+      tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X);
+      dst0 = lp_build_sin(&bld->base, tmp0);
+      break;
+
+   case TGSI_OPCODE_SLE:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_LEQUAL, src0, src1);
+      dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero);
+      break;
+
+   case TGSI_OPCODE_SNE:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_NOTEQUAL, src0, src1);
+      dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero);
+      break;
+
+   case TGSI_OPCODE_STR:
+      dst0 = bld->base.one;
+      break;
+
+   case TGSI_OPCODE_TEX:
+      dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_NONE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      /* deprecated */
+      assert (0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      /* deprecated */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      /* deprecated */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      /* deprecated */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_X2D:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_ARA:
+      /* deprecated */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_ARR:
+      src0 = emit_fetch(bld, inst, 0);
+      dst0 = lp_build_round(&bld->base, src0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      /* deprecated */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_CAL:
+      return FALSE;
+
+   case TGSI_OPCODE_RET:
+      return FALSE;
+
+   case TGSI_OPCODE_END:
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_SSG:
+   /* TGSI_OPCODE_SGN */
+      tmp0 = emit_fetch(bld, inst, 0);
+      dst0 = lp_build_sgn(&bld->base, tmp0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      src0 = emit_fetch(bld, inst, 0);
+      src1 = emit_fetch(bld, inst, 1);
+      src2 = emit_fetch(bld, inst, 2);
+      tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_LESS, src0, bld->base.zero);
+      dst0 = lp_build_select(&bld->base, tmp0, src1, src2);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      return FALSE;
+
+   case TGSI_OPCODE_TXB:
+      dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS);
+      break;
+
+   case TGSI_OPCODE_NRM:
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      return FALSE;
+
+   case TGSI_OPCODE_DIV:
+      /* deprecated */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      return FALSE;
+
+   case TGSI_OPCODE_TXL:
+      dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_PROJECTED);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      return FALSE;
+
+   case TGSI_OPCODE_IF:
+      return FALSE;
+
+   case TGSI_OPCODE_BGNLOOP:
+      return FALSE;
+
+   case TGSI_OPCODE_BGNSUB:
+      return FALSE;
+
+   case TGSI_OPCODE_ELSE:
+      return FALSE;
+
+   case TGSI_OPCODE_ENDIF:
+      return FALSE;
+
+   case TGSI_OPCODE_ENDLOOP:
+      return FALSE;
+
+   case TGSI_OPCODE_ENDSUB:
+      return FALSE;
+
+   case TGSI_OPCODE_PUSHA:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_POPA:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      src0 = emit_fetch(bld, inst, 0);
+      dst0 = lp_build_ceil(&bld->base, src0);
+      break;
+
+   case TGSI_OPCODE_I2F:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_NOT:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      src0 = emit_fetch(bld, inst, 0);
+      dst0 = lp_build_trunc(&bld->base, src0);
+      break;
+
+   case TGSI_OPCODE_SHL:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_ISHR:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_AND:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_OR:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_MOD:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_XOR:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_SAD:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_TXF:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      /* deprecated? */
+      assert(0);
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_CONT:
+      return FALSE;
+
+   case TGSI_OPCODE_EMIT:
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      return FALSE;
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      return FALSE;
+   }
+   
+   if (info->num_dst) {
+      emit_store(bld, inst, 0, dst0);
+   }
+
+   return TRUE;
+}
+
+
+void
+lp_build_tgsi_aos(LLVMBuilderRef builder,
+                  const struct tgsi_token *tokens,
+                  struct lp_type type,
+                  const unsigned char swizzles[4],
+                  LLVMValueRef consts_ptr,
+                  const LLVMValueRef *inputs,
+                  LLVMValueRef *outputs,
+                  struct lp_build_sampler_aos *sampler,
+                  const struct tgsi_shader_info *info)
+{
+   struct lp_build_tgsi_aos_context bld;
+   struct tgsi_parse_context parse;
+   uint num_immediates = 0;
+   uint num_instructions = 0;
+   unsigned chan;
+   int pc = 0;
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   lp_build_context_init(&bld.int_bld, builder, lp_int_type(type));
+
+   for (chan = 0; chan < 4; ++chan) {
+      bld.swizzles[chan] = swizzles[chan];
+      bld.inv_swizzles[swizzles[chan]] = chan;
+   }
+
+   bld.inputs = inputs;
+   bld.outputs = outputs;
+   bld.consts_ptr = consts_ptr;
+   bld.sampler = sampler;
+   bld.indirect_files = info->indirect_files;
+   bld.instructions = (struct tgsi_full_instruction *)
+                      MALLOC(LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction));
+   bld.max_instructions = LP_MAX_INSTRUCTIONS;
+
+   if (!bld.instructions) {
+      return;
+   }
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch(parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         /* Inputs already interpolated */
+         emit_declaration(&bld, &parse.FullToken.FullDeclaration);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            /* save expanded instruction */
+            if (num_instructions == bld.max_instructions) {
+               struct tgsi_full_instruction *instructions;
+               instructions = REALLOC(bld.instructions,
+                                      bld.max_instructions
+                                      * sizeof(struct tgsi_full_instruction),
+                                      (bld.max_instructions + LP_MAX_INSTRUCTIONS)
+                                      * sizeof(struct tgsi_full_instruction));
+               if (!instructions) {
+                  break;
+               }
+               bld.instructions = instructions;
+               bld.max_instructions += LP_MAX_INSTRUCTIONS;
+            }
+
+            memcpy(bld.instructions + num_instructions,
+                   &parse.FullToken.FullInstruction,
+                   sizeof(bld.instructions[0]));
+
+            num_instructions++;
+         }
+
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* simply copy the immediate values into the next immediates[] slot */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+            float imm[4];
+            assert(size <= 4);
+            assert(num_immediates < LP_MAX_TGSI_IMMEDIATES);
+            for (chan = 0; chan < 4; ++chan) {
+               imm[chan] = 0.0f;
+            }
+            for (chan = 0; chan < size; ++chan) {
+               unsigned swizzle = bld.swizzles[chan];
+               imm[swizzle] = parse.FullToken.FullImmediate.u[chan].Float;
+            }
+            bld.immediates[num_immediates] =
+                     lp_build_const_aos(type,
+                                        imm[0], imm[1], imm[2], imm[3],
+                                        NULL);
+            num_immediates++;
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   while (pc != -1) {
+      struct tgsi_full_instruction *instr = bld.instructions + pc;
+      const struct tgsi_opcode_info *opcode_info =
+         tgsi_get_opcode_info(instr->Instruction.Opcode);
+      if (!emit_instruction(&bld, instr, opcode_info, &pc))
+         _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
+                       opcode_info->mnemonic);
+   }
+
+   if (0) {
+      LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+      LLVMValueRef function = LLVMGetBasicBlockParent(block);
+      debug_printf("11111111111111111111111111111 \n");
+      tgsi_dump(tokens, 0);
+      lp_debug_dump_value(function);
+      debug_printf("2222222222222222222222222222 \n");
+   }
+   tgsi_parse_free(&parse);
+
+   if (0) {
+      LLVMModuleRef module = LLVMGetGlobalParent(
+         LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld.base.builder)));
+      LLVMDumpModule(module);
+   }
+
+   FREE(bld.instructions);
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index d3c769e28b8..441aebae298 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -45,22 +45,21 @@
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_scan.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_arit.h"
+#include "lp_bld_bitarit.h"
+#include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
 #include "lp_bld_swizzle.h"
 #include "lp_bld_flow.h"
+#include "lp_bld_quad.h"
 #include "lp_bld_tgsi.h"
+#include "lp_bld_limits.h"
 #include "lp_bld_debug.h"
 
 
-#define LP_MAX_TEMPS 256
-#define LP_MAX_IMMEDIATES 256
-
-
 #define FOR_EACH_CHANNEL( CHAN )\
    for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
 
@@ -78,13 +77,10 @@
 #define CHAN_Y 1
 #define CHAN_Z 2
 #define CHAN_W 3
+#define NUM_CHANNELS 4
 
-#define QUAD_TOP_LEFT     0
-#define QUAD_TOP_RIGHT    1
-#define QUAD_BOTTOM_LEFT  2
-#define QUAD_BOTTOM_RIGHT 3
+#define LP_MAX_INSTRUCTIONS 256
 
-#define LP_TGSI_MAX_NESTING 16
 
 struct lp_exec_mask {
    struct lp_build_context *bld;
@@ -93,22 +89,28 @@ struct lp_exec_mask {
 
    LLVMTypeRef int_vec_type;
 
-   LLVMValueRef cond_stack[LP_TGSI_MAX_NESTING];
+   LLVMValueRef cond_stack[LP_MAX_TGSI_NESTING];
    int cond_stack_size;
    LLVMValueRef cond_mask;
 
-   LLVMValueRef break_stack[LP_TGSI_MAX_NESTING];
-   int break_stack_size;
-   LLVMValueRef break_mask;
-
-   LLVMValueRef cont_stack[LP_TGSI_MAX_NESTING];
-   int cont_stack_size;
+   LLVMBasicBlockRef loop_block;
    LLVMValueRef cont_mask;
-
-   LLVMBasicBlockRef loop_stack[LP_TGSI_MAX_NESTING];
+   LLVMValueRef break_mask;
+   LLVMValueRef break_var;
+   struct {
+      LLVMBasicBlockRef loop_block;
+      LLVMValueRef cont_mask;
+      LLVMValueRef break_mask;
+      LLVMValueRef break_var;
+   } loop_stack[LP_MAX_TGSI_NESTING];
    int loop_stack_size;
-   LLVMBasicBlockRef loop_block;
 
+   LLVMValueRef ret_mask;
+   struct {
+      int pc;
+      LLVMValueRef ret_mask;
+   } call_stack[LP_MAX_TGSI_NESTING];
+   int call_stack_size;
 
    LLVMValueRef exec_mask;
 };
@@ -117,48 +119,36 @@ struct lp_build_tgsi_soa_context
 {
    struct lp_build_context base;
 
+   /* Builder for integer masks and indices */
+   struct lp_build_context uint_bld;
+
    LLVMValueRef consts_ptr;
    const LLVMValueRef *pos;
    const LLVMValueRef (*inputs)[NUM_CHANNELS];
    LLVMValueRef (*outputs)[NUM_CHANNELS];
 
-   struct lp_build_sampler_soa *sampler;
+   const struct lp_build_sampler_soa *sampler;
 
-   LLVMValueRef immediates[LP_MAX_IMMEDIATES][NUM_CHANNELS];
-   LLVMValueRef temps[LP_MAX_TEMPS][NUM_CHANNELS];
-   LLVMValueRef addr[LP_MAX_TEMPS][NUM_CHANNELS];
+   LLVMValueRef immediates[LP_MAX_TGSI_IMMEDIATES][NUM_CHANNELS];
+   LLVMValueRef temps[LP_MAX_TGSI_TEMPS][NUM_CHANNELS];
+   LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS];
+   LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS];
 
-   /* we allocate an array of temps if we have indirect
-    * addressing and then the temps above is unused */
+   /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is
+    * set in the indirect_files field.
+    * The temps[] array above is unused then.
+    */
    LLVMValueRef temps_array;
-   boolean has_indirect_addressing;
+
+   const struct tgsi_shader_info *info;
+   /** bitmask indicating which register files are accessed indirectly */
+   unsigned indirect_files;
 
    struct lp_build_mask_context *mask;
    struct lp_exec_mask exec_mask;
-};
-
-static const unsigned char
-swizzle_left[4] = {
-   QUAD_TOP_LEFT,     QUAD_TOP_LEFT,
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_LEFT
-};
-
-static const unsigned char
-swizzle_right[4] = {
-   QUAD_TOP_RIGHT,    QUAD_TOP_RIGHT,
-   QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT
-};
 
-static const unsigned char
-swizzle_top[4] = {
-   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT,
-   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT
-};
-
-static const unsigned char
-swizzle_bottom[4] = {
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT,
-   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT
+   struct tgsi_full_instruction *instructions;
+   uint max_instructions;
 };
 
 static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
@@ -167,10 +157,11 @@ static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context
    mask->has_mask = FALSE;
    mask->cond_stack_size = 0;
    mask->loop_stack_size = 0;
-   mask->break_stack_size = 0;
-   mask->cont_stack_size = 0;
+   mask->call_stack_size = 0;
 
    mask->int_vec_type = lp_build_int_vec_type(mask->bld->type);
+   mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask = mask->cond_mask =
+         LLVMConstAllOnes(mask->int_vec_type);
 }
 
 static void lp_exec_mask_update(struct lp_exec_mask *mask)
@@ -190,33 +181,47 @@ static void lp_exec_mask_update(struct lp_exec_mask *mask)
    } else
       mask->exec_mask = mask->cond_mask;
 
+   if (mask->call_stack_size) {
+      mask->exec_mask = LLVMBuildAnd(mask->bld->builder,
+                                     mask->exec_mask,
+                                     mask->ret_mask,
+                                     "callmask");
+   }
 
    mask->has_mask = (mask->cond_stack_size > 0 ||
-                     mask->loop_stack_size > 0);
+                     mask->loop_stack_size > 0 ||
+                     mask->call_stack_size > 0);
 }
 
 static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
                                    LLVMValueRef val)
 {
+   assert(mask->cond_stack_size < LP_MAX_TGSI_NESTING);
+   if (mask->cond_stack_size == 0) {
+      assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type));
+   }
    mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
-   mask->cond_mask = LLVMBuildBitCast(mask->bld->builder, val,
-                                      mask->int_vec_type, "");
-
+   assert(LLVMTypeOf(val) == mask->int_vec_type);
+   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
+                                  mask->cond_mask,
+                                  val,
+                                  "");
    lp_exec_mask_update(mask);
 }
 
 static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
 {
-   LLVMValueRef prev_mask = mask->cond_stack[mask->cond_stack_size - 1];
-   LLVMValueRef inv_mask = LLVMBuildNot(mask->bld->builder,
-                                        mask->cond_mask, "");
-
-   /* means that we didn't have any mask before and that
-    * we were fully enabled */
-   if (mask->cond_stack_size <= 1) {
-      prev_mask = LLVMConstAllOnes(mask->int_vec_type);
+   LLVMValueRef prev_mask;
+   LLVMValueRef inv_mask;
+
+   assert(mask->cond_stack_size);
+   prev_mask = mask->cond_stack[mask->cond_stack_size - 1];
+   if (mask->cond_stack_size == 1) {
+      assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type));
    }
 
+   inv_mask = LLVMBuildNot(mask->bld->builder, mask->cond_mask, "");
+
    mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
                                   inv_mask,
                                   prev_mask, "");
@@ -225,27 +230,37 @@ static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
 
 static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
 {
+   assert(mask->cond_stack_size);
    mask->cond_mask = mask->cond_stack[--mask->cond_stack_size];
    lp_exec_mask_update(mask);
 }
 
 static void lp_exec_bgnloop(struct lp_exec_mask *mask)
 {
+   if (mask->loop_stack_size == 0) {
+      assert(mask->loop_block == NULL);
+      assert(mask->cont_mask == LLVMConstAllOnes(mask->int_vec_type));
+      assert(mask->break_mask == LLVMConstAllOnes(mask->int_vec_type));
+      assert(mask->break_var == NULL);
+   }
 
-   if (mask->cont_stack_size == 0)
-      mask->cont_mask = LLVMConstAllOnes(mask->int_vec_type);
-   if (mask->break_stack_size == 0)
-      mask->break_mask = LLVMConstAllOnes(mask->int_vec_type);
-   if (mask->cond_stack_size == 0)
-      mask->cond_mask = LLVMConstAllOnes(mask->int_vec_type);
+   assert(mask->loop_stack_size < LP_MAX_TGSI_NESTING);
+
+   mask->loop_stack[mask->loop_stack_size].loop_block = mask->loop_block;
+   mask->loop_stack[mask->loop_stack_size].cont_mask = mask->cont_mask;
+   mask->loop_stack[mask->loop_stack_size].break_mask = mask->break_mask;
+   mask->loop_stack[mask->loop_stack_size].break_var = mask->break_var;
+   ++mask->loop_stack_size;
+
+   mask->break_var = lp_build_alloca(mask->bld->builder, mask->int_vec_type, "");
+   LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var);
 
-   mask->break_stack[mask->break_stack_size++] = mask->break_mask;
-   mask->cont_stack[mask->cont_stack_size++] = mask->cont_mask;
-   mask->loop_stack[mask->loop_stack_size++] = mask->loop_block;
    mask->loop_block = lp_build_insert_new_block(mask->bld->builder, "bgnloop");
    LLVMBuildBr(mask->bld->builder, mask->loop_block);
    LLVMPositionBuilderAtEnd(mask->bld->builder, mask->loop_block);
 
+   mask->break_mask = LLVMBuildLoad(mask->bld->builder, mask->break_var, "");
+
    lp_exec_mask_update(mask);
 }
 
@@ -255,16 +270,9 @@ static void lp_exec_break(struct lp_exec_mask *mask)
                                          mask->exec_mask,
                                          "break");
 
-   /* mask->break_stack_size > 1 implies that we encountered a break
-    * statemant already and if that's the case we want to make sure
-    * our mask is a combination of the previous break and the current
-    * execution mask */
-   if (mask->break_stack_size > 1) {
-      mask->break_mask = LLVMBuildAnd(mask->bld->builder,
-                                      mask->break_mask,
-                                      exec_mask, "break_full");
-   } else
-      mask->break_mask = exec_mask;
+   mask->break_mask = LLVMBuildAnd(mask->bld->builder,
+                                   mask->break_mask,
+                                   exec_mask, "break_full");
 
    lp_exec_mask_update(mask);
 }
@@ -275,12 +283,9 @@ static void lp_exec_continue(struct lp_exec_mask *mask)
                                          mask->exec_mask,
                                          "");
 
-   if (mask->cont_stack_size > 1) {
-      mask->cont_mask = LLVMBuildAnd(mask->bld->builder,
-                                     mask->cont_mask,
-                                     exec_mask, "");
-   } else
-      mask->cont_mask = exec_mask;
+   mask->cont_mask = LLVMBuildAnd(mask->bld->builder,
+                                  mask->cont_mask,
+                                  exec_mask, "");
 
    lp_exec_mask_update(mask);
 }
@@ -295,11 +300,24 @@ static void lp_exec_endloop(struct lp_exec_mask *mask)
 
    assert(mask->break_mask);
 
+   /*
+    * Restore the cont_mask, but don't pop
+    */
+   assert(mask->loop_stack_size);
+   mask->cont_mask = mask->loop_stack[mask->loop_stack_size - 1].cont_mask;
+   lp_exec_mask_update(mask);
+
+   /*
+    * Unlike the continue mask, the break_mask must be preserved across loop
+    * iterations
+    */
+   LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var);
+
    /* i1cond = (mask == 0) */
    i1cond = LLVMBuildICmp(
       mask->bld->builder,
       LLVMIntNE,
-      LLVMBuildBitCast(mask->bld->builder, mask->break_mask, reg_type, ""),
+      LLVMBuildBitCast(mask->bld->builder, mask->exec_mask, reg_type, ""),
       LLVMConstNull(reg_type), "");
 
    endloop = lp_build_insert_new_block(mask->bld->builder, "endloop");
@@ -309,15 +327,12 @@ static void lp_exec_endloop(struct lp_exec_mask *mask)
 
    LLVMPositionBuilderAtEnd(mask->bld->builder, endloop);
 
-   mask->loop_block = mask->loop_stack[--mask->loop_stack_size];
-   /* pop the cont mask */
-   if (mask->cont_stack_size) {
-      mask->cont_mask = mask->cont_stack[--mask->cont_stack_size];
-   }
-   /* pop the break mask */
-   if (mask->break_stack_size) {
-      mask->break_mask = mask->break_stack[--mask->break_stack_size];
-   }
+   assert(mask->loop_stack_size);
+   --mask->loop_stack_size;
+   mask->loop_block = mask->loop_stack[mask->loop_stack_size].loop_block;
+   mask->cont_mask = mask->loop_stack[mask->loop_stack_size].cont_mask;
+   mask->break_mask = mask->loop_stack[mask->loop_stack_size].break_mask;
+   mask->break_var = mask->loop_stack[mask->loop_stack_size].break_var;
 
    lp_exec_mask_update(mask);
 }
@@ -328,15 +343,25 @@ static void lp_exec_endloop(struct lp_exec_mask *mask)
  * (0 means don't store this bit, 1 means do store).
  */
 static void lp_exec_mask_store(struct lp_exec_mask *mask,
+                               LLVMValueRef pred,
                                LLVMValueRef val,
                                LLVMValueRef dst)
 {
+   /* Mix the predicate and execution mask */
    if (mask->has_mask) {
+      if (pred) {
+         pred = LLVMBuildAnd(mask->bld->builder, pred, mask->exec_mask, "");
+      } else {
+         pred = mask->exec_mask;
+      }
+   }
+
+   if (pred) {
       LLVMValueRef real_val, dst_val;
 
       dst_val = LLVMBuildLoad(mask->bld->builder, dst, "");
       real_val = lp_build_select(mask->bld,
-                                 mask->exec_mask,
+                                 pred,
                                  val, dst_val);
 
       LLVMBuildStore(mask->bld->builder, real_val, dst);
@@ -344,44 +369,149 @@ static void lp_exec_mask_store(struct lp_exec_mask *mask,
       LLVMBuildStore(mask->bld->builder, val, dst);
 }
 
+static void lp_exec_mask_call(struct lp_exec_mask *mask,
+                              int func,
+                              int *pc)
+{
+   assert(mask->call_stack_size < LP_MAX_TGSI_NESTING);
+   mask->call_stack[mask->call_stack_size].pc = *pc;
+   mask->call_stack[mask->call_stack_size].ret_mask = mask->ret_mask;
+   mask->call_stack_size++;
+   *pc = func;
+}
 
-static LLVMValueRef
-emit_ddx(struct lp_build_tgsi_soa_context *bld,
-         LLVMValueRef src)
+static void lp_exec_mask_ret(struct lp_exec_mask *mask, int *pc)
 {
-   LLVMValueRef src_left  = lp_build_swizzle1_aos(&bld->base, src, swizzle_left);
-   LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right);
-   return lp_build_sub(&bld->base, src_right, src_left);
+   LLVMValueRef exec_mask;
+
+   if (mask->call_stack_size == 0) {
+      /* returning from main() */
+      *pc = -1;
+      return;
+   }
+   exec_mask = LLVMBuildNot(mask->bld->builder,
+                            mask->exec_mask,
+                            "ret");
+
+   mask->ret_mask = LLVMBuildAnd(mask->bld->builder,
+                                 mask->ret_mask,
+                                 exec_mask, "ret_full");
+
+   lp_exec_mask_update(mask);
 }
 
+static void lp_exec_mask_bgnsub(struct lp_exec_mask *mask)
+{
+}
 
-static LLVMValueRef
-emit_ddy(struct lp_build_tgsi_soa_context *bld,
-         LLVMValueRef src)
+static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc)
 {
-   LLVMValueRef src_top    = lp_build_swizzle1_aos(&bld->base, src, swizzle_top);
-   LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom);
-   return lp_build_sub(&bld->base, src_top, src_bottom);
+   assert(mask->call_stack_size);
+   mask->call_stack_size--;
+   *pc = mask->call_stack[mask->call_stack_size].pc;
+   mask->ret_mask = mask->call_stack[mask->call_stack_size].ret_mask;
+   lp_exec_mask_update(mask);
 }
 
+
+/**
+ * Return pointer to a temporary register channel (src or dest).
+ * Note that indirect addressing cannot be handled here.
+ * \param index  which temporary register
+ * \param chan  which channel of the temp register.
+ */
 static LLVMValueRef
 get_temp_ptr(struct lp_build_tgsi_soa_context *bld,
              unsigned index,
-             unsigned swizzle,
-             boolean is_indirect,
-             LLVMValueRef addr)
+             unsigned chan)
 {
-   if (!bld->has_indirect_addressing) {
-      return bld->temps[index][swizzle];
-   } else {
-      LLVMValueRef lindex =
-         LLVMConstInt(LLVMInt32Type(), index*4 + swizzle, 0);
-      if (is_indirect)
-         lindex = lp_build_add(&bld->base, lindex, addr);
+   assert(chan < 4);
+   if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
+      LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan);
       return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, "");
    }
+   else {
+      return bld->temps[index][chan];
+   }
+}
+
+
+/**
+ * Gather vector.
+ * XXX the lp_build_gather() function should be capable of doing this
+ * with a little work.
+ */
+static LLVMValueRef
+build_gather(struct lp_build_tgsi_soa_context *bld,
+             LLVMValueRef base_ptr,
+             LLVMValueRef indexes)
+{
+   LLVMValueRef res = bld->base.undef;
+   unsigned i;
+
+   /*
+    * Loop over elements of index_vec, load scalar value, insert it into 'res'.
+    */
+   for (i = 0; i < bld->base.type.length; i++) {
+      LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder,
+                                                   indexes, ii, "");
+      LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr,
+                                             &index, 1, "");
+      LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
+
+      res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, "");
+   }
+
+   return res;
 }
 
+
+/**
+ * Read the current value of the ADDR register, convert the floats to
+ * ints, multiply by four and return the vector of offsets.
+ * The offsets will be used to index into the constant buffer or
+ * temporary register file.
+ */
+static LLVMValueRef
+get_indirect_index(struct lp_build_tgsi_soa_context *bld,
+                   unsigned reg_file, unsigned reg_index,
+                   const struct tgsi_src_register *indirect_reg)
+{
+   struct lp_build_context *uint_bld = &bld->uint_bld;
+   /* always use X component of address register */
+   unsigned swizzle = indirect_reg->SwizzleX;
+   LLVMValueRef base;
+   LLVMValueRef rel;
+   LLVMValueRef max_index;
+   LLVMValueRef index;
+
+   assert(bld->indirect_files & (1 << reg_file));
+
+   base = lp_build_const_int_vec(uint_bld->type, reg_index);
+
+   assert(swizzle < 4);
+   rel = LLVMBuildLoad(bld->base.builder,
+                        bld->addr[indirect_reg->Index][swizzle],
+                        "load addr reg");
+
+   /* for indexing we want integers */
+   rel = LLVMBuildFPToSI(bld->base.builder,
+                         rel,
+                         uint_bld->vec_type, "");
+
+   index = lp_build_add(uint_bld, base, rel);
+
+   max_index = lp_build_const_int_vec(uint_bld->type,
+                                      bld->info->file_max[reg_file]);
+
+   assert(!uint_bld->type.sign);
+   index = lp_build_min(uint_bld, index, max_index);
+
+   return index;
+}
+
+
 /**
  * Register fetch.
  */
@@ -389,81 +519,102 @@ static LLVMValueRef
 emit_fetch(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_instruction *inst,
-   unsigned index,
+   unsigned src_op,
    const unsigned chan_index )
 {
-   const struct tgsi_full_src_register *reg = &inst->Src[index];
-   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
+   struct lp_build_context *uint_bld = &bld->uint_bld;
+   const struct tgsi_full_src_register *reg = &inst->Src[src_op];
+   const unsigned swizzle =
+      tgsi_util_get_full_src_register_swizzle(reg, chan_index);
    LLVMValueRef res;
-   LLVMValueRef addr;
+   LLVMValueRef indirect_index = NULL;
+
+   if (swizzle > 3) {
+      assert(0 && "invalid swizzle in emit_fetch()");
+      return bld->base.undef;
+   }
 
-   switch (swizzle) {
-   case TGSI_SWIZZLE_X:
-   case TGSI_SWIZZLE_Y:
-   case TGSI_SWIZZLE_Z:
-   case TGSI_SWIZZLE_W:
+   if (reg->Register.Indirect) {
+      indirect_index = get_indirect_index(bld,
+                                          reg->Register.File,
+                                          reg->Register.Index,
+                                          &reg->Indirect);
+   } else {
+      assert(reg->Register.Index <= bld->info->file_max[reg->Register.File]);
+   }
 
+   switch (reg->Register.File) {
+   case TGSI_FILE_CONSTANT:
       if (reg->Register.Indirect) {
-         LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
-         unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
-         addr = LLVMBuildLoad(bld->base.builder,
-                              bld->addr[reg->Indirect.Index][swizzle],
-                              "");
-         /* for indexing we want integers */
-         addr = LLVMBuildFPToSI(bld->base.builder, addr,
-                                int_vec_type, "");
-         addr = LLVMBuildExtractElement(bld->base.builder,
-                                        addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
-                                        "");
-         addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
-      }
-
-      switch (reg->Register.File) {
-      case TGSI_FILE_CONSTANT: {
-         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->Register.Index*4 + swizzle, 0);
+         LLVMValueRef swizzle_vec =
+            lp_build_const_int_vec(uint_bld->type, swizzle);
+         LLVMValueRef index_vec;  /* index into the const buffer */
+
+         /* index_vec = indirect_index * 4 + swizzle */
+         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
+
+         /* Gather values from the constant buffer */
+         res = build_gather(bld, bld->consts_ptr, index_vec);
+      }
+      else {
+         LLVMValueRef index;  /* index into the const buffer */
          LLVMValueRef scalar, scalar_ptr;
 
-         if (reg->Register.Indirect) {
-            /*lp_build_printf(bld->base.builder,
-              "\taddr = %d\n", addr);*/
-            index = lp_build_add(&bld->base, index, addr);
-         }
-         scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, "");
+         index = lp_build_const_int32(reg->Register.Index*4 + swizzle);
+
+         scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr,
+                                   &index, 1, "");
          scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
 
          res = lp_build_broadcast_scalar(&bld->base, scalar);
-         break;
       }
+      break;
 
-      case TGSI_FILE_IMMEDIATE:
-         res = bld->immediates[reg->Register.Index][swizzle];
-         assert(res);
-         break;
+   case TGSI_FILE_IMMEDIATE:
+      res = bld->immediates[reg->Register.Index][swizzle];
+      assert(res);
+      break;
 
-      case TGSI_FILE_INPUT:
-         res = bld->inputs[reg->Register.Index][swizzle];
-         assert(res);
-         break;
+   case TGSI_FILE_INPUT:
+      res = bld->inputs[reg->Register.Index][swizzle];
+      assert(res);
+      break;
 
-      case TGSI_FILE_TEMPORARY: {
-         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
-                                              swizzle,
-                                              reg->Register.Indirect,
-                                              addr);
+   case TGSI_FILE_TEMPORARY:
+      if (reg->Register.Indirect) {
+         LLVMValueRef swizzle_vec =
+            lp_build_const_int_vec(uint_bld->type, swizzle);
+         LLVMValueRef length_vec =
+            lp_build_const_int_vec(uint_bld->type, bld->base.type.length);
+         LLVMValueRef index_vec;  /* index into the const buffer */
+         LLVMValueRef temps_array;
+         LLVMTypeRef float4_ptr_type;
+
+         /* index_vec = (indirect_index * 4 + swizzle) * length */
+         index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2);
+         index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec);
+         index_vec = lp_build_mul(uint_bld, index_vec, length_vec);
+
+         /* cast temps_array pointer to float* */
+         float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0);
+         temps_array = LLVMBuildBitCast(uint_bld->builder, bld->temps_array,
+                                        float4_ptr_type, "");
+
+         /* Gather values from the temporary register array */
+         res = build_gather(bld, temps_array, index_vec);
+      }
+      else {
+         LLVMValueRef temp_ptr;
+         temp_ptr = get_temp_ptr(bld, reg->Register.Index, swizzle);
          res = LLVMBuildLoad(bld->base.builder, temp_ptr, "");
-         if(!res)
+         if (!res)
             return bld->base.undef;
-         break;
-      }
-
-      default:
-         assert( 0 );
-         return bld->base.undef;
       }
       break;
 
    default:
-      assert( 0 );
+      assert(0 && "invalid src register in emit_fetch()");
       return bld->base.undef;
    }
 
@@ -473,13 +624,10 @@ emit_fetch(
       break;
 
    case TGSI_UTIL_SIGN_SET:
-      /* TODO: Use bitwese OR for floating point */
       res = lp_build_abs( &bld->base, res );
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
-      break;
-
+      /* fall through */
    case TGSI_UTIL_SIGN_TOGGLE:
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
+      res = lp_build_negate( &bld->base, res );
       break;
 
    case TGSI_UTIL_SIGN_KEEP:
@@ -513,10 +661,77 @@ emit_fetch_deriv(
    /* TODO: use interpolation coeffs for inputs */
 
    if(ddx)
-      *ddx = emit_ddx(bld, src);
+      *ddx = lp_build_ddx(&bld->base, src);
 
    if(ddy)
-      *ddy = emit_ddy(bld, src);
+      *ddy = lp_build_ddy(&bld->base, src);
+}
+
+
+/**
+ * Predicate.
+ */
+static void
+emit_fetch_predicate(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   LLVMValueRef *pred)
+{
+   unsigned index;
+   unsigned char swizzles[4];
+   LLVMValueRef unswizzled[4] = {NULL, NULL, NULL, NULL};
+   LLVMValueRef value;
+   unsigned chan;
+
+   if (!inst->Instruction.Predicate) {
+      FOR_EACH_CHANNEL( chan ) {
+         pred[chan] = NULL;
+      }
+      return;
+   }
+
+   swizzles[0] = inst->Predicate.SwizzleX;
+   swizzles[1] = inst->Predicate.SwizzleY;
+   swizzles[2] = inst->Predicate.SwizzleZ;
+   swizzles[3] = inst->Predicate.SwizzleW;
+
+   index = inst->Predicate.Index;
+   assert(index < LP_MAX_TGSI_PREDS);
+
+   FOR_EACH_CHANNEL( chan ) {
+      unsigned swizzle = swizzles[chan];
+
+      /*
+       * Only fetch the predicate register channels that are actually listed
+       * in the swizzles
+       */
+      if (!unswizzled[swizzle]) {
+         value = LLVMBuildLoad(bld->base.builder,
+                               bld->preds[index][swizzle], "");
+
+         /*
+          * Convert the value to an integer mask.
+          *
+          * TODO: Short-circuit this comparison -- a D3D setp_xx instructions
+          * is needlessly causing two comparisons due to storing the intermediate
+          * result as float vector instead of an integer mask vector.
+          */
+         value = lp_build_compare(bld->base.builder,
+                                  bld->base.type,
+                                  PIPE_FUNC_NOTEQUAL,
+                                  value,
+                                  bld->base.zero);
+         if (inst->Predicate.Negate) {
+            value = LLVMBuildNot(bld->base.builder, value, "");
+         }
+
+         unswizzled[swizzle] = value;
+      } else {
+         value = unswizzled[swizzle];
+      }
+
+      pred[chan] = value;
+   }
 }
 
 
@@ -529,10 +744,11 @@ emit_store(
    const struct tgsi_full_instruction *inst,
    unsigned index,
    unsigned chan_index,
+   LLVMValueRef pred,
    LLVMValueRef value)
 {
    const struct tgsi_full_dst_register *reg = &inst->Dst[index];
-   LLVMValueRef addr;
+   LLVMValueRef indirect_index = NULL;
 
    switch( inst->Instruction.Saturate ) {
    case TGSI_SAT_NONE:
@@ -553,43 +769,41 @@ emit_store(
    }
 
    if (reg->Register.Indirect) {
-      LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type);
-      unsigned swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, chan_index );
-      addr = LLVMBuildLoad(bld->base.builder,
-                           bld->addr[reg->Indirect.Index][swizzle],
-                           "");
-      /* for indexing we want integers */
-      addr = LLVMBuildFPToSI(bld->base.builder, addr,
-                             int_vec_type, "");
-      addr = LLVMBuildExtractElement(bld->base.builder,
-                                     addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
-                                     "");
-      addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
+      indirect_index = get_indirect_index(bld,
+                                          reg->Register.File,
+                                          reg->Register.Index,
+                                          &reg->Indirect);
+   } else {
+      assert(reg->Register.Index <= bld->info->file_max[reg->Register.File]);
    }
 
    switch( reg->Register.File ) {
    case TGSI_FILE_OUTPUT:
-      lp_exec_mask_store(&bld->exec_mask, value,
+      lp_exec_mask_store(&bld->exec_mask, pred, value,
                          bld->outputs[reg->Register.Index][chan_index]);
       break;
 
-   case TGSI_FILE_TEMPORARY: {
-      LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
-                                           chan_index,
-                                           reg->Register.Indirect,
-                                           addr);
-      lp_exec_mask_store(&bld->exec_mask, value, temp_ptr);
+   case TGSI_FILE_TEMPORARY:
+      if (reg->Register.Indirect) {
+         /* XXX not done yet */
+         debug_printf("WARNING: LLVM scatter store of temp regs"
+                      " not implemented\n");
+      }
+      else {
+         LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index,
+                                              chan_index);
+         lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr);
+      }
       break;
-   }
 
    case TGSI_FILE_ADDRESS:
-      lp_exec_mask_store(&bld->exec_mask, value,
+      lp_exec_mask_store(&bld->exec_mask, pred, value,
                          bld->addr[reg->Indirect.Index][chan_index]);
       break;
 
    case TGSI_FILE_PREDICATE:
-      /* FIXME */
-      assert(0);
+      lp_exec_mask_store(&bld->exec_mask, pred, value,
+                         bld->preds[reg->Register.Index][chan_index]);
       break;
 
    default:
@@ -602,21 +816,29 @@ emit_store(
  * High-level instruction translators.
  */
 
-
 static void
 emit_tex( struct lp_build_tgsi_soa_context *bld,
           const struct tgsi_full_instruction *inst,
-          boolean apply_lodbias,
-          boolean projected,
+          enum lp_build_tex_modifier modifier,
           LLVMValueRef *texel)
 {
-   const uint unit = inst->Src[1].Register.Index;
-   LLVMValueRef lodbias;
+   unsigned unit;
+   LLVMValueRef lod_bias, explicit_lod;
    LLVMValueRef oow = NULL;
    LLVMValueRef coords[3];
+   LLVMValueRef ddx[3];
+   LLVMValueRef ddy[3];
    unsigned num_coords;
    unsigned i;
 
+   if (!bld->sampler) {
+      _debug_printf("warning: found texture instruction but no sampler generator supplied\n");
+      for (i = 0; i < 4; i++) {
+         texel[i] = bld->base.undef;
+      }
+      return;
+   }
+
    switch (inst->Texture.Texture) {
    case TGSI_TEXTURE_1D:
       num_coords = 1;
@@ -637,29 +859,57 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
       return;
    }
 
-   if(apply_lodbias)
-      lodbias = emit_fetch( bld, inst, 0, 3 );
-   else
-      lodbias = bld->base.zero;
+   if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) {
+      lod_bias = emit_fetch( bld, inst, 0, 3 );
+      explicit_lod = NULL;
+   }
+   else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) {
+      lod_bias = NULL;
+      explicit_lod = emit_fetch( bld, inst, 0, 3 );
+   }
+   else {
+      lod_bias = NULL;
+      explicit_lod = NULL;
+   }
 
-   if (projected) {
+   if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) {
       oow = emit_fetch( bld, inst, 0, 3 );
       oow = lp_build_rcp(&bld->base, oow);
    }
 
    for (i = 0; i < num_coords; i++) {
       coords[i] = emit_fetch( bld, inst, 0, i );
-      if (projected)
+      if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED)
          coords[i] = lp_build_mul(&bld->base, coords[i], oow);
    }
    for (i = num_coords; i < 3; i++) {
       coords[i] = bld->base.undef;
    }
 
+   if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) {
+      for (i = 0; i < num_coords; i++) {
+         ddx[i] = emit_fetch( bld, inst, 1, i );
+         ddy[i] = emit_fetch( bld, inst, 2, i );
+      }
+      unit = inst->Src[3].Register.Index;
+   }  else {
+      for (i = 0; i < num_coords; i++) {
+         ddx[i] = lp_build_ddx( &bld->base, coords[i] );
+         ddy[i] = lp_build_ddy( &bld->base, coords[i] );
+      }
+      unit = inst->Src[1].Register.Index;
+   }
+   for (i = num_coords; i < 3; i++) {
+      ddx[i] = bld->base.undef;
+      ddy[i] = bld->base.undef;
+   }
+
    bld->sampler->emit_fetch_texel(bld->sampler,
                                   bld->base.builder,
                                   bld->base.type,
-                                  unit, num_coords, coords, lodbias,
+                                  unit, num_coords, coords,
+                                  ddx, ddy,
+                                  lod_bias, explicit_lod,
                                   texel);
 }
 
@@ -739,25 +989,27 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld,
    lp_build_mask_update(bld->mask, mask);
 }
 
-static int
+static void
 emit_declaration(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_declaration *decl)
 {
-   LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type);
+   LLVMTypeRef vec_type = bld->base.vec_type;
 
    unsigned first = decl->Range.First;
    unsigned last = decl->Range.Last;
    unsigned idx, i;
 
    for (idx = first; idx <= last; ++idx) {
+      assert(last <= bld->info->file_max[decl->Declaration.File]);
       switch (decl->Declaration.File) {
       case TGSI_FILE_TEMPORARY:
-         if (bld->has_indirect_addressing) {
-            LLVMValueRef val = LLVMConstInt(LLVMInt32Type(),
-                                            last*4 + 4, 0);
+         assert(idx < LP_MAX_TGSI_TEMPS);
+         if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
+            LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(),
+                                                   last*4 + 4, 0);
             bld->temps_array = lp_build_array_alloca(bld->base.builder,
-                                                     vec_type, val, "");
+                                                     vec_type, array_size, "");
          } else {
             for (i = 0; i < NUM_CHANNELS; i++)
                bld->temps[idx][i] = lp_build_alloca(bld->base.builder,
@@ -772,18 +1024,24 @@ emit_declaration(
          break;
 
       case TGSI_FILE_ADDRESS:
+         assert(idx < LP_MAX_TGSI_ADDRS);
          for (i = 0; i < NUM_CHANNELS; i++)
             bld->addr[idx][i] = lp_build_alloca(bld->base.builder,
                                                 vec_type, "");
          break;
 
+      case TGSI_FILE_PREDICATE:
+         assert(idx < LP_MAX_TGSI_PREDS);
+         for (i = 0; i < NUM_CHANNELS; i++)
+            bld->preds[idx][i] = lp_build_alloca(bld->base.builder,
+                                                 vec_type, "");
+         break;
+
       default:
          /* don't need to declare other vars */
          break;
       }
    }
-
-   return TRUE;
 }
 
 
@@ -795,7 +1053,8 @@ static boolean
 emit_instruction(
    struct lp_build_tgsi_soa_context *bld,
    const struct tgsi_full_instruction *inst,
-   const struct tgsi_opcode_info *info)
+   const struct tgsi_opcode_info *info,
+   int *pc)
 {
    unsigned chan_index;
    LLVMValueRef src0, src1, src2;
@@ -819,8 +1078,10 @@ emit_instruction(
     * redundant code.
     */
 
+   (*pc)++;
+
    assert(info->num_dst <= 1);
-   if(info->num_dst) {
+   if (info->num_dst) {
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
          dst0[chan_index] = bld->base.undef;
       }
@@ -1359,12 +1620,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TEX:
-      emit_tex( bld, inst, FALSE, FALSE, dst0 );
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_NONE, dst0 );
       break;
 
    case TGSI_OPCODE_TXD:
-      /* FIXME */
-      return FALSE;
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV, dst0 );
       break;
 
    case TGSI_OPCODE_UP2H:
@@ -1418,16 +1678,18 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_CAL:
-      /* FIXME */
-      return FALSE;
+      lp_exec_mask_call(&bld->exec_mask,
+                        inst->Label.Label,
+                        pc);
+
       break;
 
    case TGSI_OPCODE_RET:
-      /* FIXME */
-      return FALSE;
+      lp_exec_mask_ret(&bld->exec_mask, pc);
       break;
 
    case TGSI_OPCODE_END:
+      *pc = -1;
       break;
 
    case TGSI_OPCODE_SSG:
@@ -1466,7 +1728,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TXB:
-      emit_tex( bld, inst, TRUE, FALSE, dst0 );
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS, dst0 );
       break;
 
    case TGSI_OPCODE_NRM:
@@ -1571,11 +1833,11 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TXL:
-      emit_tex( bld, inst, TRUE, FALSE, dst0 );
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD, dst0 );
       break;
 
    case TGSI_OPCODE_TXP:
-      emit_tex( bld, inst, FALSE, TRUE, dst0 );
+      emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_PROJECTED, dst0 );
       break;
 
    case TGSI_OPCODE_BRK:
@@ -1593,6 +1855,10 @@ emit_instruction(
       lp_exec_bgnloop(&bld->exec_mask);
       break;
 
+   case TGSI_OPCODE_BGNSUB:
+      lp_exec_mask_bgnsub(&bld->exec_mask);
+      break;
+
    case TGSI_OPCODE_ELSE:
       lp_exec_mask_cond_invert(&bld->exec_mask);
       break;
@@ -1605,6 +1871,10 @@ emit_instruction(
       lp_exec_endloop(&bld->exec_mask);
       break;
 
+   case TGSI_OPCODE_ENDSUB:
+      lp_exec_mask_endsub(&bld->exec_mask, pc);
+      break;
+
    case TGSI_OPCODE_PUSHA:
       /* deprecated? */
       assert(0);
@@ -1717,8 +1987,12 @@ emit_instruction(
    }
    
    if(info->num_dst) {
+      LLVMValueRef pred[NUM_CHANNELS];
+
+      emit_fetch_predicate( bld, inst, pred );
+
       FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
-         emit_store( bld, inst, 0, chan_index, dst0[chan_index]);
+         emit_store( bld, inst, 0, chan_index, pred[chan_index], dst0[chan_index]);
       }
    }
 
@@ -1736,24 +2010,42 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
                   const LLVMValueRef (*inputs)[NUM_CHANNELS],
                   LLVMValueRef (*outputs)[NUM_CHANNELS],
                   struct lp_build_sampler_soa *sampler,
-                  struct tgsi_shader_info *info)
+                  const struct tgsi_shader_info *info)
 {
    struct lp_build_tgsi_soa_context bld;
    struct tgsi_parse_context parse;
    uint num_immediates = 0;
+   uint num_instructions = 0;
    unsigned i;
+   int pc = 0;
+
+   struct lp_type res_type;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = type.width;
+   res_type.length = type.length;
+   res_type.sign = 1;
 
    /* Setup build context */
    memset(&bld, 0, sizeof bld);
    lp_build_context_init(&bld.base, builder, type);
+   lp_build_context_init(&bld.uint_bld, builder, lp_uint_type(type));
    bld.mask = mask;
    bld.pos = pos;
    bld.inputs = inputs;
    bld.outputs = outputs;
    bld.consts_ptr = consts_ptr;
    bld.sampler = sampler;
-   bld.has_indirect_addressing = info->opcode_count[TGSI_OPCODE_ARR] > 0 ||
-                                 info->opcode_count[TGSI_OPCODE_ARL] > 0;
+   bld.info = info;
+   bld.indirect_files = info->indirect_files;
+   bld.instructions = (struct tgsi_full_instruction *)
+                      MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) );
+   bld.max_instructions = LP_MAX_INSTRUCTIONS;
+
+   if (!bld.instructions) {
+      return;
+   }
 
    lp_exec_mask_init(&bld.exec_mask, &bld.base);
 
@@ -1765,19 +2057,31 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
          /* Inputs already interpolated */
-         {
-            if (!emit_declaration( &bld, &parse.FullToken.FullDeclaration ))
-               _debug_printf("warning: failed to define LLVM variable\n");
-         }
+         emit_declaration( &bld, &parse.FullToken.FullDeclaration );
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
          {
-            unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
-            const struct tgsi_opcode_info *opcode_info = tgsi_get_opcode_info(opcode);
-            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, opcode_info ))
-               _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
-                             opcode_info->mnemonic);
+            /* save expanded instruction */
+            if (num_instructions == bld.max_instructions) {
+               struct tgsi_full_instruction *instructions;
+               instructions = REALLOC(bld.instructions,
+                                      bld.max_instructions
+                                      * sizeof(struct tgsi_full_instruction),
+                                      (bld.max_instructions + LP_MAX_INSTRUCTIONS)
+                                      * sizeof(struct tgsi_full_instruction));
+               if (!instructions) {
+                  break;
+               }
+               bld.instructions = instructions;
+               bld.max_instructions += LP_MAX_INSTRUCTIONS;
+            }
+
+            memcpy(bld.instructions + num_instructions,
+                   &parse.FullToken.FullInstruction,
+                   sizeof(bld.instructions[0]));
+
+            num_instructions++;
          }
 
          break;
@@ -1787,7 +2091,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
          {
             const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
             assert(size <= 4);
-            assert(num_immediates < LP_MAX_IMMEDIATES);
+            assert(num_immediates < LP_MAX_TGSI_IMMEDIATES);
             for( i = 0; i < size; ++i )
                bld.immediates[num_immediates][i] =
                   lp_build_const_vec(type, parse.FullToken.FullImmediate.u[i].Float);
@@ -1804,14 +2108,33 @@ lp_build_tgsi_soa(LLVMBuilderRef builder,
          assert( 0 );
       }
    }
+
+   while (pc != -1) {
+      struct tgsi_full_instruction *instr = bld.instructions + pc;
+      const struct tgsi_opcode_info *opcode_info =
+         tgsi_get_opcode_info(instr->Instruction.Opcode);
+      if (!emit_instruction( &bld, instr, opcode_info, &pc ))
+         _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
+                       opcode_info->mnemonic);
+   }
+
    if (0) {
       LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
       LLVMValueRef function = LLVMGetBasicBlockParent(block);
       debug_printf("11111111111111111111111111111 \n");
       tgsi_dump(tokens, 0);
-      LLVMDumpValue(function);
+      lp_debug_dump_value(function);
       debug_printf("2222222222222222222222222222 \n");
    }
    tgsi_parse_free( &parse );
+
+   if (0) {
+      LLVMModuleRef module = LLVMGetGlobalParent(
+         LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld.base.builder)));
+      LLVMDumpModule(module);
+
+   }
+
+   FREE( bld.instructions );
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
index 796af88caad..06f1aae6dcc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -195,6 +195,7 @@ lp_uint_type(struct lp_type type)
 {
    struct lp_type res_type;
 
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
    memset(&res_type, 0, sizeof res_type);
    res_type.width = type.width;
    res_type.length = type.length;
@@ -211,6 +212,7 @@ lp_int_type(struct lp_type type)
 {
    struct lp_type res_type;
 
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
    memset(&res_type, 0, sizeof res_type);
    res_type.width = type.width;
    res_type.length = type.length;
@@ -238,6 +240,131 @@ lp_wider_type(struct lp_type type)
 }
 
 
+/**
+ * Return the size of the LLVMType in bits.
+ * XXX this function doesn't necessarily handle all LLVM types.
+ */
+unsigned
+lp_sizeof_llvm_type(LLVMTypeRef t)
+{
+   LLVMTypeKind k = LLVMGetTypeKind(t);
+
+   switch (k) {
+   case LLVMIntegerTypeKind:
+      return LLVMGetIntTypeWidth(t);
+   case LLVMFloatTypeKind:
+      return 8 * sizeof(float);
+   case LLVMDoubleTypeKind:
+      return 8 * sizeof(double);
+   case LLVMVectorTypeKind:
+      {
+         LLVMTypeRef elem = LLVMGetElementType(t);
+         unsigned len = LLVMGetVectorSize(t);
+         return len * lp_sizeof_llvm_type(elem);
+      }
+      break;
+   case LLVMArrayTypeKind:
+      {
+         LLVMTypeRef elem = LLVMGetElementType(t);
+         unsigned len = LLVMGetArrayLength(t);
+         return len * lp_sizeof_llvm_type(elem);
+      }
+      break;
+   default:
+      assert(0 && "Unexpected type in lp_get_llvm_type_size()");
+      return 0;
+   }
+}
+
+
+/**
+ * Return string name for a LLVMTypeKind.  Useful for debugging.
+ */
+const char *
+lp_typekind_name(LLVMTypeKind t)
+{
+   switch (t) {
+   case LLVMVoidTypeKind:
+      return "LLVMVoidTypeKind";
+   case LLVMFloatTypeKind:
+      return "LLVMFloatTypeKind";
+   case LLVMDoubleTypeKind:
+      return "LLVMDoubleTypeKind";
+   case LLVMX86_FP80TypeKind:
+      return "LLVMX86_FP80TypeKind";
+   case LLVMFP128TypeKind:
+      return "LLVMFP128TypeKind";
+   case LLVMPPC_FP128TypeKind:
+      return "LLVMPPC_FP128TypeKind";
+   case LLVMLabelTypeKind:
+      return "LLVMLabelTypeKind";
+   case LLVMIntegerTypeKind:
+      return "LLVMIntegerTypeKind";
+   case LLVMFunctionTypeKind:
+      return "LLVMFunctionTypeKind";
+   case LLVMStructTypeKind:
+      return "LLVMStructTypeKind";
+   case LLVMArrayTypeKind:
+      return "LLVMArrayTypeKind";
+   case LLVMPointerTypeKind:
+      return "LLVMPointerTypeKind";
+   case LLVMOpaqueTypeKind:
+      return "LLVMOpaqueTypeKind";
+   case LLVMVectorTypeKind:
+      return "LLVMVectorTypeKind";
+   case LLVMMetadataTypeKind:
+      return "LLVMMetadataTypeKind";
+   /* Only in LLVM 2.7 and later???
+   case LLVMUnionTypeKind:
+      return "LLVMUnionTypeKind";
+   */
+   default:
+      return "unknown LLVMTypeKind";
+   }
+}
+
+
+/**
+ * Print an LLVMTypeRef.  Like LLVMDumpValue().  For debugging.
+ */
+void
+lp_dump_llvmtype(LLVMTypeRef t)
+{
+   LLVMTypeKind k = LLVMGetTypeKind(t);
+
+   if (k == LLVMVectorTypeKind) {
+      LLVMTypeRef te = LLVMGetElementType(t);
+      LLVMTypeKind ke = LLVMGetTypeKind(te);
+      unsigned len = LLVMGetVectorSize(t);
+      if (ke == LLVMIntegerTypeKind) {
+         unsigned b = LLVMGetIntTypeWidth(te);
+         debug_printf("Vector [%u] of %u-bit Integer\n", len, b);
+      }
+      else {
+         debug_printf("Vector [%u] of %s\n", len, lp_typekind_name(ke));
+      }
+   }
+   else if (k == LLVMArrayTypeKind) {
+      LLVMTypeRef te = LLVMGetElementType(t);
+      LLVMTypeKind ke = LLVMGetTypeKind(te);
+      unsigned len = LLVMGetArrayLength(t);
+      debug_printf("Array [%u] of %s\n", len, lp_typekind_name(ke));
+   }
+   else if (k == LLVMIntegerTypeKind) {
+      unsigned b = LLVMGetIntTypeWidth(t);
+      debug_printf("%u-bit Integer\n", b);
+   }
+   else if (k == LLVMPointerTypeKind) {
+      LLVMTypeRef te = LLVMGetElementType(t);
+      debug_printf("Pointer to ");
+      lp_dump_llvmtype(te);
+   }
+   else {
+      debug_printf("%s\n", lp_typekind_name(k));
+   }
+}
+
+
 void
 lp_build_context_init(struct lp_build_context *bld,
                       LLVMBuilderRef builder,
@@ -245,7 +372,23 @@ lp_build_context_init(struct lp_build_context *bld,
 {
    bld->builder = builder;
    bld->type = type;
-   bld->undef = lp_build_undef(type);
-   bld->zero = lp_build_zero(type);
+
+   bld->int_elem_type = lp_build_int_elem_type(type);
+   if (type.floating)
+      bld->elem_type = lp_build_elem_type(type);
+   else
+      bld->elem_type = bld->int_elem_type;
+
+   if (type.length == 1) {
+      bld->int_vec_type = bld->int_elem_type;
+      bld->vec_type = bld->elem_type;
+   }
+   else {
+      bld->int_vec_type = LLVMVectorType(bld->int_elem_type, type.length);
+      bld->vec_type = LLVMVectorType(bld->elem_type, type.length);
+   }
+
+   bld->undef = LLVMGetUndef(bld->vec_type);
+   bld->zero = LLVMConstNull(bld->vec_type);
    bld->one = lp_build_one(type);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
index cd59d2faa66..fec1d3dfbc6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_type.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -128,6 +128,18 @@ struct lp_build_context
     */
    struct lp_type type;
 
+   /** Same as lp_build_elem_type(type) */
+   LLVMTypeRef elem_type;
+
+   /** Same as lp_build_vec_type(type) */
+   LLVMTypeRef vec_type;
+
+   /** Same as lp_build_int_elem_type(type) */
+   LLVMTypeRef int_elem_type;
+
+   /** Same as lp_build_int_vec_type(type) */
+   LLVMTypeRef int_vec_type;
+
    /** Same as lp_build_undef(type) */
    LLVMValueRef undef;
 
@@ -304,6 +316,54 @@ LLVMTypeRef
 lp_build_int32_vec4_type(void);
 
 
+static INLINE struct lp_type
+lp_float32_vec4_type(void)
+{
+   struct lp_type type;
+
+   memset(&type, 0, sizeof(type));
+   type.floating = TRUE;
+   type.sign = TRUE;
+   type.norm = FALSE;
+   type.width = 32;
+   type.length = 4;
+
+   return type;
+}
+
+
+static INLINE struct lp_type
+lp_int32_vec4_type(void)
+{
+   struct lp_type type;
+
+   memset(&type, 0, sizeof(type));
+   type.floating = FALSE;
+   type.sign = TRUE;
+   type.norm = FALSE;
+   type.width = 32;
+   type.length = 4;
+
+   return type;
+}
+
+
+static INLINE struct lp_type
+lp_unorm8_vec4_type(void)
+{
+   struct lp_type type;
+
+   memset(&type, 0, sizeof(type));
+   type.floating = FALSE;
+   type.sign = FALSE;
+   type.norm = TRUE;
+   type.width = 8;
+   type.length = 4;
+
+   return type;
+}
+
+
 struct lp_type
 lp_uint_type(struct lp_type type);
 
@@ -316,6 +376,18 @@ struct lp_type
 lp_wider_type(struct lp_type type);
 
 
+unsigned
+lp_sizeof_llvm_type(LLVMTypeRef t);
+
+
+const char *
+lp_typekind_name(LLVMTypeKind t);
+
+
+void
+lp_dump_llvmtype(LLVMTypeRef t);
+
+
 void
 lp_build_context_init(struct lp_build_context *bld,
                       LLVMBuilderRef builder,
diff --git a/src/gallium/auxiliary/indices/.gitignore b/src/gallium/auxiliary/indices/.gitignore
new file mode 100644
index 00000000000..73740071451
--- /dev/null
+++ b/src/gallium/auxiliary/indices/.gitignore
@@ -0,0 +1,2 @@
+u_indices_gen.c
+u_unfilled_gen.c
diff --git a/src/gallium/auxiliary/os/os_stream.c b/src/gallium/auxiliary/os/os_stream.c
new file mode 100644
index 00000000000..3c55fc00d92
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_stream.c
@@ -0,0 +1,58 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_config.h"
+
+#include "os_stream.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+
+int
+os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap)
+{
+   char buf[1024];
+   int retval;
+   va_list ap2;
+   va_copy(ap2, ap);
+   retval = util_vsnprintf(buf, sizeof(buf), format, ap2);
+   va_end(ap2);
+   if(retval <= 0)
+   {}
+   else if(retval < sizeof(buf))
+      stream->write(stream, buf, retval);
+   else
+   {
+      char* str = MALLOC(retval + 1);
+      if(!str)
+         return -1;
+      retval = util_vsnprintf(str, retval + 1, format, ap);
+      if(retval > 0)
+         stream->write(stream, str, retval);
+      FREE(str);
+   }
+
+   return retval;
+}
diff --git a/src/gallium/auxiliary/os/os_stream.h b/src/gallium/auxiliary/os/os_stream.h
index 693a0621e2d..6c6050bb028 100644
--- a/src/gallium/auxiliary/os/os_stream.h
+++ b/src/gallium/auxiliary/os/os_stream.h
@@ -50,6 +50,9 @@ struct os_stream
 
    void
    (*flush)(struct os_stream *stream);
+
+   int
+   (*vprintf)(struct os_stream *stream, const char* format, va_list ap);
 };
 
 
@@ -90,6 +93,27 @@ os_stream_flush(struct os_stream *stream)
    stream->flush(stream);
 }
 
+int
+os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap);
+
+static INLINE int
+os_stream_vprintf (struct os_stream* stream, const char *format, va_list ap)
+{
+   return stream->vprintf(stream, format, ap);
+}
+
+static INLINE int
+os_stream_printf (struct os_stream* stream, const char *format, ...)
+{
+   int retval;
+   va_list args;
+
+   va_start (args, format);
+   retval = stream->vprintf(stream, format, args);
+   va_end (args);
+
+   return retval;
+}
 
 struct os_stream *
 os_file_stream_create(const char *filename);
@@ -118,5 +142,4 @@ os_str_stream_get_and_close(struct os_stream *stream);
 #define os_file_stream_create(_filename) os_null_stream_create()
 #endif
 
-
 #endif /* _OS_STREAM_H_ */
diff --git a/src/gallium/auxiliary/os/os_stream_log.c b/src/gallium/auxiliary/os/os_stream_log.c
index 7cc2028a22c..b01377c3468 100644
--- a/src/gallium/auxiliary/os/os_stream_log.c
+++ b/src/gallium/auxiliary/os/os_stream_log.c
@@ -73,7 +73,8 @@ static struct os_stream
 os_log_stream_struct = {
    &os_log_stream_close,
    &os_log_stream_write,
-   &os_log_stream_flush
+   &os_log_stream_flush,
+   &os_default_stream_vprintf,
 };
 
 
diff --git a/src/gallium/auxiliary/os/os_stream_null.c b/src/gallium/auxiliary/os/os_stream_null.c
index 128c4e8f0e0..a549a789e62 100644
--- a/src/gallium/auxiliary/os/os_stream_null.c
+++ b/src/gallium/auxiliary/os/os_stream_null.c
@@ -56,12 +56,18 @@ os_null_stream_flush(struct os_stream *stream)
    (void)stream;
 }
 
+static int
+os_null_stream_vprintf (struct os_stream* stream, const char *format, va_list ap)
+{
+   return 0;
+}
 
 static struct os_stream
 os_null_stream = {
    &os_null_stream_close,
    &os_null_stream_write,
-   &os_null_stream_flush
+   &os_null_stream_flush,
+   &os_null_stream_vprintf
 };
 
 
diff --git a/src/gallium/auxiliary/os/os_stream_stdc.c b/src/gallium/auxiliary/os/os_stream_stdc.c
index 9e7ed711076..37e7d063e2b 100644
--- a/src/gallium/auxiliary/os/os_stream_stdc.c
+++ b/src/gallium/auxiliary/os/os_stream_stdc.c
@@ -83,6 +83,14 @@ os_stdc_stream_flush(struct os_stream *_stream)
    fflush(stream->file);
 }
 
+static int
+os_stdc_stream_vprintf (struct os_stream* _stream, const char *format, va_list ap)
+{
+   struct os_stdc_stream *stream = os_stdc_stream(_stream);
+
+   return vfprintf(stream->file, format, ap);
+}
+
 
 struct os_stream *
 os_file_stream_create(const char *filename)
@@ -96,6 +104,7 @@ os_file_stream_create(const char *filename)
    stream->base.close = &os_stdc_stream_close;
    stream->base.write = &os_stdc_stream_write;
    stream->base.flush = &os_stdc_stream_flush;
+   stream->base.vprintf = &os_stdc_stream_vprintf;
 
    stream->file = fopen(filename, "w");
    if(!stream->file)
diff --git a/src/gallium/auxiliary/os/os_stream_str.c b/src/gallium/auxiliary/os/os_stream_str.c
index b5c7270d2ae..be9478b2a17 100644
--- a/src/gallium/auxiliary/os/os_stream_str.c
+++ b/src/gallium/auxiliary/os/os_stream_str.c
@@ -118,6 +118,7 @@ os_str_stream_create(size_t size)
    stream->base.close = &os_str_stream_close;
    stream->base.write = &os_str_stream_write;
    stream->base.flush = &os_str_stream_flush;
+   stream->base.vprintf = &os_default_stream_vprintf;
 
    stream->str = os_malloc(size);
    if(!stream->str)
diff --git a/src/gallium/auxiliary/os/os_thread.h b/src/gallium/auxiliary/os/os_thread.h
index c09e8a7a76f..a084310d4ff 100644
--- a/src/gallium/auxiliary/os/os_thread.h
+++ b/src/gallium/auxiliary/os/os_thread.h
@@ -40,12 +40,11 @@
 #include "util/u_debug.h" /* for assert */
 
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN)
 
 #include <pthread.h> /* POSIX threads headers */
 #include <stdio.h> /* for perror() */
 
-#define PIPE_THREAD_HAVE_CONDVAR
 
 /* pipe_thread
  */
@@ -168,19 +167,59 @@ typedef CRITICAL_SECTION pipe_mutex;
 #define pipe_mutex_unlock(mutex) \
    LeaveCriticalSection(&mutex)
 
+/* TODO: Need a macro to declare "I don't care about WinXP compatibilty" */
+#if 0 && defined (_WIN32_WINNT) && (_WIN32_WINNT >= 0x0600)
+/* CONDITION_VARIABLE is only available on newer versions of Windows
+ * (Server 2008/Vista or later).
+ * http://msdn.microsoft.com/en-us/library/ms682052(VS.85).aspx
+ *
+ * pipe_condvar
+ */
+typedef CONDITION_VARIABLE pipe_condvar;
+
+#define pipe_static_condvar(cond) \
+   /*static*/ pipe_condvar cond = CONDITION_VARIABLE_INIT
+
+#define pipe_condvar_init(cond) \
+   InitializeConditionVariable(&(cond))
+
+#define pipe_condvar_destroy(cond) \
+   (void) cond /* nothing to do */
+
+#define pipe_condvar_wait(cond, mutex) \
+   SleepConditionVariableCS(&(cond), &(mutex), INFINITE)
+
+#define pipe_condvar_signal(cond) \
+   WakeConditionVariable(&(cond))
+
+#define pipe_condvar_broadcast(cond) \
+   WakeAllConditionVariable(&(cond))
+
+#else /* need compatibility with pre-Vista Win32 */
 
 /* pipe_condvar (XXX FIX THIS)
+ * See http://www.cs.wustl.edu/~schmidt/win32-cv-1.html
+ * for potential pitfalls in implementation.
  */
-typedef unsigned pipe_condvar;
+typedef DWORD pipe_condvar;
+
+#define pipe_static_condvar(cond) \
+   /*static*/ pipe_condvar cond = 1
 
 #define pipe_condvar_init(cond) \
-   (void) cond
+   (void) (cond = 1)
 
 #define pipe_condvar_destroy(cond) \
    (void) cond
 
+/* Poor man's pthread_cond_wait():
+   Just release the mutex and sleep for one millisecond.
+   The caller's while() loop does all the work. */
 #define pipe_condvar_wait(cond, mutex) \
-   (void) cond; (void) mutex
+   do { pipe_mutex_unlock(mutex); \
+        Sleep(cond); \
+        pipe_mutex_lock(mutex); \
+   } while (0)
 
 #define pipe_condvar_signal(cond) \
    (void) cond
@@ -188,9 +227,12 @@ typedef unsigned pipe_condvar;
 #define pipe_condvar_broadcast(cond) \
    (void) cond
 
+#endif /* pre-Vista win32 */
 
 #else
 
+#include "os/os_time.h"
+
 /** Dummy definitions */
 
 typedef unsigned pipe_thread;
@@ -214,7 +256,6 @@ static INLINE int pipe_thread_destroy( pipe_thread thread )
 }
 
 typedef unsigned pipe_mutex;
-typedef unsigned pipe_condvar;
 
 #define pipe_static_mutex(mutex) \
    static pipe_mutex mutex = 0
@@ -231,17 +272,25 @@ typedef unsigned pipe_condvar;
 #define pipe_mutex_unlock(mutex) \
    (void) mutex
 
+typedef int64_t pipe_condvar;
+
 #define pipe_static_condvar(condvar) \
-   static unsigned condvar = 0
+   static pipe_condvar condvar = 1000
 
 #define pipe_condvar_init(condvar) \
-   (void) condvar
+   (void) (condvar = 1000)
 
 #define pipe_condvar_destroy(condvar) \
    (void) condvar
 
+/* Poor man's pthread_cond_wait():
+   Just release the mutex and sleep for one millisecond.
+   The caller's while() loop does all the work. */
 #define pipe_condvar_wait(condvar, mutex) \
-   (void) condvar
+   do { pipe_mutex_unlock(mutex); \
+        os_time_sleep(condvar); \
+        pipe_mutex_lock(mutex); \
+   } while (0)
 
 #define pipe_condvar_signal(condvar) \
    (void) condvar
@@ -277,27 +326,7 @@ static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
 }
 
 
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
-
-/* XXX FIX THIS */
-typedef unsigned pipe_barrier;
-
-static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
-{
-   /* XXX we could implement barriers with a mutex and condition var */
-}
-
-static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
-{
-}
-
-static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
-{
-   assert(0);
-}
-
-
-#else
+#else /* If the OS doesn't have its own, implement barriers using a mutex and a condvar */
 
 typedef struct {
    unsigned count;
@@ -405,7 +434,7 @@ pipe_semaphore_wait(pipe_semaphore *sema)
  */
 
 typedef struct {
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN)
    pthread_key_t key;
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
    DWORD key;
@@ -420,7 +449,7 @@ typedef struct {
 static INLINE void
 pipe_tsd_init(pipe_tsd *tsd)
 {
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN)
    if (pthread_key_create(&tsd->key, NULL/*free*/) != 0) {
       perror("pthread_key_create(): failed to allocate key for thread specific data");
       exit(-1);
@@ -437,7 +466,7 @@ pipe_tsd_get(pipe_tsd *tsd)
    if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
       pipe_tsd_init(tsd);
    }
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN)
    return pthread_getspecific(tsd->key);
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
    assert(0);
@@ -454,7 +483,7 @@ pipe_tsd_set(pipe_tsd *tsd, void *value)
    if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
       pipe_tsd_init(tsd);
    }
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN)
    if (pthread_setspecific(tsd->key, value) != 0) {
       perror("pthread_set_specific() failed");
       exit(-1);
diff --git a/src/gallium/auxiliary/os/os_time.c b/src/gallium/auxiliary/os/os_time.c
index 6259142bec0..84907215fe6 100644
--- a/src/gallium/auxiliary/os/os_time.c
+++ b/src/gallium/auxiliary/os/os_time.c
@@ -37,7 +37,7 @@
 
 #if !defined(PIPE_OS_EMBEDDED)
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_CYGWIN)
 #  include <sys/time.h> /* timeval */
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
 #  include <windows.h>
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index a6c50dcf0c1..5a13f39849f 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -130,7 +130,7 @@ struct pb_vtbl
     * flags is bitmask of PB_USAGE_CPU_READ/WRITE. 
     */
    void *(*map)( struct pb_buffer *buf, 
-                 unsigned flags );
+                 unsigned flags, void *flush_ctx );
    
    void (*unmap)( struct pb_buffer *buf );
 
@@ -164,13 +164,13 @@ struct pb_vtbl
  */
 static INLINE void *
 pb_map(struct pb_buffer *buf, 
-       unsigned flags)
+       unsigned flags, void *flush_ctx)
 {
    assert(buf);
    if(!buf)
       return NULL;
    assert(pipe_is_referenced(&buf->base.reference));
-   return buf->vtbl->map(buf, flags);
+   return buf->vtbl->map(buf, flags, flush_ctx);
 }
 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index d6cf6405825..c310f28f51f 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -624,7 +624,7 @@ fenced_buffer_copy_storage_to_gpu_locked(struct fenced_buffer *fenced_buf)
    assert(fenced_buf->data);
    assert(fenced_buf->buffer);
 
-   map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_WRITE);
+   map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_WRITE, NULL);
    if(!map)
       return PIPE_ERROR;
 
@@ -644,7 +644,7 @@ fenced_buffer_copy_storage_to_cpu_locked(struct fenced_buffer *fenced_buf)
    assert(fenced_buf->data);
    assert(fenced_buf->buffer);
 
-   map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_READ);
+   map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_READ, NULL);
    if(!map)
       return PIPE_ERROR;
 
@@ -674,7 +674,7 @@ fenced_buffer_destroy(struct pb_buffer *buf)
 
 static void *
 fenced_buffer_map(struct pb_buffer *buf,
-                  unsigned flags)
+                  unsigned flags, void *flush_ctx)
 {
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
    struct fenced_manager *fenced_mgr = fenced_buf->mgr;
@@ -712,7 +712,7 @@ fenced_buffer_map(struct pb_buffer *buf,
    }
 
    if(fenced_buf->buffer) {
-      map = pb_map(fenced_buf->buffer, flags);
+      map = pb_map(fenced_buf->buffer, flags, flush_ctx);
    }
    else {
       assert(fenced_buf->data);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
index b706f429be5..c2322eed19b 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -70,7 +70,8 @@ malloc_buffer_destroy(struct pb_buffer *buf)
 
 static void *
 malloc_buffer_map(struct pb_buffer *buf, 
-                  unsigned flags)
+                  unsigned flags,
+		  void *flush_ctx)
 {
    return malloc_buffer(buf)->data;
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
index cec2524da2b..2ef02160f23 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -50,8 +50,7 @@
 #define PB_BUFMGR_H_
 
 
-#include "pipe/p_compiler.h"
-#include "pipe/p_defines.h"
+#include "pb_buffer.h"
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 88501e8d72d..a6eb4039621 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -167,10 +167,10 @@ pb_cache_buffer_destroy(struct pb_buffer *_buf)
 
 static void *
 pb_cache_buffer_map(struct pb_buffer *_buf, 
-                  unsigned flags)
+		    unsigned flags, void *flush_ctx)
 {
    struct pb_cache_buffer *buf = pb_cache_buffer(_buf);   
-   return pb_map(buf->buffer, flags);
+   return pb_map(buf->buffer, flags, flush_ctx);
 }
 
 
@@ -222,7 +222,7 @@ pb_cache_buffer_vtbl = {
 };
 
 
-static INLINE boolean
+static INLINE int
 pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,  
                           pb_size size,
                           const struct pb_desc *desc)
@@ -230,26 +230,26 @@ pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,
    void *map;
 
    if(buf->base.base.size < size)
-      return FALSE;
+      return 0;
 
    /* be lenient with size */
    if(buf->base.base.size >= 2*size)
-      return FALSE;
+      return 0;
    
    if(!pb_check_alignment(desc->alignment, buf->base.base.alignment))
-      return FALSE;
+      return 0;
    
    if(!pb_check_usage(desc->usage, buf->base.base.usage))
-      return FALSE;
+      return 0;
 
-   map = pb_map(buf->buffer, PB_USAGE_DONTBLOCK);
+   map = pb_map(buf->buffer, PB_USAGE_DONTBLOCK, NULL);
    if (!map) {
-      return FALSE;
+      return -1;
    }
 
    pb_unmap(buf->buffer);
    
-   return TRUE;
+   return 1;
 }
 
 
@@ -263,7 +263,8 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    struct pb_cache_buffer *curr_buf;
    struct list_head *curr, *next;
    int64_t now;
-   
+   int ret = 0;
+
    pipe_mutex_lock(mgr->mutex);
 
    buf = NULL;
@@ -274,25 +275,30 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    now = os_time_get();
    while(curr != &mgr->delayed) {
       curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
-      if(!buf && pb_cache_is_buffer_compat(curr_buf, size, desc))
-	 buf = curr_buf;
+      if(!buf && (ret = pb_cache_is_buffer_compat(curr_buf, size, desc) > 0))
+         buf = curr_buf;
       else if(os_time_timeout(curr_buf->start, curr_buf->end, now))
-	 _pb_cache_buffer_destroy(curr_buf);
+         _pb_cache_buffer_destroy(curr_buf);
       else
          /* This buffer (and all hereafter) are still hot in cache */
          break;
+      if (ret == -1)
+         break;
       curr = next; 
       next = curr->next;
    }
 
    /* keep searching in the hot buffers */
-   if(!buf) {
+   if(!buf && ret != -1) {
       while(curr != &mgr->delayed) {
          curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
-         if(pb_cache_is_buffer_compat(curr_buf, size, desc)) {
+         ret = pb_cache_is_buffer_compat(curr_buf, size, desc);
+         if (ret > 0) {
             buf = curr_buf;
             break;
          }
+         if (ret == -1)
+            break;
          /* no need to check the timeout here */
          curr = next;
          next = curr->next;
@@ -301,6 +307,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    
    if(buf) {
       LIST_DEL(&buf->head);
+      --mgr->numDelayed;
       pipe_mutex_unlock(mgr->mutex);
       /* Increase refcount */
       pipe_reference_init(&buf->base.base.reference, 1);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 0dc5b31a754..7604e75af8d 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -158,7 +158,7 @@ pb_debug_buffer_fill(struct pb_debug_buffer *buf)
 {
    uint8_t *map;
    
-   map = pb_map(buf->buffer, PB_USAGE_CPU_WRITE);
+   map = pb_map(buf->buffer, PB_USAGE_CPU_WRITE, NULL);
    assert(map);
    if(map) {
       fill_random_pattern(map, buf->underflow_size);
@@ -181,7 +181,7 @@ pb_debug_buffer_check(struct pb_debug_buffer *buf)
    
    map = pb_map(buf->buffer,
                 PB_USAGE_CPU_READ |
-                PB_USAGE_UNSYNCHRONIZED);
+                PB_USAGE_UNSYNCHRONIZED, NULL);
    assert(map);
    if(map) {
       boolean underflow, overflow;
@@ -247,14 +247,14 @@ pb_debug_buffer_destroy(struct pb_buffer *_buf)
 
 static void *
 pb_debug_buffer_map(struct pb_buffer *_buf, 
-                    unsigned flags)
+                    unsigned flags, void *flush_ctx)
 {
    struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
    void *map;
    
    pb_debug_buffer_check(buf);
 
-   map = pb_map(buf->buffer, flags);
+   map = pb_map(buf->buffer, flags, flush_ctx);
    if(!map)
       return NULL;
    
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index faf7c352674..88da786216a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -108,11 +108,14 @@ mm_buffer_destroy(struct pb_buffer *buf)
 
 static void *
 mm_buffer_map(struct pb_buffer *buf,
-              unsigned flags)
+              unsigned flags,
+              void *flush_ctx)
 {
    struct mm_buffer *mm_buf = mm_buffer(buf);
    struct mm_pb_manager *mm = mm_buf->mgr;
 
+   /* XXX: it will be necessary to remap here to propagate flush_ctx */
+
    return (unsigned char *) mm->map + mm_buf->block->ofs;
 }
 
@@ -269,7 +272,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
 
    mm->map = pb_map(mm->buffer, 
 		    PB_USAGE_CPU_READ |
-		    PB_USAGE_CPU_WRITE);
+		    PB_USAGE_CPU_WRITE, NULL);
    if(!mm->map)
       goto failure;
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
index 31f1ebbeb7c..694a092f3c2 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
@@ -103,13 +103,13 @@ pb_ondemand_buffer_destroy(struct pb_buffer *_buf)
 
 static void *
 pb_ondemand_buffer_map(struct pb_buffer *_buf, 
-                       unsigned flags)
+                       unsigned flags, void *flush_ctx)
 {
    struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
 
    if(buf->buffer) {
       assert(!buf->data);
-      return pb_map(buf->buffer, flags);
+      return pb_map(buf->buffer, flags, flush_ctx);
    }
    else {
       assert(buf->data);
@@ -150,7 +150,7 @@ pb_ondemand_buffer_instantiate(struct pb_ondemand_buffer *buf)
       if(!buf->buffer)
          return PIPE_ERROR_OUT_OF_MEMORY;
       
-      map = pb_map(buf->buffer, PB_USAGE_CPU_READ);
+      map = pb_map(buf->buffer, PB_USAGE_CPU_READ, NULL);
       if(!map) {
          pb_reference(&buf->buffer, NULL);
          return PIPE_ERROR;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index fdcce428784..2f7c7389ff4 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -118,12 +118,14 @@ pool_buffer_destroy(struct pb_buffer *buf)
 
 
 static void *
-pool_buffer_map(struct pb_buffer *buf, unsigned flags)
+pool_buffer_map(struct pb_buffer *buf, unsigned flags, void *flush_ctx)
 {
    struct pool_buffer *pool_buf = pool_buffer(buf);
    struct pool_pb_manager *pool = pool_buf->mgr;
    void *map;
 
+   /* XXX: it will be necessary to remap here to propagate flush_ctx */
+
    pipe_mutex_lock(pool->mutex);
    map = (unsigned char *) pool->map + pool_buf->start;
    pipe_mutex_unlock(pool->mutex);
@@ -285,7 +287,7 @@ pool_bufmgr_create(struct pb_manager *provider,
 
    pool->map = pb_map(pool->buffer,
                           PB_USAGE_CPU_READ |
-                          PB_USAGE_CPU_WRITE);
+                          PB_USAGE_CPU_WRITE, NULL);
    if(!pool->map)
       goto failure;
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index 7a3305aaf37..176f9aa38aa 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -227,10 +227,13 @@ pb_slab_buffer_destroy(struct pb_buffer *_buf)
 
 static void *
 pb_slab_buffer_map(struct pb_buffer *_buf, 
-                   unsigned flags)
+                   unsigned flags,
+                   void *flush_ctx)
 {
    struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
 
+   /* XXX: it will be necessary to remap here to propagate flush_ctx */
+
    ++buf->mapCount;
    return (void *) ((uint8_t *) buf->slab->virtual + buf->start);
 }
@@ -316,7 +319,7 @@ pb_slab_create(struct pb_slab_manager *mgr)
     * through this address so it is required that the buffer is pinned. */
    slab->virtual = pb_map(slab->bo, 
                           PB_USAGE_CPU_READ |
-                          PB_USAGE_CPU_WRITE);
+                          PB_USAGE_CPU_WRITE, NULL);
    if(!slab->virtual) {
       ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto out_err1;
diff --git a/src/gallium/auxiliary/rbug/README b/src/gallium/auxiliary/rbug/README
index d984067893c..c5156438a1b 100644
--- a/src/gallium/auxiliary/rbug/README
+++ b/src/gallium/auxiliary/rbug/README
@@ -10,7 +10,7 @@ The code currently uses tcp and ip4v for connections.
 
 Information about driver integration can be found in:
 
-src/gallium/drivers/trace/README
+src/gallium/drivers/rbug/README
 
 for information about applications look in:
 
diff --git a/src/gallium/auxiliary/rbug/rbug_context.c b/src/gallium/auxiliary/rbug/rbug_context.c
index 1832425658f..a3fd7e8430e 100644
--- a/src/gallium/auxiliary/rbug/rbug_context.c
+++ b/src/gallium/auxiliary/rbug/rbug_context.c
@@ -480,7 +480,7 @@ struct rbug_proto_context_list * rbug_demarshal_context_list(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_LIST)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_LIST)
 		return NULL;
 
 	pos = 0;
@@ -506,7 +506,7 @@ struct rbug_proto_context_info * rbug_demarshal_context_info(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_INFO)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_INFO)
 		return NULL;
 
 	pos = 0;
@@ -533,7 +533,7 @@ struct rbug_proto_context_draw_block * rbug_demarshal_context_draw_block(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_BLOCK)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_BLOCK)
 		return NULL;
 
 	pos = 0;
@@ -561,7 +561,7 @@ struct rbug_proto_context_draw_step * rbug_demarshal_context_draw_step(struct rb
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_STEP)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_STEP)
 		return NULL;
 
 	pos = 0;
@@ -589,7 +589,7 @@ struct rbug_proto_context_draw_unblock * rbug_demarshal_context_draw_unblock(str
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_UNBLOCK)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_UNBLOCK)
 		return NULL;
 
 	pos = 0;
@@ -617,7 +617,7 @@ struct rbug_proto_context_draw_rule * rbug_demarshal_context_draw_rule(struct rb
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_RULE)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_RULE)
 		return NULL;
 
 	pos = 0;
@@ -649,7 +649,7 @@ struct rbug_proto_context_flush * rbug_demarshal_context_flush(struct rbug_proto
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_FLUSH)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_FLUSH)
 		return NULL;
 
 	pos = 0;
@@ -677,7 +677,7 @@ struct rbug_proto_context_list_reply * rbug_demarshal_context_list_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_LIST_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_LIST_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -705,7 +705,7 @@ struct rbug_proto_context_info_reply * rbug_demarshal_context_info_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_INFO_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_INFO_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -739,7 +739,7 @@ struct rbug_proto_context_draw_blocked * rbug_demarshal_context_draw_blocked(str
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_BLOCKED)
+	if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_BLOCKED)
 		return NULL;
 
 	pos = 0;
diff --git a/src/gallium/auxiliary/rbug/rbug_core.c b/src/gallium/auxiliary/rbug/rbug_core.c
index 876ae5a0ce6..1d47d13c9f3 100644
--- a/src/gallium/auxiliary/rbug/rbug_core.c
+++ b/src/gallium/auxiliary/rbug/rbug_core.c
@@ -233,7 +233,7 @@ struct rbug_proto_noop * rbug_demarshal_noop(struct rbug_proto_header *header)
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_NOOP)
+	if (header->opcode != (int32_t)RBUG_OP_NOOP)
 		return NULL;
 
 	pos = 0;
@@ -259,7 +259,7 @@ struct rbug_proto_ping * rbug_demarshal_ping(struct rbug_proto_header *header)
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_PING)
+	if (header->opcode != (int32_t)RBUG_OP_PING)
 		return NULL;
 
 	pos = 0;
@@ -285,7 +285,7 @@ struct rbug_proto_error * rbug_demarshal_error(struct rbug_proto_header *header)
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_ERROR)
+	if (header->opcode != (int32_t)RBUG_OP_ERROR)
 		return NULL;
 
 	pos = 0;
@@ -312,7 +312,7 @@ struct rbug_proto_ping_reply * rbug_demarshal_ping_reply(struct rbug_proto_heade
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_PING_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_PING_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -339,7 +339,7 @@ struct rbug_proto_error_reply * rbug_demarshal_error_reply(struct rbug_proto_hea
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_ERROR_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_ERROR_REPLY)
 		return NULL;
 
 	pos = 0;
diff --git a/src/gallium/auxiliary/rbug/rbug_demarshal.c b/src/gallium/auxiliary/rbug/rbug_demarshal.c
index 47390fbcee7..06caa45469d 100644
--- a/src/gallium/auxiliary/rbug/rbug_demarshal.c
+++ b/src/gallium/auxiliary/rbug/rbug_demarshal.c
@@ -91,3 +91,67 @@ struct rbug_header * rbug_demarshal(struct rbug_proto_header *header)
 		return NULL;
 	}
 }
+
+const char* rbug_proto_get_name(enum rbug_opcode opcode)
+{
+	switch(opcode) {
+	case RBUG_OP_NOOP:
+		return "RBUG_OP_NOOP";
+	case RBUG_OP_PING:
+		return "RBUG_OP_PING";
+	case RBUG_OP_ERROR:
+		return "RBUG_OP_ERROR";
+	case RBUG_OP_PING_REPLY:
+		return "RBUG_OP_PING_REPLY";
+	case RBUG_OP_ERROR_REPLY:
+		return "RBUG_OP_ERROR_REPLY";
+	case RBUG_OP_TEXTURE_LIST:
+		return "RBUG_OP_TEXTURE_LIST";
+	case RBUG_OP_TEXTURE_INFO:
+		return "RBUG_OP_TEXTURE_INFO";
+	case RBUG_OP_TEXTURE_WRITE:
+		return "RBUG_OP_TEXTURE_WRITE";
+	case RBUG_OP_TEXTURE_READ:
+		return "RBUG_OP_TEXTURE_READ";
+	case RBUG_OP_TEXTURE_LIST_REPLY:
+		return "RBUG_OP_TEXTURE_LIST_REPLY";
+	case RBUG_OP_TEXTURE_INFO_REPLY:
+		return "RBUG_OP_TEXTURE_INFO_REPLY";
+	case RBUG_OP_TEXTURE_READ_REPLY:
+		return "RBUG_OP_TEXTURE_READ_REPLY";
+	case RBUG_OP_CONTEXT_LIST:
+		return "RBUG_OP_CONTEXT_LIST";
+	case RBUG_OP_CONTEXT_INFO:
+		return "RBUG_OP_CONTEXT_INFO";
+	case RBUG_OP_CONTEXT_DRAW_BLOCK:
+		return "RBUG_OP_CONTEXT_DRAW_BLOCK";
+	case RBUG_OP_CONTEXT_DRAW_STEP:
+		return "RBUG_OP_CONTEXT_DRAW_STEP";
+	case RBUG_OP_CONTEXT_DRAW_UNBLOCK:
+		return "RBUG_OP_CONTEXT_DRAW_UNBLOCK";
+	case RBUG_OP_CONTEXT_DRAW_RULE:
+		return "RBUG_OP_CONTEXT_DRAW_RULE";
+	case RBUG_OP_CONTEXT_FLUSH:
+		return "RBUG_OP_CONTEXT_FLUSH";
+	case RBUG_OP_CONTEXT_LIST_REPLY:
+		return "RBUG_OP_CONTEXT_LIST_REPLY";
+	case RBUG_OP_CONTEXT_INFO_REPLY:
+		return "RBUG_OP_CONTEXT_INFO_REPLY";
+	case RBUG_OP_CONTEXT_DRAW_BLOCKED:
+		return "RBUG_OP_CONTEXT_DRAW_BLOCKED";
+	case RBUG_OP_SHADER_LIST:
+		return "RBUG_OP_SHADER_LIST";
+	case RBUG_OP_SHADER_INFO:
+		return "RBUG_OP_SHADER_INFO";
+	case RBUG_OP_SHADER_DISABLE:
+		return "RBUG_OP_SHADER_DISABLE";
+	case RBUG_OP_SHADER_REPLACE:
+		return "RBUG_OP_SHADER_REPLACE";
+	case RBUG_OP_SHADER_LIST_REPLY:
+		return "RBUG_OP_SHADER_LIST_REPLY";
+	case RBUG_OP_SHADER_INFO_REPLY:
+		return "RBUG_OP_SHADER_INFO_REPLY";
+	default:
+		return NULL;
+	}
+}
diff --git a/src/gallium/auxiliary/rbug/rbug_proto.h b/src/gallium/auxiliary/rbug/rbug_proto.h
index 4f3eb75dc4d..2fce725bc9e 100644
--- a/src/gallium/auxiliary/rbug/rbug_proto.h
+++ b/src/gallium/auxiliary/rbug/rbug_proto.h
@@ -91,4 +91,9 @@ struct rbug_proto_header
  */
 struct rbug_connection;
 
+/**
+ * Get printable string for opcode.
+ */
+const char* rbug_proto_get_name(enum rbug_opcode opcode);
+
 #endif
diff --git a/src/gallium/auxiliary/rbug/rbug_shader.c b/src/gallium/auxiliary/rbug/rbug_shader.c
index fccd2f55efd..1742941cc17 100644
--- a/src/gallium/auxiliary/rbug/rbug_shader.c
+++ b/src/gallium/auxiliary/rbug/rbug_shader.c
@@ -305,7 +305,7 @@ struct rbug_proto_shader_list * rbug_demarshal_shader_list(struct rbug_proto_hea
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_LIST)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_LIST)
 		return NULL;
 
 	pos = 0;
@@ -332,7 +332,7 @@ struct rbug_proto_shader_info * rbug_demarshal_shader_info(struct rbug_proto_hea
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_INFO)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_INFO)
 		return NULL;
 
 	pos = 0;
@@ -360,7 +360,7 @@ struct rbug_proto_shader_disable * rbug_demarshal_shader_disable(struct rbug_pro
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_DISABLE)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_DISABLE)
 		return NULL;
 
 	pos = 0;
@@ -389,7 +389,7 @@ struct rbug_proto_shader_replace * rbug_demarshal_shader_replace(struct rbug_pro
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_REPLACE)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_REPLACE)
 		return NULL;
 
 	pos = 0;
@@ -418,7 +418,7 @@ struct rbug_proto_shader_list_reply * rbug_demarshal_shader_list_reply(struct rb
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_LIST_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_LIST_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -446,7 +446,7 @@ struct rbug_proto_shader_info_reply * rbug_demarshal_shader_info_reply(struct rb
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_SHADER_INFO_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_SHADER_INFO_REPLY)
 		return NULL;
 
 	pos = 0;
diff --git a/src/gallium/auxiliary/rbug/rbug_texture.c b/src/gallium/auxiliary/rbug/rbug_texture.c
index 5a918fe6bc0..2ad577915e8 100644
--- a/src/gallium/auxiliary/rbug/rbug_texture.c
+++ b/src/gallium/auxiliary/rbug/rbug_texture.c
@@ -417,7 +417,7 @@ struct rbug_proto_texture_list * rbug_demarshal_texture_list(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_LIST)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_LIST)
 		return NULL;
 
 	pos = 0;
@@ -443,7 +443,7 @@ struct rbug_proto_texture_info * rbug_demarshal_texture_info(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_INFO)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_INFO)
 		return NULL;
 
 	pos = 0;
@@ -470,7 +470,7 @@ struct rbug_proto_texture_write * rbug_demarshal_texture_write(struct rbug_proto
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_WRITE)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_WRITE)
 		return NULL;
 
 	pos = 0;
@@ -506,7 +506,7 @@ struct rbug_proto_texture_read * rbug_demarshal_texture_read(struct rbug_proto_h
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_READ)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_READ)
 		return NULL;
 
 	pos = 0;
@@ -540,7 +540,7 @@ struct rbug_proto_texture_list_reply * rbug_demarshal_texture_list_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_LIST_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_LIST_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -568,7 +568,7 @@ struct rbug_proto_texture_info_reply * rbug_demarshal_texture_info_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_INFO_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_INFO_REPLY)
 		return NULL;
 
 	pos = 0;
@@ -606,7 +606,7 @@ struct rbug_proto_texture_read_reply * rbug_demarshal_texture_read_reply(struct
 
 	if (!header)
 		return NULL;
-	if (header->opcode != (int16_t)RBUG_OP_TEXTURE_READ_REPLY)
+	if (header->opcode != (int32_t)RBUG_OP_TEXTURE_READ_REPLY)
 		return NULL;
 
 	pos = 0;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
index 2e15751e508..0461c815504 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -30,7 +30,7 @@
 #include "rtasm_cpu.h"
 
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 static boolean rtasm_sse_enabled(void)
 {
    static boolean firsttime = 1;
@@ -49,7 +49,7 @@ static boolean rtasm_sse_enabled(void)
 int rtasm_cpu_has_sse(void)
 {
    /* FIXME: actually detect this at run-time */
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    return rtasm_sse_enabled();
 #else
    return 0;
@@ -59,7 +59,7 @@ int rtasm_cpu_has_sse(void)
 int rtasm_cpu_has_sse2(void) 
 {
    /* FIXME: actually detect this at run-time */
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    return rtasm_sse_enabled();
 #else
    return 0;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 65d5ce795be..fbde1d191a4 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -58,7 +58,6 @@
 
 #include <unistd.h>
 #include <sys/mman.h>
-#include "os/os_thread.h"
 #include "util/u_mm.h"
 
 #define EXEC_HEAP_SIZE (10*1024*1024)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 7595214bdf2..75b0f6a68ea 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -22,8 +22,9 @@
  **************************************************************************/
 
 #include "pipe/p_config.h"
+#include "util/u_cpu_detect.h"
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
@@ -231,6 +232,10 @@ static void emit_modrm( struct x86_function *p,
    
    assert(reg.mod == mod_REG);
    
+   /* TODO: support extended x86-64 registers */
+   assert(reg.idx < 8);
+   assert(regmem.idx < 8);
+
    val |= regmem.mod << 6;     	/* mod field */
    val |= reg.idx << 3;		/* reg field */
    val |= regmem.idx;		/* r/m field */
@@ -363,6 +368,12 @@ int x86_get_label( struct x86_function *p )
  */
 
 
+void x64_rexw(struct x86_function *p)
+{
+   if(x86_target(p) != X86_32)
+      emit_1ub(p, 0x48);
+}
+
 void x86_jcc( struct x86_function *p,
 	      enum x86_cc cc,
 	      int label )
@@ -449,6 +460,52 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
    emit_1i(p, imm);
 }
 
+void x86_mov_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   if(dst.mod == mod_REG)
+      x86_mov_reg_imm(p, dst, imm);
+   else
+   {
+      emit_1ub(p, 0xc7);
+      emit_modrm_noreg(p, 0, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_mov16_imm( struct x86_function *p, struct x86_reg dst, uint16_t imm )
+{
+   DUMP_RI( dst, imm );
+   emit_1ub(p, 0x66);
+   if(dst.mod == mod_REG)
+   {
+      emit_1ub(p, 0xb8 + dst.idx);
+      emit_2ub(p, imm & 0xff, imm >> 8);
+   }
+   else
+   {
+      emit_1ub(p, 0xc7);
+      emit_modrm_noreg(p, 0, dst);
+      emit_2ub(p, imm & 0xff, imm >> 8);
+   }
+}
+
+void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm )
+{
+   DUMP_RI( dst, imm );
+   if(dst.mod == mod_REG)
+   {
+      emit_1ub(p, 0xb0 + dst.idx);
+      emit_1ub(p, imm);
+   }
+   else
+   {
+      emit_1ub(p, 0xc6);
+      emit_modrm_noreg(p, 0, dst);
+      emit_1ub(p, imm);
+   }
+}
+
 /**
  * Immediate group 1 instructions.
  */
@@ -520,7 +577,7 @@ void x86_push( struct x86_function *p,
    }
 
 
-   p->stack_offset += 4;
+   p->stack_offset += sizeof(void*);
 }
 
 void x86_push_imm32( struct x86_function *p,
@@ -530,7 +587,7 @@ void x86_push_imm32( struct x86_function *p,
    emit_1ub(p, 0x68);
    emit_1i(p,  imm32);
 
-   p->stack_offset += 4;
+   p->stack_offset += sizeof(void*);
 }
 
 
@@ -540,23 +597,33 @@ void x86_pop( struct x86_function *p,
    DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x58 + reg.idx);
-   p->stack_offset -= 4;
+   p->stack_offset -= sizeof(void*);
 }
 
 void x86_inc( struct x86_function *p,
 	      struct x86_reg reg )
 {
    DUMP_R( reg );
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x40 + reg.idx);
+   if(x86_target(p) == X86_32 && reg.mod == mod_REG)
+   {
+      emit_1ub(p, 0x40 + reg.idx);
+      return;
+   }
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 0, reg);
 }
 
 void x86_dec( struct x86_function *p,
 	      struct x86_reg reg )
 {
    DUMP_R( reg );
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x48 + reg.idx);
+   if(x86_target(p) == X86_32 && reg.mod == mod_REG)
+   {
+      emit_1ub(p, 0x48 + reg.idx);
+      return;
+   }
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 1, reg);
 }
 
 void x86_ret( struct x86_function *p )
@@ -583,9 +650,82 @@ void x86_mov( struct x86_function *p,
 	      struct x86_reg src )
 {
    DUMP_RR( dst, src );
+   /* special hack for reading arguments until we support x86-64 registers everywhere */
+   if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8))
+   {
+      uint8_t rex = 0x40;
+      if(dst.idx >= 8)
+      {
+         rex |= 4;
+         dst.idx -= 8;
+      }
+      if(src.idx >= 8)
+      {
+         rex |= 1;
+         src.idx -= 8;
+      }
+      emit_1ub(p, rex);
+   }
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_mov16( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, 0x66);
    emit_op_modrm( p, 0x8b, 0x89, dst, src );
 }
 
+void x86_mov8( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x8a, 0x88, dst, src );
+}
+
+void x64_mov64( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   uint8_t rex = 0x48;
+   DUMP_RR( dst, src );
+   assert(x86_target(p) != X86_32);
+
+   /* special hack for reading arguments until we support x86-64 registers everywhere */
+   if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8))
+   {
+      if(dst.idx >= 8)
+      {
+         rex |= 4;
+         dst.idx -= 8;
+      }
+      if(src.idx >= 8)
+      {
+         rex |= 1;
+         src.idx -= 8;
+      }
+   }
+   emit_1ub(p, rex);
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0x0f, 0xb6);
+   emit_modrm(p, dst, src);
+}
+
+void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0x0f, 0xb7);
+   emit_modrm(p, dst, src);
+}
+
 void x86_xor( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
@@ -680,6 +820,61 @@ void x86_div( struct x86_function *p,
    emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src);
 }
 
+void x86_bswap( struct x86_function *p, struct x86_reg reg )
+{
+   DUMP_R(reg);
+   assert(reg.file == file_REG32);
+   assert(reg.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0xc8 + reg.idx);
+}
+
+void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 5, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 5, reg);
+      emit_1ub(p, imm);
+   }
+}
+
+void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 7, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 7, reg);
+      emit_1ub(p, imm);
+   }
+}
+
+void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm  )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 4, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 4, reg);
+      emit_1ub(p, imm);
+   }
+}
 
 
 /***********************************************************************
@@ -1013,6 +1208,77 @@ void sse_movmskps( struct x86_function *p,
  * SSE2 instructions
  */
 
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   if(dst.mod == mod_REG && dst.file == file_REG32)
+   {
+      emit_1ub(p, 0x7e);
+      emit_modrm(p, src, dst);
+   }
+   else
+   {
+      emit_op_modrm(p, 0x6e, 0x7e, dst, src);
+   }
+}
+
+void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   switch (dst.mod) {
+   case mod_REG:
+      emit_3ub(p, 0xf3, 0x0f, 0x7e);
+      emit_modrm(p, dst, src);
+      break;
+   case mod_INDIRECT:
+   case mod_DISP32:
+   case mod_DISP8:
+      assert(src.mod == mod_REG);
+      emit_3ub(p, 0x66, 0x0f, 0xd6);
+      emit_modrm(p, src, dst);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0xf3, 0x0f);
+   emit_op_modrm(p, 0x6f, 0x7f, dst, src);
+}
+
+void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x6f, 0x7f, dst, src);
+}
+
+void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0xf2, 0x0f);
+   emit_op_modrm(p, 0x10, 0x11, dst, src);
+}
+
+void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x10, 0x11, dst, src);
+}
+
+void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x28, 0x29, dst, src);
+}
+
 /**
  * Perform a reduced swizzle:
  */
@@ -1027,6 +1293,28 @@ void sse2_pshufd( struct x86_function *p,
    emit_1ub(p, shuf); 
 }
 
+void sse2_pshuflw( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src,
+                  unsigned char shuf)
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0xf2, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf);
+}
+
+void sse2_pshufhw( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src,
+                  unsigned char shuf)
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0xf3, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf);
+}
+
 void sse2_cvttps2dq( struct x86_function *p,
                      struct x86_reg dst,
                      struct x86_reg src )
@@ -1045,6 +1333,24 @@ void sse2_cvtps2dq( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse2_cvtsd2ss( struct x86_function *p,
+                    struct x86_reg dst,
+                    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xf2, 0x0f, 0x5a);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtpd2ps( struct x86_function *p,
+                    struct x86_reg dst,
+                    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x5a);
+   emit_modrm( p, dst, src );
+}
+
 void sse2_packssdw( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
@@ -1081,6 +1387,97 @@ void sse2_punpcklbw( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x61);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x62);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x6c);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x73);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x73);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 4, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 4, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_3ub(p, 0x66, 0x0f, 0xeb);
+   emit_modrm(p, dst, src);
+}
 
 void sse2_rcpps( struct x86_function *p,
                  struct x86_reg dst,
@@ -1100,18 +1497,6 @@ void sse2_rcpss( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
-void sse2_movd( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   DUMP_RR( dst, src );
-   emit_2ub(p, 0x66, X86_TWOB);
-   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
-}
-
-
-
-
 /***********************************************************************
  * x87 instructions
  */
@@ -1702,23 +2087,80 @@ void x86_cdecl_caller_pop_regs( struct x86_function *p )
 }
 
 
-/* Retreive a reference to one of the function arguments, taking into
- * account any push/pop activity:
- */
 struct x86_reg x86_fn_arg( struct x86_function *p,
-			   unsigned arg )
+                           unsigned arg )
 {
-   return x86_make_disp(x86_make_reg(file_REG32, reg_SP), 
+   switch(x86_target(p))
+   {
+   case X86_64_WIN64_ABI:
+      /* Microsoft uses a different calling convention than the rest of the world */
+      switch(arg)
+      {
+      case 1:
+         return x86_make_reg(file_REG32, reg_CX);
+      case 2:
+         return x86_make_reg(file_REG32, reg_DX);
+      case 3:
+         return x86_make_reg(file_REG32, reg_R8);
+      case 4:
+         return x86_make_reg(file_REG32, reg_R9);
+      default:
+	 /* Win64 allocates stack slots as if it pushed the first 4 arguments too */
+         return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+               p->stack_offset + arg * 8);
+      }
+   case X86_64_STD_ABI:
+      switch(arg)
+      {
+      case 1:
+         return x86_make_reg(file_REG32, reg_DI);
+      case 2:
+         return x86_make_reg(file_REG32, reg_SI);
+      case 3:
+         return x86_make_reg(file_REG32, reg_DX);
+      case 4:
+         return x86_make_reg(file_REG32, reg_CX);
+      case 5:
+         return x86_make_reg(file_REG32, reg_R8);
+      case 6:
+         return x86_make_reg(file_REG32, reg_R9);
+      default:
+         return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+               p->stack_offset + (arg - 6) * 8);     /* ??? */
+      }
+   case X86_32:
+      return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
 			p->stack_offset + arg * 4);	/* ??? */
+   default:
+      abort();
+   }
 }
 
+static void x86_init_func_common( struct x86_function *p )
+{
+   util_cpu_detect();
+   p->caps = 0;
+   if(util_cpu_caps.has_mmx)
+      p->caps |= X86_MMX;
+   if(util_cpu_caps.has_mmx2)
+      p->caps |= X86_MMX2;
+   if(util_cpu_caps.has_sse)
+      p->caps |= X86_SSE;
+   if(util_cpu_caps.has_sse2)
+      p->caps |= X86_SSE2;
+   if(util_cpu_caps.has_sse3)
+      p->caps |= X86_SSE3;
+   if(util_cpu_caps.has_sse4_1)
+      p->caps |= X86_SSE4_1;
+   p->csr = p->store;
+   DUMP_START();
+}
 
 void x86_init_func( struct x86_function *p )
 {
    p->size = 0;
    p->store = NULL;
-   p->csr = p->store;
-   DUMP_START();
+   x86_init_func_common(p);
 }
 
 void x86_init_func_size( struct x86_function *p, unsigned code_size )
@@ -1728,8 +2170,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size )
    if (p->store == NULL) {
       p->store = p->error_overflow;
    }
-   p->csr = p->store;
-   DUMP_START();
+   x86_init_func_common(p);
 }
 
 void x86_release_func( struct x86_function *p )
@@ -1743,20 +2184,35 @@ void x86_release_func( struct x86_function *p )
 }
 
 
-void (*x86_get_func( struct x86_function *p ))(void)
+static INLINE x86_func
+voidptr_to_x86_func(void *v)
+{
+   union {
+      void *v;
+      x86_func f;
+   } u;
+   assert(sizeof(u.v) == sizeof(u.f));
+   u.v = v;
+   return u.f;
+}
+
+
+x86_func x86_get_func( struct x86_function *p )
 {
    DUMP_END();
    if (DISASSEM && p->store)
       debug_printf("disassemble %p %p\n", p->store, p->csr);
 
    if (p->store == p->error_overflow)
-      return (void (*)(void)) NULL;
+      return voidptr_to_x86_func(NULL);
    else
-      return (void (*)(void)) p->store;
+      return voidptr_to_x86_func(p->store);
 }
 
 #else
 
+void x86sse_dummy( void );
+
 void x86sse_dummy( void )
 {
 }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 319b836ffb1..2b9678b1765 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -24,22 +24,31 @@
 #ifndef _RTASM_X86SSE_H_
 #define _RTASM_X86SSE_H_
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_config.h"
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 
 /* It is up to the caller to ensure that instructions issued are
  * suitable for the host cpu.  There are no checks made in this module
  * for mmx/sse/sse2 support on the cpu.
  */
 struct x86_reg {
-   unsigned file:3;
-   unsigned idx:3;
+   unsigned file:2;
+   unsigned idx:4;
    unsigned mod:2;		/* mod_REG if this is just a register */
    int      disp:24;		/* only +/- 23bits of offset - should be enough... */
 };
 
+#define X86_MMX 1
+#define X86_MMX2 2
+#define X86_SSE 4
+#define X86_SSE2 8
+#define X86_SSE3 0x10
+#define X86_SSE4_1 0x20
+
 struct x86_function {
+   unsigned caps;
    unsigned size;
    unsigned char *store;
    unsigned char *csr;
@@ -75,7 +84,15 @@ enum x86_reg_name {
    reg_SP,
    reg_BP,
    reg_SI,
-   reg_DI
+   reg_DI,
+   reg_R8,
+   reg_R9,
+   reg_R10,
+   reg_R11,
+   reg_R12,
+   reg_R13,
+   reg_R14,
+   reg_R15
 };
 
 
@@ -102,14 +119,42 @@ enum sse_cc {
 #define cc_Z  cc_E
 #define cc_NZ cc_NE
 
+
+/** generic pointer to function */
+typedef void (*x86_func)(void);
+
+
 /* Begin/end/retrieve function creation:
  */
 
+enum x86_target
+{
+   X86_32,
+   X86_64_STD_ABI,
+   X86_64_WIN64_ABI
+};
+
+/* make this read a member of x86_function if target != host is desired */
+static INLINE enum x86_target x86_target( struct x86_function* p )
+{
+#ifdef PIPE_ARCH_X86
+   return X86_32;
+#elif defined(_WIN64)
+   return X86_64_WIN64_ABI;
+#elif defined(PIPE_ARCH_X86_64)
+   return X86_64_STD_ABI;
+#endif
+}
+
+static INLINE unsigned x86_target_caps( struct x86_function* p )
+{
+   return p->caps;
+}
 
 void x86_init_func( struct x86_function *p );
 void x86_init_func_size( struct x86_function *p, unsigned code_size );
 void x86_release_func( struct x86_function *p );
-void (*x86_get_func( struct x86_function *p ))( void );
+x86_func x86_get_func( struct x86_function *p );
 
 /* Debugging:
  */
@@ -133,6 +178,8 @@ struct x86_reg x86_get_base_reg( struct x86_reg reg );
  */
 int x86_get_label( struct x86_function *p );
 
+void x64_rexw(struct x86_function *p);
+
 void x86_jcc( struct x86_function *p,
 	      enum x86_cc cc,
 	      int label );
@@ -173,18 +220,54 @@ void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
 void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtsd2ss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtpd2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
 void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
                   unsigned char shuf );
+void sse2_pshuflw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_pshufhw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
 void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_pshuflw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
+void sse2_pshufhw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
 
 void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
 void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
@@ -222,7 +305,6 @@ void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg
 void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
-void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_movmskps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
 
 void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -232,6 +314,14 @@ void x86_dec( struct x86_function *p, struct x86_reg reg );
 void x86_inc( struct x86_function *p, struct x86_reg reg );
 void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x64_mov64( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov8( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov16( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov_imm(struct x86_function *p, struct x86_reg dst, int imm );
+void x86_mov8_imm(struct x86_function *p, struct x86_reg dst, uint8_t imm );
+void x86_mov16_imm(struct x86_function *p, struct x86_reg dst, uint16_t imm );
 void x86_mul( struct x86_function *p, struct x86_reg src );
 void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -245,7 +335,10 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_sahf( struct x86_function *p );
 void x86_div( struct x86_function *p, struct x86_reg src );
-
+void x86_bswap( struct x86_function *p, struct x86_reg src );
+void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm );
+void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm );
+void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm  );
 
 void x86_cdecl_caller_push_regs( struct x86_function *p );
 void x86_cdecl_caller_pop_regs( struct x86_function *p );
diff --git a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
new file mode 100644
index 00000000000..0433da6141d
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
@@ -0,0 +1,44 @@
+
+#ifndef INLINE_DEBUG_HELPER_H
+#define INLINE_DEBUG_HELPER_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+
+
+/* Helper function to wrap a screen with
+ * one or more debug driver: rbug, trace.
+ */
+
+#ifdef GALLIUM_TRACE
+#include "trace/tr_public.h"
+#endif
+
+#ifdef GALLIUM_RBUG
+#include "rbug/rbug_public.h"
+#endif
+
+#ifdef GALLIUM_GALAHAD
+#include "galahad/glhd_public.h"
+#endif
+
+static INLINE struct pipe_screen *
+debug_screen_wrap(struct pipe_screen *screen)
+{
+
+#if defined(GALLIUM_RBUG)
+   screen = rbug_screen_create(screen);
+#endif
+
+#if defined(GALLIUM_TRACE)
+   screen = trace_screen_create(screen);
+#endif
+
+#if defined(GALLIUM_GALAHAD)
+   screen = galahad_screen_create(screen);
+#endif
+
+   return screen;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
new file mode 100644
index 00000000000..036c1ee48a8
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -0,0 +1,63 @@
+
+#ifndef INLINE_SW_HELPER_H
+#define INLINE_SW_HELPER_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+#include "state_tracker/sw_winsys.h"
+
+
+/* Helper function to choose and instantiate one of the software rasterizers:
+ * cell, llvmpipe, softpipe.
+ */
+
+#ifdef GALLIUM_SOFTPIPE
+#include "softpipe/sp_public.h"
+#endif
+
+#ifdef GALLIUM_LLVMPIPE
+#include "llvmpipe/lp_public.h"
+#endif
+
+#ifdef GALLIUM_CELL
+#include "cell/ppu/cell_public.h"
+#endif
+
+static INLINE struct pipe_screen *
+sw_screen_create(struct sw_winsys *winsys)
+{
+   const char *default_driver;
+   const char *driver;
+   struct pipe_screen *screen = NULL;
+
+#if defined(GALLIUM_CELL)
+   default_driver = "cell";
+#elif defined(GALLIUM_LLVMPIPE)
+   default_driver = "llvmpipe";
+#elif defined(GALLIUM_SOFTPIPE)
+   default_driver = "softpipe";
+#else
+   default_driver = "";
+#endif
+
+   driver = debug_get_option("GALLIUM_DRIVER", default_driver);
+
+#if defined(GALLIUM_CELL)
+   if (screen == NULL && strcmp(driver, "cell") == 0)
+      screen = cell_create_screen(winsys);
+#endif
+
+#if defined(GALLIUM_LLVMPIPE)
+   if (screen == NULL && strcmp(driver, "llvmpipe") == 0)
+      screen = llvmpipe_create_screen(winsys);
+#endif
+
+#if defined(GALLIUM_SOFTPIPE)
+   if (screen == NULL)
+      screen = softpipe_create_screen(winsys);
+#endif
+
+   return screen;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
new file mode 100644
index 00000000000..0b4e7404034
--- /dev/null
+++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h
@@ -0,0 +1,34 @@
+
+#ifndef INLINE_WRAPPER_SW_HELPER_H
+#define INLINE_WRAPPER_SW_HELPER_H
+
+#include "target-helpers/inline_sw_helper.h"
+#include "sw/wrapper/wrapper_sw_winsys.h"
+
+/**
+ * Try to wrap a hw screen with a software screen.
+ * On failure will return given screen.
+ */
+static INLINE struct pipe_screen *
+sw_screen_wrap(struct pipe_screen *screen)
+{
+   struct sw_winsys *sws;
+   struct pipe_screen *sw_screen;
+
+   sws = wrapper_sw_winsys_warp_pipe_screen(screen);
+   if (!sws)
+      goto err;
+
+   sw_screen = sw_screen_create(sws);
+   if (sw_screen == screen)
+      goto err_winsys;
+
+   return sw_screen;
+
+err_winsys:
+   sws->destroy(sws);
+err:
+  return screen;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/target-helpers/wrap_screen.c b/src/gallium/auxiliary/target-helpers/wrap_screen.c
index eb475123198..df5d56a53c9 100644
--- a/src/gallium/auxiliary/target-helpers/wrap_screen.c
+++ b/src/gallium/auxiliary/target-helpers/wrap_screen.c
@@ -33,6 +33,7 @@
 
 #include "target-helpers/wrap_screen.h"
 #include "trace/tr_public.h"
+#include "rbug/rbug_public.h"
 #include "identity/id_public.h"
 #include "util/u_debug.h"
 
@@ -56,6 +57,9 @@ gallium_wrap_screen( struct pipe_screen *screen )
    /* Trace does its own checking if it should run */
    screen = trace_screen_create(screen);
 
+   /* Rbug does its own checking if it should run */
+   screen = rbug_screen_create(screen);
+
    return screen;
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi-instruction-set.txt b/src/gallium/auxiliary/tgsi/tgsi-instruction-set.txt
deleted file mode 100644
index 5d9eed92580..00000000000
--- a/src/gallium/auxiliary/tgsi/tgsi-instruction-set.txt
+++ /dev/null
@@ -1,1127 +0,0 @@
-TGSI Instruction Specification
-==============================
-==============================
-
-
-1  Instruction Set Operations
-=============================
-
-
-1.1  GL_NV_vertex_program
--------------------------
-
-
-1.1.1  ARL - Address Register Load
-
-  dst.x = floor(src.x)
-  dst.y = floor(src.y)
-  dst.z = floor(src.z)
-  dst.w = floor(src.w)
-
-
-1.1.2  MOV - Move
-
-  dst.x = src.x
-  dst.y = src.y
-  dst.z = src.z
-  dst.w = src.w
-
-
-1.1.3  LIT - Light Coefficients
-
-  dst.x = 1.0
-  dst.y = max(src.x, 0.0)
-  dst.z = (src.x > 0.0) ? pow(max(src.y, 0.0), clamp(src.w, -128.0, 128.0)) : 0.0
-  dst.w = 1.0
-
-
-1.1.4  RCP - Reciprocal
-
-  dst.x = 1.0 / src.x
-  dst.y = 1.0 / src.x
-  dst.z = 1.0 / src.x
-  dst.w = 1.0 / src.x
-
-
-1.1.5  RSQ - Reciprocal Square Root
-
-  dst.x = 1.0 / sqrt(abs(src.x))
-  dst.y = 1.0 / sqrt(abs(src.x))
-  dst.z = 1.0 / sqrt(abs(src.x))
-  dst.w = 1.0 / sqrt(abs(src.x))
-
-
-1.1.6  EXP - Approximate Exponential Base 2
-
-  dst.x = pow(2.0, floor(src.x))
-  dst.y = src.x - floor(src.x)
-  dst.z = pow(2.0, src.x)
-  dst.w = 1.0
-
-
-1.1.7  LOG - Approximate Logarithm Base 2
-
-  dst.x = floor(lg2(abs(src.x)))
-  dst.y = abs(src.x) / pow(2.0, floor(lg2(abs(src.x))))
-  dst.z = lg2(abs(src.x))
-  dst.w = 1.0
-
-
-1.1.8  MUL - Multiply
-
-  dst.x = src0.x * src1.x
-  dst.y = src0.y * src1.y
-  dst.z = src0.z * src1.z
-  dst.w = src0.w * src1.w
-
-
-1.1.9  ADD - Add
-
-  dst.x = src0.x + src1.x
-  dst.y = src0.y + src1.y
-  dst.z = src0.z + src1.z
-  dst.w = src0.w + src1.w
-
-
-1.1.10  DP3 - 3-component Dot Product
-
-  dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
-  dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
-  dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
-  dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z
-
-
-1.1.11  DP4 - 4-component Dot Product
-
-  dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
-  dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
-  dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
-  dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w
-
-
-1.1.12  DST - Distance Vector
-
-  dst.x = 1.0
-  dst.y = src0.y * src1.y
-  dst.z = src0.z
-  dst.w = src1.w
-
-
-1.1.13  MIN - Minimum
-
-  dst.x = min(src0.x, src1.x)
-  dst.y = min(src0.y, src1.y)
-  dst.z = min(src0.z, src1.z)
-  dst.w = min(src0.w, src1.w)
-
-
-1.1.14  MAX - Maximum
-
-  dst.x = max(src0.x, src1.x)
-  dst.y = max(src0.y, src1.y)
-  dst.z = max(src0.z, src1.z)
-  dst.w = max(src0.w, src1.w)
-
-
-1.1.15  SLT - Set On Less Than
-
-  dst.x = (src0.x < src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y < src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z < src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w < src1.w) ? 1.0 : 0.0
-
-
-1.1.16  SGE - Set On Greater Equal Than
-
-  dst.x = (src0.x >= src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y >= src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z >= src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w >= src1.w) ? 1.0 : 0.0
-
-
-1.1.17  MAD - Multiply And Add
-
-  dst.x = src0.x * src1.x + src2.x
-  dst.y = src0.y * src1.y + src2.y
-  dst.z = src0.z * src1.z + src2.z
-  dst.w = src0.w * src1.w + src2.w
-
-
-1.2  GL_ATI_fragment_shader
----------------------------
-
-
-1.2.1  SUB - Subtract
-
-  dst.x = src0.x - src1.x
-  dst.y = src0.y - src1.y
-  dst.z = src0.z - src1.z
-  dst.w = src0.w - src1.w
-
-
-1.2.2  DOT3 - 3-component Dot Product
-
-  Alias for DP3.
-
-
-1.2.3  DOT4 - 4-component Dot Product
-
-  Alias for DP4.
-
-
-1.2.4  LERP - Linear Interpolate
-
-  dst.x = src0.x * (src1.x - src2.x) + src2.x
-  dst.y = src0.y * (src1.y - src2.y) + src2.y
-  dst.z = src0.z * (src1.z - src2.z) + src2.z
-  dst.w = src0.w * (src1.w - src2.w) + src2.w
-
-
-1.2.5  CND - Condition
-
-  dst.x = (src2.x > 0.5) ? src0.x : src1.x
-  dst.y = (src2.y > 0.5) ? src0.y : src1.y
-  dst.z = (src2.z > 0.5) ? src0.z : src1.z
-  dst.w = (src2.w > 0.5) ? src0.w : src1.w
-
-
-1.2.6  CND0 - Condition Zero
-
-       Removed.  Use (CMP src2, src1, src0) instead.
-
-1.2.7  DOT2ADD - 2-component Dot Product And Add
-
-  dst.x = src0.x * src1.x + src0.y * src1.y + src2.x
-  dst.y = src0.x * src1.x + src0.y * src1.y + src2.x
-  dst.z = src0.x * src1.x + src0.y * src1.y + src2.x
-  dst.w = src0.x * src1.x + src0.y * src1.y + src2.x
-
-
-1.3  GL_EXT_vertex_shader
--------------------------
-
-
-1.3.1  INDEX - Array Lookup
-
-  Considered for removal from language.
-
-
-1.3.2  NEGATE - Negate
-
-  Considered for removal from language.
-
-
-1.3.3  MADD - Multiply And Add
-
-  Alias for MAD.
-
-
-1.3.4  FRAC - Fraction
-
-  dst.x = src.x - floor(src.x)
-  dst.y = src.y - floor(src.y)
-  dst.z = src.z - floor(src.z)
-  dst.w = src.w - floor(src.w)
-
-
-1.3.5  SETGE - Set On Greater Equal
-
-  Alias for SGE.
-
-
-1.3.6  SETLT - Set On Less Than
-
-  Alias for SLT.
-
-
-1.3.7  CLAMP - Clamp
-
-  dst.x = clamp(src0.x, src1.x, src2.x)
-  dst.y = clamp(src0.y, src1.y, src2.y)
-  dst.z = clamp(src0.z, src1.z, src2.z)
-  dst.w = clamp(src0.w, src1.w, src2.w)
-
-
-1.3.8  FLOOR - Floor
-
-  dst.x = floor(src.x)
-  dst.y = floor(src.y)
-  dst.z = floor(src.z)
-  dst.w = floor(src.w)
-
-
-1.3.9  ROUND - Round
-
-  dst.x = round(src.x)
-  dst.y = round(src.y)
-  dst.z = round(src.z)
-  dst.w = round(src.w)
-
-
-1.3.10  EXPBASE2 - Exponential Base 2
-
-  dst.x = pow(2.0, src.x)
-  dst.y = pow(2.0, src.x)
-  dst.z = pow(2.0, src.x)
-  dst.w = pow(2.0, src.x)
-
-
-1.3.11  LOGBASE2 - Logarithm Base 2
-
-  dst.x = lg2(src.x)
-  dst.y = lg2(src.x)
-  dst.z = lg2(src.x)
-  dst.w = lg2(src.x)
-
-
-1.3.12  POWER - Power
-
-  dst.x = pow(src0.x, src1.x)
-  dst.y = pow(src0.x, src1.x)
-  dst.z = pow(src0.x, src1.x)
-  dst.w = pow(src0.x, src1.x)
-
-
-1.3.13  RECIP - Reciprocal
-
-  Alias for RCP.
-
-
-1.3.14  RECIPSQRT - Reciprocal Square Root
-
-  Alias for RSQ.
-
-
-1.3.15  CROSSPRODUCT - Cross Product
-
-  dst.x = src0.y * src1.z - src1.y * src0.z
-  dst.y = src0.z * src1.x - src1.z * src0.x
-  dst.z = src0.x * src1.y - src1.x * src0.y
-  dst.w = 1.0
-
-
-1.3.16  MULTIPLYMATRIX - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.4  GL_NV_vertex_program1_1
-----------------------------
-
-
-1.4.1  ABS - Absolute
-
-  dst.x = abs(src.x)
-  dst.y = abs(src.y)
-  dst.z = abs(src.z)
-  dst.w = abs(src.w)
-
-
-1.4.2  RCC - Reciprocal Clamped
-
-  dst.x = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
-  dst.y = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
-  dst.z = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
-  dst.w = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020)
-
-
-1.4.3  DPH - Homogeneous Dot Product
-
-  dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
-  dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
-  dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
-  dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w
-
-
-1.5  GL_NV_fragment_program
----------------------------
-
-
-1.5.1  COS - Cosine
-
-  dst.x = cos(src.x)
-  dst.y = cos(src.x)
-  dst.z = cos(src.x)
-  dst.w = cos(src.w)
-
-
-1.5.2  DDX - Derivative Relative To X
-
-  dst.x = partialx(src.x)
-  dst.y = partialx(src.y)
-  dst.z = partialx(src.z)
-  dst.w = partialx(src.w)
-
-
-1.5.3  DDY - Derivative Relative To Y
-
-  dst.x = partialy(src.x)
-  dst.y = partialy(src.y)
-  dst.z = partialy(src.z)
-  dst.w = partialy(src.w)
-
-
-1.5.4  EX2 - Exponential Base 2
-
-  Alias for EXPBASE2.
-
-
-1.5.5  FLR - Floor
-
-  Alias for FLOOR.
-
-
-1.5.6  FRC - Fraction
-
-  Alias for FRAC.
-
-
-1.5.7  KILP - Predicated Discard
-
-  discard
-
-
-1.5.8  LG2 - Logarithm Base 2
-
-  Alias for LOGBASE2.
-
-
-1.5.9  LRP - Linear Interpolate
-
-  Alias for LERP.
-
-
-1.5.10  PK2H - Pack Two 16-bit Floats
-
-  TBD
-
-
-1.5.11  PK2US - Pack Two Unsigned 16-bit Scalars
-
-  TBD
-
-
-1.5.12  PK4B - Pack Four Signed 8-bit Scalars
-
-  TBD
-
-
-1.5.13  PK4UB - Pack Four Unsigned 8-bit Scalars
-
-  TBD
-
-
-1.5.14  POW - Power
-
-  Alias for POWER.
-
-
-1.5.15  RFL - Reflection Vector
-
-  dst.x = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.x - src1.x
-  dst.y = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.y - src1.y
-  dst.z = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.z - src1.z
-  dst.w = 1.0
-
-
-1.5.16  SEQ - Set On Equal
-
-  dst.x = (src0.x == src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y == src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z == src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w == src1.w) ? 1.0 : 0.0
-
-
-1.5.17  SFL - Set On False
-
-  dst.x = 0.0
-  dst.y = 0.0
-  dst.z = 0.0
-  dst.w = 0.0
-
-
-1.5.18  SGT - Set On Greater Than
-
-  dst.x = (src0.x > src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y > src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z > src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w > src1.w) ? 1.0 : 0.0
-
-
-1.5.19  SIN - Sine
-
-  dst.x = sin(src.x)
-  dst.y = sin(src.x)
-  dst.z = sin(src.x)
-  dst.w = sin(src.w)
-
-
-1.5.20  SLE - Set On Less Equal Than
-
-  dst.x = (src0.x <= src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y <= src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z <= src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w <= src1.w) ? 1.0 : 0.0
-
-
-1.5.21  SNE - Set On Not Equal
-
-  dst.x = (src0.x != src1.x) ? 1.0 : 0.0
-  dst.y = (src0.y != src1.y) ? 1.0 : 0.0
-  dst.z = (src0.z != src1.z) ? 1.0 : 0.0
-  dst.w = (src0.w != src1.w) ? 1.0 : 0.0
-
-
-1.5.22  STR - Set On True
-
-  dst.x = 1.0
-  dst.y = 1.0
-  dst.z = 1.0
-  dst.w = 1.0
-
-
-1.5.23  TEX - Texture Lookup
-
-  TBD
-
-
-1.5.24  TXD - Texture Lookup with Derivatives
-
-  TBD
-
-
-1.5.25  TXP - Projective Texture Lookup
-
-  TBD
-
-
-1.5.26  UP2H - Unpack Two 16-Bit Floats
-
-  TBD
-
-
-1.5.27  UP2US - Unpack Two Unsigned 16-Bit Scalars
-
-  TBD
-
-
-1.5.28  UP4B - Unpack Four Signed 8-Bit Values
-
-  TBD
-
-
-1.5.29  UP4UB - Unpack Four Unsigned 8-Bit Scalars
-
-  TBD
-
-
-1.5.30  X2D - 2D Coordinate Transformation
-
-  dst.x = src0.x + src1.x * src2.x + src1.y * src2.y
-  dst.y = src0.y + src1.x * src2.z + src1.y * src2.w
-  dst.z = src0.x + src1.x * src2.x + src1.y * src2.y
-  dst.w = src0.y + src1.x * src2.z + src1.y * src2.w
-
-
-1.6  GL_NV_vertex_program2
---------------------------
-
-
-1.6.1  ARA - Address Register Add
-
-  TBD
-
-
-1.6.2  ARR - Address Register Load With Round
-
-  dst.x = round(src.x)
-  dst.y = round(src.y)
-  dst.z = round(src.z)
-  dst.w = round(src.w)
-
-
-1.6.3  BRA - Branch
-
-  pc = target
-
-
-1.6.4  CAL - Subroutine Call
-
-  push(pc)
-  pc = target
-
-
-1.6.5  RET - Subroutine Call Return
-
-  pc = pop()
-
-
-1.6.6  SSG - Set Sign
-
-  dst.x = (src.x > 0.0) ? 1.0 : (src.x < 0.0) ? -1.0 : 0.0
-  dst.y = (src.y > 0.0) ? 1.0 : (src.y < 0.0) ? -1.0 : 0.0
-  dst.z = (src.z > 0.0) ? 1.0 : (src.z < 0.0) ? -1.0 : 0.0
-  dst.w = (src.w > 0.0) ? 1.0 : (src.w < 0.0) ? -1.0 : 0.0
-
-
-1.7  GL_ARB_vertex_program
---------------------------
-
-
-1.7.1  SWZ - Extended Swizzle
-
-  dst.x = src.x
-  dst.y = src.y
-  dst.z = src.z
-  dst.w = src.w
-
-
-1.7.2  XPD - Cross Product
-
-  Alias for CROSSPRODUCT.
-
-
-1.8  GL_ARB_fragment_program
-----------------------------
-
-
-1.8.1  CMP - Compare
-
-  dst.x = (src0.x < 0.0) ? src1.x : src2.x
-  dst.y = (src0.y < 0.0) ? src1.y : src2.y
-  dst.z = (src0.z < 0.0) ? src1.z : src2.z
-  dst.w = (src0.w < 0.0) ? src1.w : src2.w
-
-
-1.8.2  KIL - Conditional Discard
-
-  if (src.x < 0.0 || src.y < 0.0 || src.z < 0.0 || src.w < 0.0)
-    discard
-  endif
-
-
-1.8.3  SCS - Sine Cosine
-
-  dst.x = cos(src.x)
-  dst.y = sin(src.x)
-  dst.z = 0.0
-  dst.y = 1.0
-
-
-1.8.4  TXB - Texture Lookup With Bias
-
-  TBD
-
-
-1.9  GL_NV_fragment_program2
-----------------------------
-
-
-1.9.1  NRM - 3-component Vector Normalise
-
-  dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z)
-  dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z)
-  dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z)
-  dst.w = 1.0
-
-
-1.9.2  DIV - Divide
-
-  dst.x = src0.x / src1.x
-  dst.y = src0.y / src1.y
-  dst.z = src0.z / src1.z
-  dst.w = src0.w / src1.w
-
-
-1.9.3  DP2 - 2-component Dot Product
-
-  dst.x = src0.x * src1.x + src0.y * src1.y
-  dst.y = src0.x * src1.x + src0.y * src1.y
-  dst.z = src0.x * src1.x + src0.y * src1.y
-  dst.w = src0.x * src1.x + src0.y * src1.y
-
-
-1.9.4  DP2A - 2-component Dot Product And Add
-
-  Alias for DOT2ADD.
-
-
-1.9.5  TXL - Texture Lookup With LOD
-
-  TBD
-
-
-1.9.6  BRK - Break
-
-  TBD
-
-
-1.9.7  IF - If
-
-  TBD
-
-
-1.9.10  ELSE - Else
-
-  TBD
-
-
-1.9.11  ENDIF - End If
-
-  TBD
-
-
-1.10  GL_NV_vertex_program3
----------------------------
-
-
-1.10.1  PUSHA - Push Address Register On Stack
-
-  push(src.x)
-  push(src.y)
-  push(src.z)
-  push(src.w)
-
-
-1.10.2  POPA - Pop Address Register From Stack
-
-  dst.w = pop()
-  dst.z = pop()
-  dst.y = pop()
-  dst.x = pop()
-
-
-1.11  GL_NV_gpu_program4
-------------------------
-
-
-1.11.1  CEIL - Ceiling
-
-  dst.x = ceil(src.x)
-  dst.y = ceil(src.y)
-  dst.z = ceil(src.z)
-  dst.w = ceil(src.w)
-
-
-1.11.2  I2F - Integer To Float
-
-  dst.x = (float) src.x
-  dst.y = (float) src.y
-  dst.z = (float) src.z
-  dst.w = (float) src.w
-
-
-1.11.3  NOT - Bitwise Not
-
-  dst.x = ~src.x
-  dst.y = ~src.y
-  dst.z = ~src.z
-  dst.w = ~src.w
-
-
-1.11.4  TRUNC - Truncate
-
-  dst.x = trunc(src.x)
-  dst.y = trunc(src.y)
-  dst.z = trunc(src.z)
-  dst.w = trunc(src.w)
-
-
-1.11.5  SHL - Shift Left
-
-  dst.x = src0.x << src1.x
-  dst.y = src0.y << src1.x
-  dst.z = src0.z << src1.x
-  dst.w = src0.w << src1.x
-
-
-1.11.6  SHR - Shift Right
-
-  dst.x = src0.x >> src1.x
-  dst.y = src0.y >> src1.x
-  dst.z = src0.z >> src1.x
-  dst.w = src0.w >> src1.x
-
-
-1.11.7  AND - Bitwise And
-
-  dst.x = src0.x & src1.x
-  dst.y = src0.y & src1.y
-  dst.z = src0.z & src1.z
-  dst.w = src0.w & src1.w
-
-
-1.11.8  OR - Bitwise Or
-
-  dst.x = src0.x | src1.x
-  dst.y = src0.y | src1.y
-  dst.z = src0.z | src1.z
-  dst.w = src0.w | src1.w
-
-
-1.11.9  MOD - Modulus
-
-  dst.x = src0.x % src1.x
-  dst.y = src0.y % src1.y
-  dst.z = src0.z % src1.z
-  dst.w = src0.w % src1.w
-
-
-1.11.10  XOR - Bitwise Xor
-
-  dst.x = src0.x ^ src1.x
-  dst.y = src0.y ^ src1.y
-  dst.z = src0.z ^ src1.z
-  dst.w = src0.w ^ src1.w
-
-
-1.11.11  SAD - Sum Of Absolute Differences
-
-  dst.x = abs(src0.x - src1.x) + src2.x
-  dst.y = abs(src0.y - src1.y) + src2.y
-  dst.z = abs(src0.z - src1.z) + src2.z
-  dst.w = abs(src0.w - src1.w) + src2.w
-
-
-1.11.12  TXF - Texel Fetch
-
-  TBD
-
-
-1.11.13  TXQ - Texture Size Query
-
-  TBD
-
-
-1.11.14  CONT - Continue
-
-  TBD
-
-
-1.12  GL_NV_geometry_program4
------------------------------
-
-
-1.12.1  EMIT - Emit
-
-  TBD
-
-
-1.12.2  ENDPRIM - End Primitive
-
-  TBD
-
-
-1.13  GLSL
-----------
-
-
-1.13.1  BGNLOOP - Begin a Loop
-
-  TBD
-
-
-1.13.2  BGNSUB - Begin Subroutine
-
-  TBD
-
-
-1.13.3  ENDLOOP - End a Loop
-
-  TBD
-
-
-1.13.4  ENDSUB - End Subroutine
-
-  TBD
-
-
-1.13.5  INT - Truncate
-
-  Alias for TRUNC.
-
-
-1.13.6  NOISE1 - 1D Noise
-
-  TBD
-
-
-1.13.7  NOISE2 - 2D Noise
-
-  TBD
-
-
-1.13.8  NOISE3 - 3D Noise
-
-  TBD
-
-
-1.13.9  NOISE4 - 4D Noise
-
-  TBD
-
-
-1.13.10  NOP - No Operation
-
-  Do nothing.
-
-
-1.14  ps_1_1
-------------
-
-
-1.14.1  TEXKILL - Conditional Discard
-
-  Alias for KIL.
-
-
-1.15  ps_1_4
-------------
-
-
-1.15.1  TEXLD - Texture Lookup
-
-  Alias for TEX.
-
-
-1.16  ps_2_0
-------------
-
-
-1.16.1  M4X4 - Multiply Matrix
-
-  Alias for MULTIPLYMATRIX.
-
-
-1.16.2  M4X3 - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.16.3  M3X4 - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.16.4  M3X3 - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.16.5  M3X2 - Multiply Matrix
-
-  Considered for removal from language.
-
-
-1.16.6  CRS - Cross Product
-
-  Alias for XPD.
-
-
-1.16.7  NRM4 - 4-component Vector Normalise
-
-  dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
-  dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
-  dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
-  dst.w = src.w / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w)
-
-
-1.16.8  SINCOS - Sine Cosine
-
-  Alias for SCS.
-
-
-1.16.9  TEXLDB - Texture Lookup With Bias
-
-  Alias for TXB.
-
-
-1.16.10  DP2ADD - 2-component Dot Product And Add
-
-  Alias for DP2A.
-
-
-1.17  ps_2_x
-------------
-
-
-1.17.1  CALL - Subroutine Call
-
-  Alias for CAL.
-
-
-1.17.2  CALLNZ - Subroutine Call If Not Zero
-
-  TBD
-
-
-1.17.3  IFC - If
-
-  TBD
-
-
-1.17.4  BREAK - Break
-
-  Alias for BRK.
-
-
-1.17.5  BREAKC - Break Conditional
-
-  TBD
-
-
-1.17.6  DSX - Derivative Relative To X
-
-  Alias for DDX.
-
-
-1.17.7  DSY - Derivative Relative To Y
-
-  Alias for DDY.
-
-
-1.17.8  TEXLDD - Texture Lookup with Derivatives
-
-  Alias for TXD.
-
-
-1.18  vs_1_1
-------------
-
-
-1.18.1  EXPP - Approximate Exponential Base 2
-
-  Use EXP. See also 1.19.3.
-
-
-1.18.2  LOGP - Logarithm Base 2
-
-  Use LOG. See also 1.19.4.
-
-
-1.19  vs_2_0
-------------
-
-
-1.19.1  SGN - Set Sign
-
-  Alias for SSG.
-
-
-1.19.2  MOVA - Move Address Register
-
-  Alias for ARR.
-
-
-1.19.3  EXPP - Approximate Exponential Base 2
-
-  Use EX2.
-
-
-1.19.4  LOGP - Logarithm Base 2
-
-  Use LG2.
-
-
-2  Explanation of symbols used
-==============================
-
-
-2.1  Functions
---------------
-
-
-  abs(x)            Absolute value of x.
-                    |x|
-                    (x < 0.0) ? -x : x
-
-  ceil(x)           Ceiling of x.
-
-  clamp(x,y,z)      Clamp x between y and z.
-                    (x < y) ? y : (x > z) ? z : x
-
-  cos(x)            Cosine of x.
-
-  floor(x)          Floor of x.
-
-  lg2(x)            Logarithm base 2 of x.
-
-  max(x,y)          Maximum of x and y.
-                    (x > y) ? x : y
-
-  min(x,y)          Minimum of x and y.
-                    (x < y) ? x : y
-
-  partialx(x)       Derivative of x relative to fragment's X.
-
-  partialy(x)       Derivative of x relative to fragment's Y.
-
-  pop()             Pop from stack.
-
-  pow(x,y)          Raise x to power of y.
-
-  push(x)           Push x on stack.
-
-  round(x)          Round x.
-
-  sin(x)            Sine of x.
-
-  sqrt(x)           Square root of x.
-
-  trunc(x)          Truncate x.
-
-
-2.2  Keywords
--------------
-
-
-  discard           Discard fragment.
-
-  dst               First destination register.
-
-  dst0              First destination register.
-
-  pc                Program counter.
-
-  src               First source register.
-
-  src0              First source register.
-
-  src1              Second source register.
-
-  src2              Third source register.
-
-  target            Label of target instruction.
-
-
-3  Other tokens
-===============
-
-
-3.1  Declaration Semantic
--------------------------
-
-
-  Follows Declaration token if Semantic bit is set.
-
-  Since its purpose is to link a shader with other stages of the pipeline,
-  it is valid to follow only those Declaration tokens that declare a register
-  either in INPUT or OUTPUT file.
-
-  SemanticName field contains the semantic name of the register being declared.
-  There is no default value.
-
-  SemanticIndex is an optional subscript that can be used to distinguish
-  different register declarations with the same semantic name. The default value
-  is 0.
-
-  The meanings of the individual semantic names are explained in the following
-  sections.
-
-
-3.1.1  FACE
-
-  Valid only in a fragment shader INPUT declaration.
-
-  FACE.x is negative when the primitive is back facing. FACE.x is positive
-  when the primitive is front facing.
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 0890078cd05..6dbedf15ca8 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -164,6 +164,7 @@ tgsi_default_full_declaration( void )
    full_declaration.Declaration  = tgsi_default_declaration();
    full_declaration.Range = tgsi_default_declaration_range();
    full_declaration.Semantic = tgsi_default_declaration_semantic();
+   full_declaration.ImmediateData.u = NULL;
 
    return full_declaration;
 }
@@ -180,7 +181,7 @@ tgsi_build_full_declaration(
    struct tgsi_declaration_range *dr;
 
    if( maxsize <= size )
-     return 0;
+      return 0;
    declaration = (struct tgsi_declaration *) &tokens[size];
    size++;
 
@@ -235,6 +236,24 @@ tgsi_build_full_declaration(
          header );
    }
 
+   if (full_decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) {
+      unsigned i, j;
+      union tgsi_immediate_data *data;
+
+      for (i = 0; i <= dr->Last; ++i) {
+         for (j = 0; j < 4; ++j) {
+            unsigned idx = i*4 + j;
+            if (maxsize <= size)
+               return 0;
+            data = (union tgsi_immediate_data *) &tokens[size];
+            ++size;
+
+            *data = full_decl->ImmediateData.u[idx];
+            declaration_grow( declaration, header );
+         }
+      }
+   }
+
    return size;
 }
 
@@ -613,6 +632,7 @@ tgsi_build_full_instruction(
          reg->Register.File,
          reg->Register.WriteMask,
          reg->Register.Indirect,
+         reg->Register.Dimension,
          reg->Register.Index,
          instruction,
          header );
@@ -640,6 +660,46 @@ tgsi_build_full_instruction(
             instruction,
             header );
       }
+
+      if( reg->Register.Dimension ) {
+         struct  tgsi_dimension *dim;
+
+         assert( !reg->Dimension.Dimension );
+
+         if( maxsize <= size )
+            return 0;
+         dim = (struct tgsi_dimension *) &tokens[size];
+         size++;
+
+         *dim = tgsi_build_dimension(
+            reg->Dimension.Indirect,
+            reg->Dimension.Index,
+            instruction,
+            header );
+
+         if( reg->Dimension.Indirect ) {
+            struct tgsi_src_register *ind;
+
+            if( maxsize <= size )
+               return 0;
+            ind = (struct tgsi_src_register *) &tokens[size];
+            size++;
+
+            *ind = tgsi_build_src_register(
+               reg->DimIndirect.File,
+               reg->DimIndirect.SwizzleX,
+               reg->DimIndirect.SwizzleY,
+               reg->DimIndirect.SwizzleZ,
+               reg->DimIndirect.SwizzleW,
+               reg->DimIndirect.Negate,
+               reg->DimIndirect.Absolute,
+               reg->DimIndirect.Indirect,
+               reg->DimIndirect.Dimension,
+               reg->DimIndirect.Index,
+               instruction,
+               header );
+         }
+      }
    }
 
    for( i = 0;  i < full_inst->Instruction.NumSrcRegs; i++ ) {
@@ -959,6 +1019,7 @@ tgsi_build_dst_register(
    unsigned file,
    unsigned mask,
    unsigned indirect,
+   unsigned dimension,
    int index,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header )
@@ -974,6 +1035,7 @@ tgsi_build_dst_register(
    dst_register.WriteMask = mask;
    dst_register.Index = index;
    dst_register.Indirect = indirect;
+   dst_register.Dimension = dimension;
 
    instruction_grow( instruction, header );
 
@@ -987,6 +1049,8 @@ tgsi_default_full_dst_register( void )
 
    full_dst_register.Register = tgsi_default_dst_register();
    full_dst_register.Indirect = tgsi_default_src_register();
+   full_dst_register.Dimension = tgsi_default_dimension();
+   full_dst_register.DimIndirect = tgsi_default_src_register();
 
    return full_dst_register;
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
index 13d7f5272d6..112107a0881 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -263,6 +263,7 @@ tgsi_build_dst_register(
    unsigned file,
    unsigned mask,
    unsigned indirect,
+   unsigned dimension,
    int index,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 83000200189..f71ffb70308 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -56,7 +56,7 @@ dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...)
    va_list ap;
    (void)ctx;
    va_start(ap, format);
-   debug_vprintf(format, ap);
+   _debug_vprintf(format, ap);
    va_end(ap);
 }
 
@@ -100,9 +100,10 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "SAMP",
    "ADDR",
    "IMM",
-   "LOOP",
    "PRED",
-   "SV"
+   "SV",
+   "IMMX",
+   "TEMPX"
 };
 
 static const char *interpolate_names[] =
@@ -175,7 +176,11 @@ static const char *primitive_names[] =
    "TRIANGLE_FAN",
    "QUADS",
    "QUAD_STRIP",
-   "POLYGON"
+   "POLYGON",
+   "LINES_ADJACENCY",
+   "LINE_STRIP_ADJACENCY",
+   "TRIANGLES_ADJACENCY",
+   "TRIANGLE_STRIP_ADJACENCY"
 };
 
 static const char *fs_coord_origin_names[] =
@@ -192,29 +197,30 @@ static const char *fs_coord_pixel_center_names[] =
 
 
 static void
-_dump_register_dst(
-   struct dump_ctx *ctx,
-   uint file,
-   int index)
-{
-   ENM( file, file_names );
-
-   CHR( '[' );
-   SID( index );
-   CHR( ']' );
-}
-
-
-static void
 _dump_register_src(
    struct dump_ctx *ctx,
    const struct tgsi_full_src_register *src )
 {
    ENM(src->Register.File, file_names);
    if (src->Register.Dimension) {
-      CHR('[');
-      SID(src->Dimension.Index);
-      CHR(']');
+      if (src->Dimension.Indirect) {
+         CHR( '[' );
+         ENM( src->DimIndirect.File, file_names );
+         CHR( '[' );
+         SID( src->DimIndirect.Index );
+         TXT( "]." );
+         ENM( src->DimIndirect.SwizzleX, swizzle_names );
+         if (src->Dimension.Index != 0) {
+            if (src->Dimension.Index > 0)
+               CHR( '+' );
+            SID( src->Dimension.Index );
+         }
+         CHR( ']' );
+      } else {
+         CHR('[');
+         SID(src->Dimension.Index);
+         CHR(']');
+      }
    }
    if (src->Register.Indirect) {
       CHR( '[' );
@@ -236,30 +242,52 @@ _dump_register_src(
    }
 }
 
+
 static void
-_dump_register_ind(
+_dump_register_dst(
    struct dump_ctx *ctx,
-   uint file,
-   int index,
-   uint ind_file,
-   int ind_index,
-   uint ind_swizzle )
+   const struct tgsi_full_dst_register *dst )
 {
-   ENM( file, file_names );
-   CHR( '[' );
-   ENM( ind_file, file_names );
-   CHR( '[' );
-   SID( ind_index );
-   TXT( "]." );
-   ENM( ind_swizzle, swizzle_names );
-   if (index != 0) {
-      if (index > 0)
-         CHR( '+' );
-      SID( index );
+   ENM(dst->Register.File, file_names);
+   if (dst->Register.Dimension) {
+      if (dst->Dimension.Indirect) {
+         CHR( '[' );
+         ENM( dst->DimIndirect.File, file_names );
+         CHR( '[' );
+         SID( dst->DimIndirect.Index );
+         TXT( "]." );
+         ENM( dst->DimIndirect.SwizzleX, swizzle_names );
+         if (dst->Dimension.Index != 0) {
+            if (dst->Dimension.Index > 0)
+               CHR( '+' );
+            SID( dst->Dimension.Index );
+         }
+         CHR( ']' );
+      } else {
+         CHR('[');
+         SID(dst->Dimension.Index);
+         CHR(']');
+      }
+   }
+   if (dst->Register.Indirect) {
+      CHR( '[' );
+      ENM( dst->Indirect.File, file_names );
+      CHR( '[' );
+      SID( dst->Indirect.Index );
+      TXT( "]." );
+      ENM( dst->Indirect.SwizzleX, swizzle_names );
+      if (dst->Register.Index != 0) {
+         if (dst->Register.Index > 0)
+            CHR( '+' );
+         SID( dst->Register.Index );
+      }
+      CHR( ']' );
+   } else {
+      CHR( '[' );
+      SID( dst->Register.Index );
+      CHR( ']' );
    }
-   CHR( ']' );
 }
-
 static void
 _dump_writemask(
    struct dump_ctx *ctx,
@@ -278,6 +306,39 @@ _dump_writemask(
    }
 }
 
+static void
+dump_imm_data(struct tgsi_iterate_context *iter,
+              union tgsi_immediate_data *data,
+              unsigned num_tokens,
+              unsigned data_type)
+{
+   struct dump_ctx *ctx = (struct dump_ctx *)iter;
+   unsigned i ;
+
+   TXT( " {" );
+
+   assert( num_tokens <= 4 );
+   for (i = 0; i < num_tokens; i++) {
+      switch (data_type) {
+      case TGSI_IMM_FLOAT32:
+         FLT( data[i].Float );
+         break;
+      case TGSI_IMM_UINT32:
+         UID(data[i].Uint);
+         break;
+      case TGSI_IMM_INT32:
+         SID(data[i].Int);
+         break;
+      default:
+         assert( 0 );
+      }
+
+      if (i < num_tokens - 1)
+         TXT( ", " );
+   }
+   TXT( "}" );
+}
+
 static boolean
 iter_declaration(
    struct tgsi_iterate_context *iter,
@@ -358,6 +419,43 @@ iter_declaration(
       }
    }
 
+   if (decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) {
+      unsigned i;
+      char range_indent[4];
+
+      TXT(" {");
+
+      if (decl->Range.Last < 10)
+         range_indent[0] = '\0';
+      else if (decl->Range.Last < 100) {
+         range_indent[0] = ' ';
+         range_indent[1] = '\0';
+      } else if (decl->Range.Last < 1000) {
+         range_indent[0] = ' ';
+         range_indent[1] = ' ';
+         range_indent[2] = '\0';
+      } else {
+         range_indent[0] = ' ';
+         range_indent[1] = ' ';
+         range_indent[2] = ' ';
+         range_indent[3] = '\0';
+      }
+
+      dump_imm_data(iter, decl->ImmediateData.u,
+                    4, TGSI_IMM_FLOAT32);
+      for(i = 1; i <= decl->Range.Last; ++i) {
+         /* indent by strlen of:
+          *   "DCL IMMX[0..1] {" */
+         CHR('\n');
+         TXT( "                " );
+         TXT( range_indent );
+         dump_imm_data(iter, decl->ImmediateData.u + i,
+                       4, TGSI_IMM_FLOAT32);
+      }
+
+      TXT(" }");
+   }
+
    EOL();
 
    return TRUE;
@@ -431,33 +529,11 @@ iter_immediate(
 {
    struct dump_ctx *ctx = (struct dump_ctx *) iter;
 
-   uint i;
-
    TXT( "IMM " );
    ENM( imm->Immediate.DataType, immediate_type_names );
 
-   TXT( " { " );
-
-   assert( imm->Immediate.NrTokens <= 4 + 1 );
-   for (i = 0; i < imm->Immediate.NrTokens - 1; i++) {
-      switch (imm->Immediate.DataType) {
-      case TGSI_IMM_FLOAT32:
-         FLT( imm->u[i].Float );
-         break;
-      case TGSI_IMM_UINT32:
-         UID(imm->u[i].Uint);
-         break;
-      case TGSI_IMM_INT32:
-         SID(imm->u[i].Int);
-         break;
-      default:
-         assert( 0 );
-      }
-
-      if (i < imm->Immediate.NrTokens - 2)
-         TXT( ", " );
-   }
-   TXT( " }" );
+   dump_imm_data(iter, imm->u, imm->Immediate.NrTokens - 1,
+                 imm->Immediate.DataType);
 
    EOL();
 
@@ -488,12 +564,36 @@ iter_instruction(
 
    INSTID( instno );
    TXT( ": " );
-   
+
    ctx->indent -= info->pre_dedent;
    for(i = 0; (int)i < ctx->indent; ++i)
       TXT( "  " );
    ctx->indent += info->post_indent;
-   
+
+   if (inst->Instruction.Predicate) {
+      CHR( '(' );
+
+      if (inst->Predicate.Negate)
+         CHR( '!' );
+
+      TXT( "PRED[" );
+      SID( inst->Predicate.Index );
+      CHR( ']' );
+
+      if (inst->Predicate.SwizzleX != TGSI_SWIZZLE_X ||
+          inst->Predicate.SwizzleY != TGSI_SWIZZLE_Y ||
+          inst->Predicate.SwizzleZ != TGSI_SWIZZLE_Z ||
+          inst->Predicate.SwizzleW != TGSI_SWIZZLE_W) {
+         CHR( '.' );
+         ENM( inst->Predicate.SwizzleX, swizzle_names );
+         ENM( inst->Predicate.SwizzleY, swizzle_names );
+         ENM( inst->Predicate.SwizzleZ, swizzle_names );
+         ENM( inst->Predicate.SwizzleW, swizzle_names );
+      }
+
+      TXT( ") " );
+   }
+
    TXT( info->mnemonic );
 
    switch (inst->Instruction.Saturate) {
@@ -516,21 +616,7 @@ iter_instruction(
          CHR( ',' );
       CHR( ' ' );
 
-      if (dst->Register.Indirect) {
-         _dump_register_ind(
-            ctx,
-            dst->Register.File,
-            dst->Register.Index,
-            dst->Indirect.File,
-            dst->Indirect.Index,
-            dst->Indirect.SwizzleX );
-      }
-      else {
-         _dump_register_dst(
-            ctx,
-            dst->Register.File,
-            dst->Register.Index );
-      }
+      _dump_register_dst( ctx, dst );
       _dump_writemask( ctx, dst->Register.WriteMask );
 
       first_reg = FALSE;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
index 4cd27317b36..dd78b361007 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -28,6 +28,7 @@
 #ifndef TGSI_DUMP_H
 #define TGSI_DUMP_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
 
 #if defined __cplusplus
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 82eac05dc4d..3a71540506d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -557,6 +557,23 @@ print_temp(const struct tgsi_exec_machine *mach, uint index)
 #endif
 
 
+void
+tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
+                               unsigned num_bufs,
+                               const void **bufs,
+                               const unsigned *buf_sizes)
+{
+   unsigned i;
+
+   for (i = 0; i < num_bufs; i++) {
+      mach->Consts[i] = bufs[i];
+      mach->ConstsSize[i] = buf_sizes[i];
+   }
+}
+
+
+
+
 /**
  * Check if there's a potential src/dst register data dependency when
  * using SOA execution.
@@ -588,8 +605,10 @@ tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst)
    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
       if ((inst->Src[i].Register.File ==
            inst->Dst[0].Register.File) &&
-          (inst->Src[i].Register.Index ==
-           inst->Dst[0].Register.Index)) {
+          ((inst->Src[i].Register.Index ==
+            inst->Dst[0].Register.Index) ||
+	   inst->Src[i].Register.Indirect ||
+	   inst->Dst[0].Register.Indirect)) {
          /* loop over dest channels */
          uint channelsWritten = 0x0;
          FOR_EACH_ENABLED_CHANNEL(*inst, chan) {
@@ -621,12 +640,10 @@ tgsi_exec_machine_bind_shader(
 {
    uint k;
    struct tgsi_parse_context parse;
-   struct tgsi_exec_labels *labels = &mach->Labels;
    struct tgsi_full_instruction *instructions;
    struct tgsi_full_declaration *declarations;
    uint maxInstructions = 10, numInstructions = 0;
    uint maxDeclarations = 10, numDeclarations = 0;
-   uint instno = 0;
 
 #if 0
    tgsi_dump(tokens, 0);
@@ -634,9 +651,30 @@ tgsi_exec_machine_bind_shader(
 
    util_init_math();
 
+   if (numSamplers) {
+      assert(samplers);
+   }
+
    mach->Tokens = tokens;
    mach->Samplers = samplers;
 
+   if (!tokens) {
+      /* unbind and free all */
+      if (mach->Declarations) {
+         FREE( mach->Declarations );
+      }
+      mach->Declarations = NULL;
+      mach->NumDeclarations = 0;
+
+      if (mach->Instructions) {
+         FREE( mach->Instructions );
+      }
+      mach->Instructions = NULL;
+      mach->NumInstructions = 0;
+
+      return;
+   }
+
    k = tgsi_parse_init (&parse, mach->Tokens);
    if (k != TGSI_PARSE_OK) {
       debug_printf( "Problem parsing!\n" );
@@ -645,7 +683,6 @@ tgsi_exec_machine_bind_shader(
 
    mach->Processor = parse.FullHeader.Processor.Processor;
    mach->ImmLimit = 0;
-   labels->count = 0;
 
    declarations = (struct tgsi_full_declaration *)
       MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
@@ -663,7 +700,6 @@ tgsi_exec_machine_bind_shader(
    }
 
    while( !tgsi_parse_end_of_tokens( &parse ) ) {
-      uint pointer = parse.Position;
       uint i;
 
       tgsi_parse_token( &parse );
@@ -686,6 +722,19 @@ tgsi_exec_machine_bind_shader(
                ++mach->NumOutputs;
             }
          }
+         if (parse.FullToken.FullDeclaration.Declaration.File ==
+             TGSI_FILE_IMMEDIATE_ARRAY) {
+            unsigned reg;
+            struct tgsi_full_declaration *decl =
+               &parse.FullToken.FullDeclaration;
+            debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES);
+            for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) {
+               for( i = 0; i < 4; i++ ) {
+                  int idx = reg * 4 + i;
+                  mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float;
+               }
+            }
+         }
          memcpy(declarations + numDeclarations,
                 &parse.FullToken.FullDeclaration,
                 sizeof(declarations[0]));
@@ -707,11 +756,6 @@ tgsi_exec_machine_bind_shader(
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         assert( labels->count < MAX_LABELS );
-
-         labels->labels[labels->count][0] = instno;
-         labels->labels[labels->count][1] = pointer;
-         labels->count++;
 
          /* save expanded instruction */
          if (numInstructions == maxInstructions) {
@@ -801,7 +845,9 @@ void
 tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
 {
    if (mach) {
-      FREE(mach->Instructions);
+      if (mach->Instructions)
+         FREE(mach->Instructions);
+      if (mach->Declarations)
       FREE(mach->Declarations);
    }
 
@@ -1017,6 +1063,8 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach,
 {
    uint i;
 
+   assert(swizzle < 4);
+
    switch (file) {
    case TGSI_FILE_CONSTANT:
       for (i = 0; i < QUAD_SIZE; i++) {
@@ -1026,9 +1074,23 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach,
          if (index->i[i] < 0) {
             chan->u[i] = 0;
          } else {
-            const uint *p = (const uint *)mach->Consts[index2D->i[i]];
-
-            chan->u[i] = p[index->i[i] * 4 + swizzle];
+            /* NOTE: copying the const value as a uint instead of float */
+            const uint constbuf = index2D->i[i];
+            const uint *buf = (const uint *)mach->Consts[constbuf];
+            const int pos = index->i[i] * 4 + swizzle;
+            /* const buffer bounds check */
+            if (pos < 0 || pos >= mach->ConstsSize[constbuf]) {
+               if (0) {
+                  /* Debug: print warning */
+                  static int count = 0;
+                  if (count++ < 100)
+                     debug_printf("TGSI Exec: const buffer index %d"
+                                  " out of bounds\n", pos);
+               }
+               chan->u[i] = 0;
+            }
+            else
+               chan->u[i] = buf[pos];
          }
       }
       break;
@@ -1036,8 +1098,16 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach,
    case TGSI_FILE_INPUT:
    case TGSI_FILE_SYSTEM_VALUE:
       for (i = 0; i < QUAD_SIZE; i++) {
-         /* XXX: 2D indexing */
-         chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
+         /*
+         if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
+            debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n",
+                         index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i],
+                         index2D->i[i], index->i[i]);
+                         }*/
+         int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i];
+         assert(pos >= 0);
+         assert(pos < Elements(mach->Inputs));
+         chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i];
       }
       break;
 
@@ -1050,6 +1120,16 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach,
       }
       break;
 
+   case TGSI_FILE_TEMPORARY_ARRAY:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
+         assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS);
+
+         chan->u[i] =
+            mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i];
+      }
+      break;
+
    case TGSI_FILE_IMMEDIATE:
       for (i = 0; i < QUAD_SIZE; i++) {
          assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
@@ -1059,6 +1139,14 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach,
       }
       break;
 
+   case TGSI_FILE_IMMEDIATE_ARRAY:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index2D->i[i] == 0);
+
+         chan->f[i] = mach->ImmArray[index->i[i]][swizzle];
+      }
+      break;
+
    case TGSI_FILE_ADDRESS:
       for (i = 0; i < QUAD_SIZE; i++) {
          assert(index->i[i] >= 0);
@@ -1139,7 +1227,7 @@ fetch_source(const struct tgsi_exec_machine *mach,
       index2.i[1] =
       index2.i[2] =
       index2.i[3] = reg->Indirect.Index;
-
+      assert(reg->Indirect.File == TGSI_FILE_ADDRESS);
       /* get current value of address register[swizzle] */
       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
       fetch_src_file_channel(mach,
@@ -1270,6 +1358,7 @@ store_dest(struct tgsi_exec_machine *mach,
    uint i;
    union tgsi_exec_channel null;
    union tgsi_exec_channel *dst;
+   union tgsi_exec_channel index2D;
    uint execmask = mach->ExecMask;
    int offset = 0;  /* indirection offset */
    int index;
@@ -1315,6 +1404,77 @@ store_dest(struct tgsi_exec_machine *mach,
       offset = indir_index.i[0];
    }
 
+   /* There is an extra source register that is a second
+    * subscript to a register file. Effectively it means that
+    * the register file is actually a 2D array of registers.
+    *
+    *    file[3][1],
+    *    where:
+    *       [3] = Dimension.Index
+    */
+   if (reg->Register.Dimension) {
+      index2D.i[0] =
+      index2D.i[1] =
+      index2D.i[2] =
+      index2D.i[3] = reg->Dimension.Index;
+
+      /* Again, the second subscript index can be addressed indirectly
+       * identically to the first one.
+       * Nothing stops us from indirectly addressing the indirect register,
+       * but there is no need for that, so we won't exercise it.
+       *
+       *    file[ind[4].y+3][1],
+       *    where:
+       *       ind = DimIndirect.File
+       *       [4] = DimIndirect.Index
+       *       .y = DimIndirect.SwizzleX
+       */
+      if (reg->Dimension.Indirect) {
+         union tgsi_exec_channel index2;
+         union tgsi_exec_channel indir_index;
+         const uint execmask = mach->ExecMask;
+         unsigned swizzle;
+         uint i;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->DimIndirect.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
+         fetch_src_file_channel(mach,
+                                reg->DimIndirect.File,
+                                swizzle,
+                                &index2,
+                                &ZeroVec,
+                                &indir_index);
+
+         index2D.i[0] += indir_index.i[0];
+         index2D.i[1] += indir_index.i[1];
+         index2D.i[2] += indir_index.i[2];
+         index2D.i[3] += indir_index.i[3];
+
+         /* for disabled execution channels, zero-out the index to
+          * avoid using a potential garbage value.
+          */
+         for (i = 0; i < QUAD_SIZE; i++) {
+            if ((execmask & (1 << i)) == 0) {
+               index2D.i[i] = 0;
+            }
+         }
+      }
+
+      /* If by any chance there was a need for a 3D array of register
+       * files, we would have to check whether Dimension is followed
+       * by a dimension register and continue the saga.
+       */
+   } else {
+      index2D.i[0] =
+      index2D.i[1] =
+      index2D.i[2] =
+      index2D.i[3] = 0;
+   }
+
    switch (reg->Register.File) {
    case TGSI_FILE_NULL:
       dst = &null;
@@ -1341,16 +1501,19 @@ store_dest(struct tgsi_exec_machine *mach,
       dst = &mach->Temps[offset + index].xyzw[chan_index];
       break;
 
-   case TGSI_FILE_ADDRESS:
+   case TGSI_FILE_TEMPORARY_ARRAY:
       index = reg->Register.Index;
-      dst = &mach->Addrs[index].xyzw[chan_index];
+      assert( index < TGSI_EXEC_NUM_TEMPS );
+      assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS );
+      /* XXX we use index2D.i[0] here but somehow we might
+       * end up with someone trying to store indirectly in
+       * different buffers */
+      dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index];
       break;
 
-   case TGSI_FILE_LOOP:
-      assert(reg->Register.Index == 0);
-      assert(mach->LoopCounterStackTop > 0);
-      assert(chan_index == CHAN_X);
-      dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index];
+   case TGSI_FILE_ADDRESS:
+      index = reg->Register.Index;
+      dst = &mach->Addrs[index].xyzw[chan_index];
       break;
 
    case TGSI_FILE_PREDICATE:
@@ -1533,6 +1696,19 @@ emit_primitive(struct tgsi_exec_machine *mach)
    }
 }
 
+static void
+conditional_emit_primitive(struct tgsi_exec_machine *mach)
+{
+   if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
+      int emitted_verts =
+         mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]];
+      if (emitted_verts) {
+         emit_primitive(mach);
+      }
+   }
+}
+
+
 /*
  * Fetch four texture samples using STR texture coordinates.
  */
@@ -3065,6 +3241,8 @@ exec_instruction(
 
          if (mach->CallStackTop == 0) {
             /* returning from main() */
+            mach->CondStackTop = 0;
+            mach->LoopStackTop = 0;
             *pc = -1;
             return;
          }
@@ -3133,7 +3311,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DIV:
-      assert( 0 );
+      exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DP2:
@@ -3182,6 +3360,9 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_END:
+      /* make sure we end primitives which haven't
+       * been explicitly emitted */
+      conditional_emit_primitive(mach);
       /* halt execution */
       *pc = -1;
       break;
@@ -3590,6 +3771,9 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
    }
 #endif
 
+   /* Strictly speaking, these assertions aren't really needed but they
+    * can potentially catch some bugs in the control flow code.
+    */
    assert(mach->CondStackTop == 0);
    assert(mach->LoopStackTop == 0);
    assert(mach->ContStackTop == 0);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index a22873e4c2b..9d62c1d7e7e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -37,8 +37,6 @@ extern "C" {
 #endif
 
 
-#define MAX_LABELS (4 * 1024)  /**< basically, max instructions */
-
 #define NUM_CHANNELS 4  /* R,G,B,A */
 #define QUAD_SIZE    4  /* 4 pixel/quad */
 
@@ -93,18 +91,9 @@ struct tgsi_sampler
                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
 };
 
-/**
- * For branching/calling subroutines.
- */
-struct tgsi_exec_labels
-{
-   unsigned labels[MAX_LABELS][2];
-   unsigned count;
-};
-
-
 #define TGSI_EXEC_NUM_TEMPS       128
 #define TGSI_EXEC_NUM_IMMEDIATES  256
+#define TGSI_EXEC_NUM_TEMP_ARRAYS 8
 
 /*
  * Locations of various utility registers (_I = Index, _C = Channel)
@@ -186,10 +175,11 @@ struct tgsi_exec_labels
 
 
 
-#define TGSI_EXEC_MAX_COND_NESTING  32
-#define TGSI_EXEC_MAX_LOOP_NESTING  32
-#define TGSI_EXEC_MAX_SWITCH_NESTING 32
-#define TGSI_EXEC_MAX_CALL_NESTING  32
+#define TGSI_EXEC_MAX_NESTING  32
+#define TGSI_EXEC_MAX_COND_NESTING  TGSI_EXEC_MAX_NESTING
+#define TGSI_EXEC_MAX_LOOP_NESTING  TGSI_EXEC_MAX_NESTING
+#define TGSI_EXEC_MAX_SWITCH_NESTING TGSI_EXEC_MAX_NESTING
+#define TGSI_EXEC_MAX_CALL_NESTING  TGSI_EXEC_MAX_NESTING
 
 /* The maximum number of input attributes per vertex. For 2D
  * input register files, this is the stride between two 1D
@@ -248,9 +238,12 @@ struct tgsi_exec_machine
     */
    struct tgsi_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS +
                                        TGSI_EXEC_NUM_TEMP_EXTRAS];
+   struct tgsi_exec_vector       TempArray[TGSI_EXEC_NUM_TEMP_ARRAYS][TGSI_EXEC_NUM_TEMPS];
 
    float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
 
+   float                         ImmArray[TGSI_EXEC_NUM_IMMEDIATES][4];
+
    struct tgsi_exec_vector       Inputs[TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS];
    struct tgsi_exec_vector       Outputs[TGSI_MAX_TOTAL_VERTICES];
 
@@ -260,7 +253,10 @@ struct tgsi_exec_machine
    struct tgsi_sampler           **Samplers;
 
    unsigned                      ImmLimit;
+
    const void *Consts[PIPE_MAX_CONSTANT_BUFFERS];
+   unsigned ConstsSize[PIPE_MAX_CONSTANT_BUFFERS];
+
    const struct tgsi_token       *Tokens;   /**< Declarations, instructions */
    unsigned                      Processor; /**< TGSI_PROCESSOR_x */
 
@@ -299,10 +295,6 @@ struct tgsi_exec_machine
    uint LoopLabelStack[TGSI_EXEC_MAX_LOOP_NESTING];
    int LoopLabelStackTop;
 
-   /** Loop counter stack (x = index, y = counter, z = step) */
-   struct tgsi_exec_vector LoopCounterStack[TGSI_EXEC_MAX_LOOP_NESTING];
-   int LoopCounterStackTop;
-   
    /** Loop continue mask stack (see comments in tgsi_exec.c) */
    uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
    int ContStackTop;
@@ -328,7 +320,6 @@ struct tgsi_exec_machine
    struct tgsi_full_declaration *Declarations;
    uint NumDeclarations;
 
-   struct tgsi_exec_labels Labels;
 };
 
 struct tgsi_exec_machine *
@@ -379,6 +370,43 @@ tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
 }
 
 
+extern void
+tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
+                               unsigned num_bufs,
+                               const void **bufs,
+                               const unsigned *buf_sizes);
+
+
+static INLINE int
+tgsi_exec_get_shader_param(enum pipe_shader_cap param)
+{
+   switch(param) {
+   case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+   case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+      return INT_MAX;
+   case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+      return TGSI_EXEC_MAX_NESTING;
+   case PIPE_SHADER_CAP_MAX_INPUTS:
+      return TGSI_EXEC_MAX_INPUT_ATTRIBS;
+   case PIPE_SHADER_CAP_MAX_CONSTS:
+      return TGSI_EXEC_MAX_CONST_BUFFER;
+   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+      return PIPE_MAX_CONSTANT_BUFFERS;
+   case PIPE_SHADER_CAP_MAX_TEMPS:
+      return TGSI_EXEC_NUM_TEMPS;
+   case PIPE_SHADER_CAP_MAX_ADDRS:
+      return TGSI_EXEC_NUM_ADDRS;
+   case PIPE_SHADER_CAP_MAX_PREDS:
+      return TGSI_EXEC_NUM_PREDS;
+   case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
 #if defined __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index cfa2f631bd8..e59e964ffa7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -205,3 +205,18 @@ tgsi_get_opcode_name( uint opcode )
    return info->mnemonic;
 }
 
+
+const char *
+tgsi_get_processor_name( uint processor )
+{
+   switch (processor) {
+   case TGSI_PROCESSOR_VERTEX:
+      return "vertex shader";
+   case TGSI_PROCESSOR_FRAGMENT:
+      return "fragment shader";
+   case TGSI_PROCESSOR_GEOMETRY:
+      return "geometry shader";
+   default:
+      return "unknown shader type!";
+   }
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.h b/src/gallium/auxiliary/tgsi/tgsi_info.h
index 74713c3b98a..1992d11bbe8 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.h
@@ -28,6 +28,7 @@
 #ifndef TGSI_INFO_H
 #define TGSI_INFO_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
 
 #if defined __cplusplus
@@ -52,6 +53,9 @@ tgsi_get_opcode_info( uint opcode );
 const char *
 tgsi_get_opcode_name( uint opcode );
 
+const char *
+tgsi_get_processor_name( uint processor );
+
 
 #if defined __cplusplus
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index e472947507d..b3123ed016d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -163,6 +163,10 @@ OP12(USGE)
 OP12(USHR)
 OP12(USLT)
 OP12(USNE)
+OP01(SWITCH)
+OP01(CASE)
+OP00(DEFAULT)
+OP00(ENDSWITCH)
 
 
 #undef OP00
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 7e19e1fe36f..1891203abe1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -117,6 +117,17 @@ tgsi_parse_token(
          next_token( ctx, &decl->Semantic );
       }
 
+      if (decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) {
+         unsigned i, j;
+         decl->ImmediateData.u = (union tgsi_immediate_data*)
+                                 &ctx->Tokens[ctx->Position];
+         for (i = 0; i <= decl->Range.Last; ++i) {
+            for (j = 0; j < 4; ++j) {
+               ctx->Position++;
+            }
+         }
+      }
+
       break;
    }
 
@@ -181,11 +192,6 @@ tgsi_parse_token(
 
          next_token( ctx, &inst->Dst[i].Register );
 
-         /*
-          * No support for indirect or multi-dimensional addressing.
-          */
-         assert( !inst->Dst[i].Register.Dimension );
-
          if( inst->Dst[i].Register.Indirect ) {
             next_token( ctx, &inst->Dst[i].Indirect );
 
@@ -195,6 +201,24 @@ tgsi_parse_token(
             assert( !inst->Dst[i].Indirect.Dimension );
             assert( !inst->Dst[i].Indirect.Indirect );
          }
+         if( inst->Dst[i].Register.Dimension ) {
+            next_token( ctx, &inst->Dst[i].Dimension );
+
+            /*
+             * No support for multi-dimensional addressing.
+             */
+            assert( !inst->Dst[i].Dimension.Dimension );
+
+            if( inst->Dst[i].Dimension.Indirect ) {
+               next_token( ctx, &inst->Dst[i].DimIndirect );
+
+               /*
+                * No support for indirect or multi-dimensional addressing.
+                */
+               assert( !inst->Dst[i].Indirect.Indirect );
+               assert( !inst->Dst[i].Indirect.Dimension );
+            }
+         }
       }
 
       assert( inst->Instruction.NumSrcRegs <= TGSI_FULL_MAX_SRC_REGISTERS );
@@ -258,17 +282,6 @@ tgsi_parse_token(
 }
 
 
-unsigned
-tgsi_num_tokens(const struct tgsi_token *tokens)
-{
-   struct tgsi_parse_context ctx;
-   if (tgsi_parse_init(&ctx, tokens) == TGSI_PARSE_OK) {
-      unsigned len = (ctx.FullHeader.Header.HeaderSize +
-                      ctx.FullHeader.Header.BodySize);
-      return len;
-   }
-   return 0;
-}
 
 
 /**
@@ -295,3 +308,19 @@ tgsi_alloc_tokens(unsigned num_tokens)
    unsigned bytes = num_tokens * sizeof(struct tgsi_token);
    return (struct tgsi_token *) MALLOC(bytes);
 }
+
+
+void
+tgsi_dump_tokens(const struct tgsi_token *tokens)
+{
+   const unsigned *dwords = (const unsigned *)tokens;
+   int nr = tgsi_num_tokens(tokens);
+   int i;
+   
+   assert(sizeof(*tokens) == sizeof(unsigned));
+
+   debug_printf("const unsigned tokens[%d] = {\n", nr);
+   for (i = 0; i < nr; i++)
+      debug_printf("0x%08x,\n", dwords[i]);
+   debug_printf("};\n");
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index b45ccee2f63..d4df5851764 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -28,6 +28,7 @@
 #ifndef TGSI_PARSE_H
 #define TGSI_PARSE_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
 
 #if defined __cplusplus
@@ -44,6 +45,8 @@ struct tgsi_full_dst_register
 {
    struct tgsi_dst_register               Register;
    struct tgsi_src_register               Indirect;
+   struct tgsi_dimension                  Dimension;
+   struct tgsi_src_register               DimIndirect;
 };
 
 struct tgsi_full_src_register
@@ -54,12 +57,18 @@ struct tgsi_full_src_register
    struct tgsi_src_register         DimIndirect;
 };
 
+struct tgsi_immediate_array_data
+{
+   union tgsi_immediate_data *u;
+};
+
 struct tgsi_full_declaration
 {
    struct tgsi_declaration Declaration;
    struct tgsi_declaration_range Range;
    struct tgsi_declaration_dimension Dim;
    struct tgsi_declaration_semantic Semantic;
+   struct tgsi_immediate_array_data ImmediateData;
 };
 
 struct tgsi_full_immediate
@@ -124,8 +133,15 @@ void
 tgsi_parse_token(
    struct tgsi_parse_context *ctx );
 
-unsigned
-tgsi_num_tokens(const struct tgsi_token *tokens);
+static INLINE unsigned
+tgsi_num_tokens(const struct tgsi_token *tokens)
+{
+   struct tgsi_header header = *(const struct tgsi_header *) tokens;
+   return header.HeaderSize + header.BodySize;
+}
+
+void
+tgsi_dump_tokens(const struct tgsi_token *tokens);
 
 struct tgsi_token *
 tgsi_dup_tokens(const struct tgsi_token *tokens);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index ad553c71a57..3521847b619 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -1366,4 +1366,12 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
    return ok;
 }
 
+#else
+
+void ppc_dummy_func(void);
+
+void ppc_dummy_func(void)
+{
+}
+
 #endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 76b7564cc36..acbff103efe 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -33,6 +33,10 @@
 #include "tgsi_info.h"
 #include "tgsi_iterate.h"
 
+
+DEBUG_GET_ONCE_BOOL_OPTION(print_sanity, "TGSI_PRINT_SANITY", FALSE)
+
+
 typedef struct {
    uint file : 28;
    /* max 2 dimensions */
@@ -54,6 +58,8 @@ struct sanity_check_ctx
    uint errors;
    uint warnings;
    uint implied_array_size;
+
+   boolean print;
 };
 
 static INLINE unsigned
@@ -90,9 +96,18 @@ static void
 scan_register_dst(scan_register *reg,
                   struct tgsi_full_dst_register *dst)
 {
-   fill_scan_register1d(reg,
-                        dst->Register.File,
-                        dst->Register.Index);
+   if (dst->Register.Dimension) {
+      /*FIXME: right now we don't support indirect
+       * multidimensional addressing */
+      fill_scan_register2d(reg,
+                           dst->Register.File,
+                           dst->Register.Index,
+                           dst->Dimension.Index);
+   } else {
+      fill_scan_register1d(reg,
+                           dst->Register.File,
+                           dst->Register.Index);
+   }
 }
 
 static void
@@ -102,7 +117,6 @@ scan_register_src(scan_register *reg,
    if (src->Register.Dimension) {
       /*FIXME: right now we don't support indirect
        * multidimensional addressing */
-      debug_assert(!src->Dimension.Indirect);
       fill_scan_register2d(reg,
                            src->Register.File,
                            src->Register.Index,
@@ -140,6 +154,9 @@ report_error(
 {
    va_list args;
 
+   if (!ctx->print)
+      return;
+
    debug_printf( "Error  : " );
    va_start( args, format );
    _debug_vprintf( format, args );
@@ -156,6 +173,9 @@ report_warning(
 {
    va_list args;
 
+   if (!ctx->print)
+      return;
+
    debug_printf( "Warning: " );
    va_start( args, format );
    _debug_vprintf( format, args );
@@ -235,8 +255,10 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "SAMP",
    "ADDR",
    "IMM",
-   "LOOP",
-   "PRED"
+   "PRED",
+   "SV",
+   "IMMX",
+   "TEMPX"
 };
 
 static boolean
@@ -529,6 +551,7 @@ tgsi_sanity_check(
    ctx.errors = 0;
    ctx.warnings = 0;
    ctx.implied_array_size = 0;
+   ctx.print = debug_get_option_print_sanity();
 
    if (!tgsi_iterate_shader( tokens, &ctx.iter ))
       return FALSE;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
index 52263ff8832..73f0f414e3f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
@@ -35,7 +35,8 @@ extern "C" {
 #endif
 
 /* Check the given token stream for errors and common mistakes.
- * Diagnostic messages are printed out to the debug output.
+ * Diagnostic messages are printed out to the debug output, and is
+ * controlled by the debug option TGSI_PRINT_SANITY (default false).
  * Returns TRUE if there are no errors, even though there could be some warnings.
  */
 boolean
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 232fc537c1d..90198a4f604 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -36,6 +36,7 @@
 
 #include "util/u_math.h"
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_scan.h"
 
 
@@ -84,31 +85,46 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
          {
             const struct tgsi_full_instruction *fullinst
                = &parse.FullToken.FullInstruction;
+            uint i;
 
             assert(fullinst->Instruction.Opcode < TGSI_OPCODE_LAST);
             info->opcode_count[fullinst->Instruction.Opcode]++;
 
-            /* special case: scan fragment shaders for use of the fog
-             * input/attribute.  The X component is fog, the Y component
-             * is the front/back-face flag.
-             */
-            if (procType == TGSI_PROCESSOR_FRAGMENT) {
-               uint i;
-               for (i = 0; i < fullinst->Instruction.NumSrcRegs; i++) {
-                  const struct tgsi_full_src_register *src =
-                     &fullinst->Src[i];
-                  if (src->Register.File == TGSI_FILE_INPUT ||
-                      src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
-                     const int ind = src->Register.Index;
-                     if (info->input_semantic_name[ind] == TGSI_SEMANTIC_FOG) {
-                        info->uses_fogcoord = TRUE;
-                     }
-                     else if (info->input_semantic_name[ind] == TGSI_SEMANTIC_FACE) {
-                        info->uses_frontfacing = TRUE;
+            for (i = 0; i < fullinst->Instruction.NumSrcRegs; i++) {
+               const struct tgsi_full_src_register *src =
+                  &fullinst->Src[i];
+               int ind = src->Register.Index;
+
+               /* Mark which inputs are effectively used */
+               if (src->Register.File == TGSI_FILE_INPUT) {
+                  unsigned usage_mask;
+                  usage_mask = tgsi_util_get_inst_usage_mask(fullinst, i);
+                  if (src->Register.Indirect) {
+                     for (ind = 0; ind < info->num_inputs; ++ind) {
+                        info->input_usage_mask[ind] |= usage_mask;
                      }
+                  } else {
+                     assert(ind >= 0);
+                     assert(ind < PIPE_MAX_SHADER_INPUTS);
+                     info->input_usage_mask[ind] |= usage_mask;
                   }
                }
+
+               /* check for indirect register reads */
+               if (src->Register.Indirect) {
+                  info->indirect_files |= (1 << src->Register.File);
+               }
             }
+
+            /* check for indirect register writes */
+            for (i = 0; i < fullinst->Instruction.NumDstRegs; i++) {
+               const struct tgsi_full_dst_register *dst = &fullinst->Dst[i];
+               if (dst->Register.Indirect) {
+                  info->indirect_files |= (1 << dst->Register.File);
+               }
+            }
+
+            info->num_instructions++;
          }
          break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 741aa7d5c42..f8aa90cf065 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -45,6 +45,7 @@ struct tgsi_shader_info
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
    ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS];
    ubyte input_cylindrical_wrap[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
@@ -54,14 +55,19 @@ struct tgsi_shader_info
    int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
 
    uint immediate_count; /**< number of immediates declared */
+   uint num_instructions;
 
    uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
 
    boolean writes_z;  /**< does fragment shader write Z value? */
    boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
    boolean uses_kill;  /**< KIL or KILP instruction used? */
-   boolean uses_fogcoord; /**< fragment shader uses fog coord? */
-   boolean uses_frontfacing; /**< fragment shader uses front/back-face flag? */
+
+   /**
+    * Bitmask indicating which register files are accessed with
+    * indirect addressing.  The bits are (1 << TGSI_FILE_x), etc.
+    */
+   unsigned indirect_files;
 
    struct {
       unsigned name;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 1071298b497..086d983a73a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -1244,16 +1244,9 @@ emit_sub(
       make_xmm( xmm_src ) );
 }
 
-
-
-
-
-
-
 /**
  * Register fetch.
  */
-
 static void
 emit_fetch(
    struct x86_function *func,
@@ -1338,7 +1331,6 @@ emit_fetch(
 /**
  * Register store.
  */
-
 static void
 emit_store(
    struct x86_function *func,
@@ -1455,7 +1447,6 @@ fetch_texel( struct tgsi_sampler **sampler,
 /**
  * High-level instruction translators.
  */
-
 static void
 emit_tex( struct x86_function *func,
           const struct tgsi_full_instruction *inst,
@@ -1507,7 +1498,6 @@ emit_tex( struct x86_function *func,
                get_temp( TEMP_R0, 3 ),
                make_xmm( 3 ) );
 
-   
    if (projected) {
       FETCH( func, *inst, 3, 0, 3 );
 
@@ -1535,7 +1525,6 @@ emit_tex( struct x86_function *func,
    args[0] = get_temp( TEMP_R0, 0 );
    args[1] = get_sampler_ptr( unit );
 
-
    emit_func_call( func,
                    0,
                    args,
@@ -1569,7 +1558,8 @@ emit_kil(
 
    /* This mask stores component bits that were already tested. Note that
     * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested. */
+    * tested.
+    */
    uniquemask = 0;
 
    FOR_EACH_CHANNEL( chan_index ) {
@@ -1715,22 +1705,26 @@ emit_cmp(
 
 
 /**
- * Check if inst src/dest regs use indirect addressing into temporary
- * register file.
+ * Check if inst src/dest regs use indirect addressing into temporary,
+ * input or output register files.
  */
 static boolean
-indirect_temp_reference(const struct tgsi_full_instruction *inst)
+indirect_reg_reference(const struct tgsi_full_instruction *inst)
 {
    uint i;
    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
       const struct tgsi_full_src_register *reg = &inst->Src[i];
-      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+      if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
+           reg->Register.File == TGSI_FILE_INPUT ||
+           reg->Register.File == TGSI_FILE_OUTPUT) &&
           reg->Register.Indirect)
          return TRUE;
    }
    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
       const struct tgsi_full_dst_register *reg = &inst->Dst[i];
-      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+      if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
+           reg->Register.File == TGSI_FILE_INPUT ||
+           reg->Register.File == TGSI_FILE_OUTPUT) &&
           reg->Register.Indirect)
          return TRUE;
    }
@@ -1746,7 +1740,7 @@ emit_instruction(
    unsigned chan_index;
 
    /* we can't handle indirect addressing into temp register file yet */
-   if (indirect_temp_reference(inst))
+   if (indirect_reg_reference(inst))
       return FALSE;
 
    switch (inst->Instruction.Opcode) {
@@ -1929,20 +1923,32 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_MUL:
+      /* do all fetches and adds, storing results in temp regs */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
+         int r = chan_index + 1;
+         FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
+         FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
+         emit_mul( func, r, 0 );   /* xmm[r] = xmm[r] * xmm[0] */
+      }
+      /* do all stores of the temp regs */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         int r = chan_index + 1;
+         STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
       }
       break;
 
    case TGSI_OPCODE_ADD:
+      /* do all fetches and adds, storing results in temp regs */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_add( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
+         int r = chan_index + 1;
+         FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
+         FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
+         emit_add( func, r, 0 );   /* xmm[r] = xmm[r] + xmm[0] */
+      }
+      /* do all stores of the temp regs */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         int r = chan_index + 1;
+         STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
       }
       break;
 
@@ -2697,8 +2703,7 @@ static void aos_to_soa( struct x86_function *func,
    struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
    struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
    struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
-   int inner_loop;
-
+   int loop_top, loop_exit_fixup;
 
    /* Save EBX */
    x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
@@ -2711,8 +2716,11 @@ static void aos_to_soa( struct x86_function *func,
    x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
    x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
 
-   /* do */
-   inner_loop = x86_get_label( func );
+   /* while (num_inputs != 0) */
+   loop_top = x86_get_label( func );
+   x86_cmp_imm( func, num_inputs, 0 );
+   loop_exit_fixup = x86_jcc_forward( func, cc_E );
+
    {
       x86_push( func, aos_input );
       sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
@@ -2744,9 +2752,10 @@ static void aos_to_soa( struct x86_function *func,
       x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
       x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
    }
-   /* while --num_inputs */
+   /* --num_inputs */
    x86_dec( func, num_inputs );
-   x86_jcc( func, cc_NE, inner_loop );
+   x86_jmp( func, loop_top );
+   x86_fixup_fwd_jump( func, loop_exit_fixup );
 
    /* Restore EBX */
    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
@@ -2816,6 +2825,61 @@ static void soa_to_aos( struct x86_function *func,
    x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
 }
 
+
+/**
+ * Check if the instructions dst register is the same as any src
+ * register and warn if there's a posible SOA dependency.
+ */
+static boolean
+check_soa_dependencies(const struct tgsi_full_instruction *inst)
+{
+   uint opcode = inst->Instruction.Opcode;
+
+   /* XXX: we only handle src/dst aliasing in a few opcodes currently.
+    * Need to use an additional temporay to hold the result in the
+    * cases where the code is too opaque to fix.
+    */
+
+   switch (opcode) {
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_LOG:
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DP2A:
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_XPD:
+   case TGSI_OPCODE_DPH:
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_NRM:
+   case TGSI_OPCODE_NRM4:
+   case TGSI_OPCODE_DP2:
+      /* OK - these opcodes correctly handle SOA dependencies */
+      return TRUE;
+   default:
+      if (!tgsi_check_soa_dependencies(inst))
+         return TRUE;
+
+      debug_printf("Warning: src/dst aliasing in instruction"
+                   " is not handled:\n");
+      debug_printf("Warning: ");
+      tgsi_dump_instruction(inst, 1);
+
+      return FALSE;
+   }
+}
+
+
 /**
  * Translate a TGSI vertex/fragment shader to SSE2 code.
  * Slightly different things are done for vertex vs. fragment shaders.
@@ -2885,7 +2949,6 @@ tgsi_emit_sse2(
       x86_make_disp( get_machine_base(),
                      Offset( struct tgsi_exec_machine, Samplers ) ) );
 
-
    while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
       tgsi_parse_token( &parse );
 
@@ -2905,27 +2968,15 @@ tgsi_emit_sse2(
 
 	 if (!ok) {
             uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
+            uint proc = parse.FullHeader.Processor.Processor;
 	    debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n", 
 			 opcode,
                          tgsi_get_opcode_name(opcode),
-                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
-                         "vertex shader" : "fragment shader");
+                         tgsi_get_processor_name(proc));
 	 }
 
-         if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) {
-            uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
-
-            /* XXX: we only handle src/dst aliasing in a few opcodes
-             * currently.  Need to use an additional temporay to hold
-             * the result in the cases where the code is too opaque to
-             * fix.
-             */
-            if (opcode != TGSI_OPCODE_MOV) {
-               debug_printf("Warning: src/dst aliasing in instruction"
-                            " is not handled:\n");
-               tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1);
-            }
-         }
+         if (ok)
+            ok = check_soa_dependencies(&parse.FullToken.FullInstruction);
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
@@ -2982,4 +3033,3 @@ tgsi_emit_sse2(
 }
 
 #endif /* PIPE_ARCH_X86 */
-
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
index d81ee3d00ec..00aa8b84fe9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
@@ -32,9 +32,12 @@
 extern "C" {
 #endif
 
+#include "pipe/p_compiler.h"
+
+struct tgsi_exec_machine;
+struct tgsi_interp_coef;
 struct tgsi_token;
 struct x86_function;
-struct tgsi_interp_coef;
 
 unsigned
 tgsi_emit_sse2(
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 0b468a9184e..b01d2ff4689 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -58,7 +58,7 @@ static boolean is_digit_alpha_underscore( const char *cur )
 static char uprcase( char c )
 {
    if (c >= 'a' && c <= 'z')
-      return c += 'A' - 'a';
+      return c + 'A' - 'a';
    return c;
 }
 
@@ -138,6 +138,7 @@ static boolean parse_identifier( const char **pcur, char *ret )
       ret[i++] = *cur++;
       while (is_alpha_underscore( cur ))
          ret[i++] = *cur++;
+      ret[i++] = '\0';
       *pcur = cur;
       return TRUE;
    }
@@ -278,9 +279,10 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "SAMP",
    "ADDR",
    "IMM",
-   "LOOP",
    "PRED",
-   "SV"
+   "SV",
+   "IMMX",
+   "TEMPX"
 };
 
 static boolean
@@ -345,12 +347,68 @@ parse_opt_writemask(
    return TRUE;
 }
 
+
+/* <register_file_bracket> ::= <file> `['
+ */
+static boolean
+parse_register_file_bracket(
+   struct translate_ctx *ctx,
+   uint *file )
+{
+   if (!parse_file( &ctx->cur, file )) {
+      report_error( ctx, "Unknown register file" );
+      return FALSE;
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '[') {
+      report_error( ctx, "Expected `['" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+/* <register_file_bracket_index> ::= <register_file_bracket> <uint>
+ */
+static boolean
+parse_register_file_bracket_index(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *index )
+{
+   uint uindex;
+
+   if (!parse_register_file_bracket( ctx, file ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (!parse_uint( &ctx->cur, &uindex )) {
+      report_error( ctx, "Expected literal unsigned integer" );
+      return FALSE;
+   }
+   *index = (int) uindex;
+   return TRUE;
+}
+
+/* Parse simple 1d register operand.
+ *    <register_dst> ::= <register_file_bracket_index> `]'
+ */
 static boolean
-parse_register_dst( struct translate_ctx *ctx,
-                    uint *file,
-                    int *index );
+parse_register_1d(struct translate_ctx *ctx,
+                  uint *file,
+                  int *index )
+{
+   if (!parse_register_file_bracket_index( ctx, file, index ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != ']') {
+      report_error( ctx, "Expected `]'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
 
-struct parsed_src_bracket {
+struct parsed_bracket {
    int index;
 
    uint ind_file;
@@ -360,21 +418,21 @@ struct parsed_src_bracket {
 
 
 static boolean
-parse_register_src_bracket(
+parse_register_bracket(
    struct translate_ctx *ctx,
-   struct parsed_src_bracket *brackets)
+   struct parsed_bracket *brackets)
 {
    const char *cur;
    uint uindex;
 
-   memset(brackets, 0, sizeof(struct parsed_src_bracket));
+   memset(brackets, 0, sizeof(struct parsed_bracket));
 
    eat_opt_white( &ctx->cur );
 
    cur = ctx->cur;
    if (parse_file( &cur, &brackets->ind_file )) {
-      if (!parse_register_dst( ctx, &brackets->ind_file,
-                               &brackets->ind_index ))
+      if (!parse_register_1d( ctx, &brackets->ind_file,
+                              &brackets->ind_index ))
          return FALSE;
       eat_opt_white( &ctx->cur );
 
@@ -443,7 +501,7 @@ parse_register_src_bracket(
 static boolean
 parse_opt_register_src_bracket(
    struct translate_ctx *ctx,
-   struct parsed_src_bracket *brackets,
+   struct parsed_bracket *brackets,
    int *parsed_brackets)
 {
    const char *cur = ctx->cur;
@@ -455,7 +513,7 @@ parse_opt_register_src_bracket(
       ++cur;
       ctx->cur = cur;
 
-      if (!parse_register_src_bracket(ctx, brackets))
+      if (!parse_register_bracket(ctx, brackets))
          return FALSE;
 
       *parsed_brackets = 1;
@@ -464,46 +522,6 @@ parse_opt_register_src_bracket(
    return TRUE;
 }
 
-/* <register_file_bracket> ::= <file> `['
- */
-static boolean
-parse_register_file_bracket(
-   struct translate_ctx *ctx,
-   uint *file )
-{
-   if (!parse_file( &ctx->cur, file )) {
-      report_error( ctx, "Unknown register file" );
-      return FALSE;
-   }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '[') {
-      report_error( ctx, "Expected `['" );
-      return FALSE;
-   }
-   ctx->cur++;
-   return TRUE;
-}
-
-/* <register_file_bracket_index> ::= <register_file_bracket> <uint>
- */
-static boolean
-parse_register_file_bracket_index(
-   struct translate_ctx *ctx,
-   uint *file,
-   int *index )
-{
-   uint uindex;
-
-   if (!parse_register_file_bracket( ctx, file ))
-      return FALSE;
-   eat_opt_white( &ctx->cur );
-   if (!parse_uint( &ctx->cur, &uindex )) {
-      report_error( ctx, "Expected literal unsigned integer" );
-      return FALSE;
-   }
-   *index = (int) uindex;
-   return TRUE;
-}
 
 /* Parse source register operand.
  *    <register_src> ::= <register_file_bracket_index> `]' |
@@ -515,13 +533,12 @@ static boolean
 parse_register_src(
    struct translate_ctx *ctx,
    uint *file,
-   struct parsed_src_bracket *brackets)
+   struct parsed_bracket *brackets)
 {
-
    brackets->ind_comp = TGSI_SWIZZLE_X;
    if (!parse_register_file_bracket( ctx, file ))
       return FALSE;
-   if (!parse_register_src_bracket( ctx, brackets ))
+   if (!parse_register_bracket( ctx, brackets ))
        return FALSE;
 
    return TRUE;
@@ -629,23 +646,19 @@ parse_register_dcl(
 }
 
 
-/* Parse destination register operand.
- *    <register_dst> ::= <register_file_bracket_index> `]'
- */
+/* Parse destination register operand.*/
 static boolean
 parse_register_dst(
    struct translate_ctx *ctx,
    uint *file,
-   int *index )
+   struct parsed_bracket *brackets)
 {
-   if (!parse_register_file_bracket_index( ctx, file, index ))
-      return FALSE;
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != ']') {
-      report_error( ctx, "Expected `]'" );
+   brackets->ind_comp = TGSI_SWIZZLE_X;
+   if (!parse_register_file_bracket( ctx, file ))
       return FALSE;
-   }
-   ctx->cur++;
+   if (!parse_register_bracket( ctx, brackets ))
+       return FALSE;
+
    return TRUE;
 }
 
@@ -655,11 +668,14 @@ parse_dst_operand(
    struct tgsi_full_dst_register *dst )
 {
    uint file;
-   int index;
    uint writemask;
    const char *cur;
+   struct parsed_bracket bracket[2];
+   int parsed_opt_brackets;
 
-   if (!parse_register_dst( ctx, &file, &index ))
+   if (!parse_register_dst( ctx, &file, &bracket[0] ))
+      return FALSE;
+   if (!parse_opt_register_src_bracket(ctx, &bracket[1], &parsed_opt_brackets))
       return FALSE;
 
    cur = ctx->cur;
@@ -669,8 +685,24 @@ parse_dst_operand(
       return FALSE;
 
    dst->Register.File = file;
-   dst->Register.Index = index;
+   if (parsed_opt_brackets) {
+      dst->Register.Dimension = 1;
+      dst->Dimension.Indirect = 0;
+      dst->Dimension.Dimension = 0;
+      dst->Dimension.Index = bracket[0].index;
+      bracket[0] = bracket[1];
+   }
+   dst->Register.Index = bracket[0].index;
    dst->Register.WriteMask = writemask;
+   if (bracket[0].ind_file != TGSI_FILE_NULL) {
+      dst->Register.Indirect = 1;
+      dst->Indirect.File = bracket[0].ind_file;
+      dst->Indirect.Index = bracket[0].ind_index;
+      dst->Indirect.SwizzleX = bracket[0].ind_comp;
+      dst->Indirect.SwizzleY = bracket[0].ind_comp;
+      dst->Indirect.SwizzleZ = bracket[0].ind_comp;
+      dst->Indirect.SwizzleW = bracket[0].ind_comp;
+   }
    return TRUE;
 }
 
@@ -700,7 +732,7 @@ parse_optional_swizzle(
          else if (uprcase( *cur ) == 'W')
             swizzle[i] = TGSI_SWIZZLE_W;
          else {
-	    report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" );
+	    report_error( ctx, "Expected register swizzle component `x', `y', `z' or `w'" );
 	    return FALSE;
          }
          cur++;
@@ -719,7 +751,7 @@ parse_src_operand(
    uint file;
    uint swizzle[4];
    boolean parsed_swizzle;
-   struct parsed_src_bracket bracket[2];
+   struct parsed_bracket bracket[2];
    int parsed_opt_brackets;
 
    if (*ctx->cur == '-') {
@@ -816,6 +848,45 @@ parse_instruction(
    struct tgsi_full_instruction inst;
    uint advance;
 
+   inst = tgsi_default_full_instruction();
+
+   /* Parse predicate.
+    */
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur == '(') {
+      uint file;
+      int index;
+      uint swizzle[4];
+      boolean parsed_swizzle;
+
+      inst.Instruction.Predicate = 1;
+
+      ctx->cur++;
+      if (*ctx->cur == '!') {
+         ctx->cur++;
+         inst.Predicate.Negate = 1;
+      }
+
+      if (!parse_register_1d( ctx, &file, &index ))
+         return FALSE;
+
+      if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle )) {
+         if (parsed_swizzle) {
+            inst.Predicate.SwizzleX = swizzle[0];
+            inst.Predicate.SwizzleY = swizzle[1];
+            inst.Predicate.SwizzleZ = swizzle[2];
+            inst.Predicate.SwizzleW = swizzle[3];
+         }
+      }
+
+      if (*ctx->cur != ')') {
+         report_error( ctx, "Expected `)'" );
+         return FALSE;
+      }
+
+      ctx->cur++;
+   }
+
    /* Parse instruction name.
     */
    eat_opt_white( &ctx->cur );
@@ -849,7 +920,6 @@ parse_instruction(
       return FALSE;
    }
 
-   inst = tgsi_default_full_instruction();
    inst.Instruction.Opcode = i;
    inst.Instruction.Saturate = saturate;
    inst.Instruction.NumDstRegs = info->num_dst;
@@ -947,6 +1017,45 @@ static const char *interpolate_names[TGSI_INTERPOLATE_COUNT] =
    "PERSPECTIVE"
 };
 
+
+/* parses a 4-touple of the form {x, y, z, w}
+ * where x, y, z, w are numbers */
+static boolean parse_immediate_data(struct translate_ctx *ctx,
+                                    float *values)
+{
+   unsigned i;
+
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '{') {
+      report_error( ctx, "Expected `{'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   for (i = 0; i < 4; i++) {
+      eat_opt_white( &ctx->cur );
+      if (i > 0) {
+         if (*ctx->cur != ',') {
+            report_error( ctx, "Expected `,'" );
+            return FALSE;
+         }
+         ctx->cur++;
+         eat_opt_white( &ctx->cur );
+      }
+      if (!parse_float( &ctx->cur, &values[i] )) {
+         report_error( ctx, "Expected literal floating point" );
+         return FALSE;
+      }
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '}') {
+      report_error( ctx, "Expected `}'" );
+      return FALSE;
+   }
+   ctx->cur++;
+
+   return TRUE;
+}
+
 static boolean parse_declaration( struct translate_ctx *ctx )
 {
    struct tgsi_full_declaration decl;
@@ -956,6 +1065,8 @@ static boolean parse_declaration( struct translate_ctx *ctx )
    uint writemask;
    const char *cur;
    uint advance;
+   boolean is_vs_input;
+   boolean is_imm_array;
 
    assert(Elements(semantic_names) == TGSI_SEMANTIC_COUNT);
    assert(Elements(interpolate_names) == TGSI_INTERPOLATE_COUNT);
@@ -984,9 +1095,13 @@ static boolean parse_declaration( struct translate_ctx *ctx )
       decl.Dim.Index2D = brackets[0].first;
    }
 
+   is_vs_input = (file == TGSI_FILE_INPUT &&
+                  ctx->processor == TGSI_PROCESSOR_VERTEX);
+   is_imm_array = (file == TGSI_FILE_IMMEDIATE_ARRAY);
+
    cur = ctx->cur;
    eat_opt_white( &cur );
-   if (*cur == ',') {
+   if (*cur == ',' && !is_vs_input) {
       uint i;
 
       cur++;
@@ -1025,11 +1140,49 @@ static boolean parse_declaration( struct translate_ctx *ctx )
             break;
          }
       }
+   } else if (is_imm_array) {
+      unsigned i;
+      float *vals_itr;
+      /* we have our immediate data */
+      if (*cur != '{') {
+         report_error( ctx, "Immediate array without data" );
+         return FALSE;
+      }
+      ++cur;
+      ctx->cur = cur;
+
+      decl.ImmediateData.u =
+         MALLOC(sizeof(union tgsi_immediate_data) * 4 *
+                (decl.Range.Last + 1));
+      vals_itr = (float*)decl.ImmediateData.u;
+      for (i = 0; i <= decl.Range.Last; ++i) {
+         if (!parse_immediate_data(ctx, vals_itr)) {
+            FREE(decl.ImmediateData.u);
+            return FALSE;
+         }
+         vals_itr += 4;
+         eat_opt_white( &ctx->cur );
+         if (*ctx->cur != ',') {
+            if (i !=  decl.Range.Last) {
+               report_error( ctx, "Not enough data in immediate array!" );
+               FREE(decl.ImmediateData.u);
+               return FALSE;
+            }
+         } else
+            ++ctx->cur;
+      }
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != '}') {
+         FREE(decl.ImmediateData.u);
+         report_error( ctx, "Immediate array data missing closing '}'" );
+         return FALSE;
+      }
+      ++ctx->cur;
    }
 
    cur = ctx->cur;
    eat_opt_white( &cur );
-   if (*cur == ',') {
+   if (*cur == ',' && !is_vs_input) {
       uint i;
 
       cur++;
@@ -1055,6 +1208,10 @@ static boolean parse_declaration( struct translate_ctx *ctx )
       ctx->tokens_cur,
       ctx->header,
       (uint) (ctx->tokens_end - ctx->tokens_cur) );
+
+   if (is_imm_array)
+      FREE(decl.ImmediateData.u);
+
    if (advance == 0)
       return FALSE;
    ctx->tokens_cur += advance;
@@ -1065,7 +1222,6 @@ static boolean parse_declaration( struct translate_ctx *ctx )
 static boolean parse_immediate( struct translate_ctx *ctx )
 {
    struct tgsi_full_immediate imm;
-   uint i;
    float values[4];
    uint advance;
 
@@ -1073,37 +1229,13 @@ static boolean parse_immediate( struct translate_ctx *ctx )
       report_error( ctx, "Syntax error" );
       return FALSE;
    }
-   if (!str_match_no_case( &ctx->cur, "FLT32" ) || is_digit_alpha_underscore( ctx->cur )) {
+   if (!str_match_no_case( &ctx->cur, "FLT32" ) ||
+       is_digit_alpha_underscore( ctx->cur )) {
       report_error( ctx, "Expected `FLT32'" );
       return FALSE;
    }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '{') {
-      report_error( ctx, "Expected `{'" );
-      return FALSE;
-   }
-   ctx->cur++;
-   for (i = 0; i < 4; i++) {
-      eat_opt_white( &ctx->cur );
-      if (i > 0) {
-         if (*ctx->cur != ',') {
-            report_error( ctx, "Expected `,'" );
-            return FALSE;
-         }
-         ctx->cur++;
-         eat_opt_white( &ctx->cur );
-      }
-      if (!parse_float( &ctx->cur, &values[i] )) {
-         report_error( ctx, "Expected literal floating point" );
-         return FALSE;
-      }
-   }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '}') {
-      report_error( ctx, "Expected `}'" );
-      return FALSE;
-   }
-   ctx->cur++;
+
+   parse_immediate_data(ctx, values);
 
    imm = tgsi_default_full_immediate();
    imm.Immediate.NrTokens += 4;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index f725405ade1..7d13a17bdbc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -74,7 +74,6 @@ struct ureg_tokens {
 #define UREG_MAX_IMMEDIATE 32
 #define UREG_MAX_TEMP 256
 #define UREG_MAX_ADDR 2
-#define UREG_MAX_LOOP 1
 #define UREG_MAX_PRED 1
 
 struct const_decl {
@@ -97,7 +96,8 @@ struct ureg_program
       unsigned semantic_name;
       unsigned semantic_index;
       unsigned interp;
-      unsigned cylindrical_wrap;
+      unsigned char cylindrical_wrap;
+      unsigned char centroid;
    } fs_input[UREG_MAX_INPUT];
    unsigned nr_fs_inputs;
 
@@ -151,7 +151,6 @@ struct ureg_program
 
    unsigned nr_addrs;
    unsigned nr_preds;
-   unsigned nr_loops;
    unsigned nr_instructions;
 
    struct ureg_tokens domain[2];
@@ -288,11 +287,12 @@ ureg_property_fs_coord_pixel_center(struct ureg_program *ureg,
 
 
 struct ureg_src
-ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
+ureg_DECL_fs_input_cyl_centroid(struct ureg_program *ureg,
                        unsigned semantic_name,
                        unsigned semantic_index,
                        unsigned interp_mode,
-                       unsigned cylindrical_wrap)
+                       unsigned cylindrical_wrap,
+                       unsigned centroid)
 {
    unsigned i;
 
@@ -308,6 +308,7 @@ ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
       ureg->fs_input[i].semantic_index = semantic_index;
       ureg->fs_input[i].interp = interp_mode;
       ureg->fs_input[i].cylindrical_wrap = cylindrical_wrap;
+      ureg->fs_input[i].centroid = centroid;
       ureg->nr_fs_inputs++;
    } else {
       set_bad(ureg);
@@ -537,19 +538,6 @@ struct ureg_dst ureg_DECL_address( struct ureg_program *ureg )
    return ureg_dst_register( TGSI_FILE_ADDRESS, 0 );
 }
 
-/* Allocate a new loop register.
- */
-struct ureg_dst
-ureg_DECL_loop(struct ureg_program *ureg)
-{
-   if (ureg->nr_loops < UREG_MAX_LOOP) {
-      return ureg_dst_register(TGSI_FILE_LOOP, ureg->nr_loops++);
-   }
-
-   assert(0);
-   return ureg_dst_register(TGSI_FILE_LOOP, 0);
-}
-
 /* Allocate a new predicate register.
  */
 struct ureg_dst
@@ -747,11 +735,12 @@ ureg_DECL_immediate_int( struct ureg_program *ureg,
 }
 
 
-void 
+void
 ureg_emit_src( struct ureg_program *ureg,
                struct ureg_src src )
 {
-   unsigned size = 1 + (src.Indirect ? 1 : 0) + (src.Dimension ? 1 : 0);
+   unsigned size = 1 + (src.Indirect ? 1 : 0) +
+                   (src.Dimension ? (src.DimIndirect ? 2 : 1) : 0);
 
    union tgsi_any_token *out = get_tokens( ureg, DOMAIN_INSN, size );
    unsigned n = 0;
@@ -784,11 +773,27 @@ ureg_emit_src( struct ureg_program *ureg,
    }
 
    if (src.Dimension) {
-      out[0].src.Dimension = 1;
-      out[n].dim.Indirect = 0;
-      out[n].dim.Dimension = 0;
-      out[n].dim.Padding = 0;
-      out[n].dim.Index = src.DimensionIndex;
+      if (src.DimIndirect) {
+         out[0].src.Dimension = 1;
+         out[n].dim.Indirect = 1;
+         out[n].dim.Dimension = 0;
+         out[n].dim.Padding = 0;
+         out[n].dim.Index = src.DimensionIndex;
+         n++;
+         out[n].value = 0;
+         out[n].src.File = src.DimIndFile;
+         out[n].src.SwizzleX = src.DimIndSwizzle;
+         out[n].src.SwizzleY = src.DimIndSwizzle;
+         out[n].src.SwizzleZ = src.DimIndSwizzle;
+         out[n].src.SwizzleW = src.DimIndSwizzle;
+         out[n].src.Index = src.DimIndIndex;
+      } else {
+         out[0].src.Dimension = 1;
+         out[n].dim.Indirect = 0;
+         out[n].dim.Dimension = 0;
+         out[n].dim.Padding = 0;
+         out[n].dim.Index = src.DimensionIndex;
+      }
       n++;
    }
 
@@ -1124,7 +1129,8 @@ emit_decl_fs(struct ureg_program *ureg,
              unsigned semantic_name,
              unsigned semantic_index,
              unsigned interpolate,
-             unsigned cylindrical_wrap)
+             unsigned cylindrical_wrap,
+             unsigned centroid)
 {
    union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
 
@@ -1136,6 +1142,7 @@ emit_decl_fs(struct ureg_program *ureg,
    out[0].decl.Interpolate = interpolate;
    out[0].decl.Semantic = 1;
    out[0].decl.CylindricalWrap = cylindrical_wrap;
+   out[0].decl.Centroid = centroid;
 
    out[1].value = 0;
    out[1].decl_range.First = index;
@@ -1251,7 +1258,7 @@ static void emit_decls( struct ureg_program *ureg )
       assert(ureg->processor == TGSI_PROCESSOR_GEOMETRY);
 
       emit_property(ureg,
-                    TGSI_PROPERTY_GS_MAX_VERTICES,
+                    TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES,
                     ureg->property_gs_max_vertices);
    }
 
@@ -1285,7 +1292,8 @@ static void emit_decls( struct ureg_program *ureg )
                       ureg->fs_input[i].semantic_name,
                       ureg->fs_input[i].semantic_index,
                       ureg->fs_input[i].interp,
-                      ureg->fs_input[i].cylindrical_wrap);
+                      ureg->fs_input[i].cylindrical_wrap,
+                      ureg->fs_input[i].centroid);
       }
    } else {
       for (i = 0; i < ureg->nr_gs_inputs; i++) {
@@ -1356,13 +1364,6 @@ static void emit_decls( struct ureg_program *ureg )
                        0, ureg->nr_addrs );
    }
 
-   if (ureg->nr_loops) {
-      emit_decl_range(ureg,
-                      TGSI_FILE_LOOP,
-                      0,
-                      ureg->nr_loops);
-   }
-
    if (ureg->nr_preds) {
       emit_decl_range(ureg,
                       TGSI_FILE_PREDICATE,
@@ -1489,6 +1490,12 @@ const struct tgsi_token *ureg_get_tokens( struct ureg_program *ureg,
 }
 
 
+void ureg_free_tokens( const struct tgsi_token *tokens )
+{
+   FREE((struct tgsi_token *)tokens);
+}
+
+
 struct ureg_program *ureg_create( unsigned processor )
 {
    struct ureg_program *ureg = CALLOC_STRUCT( ureg_program );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 0130a77aadb..acc463200a6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -49,14 +49,18 @@ struct ureg_src
    unsigned SwizzleZ    : 2;  /* TGSI_SWIZZLE_ */
    unsigned SwizzleW    : 2;  /* TGSI_SWIZZLE_ */
    unsigned Indirect    : 1;  /* BOOL */
+   unsigned DimIndirect : 1;  /* BOOL */
    unsigned Dimension   : 1;  /* BOOL */
    unsigned Absolute    : 1;  /* BOOL */
    unsigned Negate      : 1;  /* BOOL */
    int      Index       : 16; /* SINT */
-   unsigned IndirectFile    : 4;  /* TGSI_FILE_ */
-   int      IndirectIndex   : 16; /* SINT */
-   unsigned IndirectSwizzle : 2;  /* TGSI_SWIZZLE_ */
-   int      DimensionIndex  : 16; /* SINT */
+   unsigned IndirectFile     : 4;  /* TGSI_FILE_ */
+   int      IndirectIndex    : 16; /* SINT */
+   unsigned IndirectSwizzle  : 2;  /* TGSI_SWIZZLE_ */
+   int      DimensionIndex   : 16; /* SINT */
+   unsigned DimIndFile       : 4;  /* TGSI_FILE_ */
+   int      DimIndIndex      : 16; /* SINT */
+   unsigned DimIndSwizzle    : 2;  /* TGSI_SWIZZLE_ */
 };
 
 /* Very similar to a tgsi_dst_register, removing unsupported fields
@@ -104,6 +108,10 @@ ureg_get_tokens( struct ureg_program *ureg,
                  unsigned *nr_tokens );
 
 
+/* Free the tokens created by ureg_get_tokens() */
+void ureg_free_tokens( const struct tgsi_token *tokens );
+
+
 void 
 ureg_destroy( struct ureg_program * );
 
@@ -150,11 +158,27 @@ ureg_property_fs_coord_pixel_center(struct ureg_program *ureg,
  */
 
 struct ureg_src
-ureg_DECL_fs_input_cyl(struct ureg_program *,
+ureg_DECL_fs_input_cyl_centroid(struct ureg_program *,
                        unsigned semantic_name,
                        unsigned semantic_index,
                        unsigned interp_mode,
-                       unsigned cylindrical_wrap);
+                       unsigned cylindrical_wrap,
+                       unsigned centroid);
+
+static INLINE struct ureg_src
+ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
+                       unsigned semantic_name,
+                       unsigned semantic_index,
+                       unsigned interp_mode,
+                       unsigned cylindrical_wrap)
+{
+   return ureg_DECL_fs_input_cyl_centroid(ureg,
+                                 semantic_name,
+                                 semantic_index,
+                                 interp_mode,
+                                 cylindrical_wrap,
+                                 0);
+}
 
 static INLINE struct ureg_src
 ureg_DECL_fs_input(struct ureg_program *ureg,
@@ -162,11 +186,11 @@ ureg_DECL_fs_input(struct ureg_program *ureg,
                    unsigned semantic_index,
                    unsigned interp_mode)
 {
-   return ureg_DECL_fs_input_cyl(ureg,
+   return ureg_DECL_fs_input_cyl_centroid(ureg,
                                  semantic_name,
                                  semantic_index,
                                  interp_mode,
-                                 0);
+                                 0, 0);
 }
 
 struct ureg_src
@@ -231,9 +255,6 @@ struct ureg_dst
 ureg_DECL_address( struct ureg_program * );
 
 struct ureg_dst
-ureg_DECL_loop( struct ureg_program * );
-
-struct ureg_dst
 ureg_DECL_predicate(struct ureg_program *);
 
 /* Supply an index to the sampler declaration as this is the hook to
@@ -820,15 +841,31 @@ ureg_src_indirect( struct ureg_src reg, struct ureg_src addr )
    return reg;
 }
 
-static INLINE struct ureg_src 
+static INLINE struct ureg_src
 ureg_src_dimension( struct ureg_src reg, int index )
 {
    assert(reg.File != TGSI_FILE_NULL);
    reg.Dimension = 1;
+   reg.DimIndirect = 0;
    reg.DimensionIndex = index;
    return reg;
 }
 
+
+static INLINE struct ureg_src
+ureg_src_dimension_indirect( struct ureg_src reg, struct ureg_src addr,
+                             int index )
+{
+   assert(reg.File != TGSI_FILE_NULL);
+   reg.Dimension = 1;
+   reg.DimIndirect = 1;
+   reg.DimensionIndex = index;
+   reg.DimIndFile = addr.File;
+   reg.DimIndIndex = addr.Index;
+   reg.DimIndSwizzle = addr.SwizzleX;
+   return reg;
+}
+
 static INLINE struct ureg_dst
 ureg_dst( struct ureg_src src )
 {
@@ -873,6 +910,10 @@ ureg_src_register(unsigned file,
    src.Negate = 0;
    src.Dimension = 0;
    src.DimensionIndex = 0;
+   src.DimIndirect = 0;
+   src.DimIndFile = TGSI_FILE_NULL;
+   src.DimIndIndex = 0;
+   src.DimIndSwizzle = 0;
 
    return src;
 }
@@ -896,6 +937,10 @@ ureg_src( struct ureg_dst dst )
    src.Negate    = 0;
    src.Dimension = 0;
    src.DimensionIndex = 0;
+   src.DimIndirect = 0;
+   src.DimIndFile = TGSI_FILE_NULL;
+   src.DimIndIndex = 0;
+   src.DimIndSwizzle = 0;
 
    return src;
 }
@@ -943,7 +988,11 @@ ureg_src_undef( void )
    src.Negate    = 0;
    src.Dimension = 0;
    src.DimensionIndex = 0;
-   
+   src.DimIndirect = 0;
+   src.DimIndFile = TGSI_FILE_NULL;
+   src.DimIndIndex = 0;
+   src.DimIndSwizzle = 0;
+
    return src;
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 0a7e4105a80..08e7e89bd67 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -163,3 +163,150 @@ tgsi_util_set_full_src_register_sign_mode(
       assert( 0 );
    }
 }
+
+/**
+ * Determine which channels of the specificed src register are effectively
+ * used by this instruction.
+ */
+unsigned
+tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst,
+                              unsigned src_idx)
+{
+   const struct tgsi_full_src_register *src = &inst->Src[src_idx];
+   unsigned write_mask = inst->Dst[0].Register.WriteMask;
+   unsigned read_mask;
+   unsigned usage_mask;
+   unsigned chan;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ARL:
+   case TGSI_OPCODE_ARR:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_DIV:
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_LRP:
+   case TGSI_OPCODE_CND:
+   case TGSI_OPCODE_FRC:
+   case TGSI_OPCODE_CEIL:
+   case TGSI_OPCODE_CLAMP:
+   case TGSI_OPCODE_FLR:
+   case TGSI_OPCODE_ROUND:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
+   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDY:
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SSG:
+   case TGSI_OPCODE_CMP:
+   case TGSI_OPCODE_TRUNC:
+   case TGSI_OPCODE_NOT:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_XOR:
+   case TGSI_OPCODE_SAD:
+      /* Channel-wise operations */
+      read_mask = write_mask;
+      break;
+
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_RCC:
+      read_mask = TGSI_WRITEMASK_X;
+      break;
+
+   case TGSI_OPCODE_SCS:
+      read_mask = write_mask & TGSI_WRITEMASK_XY ? TGSI_WRITEMASK_X : 0;
+      break;
+
+   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_LOG:
+      read_mask = write_mask & TGSI_WRITEMASK_XYZ ? TGSI_WRITEMASK_X : 0;
+      break;
+
+   case TGSI_OPCODE_DP2A:
+      read_mask = src_idx == 2 ? TGSI_WRITEMASK_X : TGSI_WRITEMASK_XY;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      read_mask = TGSI_WRITEMASK_XY;
+      break;
+
+   case TGSI_OPCODE_DP3:
+      read_mask = TGSI_WRITEMASK_XYZ;
+      break;
+
+   case TGSI_OPCODE_DP4:
+      read_mask = TGSI_WRITEMASK_XYZW;
+      break;
+
+   case TGSI_OPCODE_DPH:
+      read_mask = src_idx == 0 ? TGSI_WRITEMASK_XYZ : TGSI_WRITEMASK_XYZW;
+      break;
+
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXD:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXP:
+      if (src_idx == 0) {
+         /* Note that the SHADOW variants use the Z component too */
+         switch (inst->Texture.Texture) {
+         case TGSI_TEXTURE_1D:
+            read_mask = TGSI_WRITEMASK_X;
+            break;
+         case TGSI_TEXTURE_SHADOW1D:
+            read_mask = TGSI_WRITEMASK_XZ;
+            break;
+         case TGSI_TEXTURE_2D:
+         case TGSI_TEXTURE_RECT:
+            read_mask = TGSI_WRITEMASK_XY;
+            break;
+         case TGSI_TEXTURE_SHADOW2D:
+         case TGSI_TEXTURE_SHADOWRECT:
+         case TGSI_TEXTURE_3D:
+         case TGSI_TEXTURE_CUBE:
+            read_mask = TGSI_WRITEMASK_XYZ;
+            break;
+
+         default:
+            assert(0);
+            read_mask = 0;
+         }
+
+         if (inst->Instruction.Opcode != TGSI_OPCODE_TEX) {
+            read_mask |= TGSI_WRITEMASK_W;
+         }
+      } else {
+         /* A safe approximation */
+         read_mask = TGSI_WRITEMASK_XYZW;
+      }
+      break;
+
+   default:
+      /* Assume all channels are read */
+      read_mask = TGSI_WRITEMASK_XYZW;
+      break;
+   }
+
+   usage_mask = 0;
+   for (chan = 0; chan < 4; ++chan) {
+      if (read_mask & (1 << chan)) {
+         usage_mask |= 1 << tgsi_util_get_full_src_register_swizzle(src, chan);
+      }
+   }
+
+   return usage_mask;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h
index 19ee2e7cf2a..04702ba9826 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -34,6 +34,7 @@ extern "C" {
 
 struct tgsi_src_register;
 struct tgsi_full_src_register;
+struct tgsi_full_instruction;
 
 void *
 tgsi_align_128bit(
@@ -71,6 +72,10 @@ tgsi_util_set_full_src_register_sign_mode(
    struct tgsi_full_src_register *reg,
    unsigned sign_mode );
 
+unsigned
+tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst,
+                              unsigned src_idx);
+
 #if defined __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c
index a9b7253bf44..73287b667db 100644
--- a/src/gallium/auxiliary/translate/translate.c
+++ b/src/gallium/auxiliary/translate/translate.c
@@ -38,7 +38,7 @@ struct translate *translate_create( const struct translate_key *key )
 {
    struct translate *translate = NULL;
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    translate = translate_sse2_create( key );
    if (translate)
       return translate;
@@ -48,3 +48,8 @@ struct translate *translate_create( const struct translate_key *key )
 
    return translate_generic_create( key );
 }
+
+boolean translate_is_output_format_supported(enum pipe_format format)
+{
+   return translate_generic_is_output_format_supported(format);
+}
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index edd95e07882..a75380228b1 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -85,6 +85,18 @@ struct translate {
                                 unsigned instance_id,
                                 void *output_buffer);
 
+   void (PIPE_CDECL *run_elts16)( struct translate *,
+                                const uint16_t *elts,
+                                unsigned count,
+                                unsigned instance_id,
+                                void *output_buffer);
+
+   void (PIPE_CDECL *run_elts8)( struct translate *,
+                                const uint8_t *elts,
+                                unsigned count,
+                                unsigned instance_id,
+                                void *output_buffer);
+
    void (PIPE_CDECL *run)( struct translate *,
                            unsigned start,
                            unsigned count,
@@ -105,6 +117,8 @@ struct translate *translate_lookup_or_create( struct translate_context *tctx,
 
 struct translate *translate_create( const struct translate_key *key );
 
+boolean translate_is_output_format_supported(enum pipe_format format);
+
 static INLINE int translate_keysize( const struct translate_key *key )
 {
    return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element);
@@ -138,5 +152,6 @@ struct translate *translate_sse2_create( const struct translate_key *key );
 
 struct translate *translate_generic_create( const struct translate_key *key );
 
+boolean translate_generic_is_output_format_supported(enum pipe_format format);
 
 #endif
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index a9272fbb491..ad809db720d 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -31,6 +31,7 @@
   */
 
 #include "util/u_memory.h"
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "pipe/p_state.h"
 #include "translate.h"
@@ -38,7 +39,9 @@
 
 #define DRAW_DBG 0
 
-typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*fetch_func)(float *dst,
+                           const uint8_t *src,
+                           unsigned i, unsigned j);
 typedef void (*emit_func)(const float *attrib, void *ptr);
 
 
@@ -57,10 +60,18 @@ struct translate_generic {
       emit_func emit;
       unsigned output_offset;
       
-      char *input_ptr;
+      const uint8_t *input_ptr;
       unsigned input_stride;
       unsigned max_index;
 
+      /* this value is set to -1 if this is a normal element with output_format != input_format:
+       * in this case, u_format is used to do a full conversion
+       *
+       * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids:
+       * in this case, memcpy is used to copy this amount of bytes
+       */
+      int copy_size;
+
    } attrib[PIPE_MAX_ATTRIBS];
 
    unsigned nr_attrib;
@@ -79,22 +90,7 @@ static struct translate_generic *translate_generic( struct translate *translate
  * This is probably needed/dupliocated elsewhere, eg format
  * conversion, texture sampling etc.
  */
-#define ATTRIB( NAME, SZ, TYPE, FROM, TO )		\
-static void						\
-fetch_##NAME(const void *ptr, float *attrib)		\
-{							\
-   const float defaults[4] = { 0.0f,0.0f,0.0f,1.0f };	\
-   unsigned i;						\
-							\
-   for (i = 0; i < SZ; i++) {				\
-      attrib[i] = FROM(i);				\
-   }							\
-							\
-   for (; i < 4; i++) {					\
-      attrib[i] = defaults[i];				\
-   }							\
-}							\
-							\
+#define ATTRIB( NAME, SZ, TYPE, TO )	    	        \
 static void						\
 emit_##NAME(const float *attrib, void *ptr)		\
 {  \
@@ -107,27 +103,6 @@ emit_##NAME(const float *attrib, void *ptr)		\
 }
 
 
-#define FROM_64_FLOAT(i)   ((float) ((double *) ptr)[i])
-#define FROM_32_FLOAT(i)   (((float *) ptr)[i])
-
-#define FROM_8_USCALED(i)  ((float) ((unsigned char *) ptr)[i])
-#define FROM_16_USCALED(i) ((float) ((unsigned short *) ptr)[i])
-#define FROM_32_USCALED(i) ((float) ((unsigned int *) ptr)[i])
-
-#define FROM_8_SSCALED(i)  ((float) ((char *) ptr)[i])
-#define FROM_16_SSCALED(i) ((float) ((short *) ptr)[i])
-#define FROM_32_SSCALED(i) ((float) ((int *) ptr)[i])
-
-#define FROM_8_UNORM(i)    ((float) ((unsigned char *) ptr)[i] / 255.0f)
-#define FROM_16_UNORM(i)   ((float) ((unsigned short *) ptr)[i] / 65535.0f)
-#define FROM_32_UNORM(i)   ((float) ((unsigned int *) ptr)[i] / 4294967295.0f)
-
-#define FROM_8_SNORM(i)    ((float) ((char *) ptr)[i] / 127.0f)
-#define FROM_16_SNORM(i)   ((float) ((short *) ptr)[i] / 32767.0f)
-#define FROM_32_SNORM(i)   ((float) ((int *) ptr)[i] / 2147483647.0f)
-
-#define FROM_32_FIXED(i)   (((int *) ptr)[i] / 65536.0f)
-
 #define TO_64_FLOAT(x)   ((double) x)
 #define TO_32_FLOAT(x)   (x)
 
@@ -150,94 +125,84 @@ emit_##NAME(const float *attrib, void *ptr)		\
 #define TO_32_FIXED(x)   ((int) (x * 65536.0f))
 
 
-
-ATTRIB( R64G64B64A64_FLOAT,   4, double, FROM_64_FLOAT, TO_64_FLOAT )
-ATTRIB( R64G64B64_FLOAT,      3, double, FROM_64_FLOAT, TO_64_FLOAT )
-ATTRIB( R64G64_FLOAT,         2, double, FROM_64_FLOAT, TO_64_FLOAT )
-ATTRIB( R64_FLOAT,            1, double, FROM_64_FLOAT, TO_64_FLOAT )
-
-ATTRIB( R32G32B32A32_FLOAT,   4, float, FROM_32_FLOAT, TO_32_FLOAT )
-ATTRIB( R32G32B32_FLOAT,      3, float, FROM_32_FLOAT, TO_32_FLOAT )
-ATTRIB( R32G32_FLOAT,         2, float, FROM_32_FLOAT, TO_32_FLOAT )
-ATTRIB( R32_FLOAT,            1, float, FROM_32_FLOAT, TO_32_FLOAT )
-
-ATTRIB( R32G32B32A32_USCALED, 4, unsigned, FROM_32_USCALED, TO_32_USCALED )
-ATTRIB( R32G32B32_USCALED,    3, unsigned, FROM_32_USCALED, TO_32_USCALED )
-ATTRIB( R32G32_USCALED,       2, unsigned, FROM_32_USCALED, TO_32_USCALED )
-ATTRIB( R32_USCALED,          1, unsigned, FROM_32_USCALED, TO_32_USCALED )
-
-ATTRIB( R32G32B32A32_SSCALED, 4, int, FROM_32_SSCALED, TO_32_SSCALED )
-ATTRIB( R32G32B32_SSCALED,    3, int, FROM_32_SSCALED, TO_32_SSCALED )
-ATTRIB( R32G32_SSCALED,       2, int, FROM_32_SSCALED, TO_32_SSCALED )
-ATTRIB( R32_SSCALED,          1, int, FROM_32_SSCALED, TO_32_SSCALED )
-
-ATTRIB( R32G32B32A32_UNORM, 4, unsigned, FROM_32_UNORM, TO_32_UNORM )
-ATTRIB( R32G32B32_UNORM,    3, unsigned, FROM_32_UNORM, TO_32_UNORM )
-ATTRIB( R32G32_UNORM,       2, unsigned, FROM_32_UNORM, TO_32_UNORM )
-ATTRIB( R32_UNORM,          1, unsigned, FROM_32_UNORM, TO_32_UNORM )
-
-ATTRIB( R32G32B32A32_SNORM, 4, int, FROM_32_SNORM, TO_32_SNORM )
-ATTRIB( R32G32B32_SNORM,    3, int, FROM_32_SNORM, TO_32_SNORM )
-ATTRIB( R32G32_SNORM,       2, int, FROM_32_SNORM, TO_32_SNORM )
-ATTRIB( R32_SNORM,          1, int, FROM_32_SNORM, TO_32_SNORM )
-
-ATTRIB( R16G16B16A16_USCALED, 4, ushort, FROM_16_USCALED, TO_16_USCALED )
-ATTRIB( R16G16B16_USCALED,    3, ushort, FROM_16_USCALED, TO_16_USCALED )
-ATTRIB( R16G16_USCALED,       2, ushort, FROM_16_USCALED, TO_16_USCALED )
-ATTRIB( R16_USCALED,          1, ushort, FROM_16_USCALED, TO_16_USCALED )
-
-ATTRIB( R16G16B16A16_SSCALED, 4, short, FROM_16_SSCALED, TO_16_SSCALED )
-ATTRIB( R16G16B16_SSCALED,    3, short, FROM_16_SSCALED, TO_16_SSCALED )
-ATTRIB( R16G16_SSCALED,       2, short, FROM_16_SSCALED, TO_16_SSCALED )
-ATTRIB( R16_SSCALED,          1, short, FROM_16_SSCALED, TO_16_SSCALED )
-
-ATTRIB( R16G16B16A16_UNORM, 4, ushort, FROM_16_UNORM, TO_16_UNORM )
-ATTRIB( R16G16B16_UNORM,    3, ushort, FROM_16_UNORM, TO_16_UNORM )
-ATTRIB( R16G16_UNORM,       2, ushort, FROM_16_UNORM, TO_16_UNORM )
-ATTRIB( R16_UNORM,          1, ushort, FROM_16_UNORM, TO_16_UNORM )
-
-ATTRIB( R16G16B16A16_SNORM, 4, short, FROM_16_SNORM, TO_16_SNORM )
-ATTRIB( R16G16B16_SNORM,    3, short, FROM_16_SNORM, TO_16_SNORM )
-ATTRIB( R16G16_SNORM,       2, short, FROM_16_SNORM, TO_16_SNORM )
-ATTRIB( R16_SNORM,          1, short, FROM_16_SNORM, TO_16_SNORM )
-
-ATTRIB( R8G8B8A8_USCALED,   4, ubyte, FROM_8_USCALED, TO_8_USCALED )
-ATTRIB( R8G8B8_USCALED,     3, ubyte, FROM_8_USCALED, TO_8_USCALED )
-ATTRIB( R8G8_USCALED,       2, ubyte, FROM_8_USCALED, TO_8_USCALED )
-ATTRIB( R8_USCALED,         1, ubyte, FROM_8_USCALED, TO_8_USCALED )
-
-ATTRIB( R8G8B8A8_SSCALED,  4, char, FROM_8_SSCALED, TO_8_SSCALED )
-ATTRIB( R8G8B8_SSCALED,    3, char, FROM_8_SSCALED, TO_8_SSCALED )
-ATTRIB( R8G8_SSCALED,      2, char, FROM_8_SSCALED, TO_8_SSCALED )
-ATTRIB( R8_SSCALED,        1, char, FROM_8_SSCALED, TO_8_SSCALED )
-
-ATTRIB( R8G8B8A8_UNORM,  4, ubyte, FROM_8_UNORM, TO_8_UNORM )
-ATTRIB( R8G8B8_UNORM,    3, ubyte, FROM_8_UNORM, TO_8_UNORM )
-ATTRIB( R8G8_UNORM,      2, ubyte, FROM_8_UNORM, TO_8_UNORM )
-ATTRIB( R8_UNORM,        1, ubyte, FROM_8_UNORM, TO_8_UNORM )
-
-ATTRIB( R8G8B8A8_SNORM,  4, char, FROM_8_SNORM, TO_8_SNORM )
-ATTRIB( R8G8B8_SNORM,    3, char, FROM_8_SNORM, TO_8_SNORM )
-ATTRIB( R8G8_SNORM,      2, char, FROM_8_SNORM, TO_8_SNORM )
-ATTRIB( R8_SNORM,        1, char, FROM_8_SNORM, TO_8_SNORM )
-
-ATTRIB( A8R8G8B8_UNORM,       4, ubyte, FROM_8_UNORM, TO_8_UNORM )
-/*ATTRIB( R8G8B8A8_UNORM,       4, ubyte, FROM_8_UNORM, TO_8_UNORM )*/
-
-ATTRIB( R32G32B32A32_FIXED,   4, int, FROM_32_FIXED, TO_32_FIXED )
-ATTRIB( R32G32B32_FIXED,      3, int, FROM_32_FIXED, TO_32_FIXED )
-ATTRIB( R32G32_FIXED,         2, int, FROM_32_FIXED, TO_32_FIXED )
-ATTRIB( R32_FIXED,            1, int, FROM_32_FIXED, TO_32_FIXED )
-
-
+ATTRIB( R64G64B64A64_FLOAT,   4, double, TO_64_FLOAT )
+ATTRIB( R64G64B64_FLOAT,      3, double, TO_64_FLOAT )
+ATTRIB( R64G64_FLOAT,         2, double, TO_64_FLOAT )
+ATTRIB( R64_FLOAT,            1, double, TO_64_FLOAT )
+
+ATTRIB( R32G32B32A32_FLOAT,   4, float, TO_32_FLOAT )
+ATTRIB( R32G32B32_FLOAT,      3, float, TO_32_FLOAT )
+ATTRIB( R32G32_FLOAT,         2, float, TO_32_FLOAT )
+ATTRIB( R32_FLOAT,            1, float, TO_32_FLOAT )
+
+ATTRIB( R32G32B32A32_USCALED, 4, unsigned, TO_32_USCALED )
+ATTRIB( R32G32B32_USCALED,    3, unsigned, TO_32_USCALED )
+ATTRIB( R32G32_USCALED,       2, unsigned, TO_32_USCALED )
+ATTRIB( R32_USCALED,          1, unsigned, TO_32_USCALED )
+
+ATTRIB( R32G32B32A32_SSCALED, 4, int, TO_32_SSCALED )
+ATTRIB( R32G32B32_SSCALED,    3, int, TO_32_SSCALED )
+ATTRIB( R32G32_SSCALED,       2, int, TO_32_SSCALED )
+ATTRIB( R32_SSCALED,          1, int, TO_32_SSCALED )
+
+ATTRIB( R32G32B32A32_UNORM, 4, unsigned, TO_32_UNORM )
+ATTRIB( R32G32B32_UNORM,    3, unsigned, TO_32_UNORM )
+ATTRIB( R32G32_UNORM,       2, unsigned, TO_32_UNORM )
+ATTRIB( R32_UNORM,          1, unsigned, TO_32_UNORM )
+
+ATTRIB( R32G32B32A32_SNORM, 4, int, TO_32_SNORM )
+ATTRIB( R32G32B32_SNORM,    3, int, TO_32_SNORM )
+ATTRIB( R32G32_SNORM,       2, int, TO_32_SNORM )
+ATTRIB( R32_SNORM,          1, int, TO_32_SNORM )
+
+ATTRIB( R16G16B16A16_USCALED, 4, ushort, TO_16_USCALED )
+ATTRIB( R16G16B16_USCALED,    3, ushort, TO_16_USCALED )
+ATTRIB( R16G16_USCALED,       2, ushort, TO_16_USCALED )
+ATTRIB( R16_USCALED,          1, ushort, TO_16_USCALED )
+
+ATTRIB( R16G16B16A16_SSCALED, 4, short, TO_16_SSCALED )
+ATTRIB( R16G16B16_SSCALED,    3, short, TO_16_SSCALED )
+ATTRIB( R16G16_SSCALED,       2, short, TO_16_SSCALED )
+ATTRIB( R16_SSCALED,          1, short, TO_16_SSCALED )
+
+ATTRIB( R16G16B16A16_UNORM, 4, ushort, TO_16_UNORM )
+ATTRIB( R16G16B16_UNORM,    3, ushort, TO_16_UNORM )
+ATTRIB( R16G16_UNORM,       2, ushort, TO_16_UNORM )
+ATTRIB( R16_UNORM,          1, ushort, TO_16_UNORM )
+
+ATTRIB( R16G16B16A16_SNORM, 4, short, TO_16_SNORM )
+ATTRIB( R16G16B16_SNORM,    3, short, TO_16_SNORM )
+ATTRIB( R16G16_SNORM,       2, short, TO_16_SNORM )
+ATTRIB( R16_SNORM,          1, short, TO_16_SNORM )
+
+ATTRIB( R8G8B8A8_USCALED,   4, ubyte, TO_8_USCALED )
+ATTRIB( R8G8B8_USCALED,     3, ubyte, TO_8_USCALED )
+ATTRIB( R8G8_USCALED,       2, ubyte, TO_8_USCALED )
+ATTRIB( R8_USCALED,         1, ubyte, TO_8_USCALED )
+
+ATTRIB( R8G8B8A8_SSCALED,  4, char, TO_8_SSCALED )
+ATTRIB( R8G8B8_SSCALED,    3, char, TO_8_SSCALED )
+ATTRIB( R8G8_SSCALED,      2, char, TO_8_SSCALED )
+ATTRIB( R8_SSCALED,        1, char, TO_8_SSCALED )
+
+ATTRIB( R8G8B8A8_UNORM,  4, ubyte, TO_8_UNORM )
+ATTRIB( R8G8B8_UNORM,    3, ubyte, TO_8_UNORM )
+ATTRIB( R8G8_UNORM,      2, ubyte, TO_8_UNORM )
+ATTRIB( R8_UNORM,        1, ubyte, TO_8_UNORM )
+
+ATTRIB( R8G8B8A8_SNORM,  4, char, TO_8_SNORM )
+ATTRIB( R8G8B8_SNORM,    3, char, TO_8_SNORM )
+ATTRIB( R8G8_SNORM,      2, char, TO_8_SNORM )
+ATTRIB( R8_SNORM,        1, char, TO_8_SNORM )
 
 static void
-fetch_B8G8R8A8_UNORM(const void *ptr, float *attrib)
+emit_A8R8G8B8_UNORM( const float *attrib, void *ptr)
 {
-   attrib[2] = FROM_8_UNORM(0);
-   attrib[1] = FROM_8_UNORM(1);
-   attrib[0] = FROM_8_UNORM(2);
-   attrib[3] = FROM_8_UNORM(3);
+   ubyte *out = (ubyte *)ptr;
+   out[0] = TO_8_UNORM(attrib[3]);
+   out[1] = TO_8_UNORM(attrib[0]);
+   out[2] = TO_8_UNORM(attrib[1]);
+   out[3] = TO_8_UNORM(attrib[2]);
 }
 
 static void
@@ -251,181 +216,13 @@ emit_B8G8R8A8_UNORM( const float *attrib, void *ptr)
 }
 
 static void 
-fetch_NULL( const void *ptr, float *attrib )
-{
-   attrib[0] = 0;
-   attrib[1] = 0;
-   attrib[2] = 0;
-   attrib[3] = 1;
-}
-
-static void 
 emit_NULL( const float *attrib, void *ptr )
 {
    /* do nothing is the only sensible option */
 }
 
-static fetch_func get_fetch_func( enum pipe_format format )
-{
-   switch (format) {
-   case PIPE_FORMAT_R64_FLOAT:
-      return &fetch_R64_FLOAT;
-   case PIPE_FORMAT_R64G64_FLOAT:
-      return &fetch_R64G64_FLOAT;
-   case PIPE_FORMAT_R64G64B64_FLOAT:
-      return &fetch_R64G64B64_FLOAT;
-   case PIPE_FORMAT_R64G64B64A64_FLOAT:
-      return &fetch_R64G64B64A64_FLOAT;
-
-   case PIPE_FORMAT_R32_FLOAT:
-      return &fetch_R32_FLOAT;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return &fetch_R32G32_FLOAT;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return &fetch_R32G32B32_FLOAT;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return &fetch_R32G32B32A32_FLOAT;
-
-   case PIPE_FORMAT_R32_UNORM:
-      return &fetch_R32_UNORM;
-   case PIPE_FORMAT_R32G32_UNORM:
-      return &fetch_R32G32_UNORM;
-   case PIPE_FORMAT_R32G32B32_UNORM:
-      return &fetch_R32G32B32_UNORM;
-   case PIPE_FORMAT_R32G32B32A32_UNORM:
-      return &fetch_R32G32B32A32_UNORM;
-
-   case PIPE_FORMAT_R32_USCALED:
-      return &fetch_R32_USCALED;
-   case PIPE_FORMAT_R32G32_USCALED:
-      return &fetch_R32G32_USCALED;
-   case PIPE_FORMAT_R32G32B32_USCALED:
-      return &fetch_R32G32B32_USCALED;
-   case PIPE_FORMAT_R32G32B32A32_USCALED:
-      return &fetch_R32G32B32A32_USCALED;
-
-   case PIPE_FORMAT_R32_SNORM:
-      return &fetch_R32_SNORM;
-   case PIPE_FORMAT_R32G32_SNORM:
-      return &fetch_R32G32_SNORM;
-   case PIPE_FORMAT_R32G32B32_SNORM:
-      return &fetch_R32G32B32_SNORM;
-   case PIPE_FORMAT_R32G32B32A32_SNORM:
-      return &fetch_R32G32B32A32_SNORM;
-
-   case PIPE_FORMAT_R32_SSCALED:
-      return &fetch_R32_SSCALED;
-   case PIPE_FORMAT_R32G32_SSCALED:
-      return &fetch_R32G32_SSCALED;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-      return &fetch_R32G32B32_SSCALED;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return &fetch_R32G32B32A32_SSCALED;
-
-   case PIPE_FORMAT_R16_UNORM:
-      return &fetch_R16_UNORM;
-   case PIPE_FORMAT_R16G16_UNORM:
-      return &fetch_R16G16_UNORM;
-   case PIPE_FORMAT_R16G16B16_UNORM:
-      return &fetch_R16G16B16_UNORM;
-   case PIPE_FORMAT_R16G16B16A16_UNORM:
-      return &fetch_R16G16B16A16_UNORM;
-
-   case PIPE_FORMAT_R16_USCALED:
-      return &fetch_R16_USCALED;
-   case PIPE_FORMAT_R16G16_USCALED:
-      return &fetch_R16G16_USCALED;
-   case PIPE_FORMAT_R16G16B16_USCALED:
-      return &fetch_R16G16B16_USCALED;
-   case PIPE_FORMAT_R16G16B16A16_USCALED:
-      return &fetch_R16G16B16A16_USCALED;
-
-   case PIPE_FORMAT_R16_SNORM:
-      return &fetch_R16_SNORM;
-   case PIPE_FORMAT_R16G16_SNORM:
-      return &fetch_R16G16_SNORM;
-   case PIPE_FORMAT_R16G16B16_SNORM:
-      return &fetch_R16G16B16_SNORM;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      return &fetch_R16G16B16A16_SNORM;
-
-   case PIPE_FORMAT_R16_SSCALED:
-      return &fetch_R16_SSCALED;
-   case PIPE_FORMAT_R16G16_SSCALED:
-      return &fetch_R16G16_SSCALED;
-   case PIPE_FORMAT_R16G16B16_SSCALED:
-      return &fetch_R16G16B16_SSCALED;
-   case PIPE_FORMAT_R16G16B16A16_SSCALED:
-      return &fetch_R16G16B16A16_SSCALED;
-
-   case PIPE_FORMAT_R8_UNORM:
-      return &fetch_R8_UNORM;
-   case PIPE_FORMAT_R8G8_UNORM:
-      return &fetch_R8G8_UNORM;
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      return &fetch_R8G8B8_UNORM;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return &fetch_R8G8B8A8_UNORM;
-
-   case PIPE_FORMAT_R8_USCALED:
-      return &fetch_R8_USCALED;
-   case PIPE_FORMAT_R8G8_USCALED:
-      return &fetch_R8G8_USCALED;
-   case PIPE_FORMAT_R8G8B8_USCALED:
-      return &fetch_R8G8B8_USCALED;
-   case PIPE_FORMAT_R8G8B8A8_USCALED:
-      return &fetch_R8G8B8A8_USCALED;
-
-   case PIPE_FORMAT_R8_SNORM:
-      return &fetch_R8_SNORM;
-   case PIPE_FORMAT_R8G8_SNORM:
-      return &fetch_R8G8_SNORM;
-   case PIPE_FORMAT_R8G8B8_SNORM:
-      return &fetch_R8G8B8_SNORM;
-   case PIPE_FORMAT_R8G8B8A8_SNORM:
-      return &fetch_R8G8B8A8_SNORM;
-
-   case PIPE_FORMAT_R8_SSCALED:
-      return &fetch_R8_SSCALED;
-   case PIPE_FORMAT_R8G8_SSCALED:
-      return &fetch_R8G8_SSCALED;
-   case PIPE_FORMAT_R8G8B8_SSCALED:
-      return &fetch_R8G8B8_SSCALED;
-   case PIPE_FORMAT_R8G8B8A8_SSCALED:
-      return &fetch_R8G8B8A8_SSCALED;
-
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return &fetch_B8G8R8A8_UNORM;
-
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return &fetch_A8R8G8B8_UNORM;
-
-   case PIPE_FORMAT_R32_FIXED:
-      return &fetch_R32_FIXED;
-   case PIPE_FORMAT_R32G32_FIXED:
-      return &fetch_R32G32_FIXED;
-   case PIPE_FORMAT_R32G32B32_FIXED:
-      return &fetch_R32G32B32_FIXED;
-   case PIPE_FORMAT_R32G32B32A32_FIXED:
-      return &fetch_R32G32B32A32_FIXED;
-
-   default:
-      assert(0); 
-      return &fetch_NULL;
-   }
-}
-
-
-
-
 static emit_func get_emit_func( enum pipe_format format )
 {
-   /* silence warnings */
-   (void) emit_R32G32B32A32_FIXED;
-   (void) emit_R32G32B32_FIXED;
-   (void) emit_R32G32_FIXED;
-   (void) emit_R32_FIXED;
-
    switch (format) {
    case PIPE_FORMAT_R64_FLOAT:
       return &emit_R64_FLOAT;
@@ -565,60 +362,116 @@ static emit_func get_emit_func( enum pipe_format format )
    }
 }
 
-
-
-/**
- * Fetch vertex attributes for 'count' vertices.
- */
-static void PIPE_CDECL generic_run_elts( struct translate *translate,
-                                         const unsigned *elts,
-                                         unsigned count,
+static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg,
+                                         unsigned elt,
                                          unsigned instance_id,
-                                         void *output_buffer )
+                                         void *vert )
 {
-   struct translate_generic *tg = translate_generic(translate);
-   char *vert = output_buffer;
    unsigned nr_attrs = tg->nr_attrib;
    unsigned attr;
-   unsigned i;
 
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-   for (i = 0; i < count; i++) {
-      unsigned elt = *elts++;
+   for (attr = 0; attr < nr_attrs; attr++) {
+      float data[4];
+      uint8_t *dst = (uint8_t *)vert + tg->attrib[attr].output_offset;
 
-      for (attr = 0; attr < nr_attrs; attr++) {
-	 float data[4];
-         const char *src;
+      if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
+         const uint8_t *src;
          unsigned index;
-
-	 char *dst = (vert + 
-		      tg->attrib[attr].output_offset);
+         int copy_size;
 
          if (tg->attrib[attr].instance_divisor) {
             index = instance_id / tg->attrib[attr].instance_divisor;
-         } else {
+         }
+         else {
             index = elt;
          }
 
+         /* clamp to void going out of bounds */
          index = MIN2(index, tg->attrib[attr].max_index);
 
          src = tg->attrib[attr].input_ptr +
                tg->attrib[attr].input_stride * index;
 
-	 tg->attrib[attr].fetch( src, data );
+         copy_size = tg->attrib[attr].copy_size;
+         if(likely(copy_size >= 0))
+            memcpy(dst, src, copy_size);
+         else
+         {
+            tg->attrib[attr].fetch( data, src, 0, 0 );
+
+            if (0)
+               debug_printf("Fetch linear attr %d  from %p  stride %d  index %d: "
+                         " %f, %f, %f, %f \n",
+                         attr,
+                         tg->attrib[attr].input_ptr,
+                         tg->attrib[attr].input_stride,
+                         index,
+                         data[0], data[1],data[2], data[3]);
+
+            tg->attrib[attr].emit( data, dst );
+         }
+      } else {
+         if(likely(tg->attrib[attr].copy_size >= 0))
+            memcpy(data, &instance_id, 4);
+         else
+         {
+            data[0] = (float)instance_id;
+            tg->attrib[attr].emit( data, dst );
+         }
+      }
+   }
+}
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void PIPE_CDECL generic_run_elts( struct translate *translate,
+                                         const unsigned *elts,
+                                         unsigned count,
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
 
-         if (0) debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
-                             i, elt, attr, data[0], data[1], data[2], data[3]);
+static void PIPE_CDECL generic_run_elts16( struct translate *translate,
+                                         const uint16_t *elts,
+                                         unsigned count,
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
 
-	 tg->attrib[attr].emit( data, dst );
-      }
-      
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
 
+static void PIPE_CDECL generic_run_elts8( struct translate *translate,
+                                         const uint8_t *elts,
+                                         unsigned count,
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
 
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
 
 static void PIPE_CDECL generic_run( struct translate *translate,
                                     unsigned start,
@@ -628,44 +481,10 @@ static void PIPE_CDECL generic_run( struct translate *translate,
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
-   unsigned nr_attrs = tg->nr_attrib;
-   unsigned attr;
    unsigned i;
 
-   /* loop over vertex attributes (vertex shader inputs)
-    */
    for (i = 0; i < count; i++) {
-      unsigned elt = start + i;
-
-      for (attr = 0; attr < nr_attrs; attr++) {
-	 float data[4];
-
-	 char *dst = (vert + 
-		      tg->attrib[attr].output_offset);
-
-         if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
-            const char *src;
-
-            if (tg->attrib[attr].instance_divisor) {
-               src = tg->attrib[attr].input_ptr +
-                     tg->attrib[attr].input_stride *
-                     (instance_id / tg->attrib[attr].instance_divisor);
-            } else {
-               src = tg->attrib[attr].input_ptr +
-                     tg->attrib[attr].input_stride * elt;
-            }
-
-            tg->attrib[attr].fetch( src, data );
-         } else {
-            data[0] = (float)instance_id;
-         }
-
-         if (0) debug_printf("vert %d attr %d: %f %f %f %f\n",
-                             i, attr, data[0], data[1], data[2], data[3]);
-
-	 tg->attrib[attr].emit( data, dst );
-      }
-      
+      generic_run_one(tg, start + i, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
@@ -683,7 +502,7 @@ static void generic_set_buffer( struct translate *translate,
 
    for (i = 0; i < tg->nr_attrib; i++) {
       if (tg->attrib[i].buffer == buf) {
-	 tg->attrib[i].input_ptr = ((char *)ptr +
+	 tg->attrib[i].input_ptr = ((const uint8_t *)ptr +
 				    tg->attrib[i].input_offset);
 	 tg->attrib[i].input_stride = stride;
          tg->attrib[i].max_index = max_index;
@@ -711,19 +530,46 @@ struct translate *translate_generic_create( const struct translate_key *key )
    tg->translate.release = generic_release;
    tg->translate.set_buffer = generic_set_buffer;
    tg->translate.run_elts = generic_run_elts;
+   tg->translate.run_elts16 = generic_run_elts16;
+   tg->translate.run_elts8 = generic_run_elts8;
    tg->translate.run = generic_run;
 
    for (i = 0; i < key->nr_elements; i++) {
+      const struct util_format_description *format_desc =
+            util_format_description(key->element[i].input_format);
+
+      assert(format_desc);
+      assert(format_desc->fetch_rgba_float);
+
       tg->attrib[i].type = key->element[i].type;
 
-      tg->attrib[i].fetch = get_fetch_func(key->element[i].input_format);
+      tg->attrib[i].fetch = format_desc->fetch_rgba_float;
       tg->attrib[i].buffer = key->element[i].input_buffer;
       tg->attrib[i].input_offset = key->element[i].input_offset;
       tg->attrib[i].instance_divisor = key->element[i].instance_divisor;
 
-      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
       tg->attrib[i].output_offset = key->element[i].output_offset;
 
+      tg->attrib[i].copy_size = -1;
+      if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID)
+      {
+            if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED
+                  || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED)
+               tg->attrib[i].copy_size = 4;
+      }
+      else
+      {
+         if(key->element[i].input_format == key->element[i].output_format
+               && format_desc->block.width == 1
+               && format_desc->block.height == 1
+               && !(format_desc->block.bits & 7))
+            tg->attrib[i].copy_size = format_desc->block.bits >> 3;
+      }
+
+      if(tg->attrib[i].copy_size < 0)
+	      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
+      else
+	      tg->attrib[i].emit  = NULL;
    }
 
    tg->nr_attrib = key->nr_elements;
@@ -731,3 +577,83 @@ struct translate *translate_generic_create( const struct translate_key *key )
 
    return &tg->translate;
 }
+
+boolean translate_generic_is_output_format_supported(enum pipe_format format)
+{
+   switch(format)
+   {
+   case PIPE_FORMAT_R64G64B64A64_FLOAT: return TRUE;
+   case PIPE_FORMAT_R64G64B64_FLOAT: return TRUE;
+   case PIPE_FORMAT_R64G64_FLOAT: return TRUE;
+   case PIPE_FORMAT_R64_FLOAT: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_FLOAT: return TRUE;
+   case PIPE_FORMAT_R32G32B32_FLOAT: return TRUE;
+   case PIPE_FORMAT_R32G32_FLOAT: return TRUE;
+   case PIPE_FORMAT_R32_FLOAT: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_USCALED: return TRUE;
+   case PIPE_FORMAT_R32G32B32_USCALED: return TRUE;
+   case PIPE_FORMAT_R32G32_USCALED: return TRUE;
+   case PIPE_FORMAT_R32_USCALED: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_SSCALED: return TRUE;
+   case PIPE_FORMAT_R32G32B32_SSCALED: return TRUE;
+   case PIPE_FORMAT_R32G32_SSCALED: return TRUE;
+   case PIPE_FORMAT_R32_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_UNORM: return TRUE;
+   case PIPE_FORMAT_R32G32B32_UNORM: return TRUE;
+   case PIPE_FORMAT_R32G32_UNORM: return TRUE;
+   case PIPE_FORMAT_R32_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_SNORM: return TRUE;
+   case PIPE_FORMAT_R32G32B32_SNORM: return TRUE;
+   case PIPE_FORMAT_R32G32_SNORM: return TRUE;
+   case PIPE_FORMAT_R32_SNORM: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_USCALED: return TRUE;
+   case PIPE_FORMAT_R16G16B16_USCALED: return TRUE;
+   case PIPE_FORMAT_R16G16_USCALED: return TRUE;
+   case PIPE_FORMAT_R16_USCALED: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_SSCALED: return TRUE;
+   case PIPE_FORMAT_R16G16B16_SSCALED: return TRUE;
+   case PIPE_FORMAT_R16G16_SSCALED: return TRUE;
+   case PIPE_FORMAT_R16_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_UNORM: return TRUE;
+   case PIPE_FORMAT_R16G16B16_UNORM: return TRUE;
+   case PIPE_FORMAT_R16G16_UNORM: return TRUE;
+   case PIPE_FORMAT_R16_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_SNORM: return TRUE;
+   case PIPE_FORMAT_R16G16B16_SNORM: return TRUE;
+   case PIPE_FORMAT_R16G16_SNORM: return TRUE;
+   case PIPE_FORMAT_R16_SNORM: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_USCALED: return TRUE;
+   case PIPE_FORMAT_R8G8B8_USCALED: return TRUE;
+   case PIPE_FORMAT_R8G8_USCALED: return TRUE;
+   case PIPE_FORMAT_R8_USCALED: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_SSCALED: return TRUE;
+   case PIPE_FORMAT_R8G8B8_SSCALED: return TRUE;
+   case PIPE_FORMAT_R8G8_SSCALED: return TRUE;
+   case PIPE_FORMAT_R8_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_UNORM: return TRUE;
+   case PIPE_FORMAT_R8G8B8_UNORM: return TRUE;
+   case PIPE_FORMAT_R8G8_UNORM: return TRUE;
+   case PIPE_FORMAT_R8_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_SNORM: return TRUE;
+   case PIPE_FORMAT_R8G8B8_SNORM: return TRUE;
+   case PIPE_FORMAT_R8G8_SNORM: return TRUE;
+   case PIPE_FORMAT_R8_SNORM: return TRUE;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM: return TRUE;
+   case PIPE_FORMAT_B8G8R8A8_UNORM: return TRUE;
+   default: return FALSE;
+   }
+}
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index ef3aa674a34..f8bf5b46692 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -30,11 +30,12 @@
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_format.h"
 
 #include "translate.h"
 
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 
 #include "rtasm/rtasm_cpu.h"
 #include "rtasm/rtasm_x86sse.h"
@@ -46,21 +47,9 @@
 #define W    3
 
 
-typedef void (PIPE_CDECL *run_func)( struct translate *translate,
-                                     unsigned start,
-                                     unsigned count,
-                                     unsigned instance_id,
-                                     void *output_buffer);
-
-typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
-                                          const unsigned *elts,
-                                          unsigned count,
-                                          unsigned instance_id,
-                                          void *output_buffer);
-
 struct translate_buffer {
    const void *base_ptr;
-   unsigned stride;
+   uintptr_t stride;
    unsigned max_index;
 };
 
@@ -73,21 +62,43 @@ struct translate_buffer_varient {
 
 #define ELEMENT_BUFFER_INSTANCE_ID  1001
 
+#define NUM_CONSTS 7
+
+enum
+{
+   CONST_IDENTITY,
+   CONST_INV_127,
+   CONST_INV_255,
+   CONST_INV_32767,
+   CONST_INV_65535,
+   CONST_INV_2147483647,
+   CONST_255
+};
+
+#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
+static float consts[NUM_CONSTS][4] = {
+      {0, 0, 0, 1},
+      C(1.0 / 127.0),
+      C(1.0 / 255.0),
+      C(1.0 / 32767.0),
+      C(1.0 / 65535.0),
+      C(1.0 / 2147483647.0),
+      C(255.0)
+};
+#undef C
 
 struct translate_sse {
    struct translate translate;
 
    struct x86_function linear_func;
    struct x86_function elt_func;
+   struct x86_function elt16_func;
+   struct x86_function elt8_func;
    struct x86_function *func;
 
-   boolean loaded_identity;
-   boolean loaded_255;
-   boolean loaded_inv_255;
-
-   float identity[4];
-   float float_255[4];
-   float inv_255[4];
+   PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
+   int8_t reg_to_const[16];
+   int8_t const_to_reg[NUM_CONSTS];
 
    struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
    unsigned nr_buffers;
@@ -102,17 +113,16 @@ struct translate_sse {
    boolean use_instancing;
    unsigned instance_id;
 
-   run_func      gen_run;
-   run_elts_func gen_run_elts;
-
    /* these are actually known values, but putting them in a struct
     * like this is helpful to keep them in sync across the file.
     */
    struct x86_reg tmp_EAX;
-   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
-   struct x86_reg outbuf_ECX;
-   struct x86_reg machine_EDX;
-   struct x86_reg count_ESI;    /* decrements to zero */
+   struct x86_reg tmp2_EDX;
+   struct x86_reg src_ECX;
+   struct x86_reg idx_ESI;     /* either start+i or &elt[i] */
+   struct x86_reg machine_EDI;
+   struct x86_reg outbuf_EBX;
+   struct x86_reg count_EBP;    /* decrements to zero */
 };
 
 static int get_offset( const void *a, const void *b )
@@ -120,281 +130,950 @@ static int get_offset( const void *a, const void *b )
    return (const char *)b - (const char *)a;
 }
 
+static struct x86_reg get_const( struct translate_sse *p, unsigned id)
+{
+   struct x86_reg reg;
+   unsigned i;
 
+   if(p->const_to_reg[id] >= 0)
+      return x86_make_reg(file_XMM, p->const_to_reg[id]);
 
-static struct x86_reg get_identity( struct translate_sse *p )
-{
-   struct x86_reg reg = x86_make_reg(file_XMM, 6);
-
-   if (!p->loaded_identity) {
-      p->loaded_identity = TRUE;
-      p->identity[0] = 0;
-      p->identity[1] = 0;
-      p->identity[2] = 0;
-      p->identity[3] = 1;
-
-      sse_movups(p->func, reg, 
-		 x86_make_disp(p->machine_EDX, 
-			       get_offset(p, &p->identity[0])));
+   for(i = 2; i < 8; ++i)
+   {
+      if(p->reg_to_const[i] < 0)
+         break;
    }
 
+   /* TODO: be smarter here */
+   if(i == 8)
+      --i;
+
+   reg = x86_make_reg(file_XMM, i);
+
+   if(p->reg_to_const[i] >= 0)
+      p->const_to_reg[p->reg_to_const[i]] = -1;
+
+   p->reg_to_const[i] = id;
+   p->const_to_reg[id] = i;
+
+   /* TODO: this should happen outside the loop, if possible */
+   sse_movaps(p->func, reg,
+         x86_make_disp(p->machine_EDI,
+               get_offset(p, &p->consts[id][0])));
+
    return reg;
 }
 
-static struct x86_reg get_255( struct translate_sse *p )
+/* load the data in a SSE2 register, padding with zeros */
+static boolean emit_load_sse2( struct translate_sse *p,
+				       struct x86_reg data,
+				       struct x86_reg src,
+				       unsigned size)
 {
-   struct x86_reg reg = x86_make_reg(file_XMM, 7);
-
-   if (!p->loaded_255) {
-      p->loaded_255 = TRUE;
-      p->float_255[0] =
-	 p->float_255[1] =
-	 p->float_255[2] =
-	 p->float_255[3] = 255.0f;
-
-      sse_movups(p->func, reg, 
-		 x86_make_disp(p->machine_EDX, 
-			       get_offset(p, &p->float_255[0])));
+   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+   struct x86_reg tmp = p->tmp_EAX;
+   switch(size)
+   {
+   case 1:
+      x86_movzx8(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 2:
+      x86_movzx16(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 3:
+      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
+      x86_shl_imm(p->func, tmp, 16);
+      x86_mov16(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 4:
+      sse2_movd(p->func, data, src);
+      break;
+   case 6:
+      sse2_movd(p->func, data, src);
+      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
+      sse2_movd(p->func, tmpXMM, tmp);
+      sse2_punpckldq(p->func, data, tmpXMM);
+      break;
+   case 8:
+      sse2_movq(p->func, data, src);
+      break;
+   case 12:
+      sse2_movq(p->func, data, src);
+      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
+      sse2_punpcklqdq(p->func, data, tmpXMM);
+      break;
+   case 16:
+      sse2_movdqu(p->func, data, src);
+      break;
+   default:
+      return FALSE;
    }
-
-   return reg;
+   return TRUE;
 }
 
-static struct x86_reg get_inv_255( struct translate_sse *p )
+/* this value can be passed for the out_chans argument */
+#define CHANNELS_0001 5
+
+/* this function will load #chans float values, and will
+ * pad the register with zeroes at least up to out_chans.
+ *
+ * If out_chans is set to CHANNELS_0001, then the fourth
+ * value will be padded with 1. Only pass this value if
+ * chans < 4 or results are undefined.
+ */
+static void emit_load_float32( struct translate_sse *p,
+                                       struct x86_reg data,
+                                       struct x86_reg arg0,
+                                       unsigned out_chans,
+                                       unsigned chans)
 {
-   struct x86_reg reg = x86_make_reg(file_XMM, 5);
-
-   if (!p->loaded_inv_255) {
-      p->loaded_inv_255 = TRUE;
-      p->inv_255[0] =
-	 p->inv_255[1] =
-	 p->inv_255[2] =
-	 p->inv_255[3] = 1.0f / 255.0f;
-
-      sse_movups(p->func, reg, 
-		 x86_make_disp(p->machine_EDX, 
-			       get_offset(p, &p->inv_255[0])));
+   switch(chans)
+   {
+   case 1:
+      /* a 0 0 0
+       * a 0 0 1
+       */
+      sse_movss(p->func, data, arg0);
+      if(out_chans == CHANNELS_0001)
+         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+      break;
+   case 2:
+      /* 0 0 0 1
+       * a b 0 1
+       */
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+      else if(out_chans > 2)
+         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
+      sse_movlps(p->func, data, arg0);
+      break;
+   case 3:
+      /* Have to jump through some hoops:
+       *
+       * c 0 0 0
+       * c 0 0 1 if out_chans == CHANNELS_0001
+       * 0 0 c 0/1
+       * a b c 0/1
+       */
+      sse_movss(p->func, data, x86_make_disp(arg0, 8));
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
+      sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
+      sse_movlps(p->func, data, arg0);
+      break;
+   case 4:
+      sse_movups(p->func, data, arg0);
+      break;
    }
-
-   return reg;
 }
 
+/* this function behaves like emit_load_float32, but loads
+   64-bit floating point numbers, converting them to 32-bit
+  ones */
+static void emit_load_float64to32( struct translate_sse *p,
+                                       struct x86_reg data,
+                                       struct x86_reg arg0,
+                                       unsigned out_chans,
+                                       unsigned chans)
+{
+   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+   switch(chans)
+   {
+   case 1:
+      sse2_movsd(p->func, data, arg0);
+      if(out_chans > 1)
+         sse2_cvtpd2ps(p->func, data, data);
+      else
+         sse2_cvtsd2ss(p->func, data, data);
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W)  );
+      break;
+   case 2:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
+      else if(out_chans > 2)
+         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
+       break;
+   case 3:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+      if(out_chans > 3)
+         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+      else
+         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
+      sse_movlhps(p->func, data, tmpXMM);
+      if(out_chans == CHANNELS_0001)
+         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
+      break;
+   case 4:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+      sse_movlhps(p->func, data, tmpXMM);
+      break;
+   }
+}
 
-static void emit_load_R32G32B32A32( struct translate_sse *p, 			   
-				    struct x86_reg data,
-				    struct x86_reg arg0 )
+static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr,  struct x86_reg src_xmm)
 {
-   sse_movups(p->func, data, arg0);
+   if(x86_target(p->func) != X86_32)
+      x64_mov64(p->func, dst_gpr, src_gpr);
+   else
+   {
+      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
+      if(x86_target_caps(p->func) & X86_SSE2)
+         sse2_movq(p->func, dst_xmm, src_xmm);
+      else
+         sse_movlps(p->func, dst_xmm, src_xmm);
+   }
 }
 
-static void emit_load_R32G32B32( struct translate_sse *p, 			   
-				 struct x86_reg data,
-				 struct x86_reg arg0 )
+static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
 {
-   /* Have to jump through some hoops:
-    *
-    * c 0 0 0
-    * c 0 0 1
-    * 0 0 c 1
-    * a b c 1
-    */
-   sse_movss(p->func, data, x86_make_disp(arg0, 8));
-   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
-   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
-   sse_movlps(p->func, data, arg0);
+   emit_mov64(p, dst_gpr, dst_xmm, src, src);
 }
 
-static void emit_load_R32G32( struct translate_sse *p, 
-			   struct x86_reg data,
-			   struct x86_reg arg0 )
+static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
 {
-   /* 0 0 0 1
-    * a b 0 1
-    */
-   sse_movups(p->func, data, get_identity(p) );
-   sse_movlps(p->func, data, arg0);
+   emit_mov64(p, dst, dst, src_gpr, src_xmm);
 }
 
+static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
+{
+   if(x86_target_caps(p->func) & X86_SSE2)
+      sse2_movdqu(p->func, dst, src);
+   else
+      sse_movups(p->func, dst, src);
+}
 
-static void emit_load_R32( struct translate_sse *p, 
-			   struct x86_reg data,
-			   struct x86_reg arg0 )
+/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
+ * but may or may not be good on older processors
+ * TODO: may perhaps want to use non-temporal stores here if possible
+ */
+static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
 {
-   /* a 0 0 0
-    * a 0 0 1
-    */
-   sse_movss(p->func, data, arg0);
-   sse_orps(p->func, data, get_identity(p) );
+   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
+   struct x86_reg dataGPR = p->tmp_EAX;
+   struct x86_reg dataGPR2 = p->tmp2_EDX;
+
+   if(size < 8)
+   {
+      switch (size)
+      {
+      case 1:
+         x86_mov8(p->func, dataGPR, src);
+         x86_mov8(p->func, dst, dataGPR);
+         break;
+      case 2:
+         x86_mov16(p->func, dataGPR, src);
+         x86_mov16(p->func, dst, dataGPR);
+         break;
+      case 3:
+         x86_mov16(p->func, dataGPR, src);
+         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
+         x86_mov16(p->func, dst, dataGPR);
+         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
+         break;
+      case 4:
+         x86_mov(p->func, dataGPR, src);
+         x86_mov(p->func, dst, dataGPR);
+         break;
+      case 6:
+         x86_mov(p->func, dataGPR, src);
+         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
+         x86_mov(p->func, dst, dataGPR);
+         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
+         break;
+      }
+   }
+   else if(!(x86_target_caps(p->func) & X86_SSE))
+   {
+      unsigned i = 0;
+      assert((size & 3) == 0);
+      for(i = 0; i < size; i += 4)
+      {
+         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
+         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
+      }
+   }
+   else
+   {
+      switch(size)
+      {
+      case 8:
+         emit_load64(p, dataGPR, dataXMM, src);
+         emit_store64(p, dst, dataGPR, dataXMM);
+         break;
+      case 12:
+         emit_load64(p, dataGPR2, dataXMM, src);
+         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
+         emit_store64(p, dst, dataGPR2, dataXMM);
+         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
+         break;
+      case 16:
+         emit_mov128(p, dataXMM, src);
+         emit_mov128(p, dst, dataXMM);
+         break;
+      case 24:
+         emit_mov128(p, dataXMM, src);
+         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
+         emit_mov128(p, dst, dataXMM);
+         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
+         break;
+      case 32:
+         emit_mov128(p, dataXMM, src);
+         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
+         emit_mov128(p, dst, dataXMM);
+         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
+         break;
+      default:
+         assert(0);
+      }
+   }
 }
 
+static boolean translate_attr_convert( struct translate_sse *p,
+                               const struct translate_element *a,
+                               struct x86_reg src,
+                               struct x86_reg dst)
 
-static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
-				       struct x86_reg data,
-				       struct x86_reg src )
 {
+   const struct util_format_description* input_desc = util_format_description(a->input_format);
+   const struct util_format_description* output_desc = util_format_description(a->output_format);
+   unsigned i;
+   boolean id_swizzle = TRUE;
+   unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
+   unsigned needed_chans = 0;
+   unsigned imms[2] = {0, 0x3f800000};
 
-   /* Load and unpack twice:
-    */
-   sse_movss(p->func, data, src);
-   sse2_punpcklbw(p->func, data, get_identity(p));
-   sse2_punpcklbw(p->func, data, get_identity(p));
+   if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
+      return FALSE;
 
-   /* Convert to float:
-    */
-   sse2_cvtdq2ps(p->func, data, data);
+   if(input_desc->channel[0].size & 7)
+      return FALSE;
 
+   if(input_desc->colorspace != output_desc->colorspace)
+      return FALSE;
 
-   /* Scale by 1/255.0
-    */
-   sse_mulps(p->func, data, get_inv_255(p));
-}
+   for(i = 1; i < input_desc->nr_channels; ++i)
+   {
+      if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
+         return FALSE;
+   }
 
+   for(i = 1; i < output_desc->nr_channels; ++i)
+   {
+      if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
+         return FALSE;
+   }
 
+   for(i = 0; i < output_desc->nr_channels; ++i)
+   {
+      if(output_desc->swizzle[i] < 4)
+         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
+   }
 
+   if((x86_target_caps(p->func) & X86_SSE) && (0
+         || a->output_format == PIPE_FORMAT_R32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 
-static void emit_store_R32G32B32A32( struct translate_sse *p, 			   
-				     struct x86_reg dest,
-				     struct x86_reg dataXMM )
-{
-   sse_movups(p->func, dest, dataXMM);
-}
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+            swizzle[i] = i;
+      }
 
-static void emit_store_R32G32B32( struct translate_sse *p, 
-				  struct x86_reg dest,
-				  struct x86_reg dataXMM )
-{
-   /* Emit two, shuffle, emit one.
-    */
-   sse_movlps(p->func, dest, dataXMM);
-   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
-   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
-}
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] < 4)
+            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+            id_swizzle = FALSE;
+      }
 
-static void emit_store_R32G32( struct translate_sse *p, 
-			       struct x86_reg dest,
-			       struct x86_reg dataXMM )
-{
-   sse_movlps(p->func, dest, dataXMM);
-}
+      if(needed_chans > 0)
+      {
+         switch(input_desc->channel[0].type)
+         {
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if(!(x86_target_caps(p->func) & X86_SSE2))
+               return FALSE;
+            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+            /* TODO: add support for SSE4.1 pmovzx */
+            switch(input_desc->channel[0].size)
+            {
+            case 8:
+               /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
+               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+               break;
+            case 16:
+               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+               break;
+            case 32: /* we lose precision here */
+               sse2_psrld_imm(p->func, dataXMM, 1);
+               break;
+            default:
+               return FALSE;
+            }
+            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if(input_desc->channel[0].normalized)
+            {
+               struct x86_reg factor;
+               switch(input_desc->channel[0].size)
+               {
+               case 8:
+                  factor = get_const(p, CONST_INV_255);
+                  break;
+               case 16:
+                  factor = get_const(p, CONST_INV_65535);
+                  break;
+               case 32:
+                  factor = get_const(p, CONST_INV_2147483647);
+                  break;
+               default:
+                  assert(0);
+                  factor.disp = 0;
+                  factor.file = 0;
+                  factor.idx = 0;
+                  factor.mod = 0;
+                  break;
+               }
+               sse_mulps(p->func, dataXMM, factor);
+            }
+            else if(input_desc->channel[0].size == 32)
+               sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if(!(x86_target_caps(p->func) & X86_SSE2))
+               return FALSE;
+            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+            /* TODO: add support for SSE4.1 pmovsx */
+            switch(input_desc->channel[0].size)
+            {
+            case 8:
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_psrad_imm(p->func, dataXMM, 24);
+               break;
+            case 16:
+               sse2_punpcklwd(p->func, dataXMM, dataXMM);
+               sse2_psrad_imm(p->func, dataXMM, 16);
+               break;
+            case 32: /* we lose precision here */
+               break;
+            default:
+               return FALSE;
+            }
+            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if(input_desc->channel[0].normalized)
+            {
+               struct x86_reg factor;
+               switch(input_desc->channel[0].size)
+               {
+               case 8:
+                  factor = get_const(p, CONST_INV_127);
+                  break;
+               case 16:
+                  factor = get_const(p, CONST_INV_32767);
+                  break;
+               case 32:
+                  factor = get_const(p, CONST_INV_2147483647);
+                  break;
+               default:
+                  assert(0);
+                  factor.disp = 0;
+                  factor.file = 0;
+                  factor.idx = 0;
+                  factor.mod = 0;
+                  break;
+               }
+               sse_mulps(p->func, dataXMM, factor);
+            }
+            break;
+
+            break;
+         case UTIL_FORMAT_TYPE_FLOAT:
+            if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
+               return FALSE;
+            if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
+            {
+               swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
+               needed_chans = CHANNELS_0001;
+            }
+            switch(input_desc->channel[0].size)
+            {
+            case 32:
+               emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+               break;
+            case 64: /* we lose precision here */
+               if(!(x86_target_caps(p->func) & X86_SSE2))
+                  return FALSE;
+               emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+               break;
+            default:
+               return FALSE;
+            }
+            break;
+         default:
+            return FALSE;
+         }
 
-static void emit_store_R32( struct translate_sse *p, 
-			    struct x86_reg dest,
-			    struct x86_reg dataXMM )
-{
-   sse_movss(p->func, dest, dataXMM);
-}
+         if(!id_swizzle)
+            sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
+      }
 
+      if(output_desc->nr_channels >= 4
+            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+            )
+         sse_movups(p->func, dst, dataXMM);
+      else
+      {
+         if(output_desc->nr_channels >= 2
+               && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+               && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+            sse_movlps(p->func, dst, dataXMM);
+         else
+         {
+            if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+               sse_movss(p->func, dst, dataXMM);
+            else
+               x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+
+            if(output_desc->nr_channels >= 2)
+            {
+               if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
+                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
+               }
+               else
+                  x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+            }
+         }
 
+         if(output_desc->nr_channels >= 3)
+         {
+            if(output_desc->nr_channels >= 4
+                  && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+                  && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
+            else
+            {
+               if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
+                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
+               }
+               else
+                  x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+               if(output_desc->nr_channels >= 4)
+               {
+                  if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+                  {
+                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
+                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
+                  }
+                  else
+                     x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+               }
+            }
+         }
+      }
+      return TRUE;
+   }
+   else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
+         && output_desc->channel[0].normalized == input_desc->channel[0].normalized
+         && (0
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+               ))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+      struct x86_reg tmp = p->tmp_EAX;
+      unsigned imms[2] = {0, 1};
+
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+            swizzle[i] = i;
+      }
 
-static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
-				       struct x86_reg dest,
-				       struct x86_reg dataXMM )
-{
-   /* Scale by 255.0
-    */
-   sse_mulps(p->func, dataXMM, get_255(p));
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] < 4)
+            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+            id_swizzle = FALSE;
+      }
 
-   /* Pack and emit:
-    */
-   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
-   sse2_packssdw(p->func, dataXMM, dataXMM);
-   sse2_packuswb(p->func, dataXMM, dataXMM);
-   sse_movss(p->func, dest, dataXMM);
-}
+      if(needed_chans > 0)
+      {
+         emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+         switch(input_desc->channel[0].type)
+         {
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if(input_desc->channel[0].normalized)
+            {
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+        	       sse2_psrlw_imm(p->func, dataXMM, 1);
+            }
+            else
+               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if(input_desc->channel[0].normalized)
+            {
+               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
+               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
+               sse2_psllw_imm(p->func, dataXMM, 9);
+               sse2_psrlw_imm(p->func, dataXMM, 8);
+               sse2_por(p->func, tmpXMM, dataXMM);
+               sse2_psrlw_imm(p->func, dataXMM, 7);
+               sse2_por(p->func, tmpXMM, dataXMM);
+               {
+                  struct x86_reg t = dataXMM;
+                  dataXMM = tmpXMM;
+                  tmpXMM = t;
+               }
+            }
+            else
+            {
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_psraw_imm(p->func, dataXMM, 8);
+            }
+            break;
+         default:
+            assert(0);
+         }
 
+         if(output_desc->channel[0].normalized)
+            imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
 
+         if(!id_swizzle)
+            sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
+      }
 
+      if(output_desc->nr_channels >= 4
+            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+            )
+         sse2_movq(p->func, dst, dataXMM);
+      else
+      {
+         if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+         {
+            if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+               sse2_movd(p->func, dst, dataXMM);
+            else
+            {
+               sse2_movd(p->func, tmp, dataXMM);
+               x86_mov16(p->func, dst, tmp);
+               if(output_desc->nr_channels >= 2)
+                  x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+            }
+         }
+         else
+         {
+            if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
+               x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+            else
+            {
+               x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+               if(output_desc->nr_channels >= 2)
+               {
+                  sse2_movd(p->func, tmp, dataXMM);
+                  x86_shr_imm(p->func, tmp, 16);
+                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
+               }
+            }
+         }
 
+         if(output_desc->nr_channels >= 3)
+         {
+            if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+            {
+               if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse2_psrlq_imm(p->func, dataXMM, 32);
+                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
+               }
+               else
+               {
+                  sse2_psrlq_imm(p->func, dataXMM, 32);
+                  sse2_movd(p->func, tmp, dataXMM);
+                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
+                  if(output_desc->nr_channels >= 4)
+                  {
+                     x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+                  }
+               }
+            }
+            else
+            {
+               if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
+                  x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+               else
+               {
+                  x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+                  if(output_desc->nr_channels >= 4)
+                  {
+                     sse2_psrlq_imm(p->func, dataXMM, 48);
+                     sse2_movd(p->func, tmp, dataXMM);
+                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
+                  }
+               }
+            }
+         }
+      }
+      return TRUE;
+   }
+   else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
+   {
+      struct x86_reg tmp = p->tmp_EAX;
+      unsigned i;
+      if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
+                     && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
+                     && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
+                     && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
+                     && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
+      {
+         /* TODO: support movbe */
+         x86_mov(p->func, tmp, src);
+         x86_bswap(p->func, tmp);
+         x86_mov(p->func, dst, tmp);
+         return TRUE;
+      }
 
-/* Extended swizzles?  Maybe later.
- */  
-static void emit_swizzle( struct translate_sse *p,
-			  struct x86_reg dest,
-			  struct x86_reg src,
-			  unsigned char shuffle )
-{
-   sse_shufps(p->func, dest, src, shuffle);
-}
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         switch(output_desc->channel[0].size)
+         {
+         case 8:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[0].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[0].normalized ? 0xff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[0].normalized ? 0x7f : 1;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
+            }
+            else
+            {
+               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
+               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
+            }
+            break;
+         case 16:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[1].normalized ? 0xffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     v = 0x3c00;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
+            }
+            else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
+               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
+            else
+            {
+               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
+               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
+            }
+            break;
+         case 32:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     v = 0x3f800000;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
+            }
+            else
+            {
+               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
+               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
+            }
+            break;
+         case 64:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned l = 0;
+               unsigned h = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
+                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
+                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     h = 0x3ff00000;
+                     l = 0;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
+            }
+            else
+            {
+               if(x86_target_caps(p->func) & X86_SSE)
+               {
+                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
+                  emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
+                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
+               }
+               else
+               {
+                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
+                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
+                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
+                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
+               }
+            }
+            break;
+         default:
+            return FALSE;
+         }
+      }
+      return TRUE;
+   }
+   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
+   else if((x86_target_caps(p->func) & X86_SSE2) &&
+         a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
+               || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
+               || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
+         ))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 
+      /* load */
+      sse_movups(p->func, dataXMM, src);
 
-static boolean translate_attr( struct translate_sse *p,
-			       const struct translate_element *a,
-			       struct x86_reg srcECX,
-			       struct x86_reg dstEAX)
-{
-   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
 
-   switch (a->input_format) {
-   case PIPE_FORMAT_R32_FLOAT:
-      emit_load_R32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      emit_load_R32G32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      emit_load_R32G32B32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      emit_load_R32G32B32A32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
-      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
-      break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
-      break;
-   default:
-      return FALSE;
-   }
+      /* scale by 255.0 */
+      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
 
-   switch (a->output_format) {
-   case PIPE_FORMAT_R32_FLOAT:
-      emit_store_R32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      emit_store_R32G32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      emit_store_R32G32B32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
-      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
-      break;
-   default:
-      return FALSE;
+      /* pack and emit */
+      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
+      sse2_packssdw(p->func, dataXMM, dataXMM);
+      sse2_packuswb(p->func, dataXMM, dataXMM);
+      sse2_movd(p->func, dst, dataXMM);
+
+      return TRUE;
    }
 
-   return TRUE;
+   return FALSE;
 }
 
+static boolean translate_attr( struct translate_sse *p,
+			       const struct translate_element *a,
+			       struct x86_reg src,
+			       struct x86_reg dst)
+{
+   if(a->input_format == a->output_format)
+   {
+      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
+      return TRUE;
+   }
+
+   return translate_attr_convert(p, a, src, dst);
+}
 
 static boolean init_inputs( struct translate_sse *p,
-                            boolean linear )
+                            unsigned index_size )
 {
    unsigned i;
-   struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
+   struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
                                               get_offset(p, &p->instance_id));
 
    for (i = 0; i < p->nr_buffer_varients; i++) {
       struct translate_buffer_varient *varient = &p->buffer_varient[i];
       struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
 
-      if (linear || varient->instance_divisor) {
-         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
+      if (!index_size || varient->instance_divisor) {
+         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDI,
                                                      get_offset(p, &buffer->stride));
-         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDI,
                                                      get_offset(p, &varient->ptr));
-         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
                                                      get_offset(p, &buffer->base_ptr));
-         struct x86_reg elt = p->idx_EBX;
+         struct x86_reg elt = p->idx_ESI;
          struct x86_reg tmp_EAX = p->tmp_EAX;
 
          /* Calculate pointer to first attrib:
@@ -406,20 +1085,16 @@ static boolean init_inputs( struct translate_sse *p,
             x86_mov(p->func, tmp_EAX, instance_id);
 
             if (varient->instance_divisor != 1) {
-               struct x86_reg tmp_EDX = p->machine_EDX;
-               struct x86_reg tmp_ECX = p->outbuf_ECX;
+               struct x86_reg tmp_EDX = p->tmp2_EDX;
+               struct x86_reg tmp_ECX = p->src_ECX;
 
                /* TODO: Add x86_shr() to rtasm and use it whenever
                 *       instance divisor is power of two.
                 */
 
-               x86_push(p->func, tmp_EDX);
-               x86_push(p->func, tmp_ECX);
                x86_xor(p->func, tmp_EDX, tmp_EDX);
                x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
                x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
-               x86_pop(p->func, tmp_ECX);
-               x86_pop(p->func, tmp_EDX);
             }
          } else {
             x86_mov(p->func, tmp_EAX, elt);
@@ -430,16 +1105,23 @@ static boolean init_inputs( struct translate_sse *p,
           */
 
          x86_imul(p->func, tmp_EAX, buf_stride);
+         x64_rexw(p->func);
          x86_add(p->func, tmp_EAX, buf_base_ptr);
 
 
          /* In the linear case, keep the buffer pointer instead of the
           * index number.
           */
-         if (linear && p->nr_buffer_varients == 1)
+         if (!index_size && p->nr_buffer_varients == 1)
+         {
+            x64_rexw(p->func);
             x86_mov(p->func, elt, tmp_EAX);
+         }
          else
+         {
+            x64_rexw(p->func);
             x86_mov(p->func, buf_ptr, tmp_EAX);
+         }
       }
    }
 
@@ -448,44 +1130,57 @@ static boolean init_inputs( struct translate_sse *p,
 
 
 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
-                                      boolean linear,
+                                      unsigned index_size,
                                       unsigned var_idx,
                                       struct x86_reg elt )
 {
    if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
-      return x86_make_disp(p->machine_EDX,
+      return x86_make_disp(p->machine_EDI,
                            get_offset(p, &p->instance_id));
    }
-   if (linear && p->nr_buffer_varients == 1) {
-      return p->idx_EBX;
+   if (!index_size && p->nr_buffer_varients == 1) {
+      return p->idx_ESI;
    }
-   else if (linear || p->buffer_varient[var_idx].instance_divisor) {
-      struct x86_reg ptr = p->tmp_EAX;
+   else if (!index_size || p->buffer_varient[var_idx].instance_divisor) {
+      struct x86_reg ptr = p->src_ECX;
       struct x86_reg buf_ptr = 
-         x86_make_disp(p->machine_EDX, 
+         x86_make_disp(p->machine_EDI,
                        get_offset(p, &p->buffer_varient[var_idx].ptr));
       
+      x64_rexw(p->func);
       x86_mov(p->func, ptr, buf_ptr);
       return ptr;
    }
    else {
-      struct x86_reg ptr = p->tmp_EAX;
+      struct x86_reg ptr = p->src_ECX;
       const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
 
       struct x86_reg buf_stride = 
-         x86_make_disp(p->machine_EDX, 
+         x86_make_disp(p->machine_EDI,
                        get_offset(p, &p->buffer[varient->buffer_index].stride));
 
       struct x86_reg buf_base_ptr = 
-         x86_make_disp(p->machine_EDX, 
+         x86_make_disp(p->machine_EDI,
                        get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
 
 
 
       /* Calculate pointer to current attrib:
        */
-      x86_mov(p->func, ptr, buf_stride);
-      x86_imul(p->func, ptr, elt);
+      switch(index_size)
+      {
+      case 1:
+         x86_movzx8(p->func, ptr, elt);
+         break;
+      case 2:
+         x86_movzx16(p->func, ptr, elt);
+         break;
+      case 4:
+         x86_mov(p->func, ptr, elt);
+         break;
+      }
+      x86_imul(p->func, ptr, buf_stride);
+      x64_rexw(p->func);
       x86_add(p->func, ptr, buf_base_ptr);
       return ptr;
    }
@@ -494,39 +1189,43 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p,
 
 
 static boolean incr_inputs( struct translate_sse *p, 
-                            boolean linear )
+                            unsigned index_size )
 {
-   if (linear && p->nr_buffer_varients == 1) {
-      struct x86_reg stride = x86_make_disp(p->machine_EDX,
+   if (!index_size && p->nr_buffer_varients == 1) {
+      struct x86_reg stride = x86_make_disp(p->machine_EDI,
                                             get_offset(p, &p->buffer[0].stride));
 
       if (p->buffer_varient[0].instance_divisor == 0) {
-         x86_add(p->func, p->idx_EBX, stride);
-         sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+         x64_rexw(p->func);
+         x86_add(p->func, p->idx_ESI, stride);
+         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
       }
    }
-   else if (linear) {
+   else if (!index_size) {
       unsigned i;
 
       /* Is this worthwhile??
        */
       for (i = 0; i < p->nr_buffer_varients; i++) {
          struct translate_buffer_varient *varient = &p->buffer_varient[i];
-         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
                                                 get_offset(p, &varient->ptr));
-         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
                                                    get_offset(p, &p->buffer[varient->buffer_index].stride));
 
          if (varient->instance_divisor == 0) {
-            x86_mov(p->func, p->tmp_EAX, buf_ptr);
-            x86_add(p->func, p->tmp_EAX, buf_stride);
+            x86_mov(p->func, p->tmp_EAX, buf_stride);
+            x64_rexw(p->func);
+            x86_add(p->func, p->tmp_EAX, buf_ptr);
             if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+            x64_rexw(p->func);
             x86_mov(p->func, buf_ptr, p->tmp_EAX);
          }
       }
    } 
    else {
-      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
+      x64_rexw(p->func);
+      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
    }
    
    return TRUE;
@@ -551,35 +1250,52 @@ static boolean incr_inputs( struct translate_sse *p,
  */
 static boolean build_vertex_emit( struct translate_sse *p,
 				  struct x86_function *func,
-				  boolean linear )
+				  unsigned index_size )
 {
    int fixup, label;
    unsigned j;
 
+   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
+   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
+
    p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
-   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
-   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
-   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
-   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
+   p->idx_ESI       = x86_make_reg(file_REG32, reg_SI);
+   p->outbuf_EBX    = x86_make_reg(file_REG32, reg_BX);
+   p->machine_EDI   = x86_make_reg(file_REG32, reg_DI);
+   p->count_EBP     = x86_make_reg(file_REG32, reg_BP);
+   p->tmp2_EDX     = x86_make_reg(file_REG32, reg_DX);
+   p->src_ECX     = x86_make_reg(file_REG32, reg_CX);
 
    p->func = func;
-   p->loaded_inv_255 = FALSE;
-   p->loaded_255 = FALSE;
-   p->loaded_identity = FALSE;
 
    x86_init_func(p->func);
 
-   /* Push a few regs?
-    */
-   x86_push(p->func, p->idx_EBX);
-   x86_push(p->func, p->count_ESI);
+   if(x86_target(p->func) == X86_64_WIN64_ABI)
+   {
+	   /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
+	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
+	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
+   }
 
-   /* Load arguments into regs:
-    */
-   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
-   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
-   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
-   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
+   x86_push(p->func, p->outbuf_EBX);
+   x86_push(p->func, p->count_EBP);
+
+/* on non-Win64 x86-64, these are already in the right registers */
+   if(x86_target(p->func) != X86_64_STD_ABI)
+   {
+      x86_push(p->func, p->machine_EDI);
+      x86_push(p->func, p->idx_ESI);
+
+      x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
+      x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
+   }
+
+   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
+
+   if(x86_target(p->func) != X86_32)
+      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
+   else
+      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
 
    /* Load instance ID.
     */
@@ -588,25 +1304,25 @@ static boolean build_vertex_emit( struct translate_sse *p,
               p->tmp_EAX,
               x86_fn_arg(p->func, 4));
       x86_mov(p->func,
-              x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
+              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
               p->tmp_EAX);
    }
 
    /* Get vertex count, compare to zero
     */
    x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
-   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
+   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
    fixup = x86_jcc_forward(p->func, cc_E);
 
    /* always load, needed or not:
     */
-   init_inputs(p, linear);
+   init_inputs(p, index_size);
 
    /* Note address for loop jump
     */
    label = x86_get_label(p->func);
    {
-      struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
+      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
       int last_varient = -1;
       struct x86_reg vb;
 
@@ -618,30 +1334,31 @@ static boolean build_vertex_emit( struct translate_sse *p,
           */
          if (varient != last_varient) {
             last_varient = varient;
-            vb = get_buffer_ptr(p, linear, varient, elt);
+            vb = get_buffer_ptr(p, index_size, varient, elt);
          }
          
          if (!translate_attr( p, a, 
                               x86_make_disp(vb, a->input_offset), 
-                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
+                              x86_make_disp(p->outbuf_EBX, a->output_offset)))
             return FALSE;
       }
 
       /* Next output vertex:
        */
+      x64_rexw(p->func);
       x86_lea(p->func, 
-              p->outbuf_ECX, 
-              x86_make_disp(p->outbuf_ECX, 
+              p->outbuf_EBX,
+              x86_make_disp(p->outbuf_EBX,
                             p->translate.key.output_stride));
 
       /* Incr index
        */ 
-      incr_inputs( p, linear );
+      incr_inputs( p, index_size );
    }
 
    /* decr count, loop if not zero
     */
-   x86_dec(p->func, p->count_ESI);
+   x86_dec(p->func, p->count_EBP);
    x86_jcc(p->func, cc_NZ, label);
 
    /* Exit mmx state?
@@ -656,8 +1373,20 @@ static boolean build_vertex_emit( struct translate_sse *p,
    /* Pop regs and return
     */
    
-   x86_pop(p->func, p->count_ESI);
-   x86_pop(p->func, p->idx_EBX);
+   if(x86_target(p->func) != X86_64_STD_ABI)
+   {
+      x86_pop(p->func, p->idx_ESI);
+      x86_pop(p->func, p->machine_EDI);
+   }
+
+   x86_pop(p->func, p->count_EBP);
+   x86_pop(p->func, p->outbuf_EBX);
+
+   if(x86_target(p->func) == X86_64_WIN64_ABI)
+   {
+	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
+	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
+   }
    x86_ret(p->func);
 
    return TRUE;
@@ -697,37 +1426,7 @@ static void translate_sse_release( struct translate *translate )
    x86_release_func( &p->linear_func );
    x86_release_func( &p->elt_func );
 
-   FREE(p);
-}
-
-static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
-			      const unsigned *elts,
-			      unsigned count,
-                              unsigned instance_id,
-			      void *output_buffer )
-{
-   struct translate_sse *p = (struct translate_sse *)translate;
-
-   p->gen_run_elts( translate,
-		    elts,
-		    count,
-                    instance_id,
-                    output_buffer);
-}
-
-static void PIPE_CDECL translate_sse_run( struct translate *translate,
-			 unsigned start,
-			 unsigned count,
-                         unsigned instance_id,
-			 void *output_buffer )
-{
-   struct translate_sse *p = (struct translate_sse *)translate;
-
-   p->gen_run( translate,
-	       start,
-	       count,
-               instance_id,
-               output_buffer);
+   os_free_aligned(p);
 }
 
 
@@ -736,18 +1435,19 @@ struct translate *translate_sse2_create( const struct translate_key *key )
    struct translate_sse *p = NULL;
    unsigned i;
 
-   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
+   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
+   if (!rtasm_cpu_has_sse())
       goto fail;
 
-   p = CALLOC_STRUCT( translate_sse );
+   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
    if (p == NULL) 
       goto fail;
+   memset(p, 0, sizeof(*p));
+   memcpy(p->consts, consts, sizeof(consts));
 
    p->translate.key = *key;
    p->translate.release = translate_sse_release;
    p->translate.set_buffer = translate_sse_set_buffer;
-   p->translate.run_elts = translate_sse_run_elts;
-   p->translate.run = translate_sse_run;
 
    for (i = 0; i < key->nr_elements; i++) {
       if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
@@ -783,18 +1483,32 @@ struct translate *translate_sse2_create( const struct translate_key *key )
 
    if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
 
-   if (!build_vertex_emit(p, &p->linear_func, TRUE))
+   if (!build_vertex_emit(p, &p->linear_func, 0))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt_func, 4))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt16_func, 2))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt8_func, 1))
+      goto fail;
+
+   p->translate.run = (void*)x86_get_func(&p->linear_func);
+   if (p->translate.run == NULL)
       goto fail;
 
-   if (!build_vertex_emit(p, &p->elt_func, FALSE))
+   p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
+   if (p->translate.run_elts == NULL)
       goto fail;
 
-   p->gen_run = (run_func)x86_get_func(&p->linear_func);
-   if (p->gen_run == NULL)
+   p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
+   if (p->translate.run_elts16 == NULL)
       goto fail;
 
-   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
-   if (p->gen_run_elts == NULL)
+   p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
+   if (p->translate.run_elts8 == NULL)
       goto fail;
 
    return &p->translate;
diff --git a/src/gallium/auxiliary/util/u_atomic.h b/src/gallium/auxiliary/util/u_atomic.h
index a1568233906..8434491a421 100644
--- a/src/gallium/auxiliary/util/u_atomic.h
+++ b/src/gallium/auxiliary/util/u_atomic.h
@@ -29,6 +29,8 @@
 #define PIPE_ATOMIC_ASM_MSVC_X86                
 #elif (defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86))
 #define PIPE_ATOMIC_ASM_GCC_X86
+#elif (defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64))
+#define PIPE_ATOMIC_ASM_GCC_X86_64
 #elif defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 401)
 #define PIPE_ATOMIC_GCC_INTRINSIC
 #else
@@ -36,6 +38,51 @@
 #endif
 
 
+#if defined(PIPE_ATOMIC_ASM_GCC_X86_64)
+#define PIPE_ATOMIC "GCC x86_64 assembly"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define p_atomic_set(_v, _i) (*(_v) = (_i))
+#define p_atomic_read(_v) (*(_v))
+
+static INLINE boolean
+p_atomic_dec_zero(int32_t *v)
+{
+   unsigned char c;
+
+   __asm__ __volatile__("lock; decl %0; sete %1":"+m"(*v), "=qm"(c)
+			::"memory");
+
+   return c != 0;
+}
+
+static INLINE void
+p_atomic_inc(int32_t *v)
+{
+   __asm__ __volatile__("lock; incl %0":"+m"(*v));
+}
+
+static INLINE void
+p_atomic_dec(int32_t *v)
+{
+   __asm__ __volatile__("lock; decl %0":"+m"(*v));
+}
+
+static INLINE int32_t
+p_atomic_cmpxchg(int32_t *v, int32_t old, int32_t _new)
+{
+   return __sync_val_compare_and_swap(v, old, _new);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PIPE_ATOMIC_ASM_GCC_X86_64 */
+
 
 #if defined(PIPE_ATOMIC_ASM_GCC_X86)
 
diff --git a/src/gallium/auxiliary/util/u_bitmask.h b/src/gallium/auxiliary/util/u_bitmask.h
index 87f1110296a..98b85ddecd5 100644
--- a/src/gallium/auxiliary/util/u_bitmask.h
+++ b/src/gallium/auxiliary/util/u_bitmask.h
@@ -36,6 +36,9 @@
 #define U_HANDLE_BITMASK_H_
 
 
+#include "pipe/p_compiler.h"
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index e45310b9bb7..dfb142b9e1c 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -47,8 +47,6 @@
 #include "util/u_memory.h"
 #include "util/u_sampler.h"
 #include "util/u_simple_shaders.h"
-#include "util/u_surface.h"
-#include "util/u_rect.h"
 
 #include "cso_cache/cso_context.h"
 
@@ -59,15 +57,18 @@ struct blit_state
    struct cso_context *cso;
 
    struct pipe_blend_state blend;
-   struct pipe_depth_stencil_alpha_state depthstencil;
+   struct pipe_depth_stencil_alpha_state depthstencil_keep;
+   struct pipe_depth_stencil_alpha_state depthstencil_write;
    struct pipe_rasterizer_state rasterizer;
    struct pipe_sampler_state sampler;
    struct pipe_viewport_state viewport;
    struct pipe_clip_state clip;
    struct pipe_vertex_element velem[2];
+   enum pipe_texture_target internal_target;
 
    void *vs;
    void *fs[TGSI_WRITEMASK_XYZW + 1];
+   void *fs_depth;
 
    struct pipe_resource *vbuf;  /**< quad vertices */
    unsigned vbuf_slot;
@@ -98,12 +99,15 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->blend.rt[0].colormask = PIPE_MASK_RGBA;
 
    /* no-op depth/stencil/alpha */
-   memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil));
+   memset(&ctx->depthstencil_keep, 0, sizeof(ctx->depthstencil_keep));
+   memset(&ctx->depthstencil_write, 0, sizeof(ctx->depthstencil_write));
+   ctx->depthstencil_write.depth.enabled = 1;
+   ctx->depthstencil_write.depth.writemask = 1;
+   ctx->depthstencil_write.depth.func = PIPE_FUNC_ALWAYS;
 
    /* rasterizer */
    memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
-   ctx->rasterizer.front_winding = PIPE_WINDING_CW;
-   ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
+   ctx->rasterizer.cull_face = PIPE_FACE_NONE;
    ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* samplers */
@@ -114,7 +118,6 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
    ctx->sampler.min_img_filter = 0; /* set later */
    ctx->sampler.mag_img_filter = 0; /* set later */
-   ctx->sampler.normalized_coords = 1;
 
    /* vertex elements state */
    memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2);
@@ -138,7 +141,8 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
 
    /* fragment shader */
    ctx->fs[TGSI_WRITEMASK_XYZW] =
-      util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D);
+      util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D,
+                                    TGSI_INTERPOLATE_LINEAR);
    ctx->vbuf = NULL;
 
    /* init vertex data that doesn't change */
@@ -148,6 +152,11 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
       ctx->vertices[i][1][3] = 1.0f; /* q */
    }
 
+   if(pipe->screen->get_param(pipe->screen, PIPE_CAP_NPOT_TEXTURES))
+      ctx->internal_target = PIPE_TEXTURE_2D;
+   else
+      ctx->internal_target = PIPE_TEXTURE_RECT;
+
    return ctx;
 }
 
@@ -167,6 +176,9 @@ util_destroy_blit(struct blit_state *ctx)
       if (ctx->fs[i])
          pipe->delete_fs_state(pipe, ctx->fs[i]);
 
+   if (ctx->fs_depth)
+      pipe->delete_fs_state(pipe, ctx->fs_depth);
+
    pipe_resource_reference(&ctx->vbuf, NULL);
 
    FREE(ctx);
@@ -192,7 +204,6 @@ get_next_slot( struct blit_state *ctx )
    
    return ctx->vbuf_slot++ * sizeof ctx->vertices;
 }
-                               
 
 
 
@@ -275,14 +286,15 @@ regions_overlap(int srcX0, int srcY0,
  * \param writemask  controls which channels in the dest surface are sourced
  *                   from the src surface.  Disabled channels are sourced
  *                   from (0,0,0,1).
- * XXX need some control over blitting Z and/or stencil.
+ * XXX need some control over blitting stencil.
  */
 void
 util_blit_pixels_writemask(struct blit_state *ctx,
-                           struct pipe_surface *src,
-                           struct pipe_sampler_view *src_sampler_view,
+                           struct pipe_resource *src_tex,
+                           struct pipe_subresource srcsub,
                            int srcX0, int srcY0,
                            int srcX1, int srcY1,
+                           int srcZ0,
                            struct pipe_surface *dst,
                            int dstX0, int dstY0,
                            int dstX1, int dstY1,
@@ -292,23 +304,25 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_screen *screen = pipe->screen;
    struct pipe_sampler_view *sampler_view = NULL;
+   struct pipe_sampler_view sv_templ;
    struct pipe_framebuffer_state fb;
    const int srcW = abs(srcX1 - srcX0);
    const int srcH = abs(srcY1 - srcY0);
    unsigned offset;
-   boolean overlap;
+   boolean overlap, dst_is_depth;
    float s0, t0, s1, t1;
+   boolean normalized;
 
    assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
           filter == PIPE_TEX_MIPFILTER_LINEAR);
 
-   assert(screen->is_format_supported(screen, src->format, PIPE_TEXTURE_2D,
-                                      PIPE_BIND_SAMPLER_VIEW, 0));
-   assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
-                                      PIPE_BIND_RENDER_TARGET, 0));
+   assert(srcsub.level <= src_tex->last_level);
 
    /* do the regions overlap? */
-   overlap = util_same_surface(src, dst) &&
+   overlap = src_tex == dst->texture &&
+             dst->face == srcsub.face &&
+             dst->level == srcsub.level &&
+             dst->zslice == srcZ0 &&
       regions_overlap(srcX0, srcY0, srcX1, srcY1,
                       dstX0, dstY0, dstX1, dstY1);
 
@@ -317,8 +331,7 @@ util_blit_pixels_writemask(struct blit_state *ctx,
     * no overlapping.
     * Filter mode should not matter since there's no stretching.
     */
-   if (pipe->surface_copy &&
-       dst->format == src->format &&
+   if (dst->format == src_tex->format &&
        srcX0 < srcX1 &&
        dstX0 < dstX1 &&
        srcY0 < srcY1 &&
@@ -326,29 +339,36 @@ util_blit_pixels_writemask(struct blit_state *ctx,
        (dstX1 - dstX0) == (srcX1 - srcX0) &&
        (dstY1 - dstY0) == (srcY1 - srcY0) &&
        !overlap) {
-      pipe->surface_copy(pipe,
-			 dst, dstX0, dstY0, /* dest */
-			 src, srcX0, srcY0, /* src */
-			 srcW, srcH);       /* size */
+      struct pipe_subresource subdst;
+      subdst.face = dst->face;
+      subdst.level = dst->level;
+      pipe->resource_copy_region(pipe,
+                                 dst->texture, subdst,
+                                 dstX0, dstY0, dst->zslice,/* dest */
+                                 src_tex, srcsub,
+                                 srcX0, srcY0, srcZ0,/* src */
+                                 srcW, srcH);       /* size */
       return;
    }
-   
-   assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
-                                      PIPE_BIND_RENDER_TARGET, 0));
 
    /* Create a temporary texture when src and dest alias or when src
-    * is anything other than a single-level 2d texture.
+    * is anything other than a 2d texture.
+    * XXX should just use appropriate shader to access 1d / 3d slice / cube face,
+    * much like the u_blitter code does (should be pretty trivial).
     * 
     * This can still be improved upon.
     */
-   if (util_same_surface(src, dst) ||
-       src->texture->target != PIPE_TEXTURE_2D ||
-       src->texture->last_level != 0)
+   if ((src_tex == dst->texture &&
+       dst->face == srcsub.face &&
+       dst->level == srcsub.level &&
+       dst->zslice == srcZ0) ||
+       (src_tex->target != PIPE_TEXTURE_2D &&
+       src_tex->target != PIPE_TEXTURE_RECT))
    {
       struct pipe_resource texTemp;
       struct pipe_resource *tex;
       struct pipe_sampler_view sv_templ;
-      struct pipe_surface *texSurf;
+      struct pipe_subresource texsub;
       const int srcLeft = MIN2(srcX0, srcX1);
       const int srcTop = MIN2(srcY0, srcY1);
 
@@ -368,8 +388,8 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 
       /* create temp texture */
       memset(&texTemp, 0, sizeof(texTemp));
-      texTemp.target = PIPE_TEXTURE_2D;
-      texTemp.format = src->format;
+      texTemp.target = ctx->internal_target;
+      texTemp.format = src_tex->format;
       texTemp.last_level = 0;
       texTemp.width0 = srcW;
       texTemp.height0 = srcH;
@@ -380,50 +400,69 @@ util_blit_pixels_writemask(struct blit_state *ctx,
       if (!tex)
          return;
 
+      texsub.face = 0;
+      texsub.level = 0;
+      /* load temp texture */
+      pipe->resource_copy_region(pipe,
+                                 tex, texsub, 0, 0, 0,  /* dest */
+                                 src_tex, srcsub, srcLeft, srcTop, srcZ0, /* src */
+                                 srcW, srcH);     /* size */
+
+      normalized = tex->target != PIPE_TEXTURE_RECT;
+      if(normalized) {
+         s0 = 0.0f;
+         s1 = 1.0f;
+         t0 = 0.0f;
+         t1 = 1.0f;
+      }
+      else {
+         s0 = 0;
+         s1 = srcW;
+         t0 = 0;
+         t1 = srcH;
+      }
+
       u_sampler_view_default_template(&sv_templ, tex, tex->format);
+      sampler_view = pipe->create_sampler_view(pipe, tex, &sv_templ);
 
-      sampler_view = ctx->pipe->create_sampler_view(ctx->pipe, tex, &sv_templ);
       if (!sampler_view) {
          pipe_resource_reference(&tex, NULL);
          return;
       }
-
-      texSurf = screen->get_tex_surface(screen, tex, 0, 0, 0, 
-                                        PIPE_BIND_BLIT_DESTINATION);
-
-      /* load temp texture */
-      if (pipe->surface_copy) {
-         pipe->surface_copy(pipe,
-                            texSurf, 0, 0,   /* dest */
-                            src, srcLeft, srcTop, /* src */
-                            srcW, srcH);     /* size */
-      } else {
-         util_surface_copy(pipe, FALSE,
-                           texSurf, 0, 0,   /* dest */
-                           src, srcLeft, srcTop, /* src */
-                           srcW, srcH);     /* size */
-      }
-
-      /* free the surface, update the texture if necessary.
-       */
-      pipe_surface_reference(&texSurf, NULL);
-      s0 = 0.0f; 
-      s1 = 1.0f;
-      t0 = 0.0f;
-      t1 = 1.0f;
-
       pipe_resource_reference(&tex, NULL);
    }
    else {
-      pipe_sampler_view_reference(&sampler_view, src_sampler_view);
-      s0 = srcX0 / (float)src->texture->width0;
-      s1 = srcX1 / (float)src->texture->width0;
-      t0 = srcY0 / (float)src->texture->height0;
-      t1 = srcY1 / (float)src->texture->height0;
+      u_sampler_view_default_template(&sv_templ, src_tex, src_tex->format);
+      sv_templ.first_level = sv_templ.last_level = srcsub.level;
+      sampler_view = pipe->create_sampler_view(pipe, src_tex, &sv_templ);
+
+      if (!sampler_view) {
+         return;
+      }
+
+      s0 = srcX0;
+      s1 = srcX1;
+      t0 = srcY0;
+      t1 = srcY1;
+      normalized = sampler_view->texture->target != PIPE_TEXTURE_RECT;
+      if(normalized)
+      {
+         s0 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level));
+         s1 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level));
+         t0 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level));
+         t1 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level));
+      }
    }
 
-   
+   dst_is_depth = util_format_is_depth_or_stencil(dst->format);
 
+   assert(screen->is_format_supported(screen, sampler_view->format, ctx->internal_target,
+                                      sampler_view->texture->nr_samples,
+                                      PIPE_BIND_SAMPLER_VIEW, 0));
+   assert(screen->is_format_supported(screen, dst->format, ctx->internal_target,
+                                      dst->texture->nr_samples,
+                                      dst_is_depth ? PIPE_BIND_DEPTH_STENCIL :
+                                                     PIPE_BIND_RENDER_TARGET, 0));
    /* save state (restored below) */
    cso_save_blend(ctx->cso);
    cso_save_depth_stencil_alpha(ctx->cso);
@@ -439,14 +478,20 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 
    /* set misc state we care about */
    cso_set_blend(ctx->cso, &ctx->blend);
-   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
+   cso_set_depth_stencil_alpha(ctx->cso,
+                               dst_is_depth ? &ctx->depthstencil_write :
+                                              &ctx->depthstencil_keep);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
    cso_set_clip(ctx->cso, &ctx->clip);
    cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
 
    /* sampler */
+   ctx->sampler.normalized_coords = normalized;
    ctx->sampler.min_img_filter = filter;
    ctx->sampler.mag_img_filter = filter;
+   /* we've limited this already with the sampler view but you never know... */
+   ctx->sampler.min_lod = srcsub.level;
+   ctx->sampler.max_lod = srcsub.level;
    cso_single_sampler(ctx->cso, 0, &ctx->sampler);
    cso_single_sampler_done(ctx->cso);
 
@@ -464,21 +509,35 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    /* texture */
    cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view);
 
-   if (ctx->fs[writemask] == NULL)
-      ctx->fs[writemask] =
-         util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D,
-                                                 writemask);
-
    /* shaders */
-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]);
+   if (dst_is_depth) {
+      if (ctx->fs_depth == NULL)
+         ctx->fs_depth =
+            util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_2D,
+                                                     TGSI_INTERPOLATE_LINEAR);
+
+      cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth);
+   } else {
+      if (ctx->fs[writemask] == NULL)
+         ctx->fs[writemask] =
+            util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D,
+                                                    TGSI_INTERPOLATE_LINEAR,
+                                                    writemask);
+
+      cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]);
+   }
    cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
 
    /* drawing dest */
    memset(&fb, 0, sizeof(fb));
    fb.width = dst->width;
    fb.height = dst->height;
-   fb.nr_cbufs = 1;
-   fb.cbufs[0] = dst;
+   if (dst_is_depth) {
+      fb.zsbuf = dst;
+   } else {
+      fb.nr_cbufs = 1;
+      fb.cbufs[0] = dst;
+   }
    cso_set_framebuffer(ctx->cso, &fb);
 
    /* draw quad */
@@ -515,18 +574,21 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 
 void
 util_blit_pixels(struct blit_state *ctx,
-                 struct pipe_surface *src,
-                 struct pipe_sampler_view *src_sampler_view,
+                 struct pipe_resource *src_tex,
+                 struct pipe_subresource srcsub,
                  int srcX0, int srcY0,
                  int srcX1, int srcY1,
+                 int srcZ,
                  struct pipe_surface *dst,
                  int dstX0, int dstY0,
                  int dstX1, int dstY1,
                  float z, uint filter )
 {
-   util_blit_pixels_writemask( ctx, src, src_sampler_view,
+   util_blit_pixels_writemask( ctx, src_tex,
+                               srcsub,
                                srcX0, srcY0,
                                srcX1, srcY1,
+                               srcZ,
                                dst,
                                dstX0, dstY0,
                                dstX1, dstY1,
@@ -548,7 +610,6 @@ void util_blit_flush( struct blit_state *ctx )
 
 /**
  * Copy pixel block from src texture to dst surface.
- * Overlapping regions are acceptable.
  *
  * XXX Should support selection of level.
  * XXX need some control over blitting Z and/or stencil.
@@ -563,6 +624,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
                      int dstX1, int dstY1,
                      float z, uint filter)
 {
+   boolean normalized = src_sampler_view->texture->target != PIPE_TEXTURE_RECT;
    struct pipe_framebuffer_state fb;
    float s0, t0, s1, t1;
    unsigned offset;
@@ -575,13 +637,22 @@ util_blit_pixels_tex(struct blit_state *ctx,
    assert(tex->width0 != 0);
    assert(tex->height0 != 0);
 
-   s0 = srcX0 / (float)tex->width0;
-   s1 = srcX1 / (float)tex->width0;
-   t0 = srcY0 / (float)tex->height0;
-   t1 = srcY1 / (float)tex->height0;
+   s0 = srcX0;
+   s1 = srcX1;
+   t0 = srcY0;
+   t1 = srcY1;
+
+   if(normalized)
+   {
+      s0 /= (float)tex->width0;
+      s1 /= (float)tex->width0;
+      t0 /= (float)tex->height0;
+      t1 /= (float)tex->height0;
+   }
 
    assert(ctx->pipe->screen->is_format_supported(ctx->pipe->screen, dst->format,
                                                  PIPE_TEXTURE_2D,
+                                                 dst->texture->nr_samples,
                                                  PIPE_BIND_RENDER_TARGET,
                                                  0));
 
@@ -599,12 +670,13 @@ util_blit_pixels_tex(struct blit_state *ctx,
 
    /* set misc state we care about */
    cso_set_blend(ctx->cso, &ctx->blend);
-   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
+   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil_keep);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
    cso_set_clip(ctx->cso, &ctx->clip);
    cso_set_vertex_elements(ctx->cso, 2, ctx->velem);
 
    /* sampler */
+   ctx->sampler.normalized_coords = normalized;
    ctx->sampler.min_img_filter = filter;
    ctx->sampler.mag_img_filter = filter;
    cso_single_sampler(ctx->cso, 0, &ctx->sampler);
diff --git a/src/gallium/auxiliary/util/u_blit.h b/src/gallium/auxiliary/util/u_blit.h
index 464ff9aaced..b8a0dfce13f 100644
--- a/src/gallium/auxiliary/util/u_blit.h
+++ b/src/gallium/auxiliary/util/u_blit.h
@@ -30,18 +30,20 @@
 #define U_BLIT_H
 
 
+#include "pipe/p_compiler.h"
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
    
+struct cso_context;
 struct pipe_context;
-struct pipe_surface;
 struct pipe_resource;
-struct cso_context;
-
-
-struct blit_state;
+struct pipe_sampler_view;
+struct pipe_subresource;
+struct pipe_surface;
 
 
 extern struct blit_state *
@@ -52,10 +54,11 @@ util_destroy_blit(struct blit_state *ctx);
 
 extern void
 util_blit_pixels(struct blit_state *ctx,
-                 struct pipe_surface *src,
-                 struct pipe_sampler_view *src_sampler_view,
+                 struct pipe_resource *src_tex,
+                 struct pipe_subresource srcsub,
                  int srcX0, int srcY0,
                  int srcX1, int srcY1,
+                 int srcZ0,
                  struct pipe_surface *dst,
                  int dstX0, int dstY0,
                  int dstX1, int dstY1,
@@ -63,10 +66,11 @@ util_blit_pixels(struct blit_state *ctx,
 
 void
 util_blit_pixels_writemask(struct blit_state *ctx,
-                           struct pipe_surface *src,
-                           struct pipe_sampler_view *src_sampler_view,
+                           struct pipe_resource *src_tex,
+                           struct pipe_subresource srcsub,
                            int srcX0, int srcY0,
                            int srcX1, int srcY1,
+                           int srcZ0,
                            struct pipe_surface *dst,
                            int dstX0, int dstY0,
                            int dstX1, int dstY1,
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 956aedc8a15..a163f93cb82 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -26,8 +26,8 @@
 
 /**
  * @file
- * Blitter utility to facilitate acceleration of the clear, surface_copy,
- * and surface_fill functions.
+ * Blitter utility to facilitate acceleration of the clear, clear_render_target, clear_depth_stencil
+ * resource_copy_region functions.
  *
  * @author Marek Olšák
  */
@@ -43,19 +43,17 @@
 #include "util/u_math.h"
 #include "util/u_blitter.h"
 #include "util/u_draw_quad.h"
-#include "util/u_pack_color.h"
-#include "util/u_rect.h"
 #include "util/u_sampler.h"
 #include "util/u_simple_shaders.h"
+#include "util/u_surface.h"
 #include "util/u_texture.h"
 
 #define INVALID_PTR ((void*)~0)
 
 struct blitter_context_priv
 {
-   struct blitter_context blitter;
+   struct blitter_context base;
 
-   struct pipe_context *pipe; /**< pipe context */
    struct pipe_resource *vbuf;  /**< quad */
 
    float vertices[4][2][4];   /**< {pos, color} or {pos, texcoord} */
@@ -69,8 +67,8 @@ struct blitter_context_priv
    void *vs_tex; /**< Vertex shader which passes {pos, texcoord} to the output.*/
 
    /* Fragment shaders. */
-   /* FS which outputs a color to multiple color buffers. */
-   void *fs_col[PIPE_MAX_COLOR_BUFS];
+   /* The shader at index i outputs color to color buffers 0,1,...,i-1. */
+   void *fs_col[PIPE_MAX_COLOR_BUFS+1];
 
    /* FS which outputs a color from a texture,
       where the index is PIPE_TEXTURE_* to be sampled. */
@@ -88,24 +86,36 @@ struct blitter_context_priv
    void *dsa_write_depth_stencil;
    void *dsa_write_depth_keep_stencil;
    void *dsa_keep_depth_stencil;
+   void *dsa_keep_depth_write_stencil;
+   void *dsa_flush_depth_stencil;
 
    void *velem_state;
 
    /* Sampler state for clamping to a miplevel. */
-   void *sampler_state[PIPE_MAX_TEXTURE_LEVELS];
+   void *sampler_state[PIPE_MAX_TEXTURE_LEVELS * 2];
 
    /* Rasterizer state. */
    void *rs_state;
 
-   struct pipe_sampler_view *sampler_view;
-
    /* Viewport state. */
    struct pipe_viewport_state viewport;
 
    /* Clip state. */
    struct pipe_clip_state clip;
+
+   /* Destination surface dimensions. */
+   unsigned dst_width;
+   unsigned dst_height;
 };
 
+static void blitter_draw_rectangle(struct blitter_context *blitter,
+                                   unsigned x, unsigned y,
+                                   unsigned width, unsigned height,
+                                   float depth,
+                                   enum blitter_attrib_type type,
+                                   const float attrib[4]);
+
+
 struct blitter_context *util_blitter_create(struct pipe_context *pipe)
 {
    struct blitter_context_priv *ctx;
@@ -120,19 +130,20 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    if (!ctx)
       return NULL;
 
-   ctx->pipe = pipe;
+   ctx->base.pipe = pipe;
+   ctx->base.draw_rectangle = blitter_draw_rectangle;
 
    /* init state objects for them to be considered invalid */
-   ctx->blitter.saved_blend_state = INVALID_PTR;
-   ctx->blitter.saved_dsa_state = INVALID_PTR;
-   ctx->blitter.saved_rs_state = INVALID_PTR;
-   ctx->blitter.saved_fs = INVALID_PTR;
-   ctx->blitter.saved_vs = INVALID_PTR;
-   ctx->blitter.saved_velem_state = INVALID_PTR;
-   ctx->blitter.saved_fb_state.nr_cbufs = ~0;
-   ctx->blitter.saved_num_sampler_views = ~0;
-   ctx->blitter.saved_num_sampler_states = ~0;
-   ctx->blitter.saved_num_vertex_buffers = ~0;
+   ctx->base.saved_blend_state = INVALID_PTR;
+   ctx->base.saved_dsa_state = INVALID_PTR;
+   ctx->base.saved_rs_state = INVALID_PTR;
+   ctx->base.saved_fs = INVALID_PTR;
+   ctx->base.saved_vs = INVALID_PTR;
+   ctx->base.saved_velem_state = INVALID_PTR;
+   ctx->base.saved_fb_state.nr_cbufs = ~0;
+   ctx->base.saved_num_sampler_views = ~0;
+   ctx->base.saved_num_sampler_states = ~0;
+   ctx->base.saved_num_vertex_buffers = ~0;
 
    /* blend state objects */
    memset(&blend, 0, sizeof(blend));
@@ -146,6 +157,10 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    ctx->dsa_keep_depth_stencil =
       pipe->create_depth_stencil_alpha_state(pipe, &dsa);
 
+   dsa.depth.writemask = 1;
+   ctx->dsa_flush_depth_stencil =
+      pipe->create_depth_stencil_alpha_state(pipe, &dsa);
+
    dsa.depth.enabled = 1;
    dsa.depth.writemask = 1;
    dsa.depth.func = PIPE_FUNC_ALWAYS;
@@ -161,8 +176,12 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    dsa.stencil[0].writemask = 0xff;
    ctx->dsa_write_depth_stencil =
       pipe->create_depth_stencil_alpha_state(pipe, &dsa);
-   /* The DSA state objects which write depth and stencil are created
-    * on-demand. */
+
+
+   dsa.depth.enabled = 0;
+   dsa.depth.writemask = 0;
+   ctx->dsa_keep_depth_write_stencil =
+      pipe->create_depth_stencil_alpha_state(pipe, &dsa);
 
    /* sampler state */
    sampler_state = &ctx->template_sampler_state;
@@ -175,8 +194,7 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
 
    /* rasterizer state */
    memset(&rs_state, 0, sizeof(rs_state));
-   rs_state.front_winding = PIPE_WINDING_CW;
-   rs_state.cull_mode = PIPE_WINDING_NONE;
+   rs_state.cull_face = PIPE_FACE_NONE;
    rs_state.gl_rasterization_rules = 1;
    rs_state.flatshade = 1;
    ctx->rs_state = pipe->create_rasterizer_state(pipe, &rs_state);
@@ -216,17 +234,17 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
       ctx->vertices[i][0][3] = 1; /*v.w*/
 
    /* create the vertex buffer */
-   ctx->vbuf = pipe_buffer_create(ctx->pipe->screen,
+   ctx->vbuf = pipe_buffer_create(ctx->base.pipe->screen,
                                   PIPE_BIND_VERTEX_BUFFER,
                                   sizeof(ctx->vertices));
 
-   return &ctx->blitter;
+   return &ctx->base;
 }
 
 void util_blitter_destroy(struct blitter_context *blitter)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = blitter->pipe;
    int i;
 
    pipe->delete_blend_state(pipe, ctx->blend_write_color);
@@ -235,6 +253,8 @@ void util_blitter_destroy(struct blitter_context *blitter)
    pipe->delete_depth_stencil_alpha_state(pipe,
                                           ctx->dsa_write_depth_keep_stencil);
    pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil);
+   pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_write_stencil);
+   pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_flush_depth_stencil);
 
    pipe->delete_rasterizer_state(pipe, ctx->rs_state);
    pipe->delete_vs_state(pipe, ctx->vs_col);
@@ -248,18 +268,14 @@ void util_blitter_destroy(struct blitter_context *blitter)
          pipe->delete_fs_state(pipe, ctx->fs_texfetch_depth[i]);
    }
 
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS && ctx->fs_col[i]; i++)
+   for (i = 0; i <= PIPE_MAX_COLOR_BUFS; i++)
       if (ctx->fs_col[i])
          pipe->delete_fs_state(pipe, ctx->fs_col[i]);
 
-   for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
+   for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS * 2; i++)
       if (ctx->sampler_state[i])
          pipe->delete_sampler_state(pipe, ctx->sampler_state[i]);
 
-   if (ctx->sampler_view) {
-      pipe_sampler_view_reference(&ctx->sampler_view, NULL);
-   }
-
    pipe_resource_reference(&ctx->vbuf, NULL);
    FREE(ctx);
 }
@@ -267,104 +283,117 @@ void util_blitter_destroy(struct blitter_context *blitter)
 static void blitter_check_saved_CSOs(struct blitter_context_priv *ctx)
 {
    /* make sure these CSOs have been saved */
-   assert(ctx->blitter.saved_blend_state != INVALID_PTR &&
-          ctx->blitter.saved_dsa_state != INVALID_PTR &&
-          ctx->blitter.saved_rs_state != INVALID_PTR &&
-          ctx->blitter.saved_fs != INVALID_PTR &&
-          ctx->blitter.saved_vs != INVALID_PTR &&
-          ctx->blitter.saved_velem_state != INVALID_PTR);
+   assert(ctx->base.saved_blend_state != INVALID_PTR &&
+          ctx->base.saved_dsa_state != INVALID_PTR &&
+          ctx->base.saved_rs_state != INVALID_PTR &&
+          ctx->base.saved_fs != INVALID_PTR &&
+          ctx->base.saved_vs != INVALID_PTR &&
+          ctx->base.saved_velem_state != INVALID_PTR);
 }
 
 static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
+   unsigned i;
 
    /* restore the state objects which are always required to be saved */
-   pipe->bind_blend_state(pipe, ctx->blitter.saved_blend_state);
-   pipe->bind_depth_stencil_alpha_state(pipe, ctx->blitter.saved_dsa_state);
-   pipe->bind_rasterizer_state(pipe, ctx->blitter.saved_rs_state);
-   pipe->bind_fs_state(pipe, ctx->blitter.saved_fs);
-   pipe->bind_vs_state(pipe, ctx->blitter.saved_vs);
-   pipe->bind_vertex_elements_state(pipe, ctx->blitter.saved_velem_state);
+   pipe->bind_blend_state(pipe, ctx->base.saved_blend_state);
+   pipe->bind_depth_stencil_alpha_state(pipe, ctx->base.saved_dsa_state);
+   pipe->bind_rasterizer_state(pipe, ctx->base.saved_rs_state);
+   pipe->bind_fs_state(pipe, ctx->base.saved_fs);
+   pipe->bind_vs_state(pipe, ctx->base.saved_vs);
+   pipe->bind_vertex_elements_state(pipe, ctx->base.saved_velem_state);
 
-   ctx->blitter.saved_blend_state = INVALID_PTR;
-   ctx->blitter.saved_dsa_state = INVALID_PTR;
-   ctx->blitter.saved_rs_state = INVALID_PTR;
-   ctx->blitter.saved_fs = INVALID_PTR;
-   ctx->blitter.saved_vs = INVALID_PTR;
-   ctx->blitter.saved_velem_state = INVALID_PTR;
+   ctx->base.saved_blend_state = INVALID_PTR;
+   ctx->base.saved_dsa_state = INVALID_PTR;
+   ctx->base.saved_rs_state = INVALID_PTR;
+   ctx->base.saved_fs = INVALID_PTR;
+   ctx->base.saved_vs = INVALID_PTR;
+   ctx->base.saved_velem_state = INVALID_PTR;
 
-   pipe->set_stencil_ref(pipe, &ctx->blitter.saved_stencil_ref);
+   pipe->set_stencil_ref(pipe, &ctx->base.saved_stencil_ref);
 
-   pipe->set_viewport_state(pipe, &ctx->blitter.saved_viewport);
-   pipe->set_clip_state(pipe, &ctx->blitter.saved_clip);
+   pipe->set_viewport_state(pipe, &ctx->base.saved_viewport);
+   pipe->set_clip_state(pipe, &ctx->base.saved_clip);
 
    /* restore the state objects which are required to be saved before copy/fill
     */
-   if (ctx->blitter.saved_fb_state.nr_cbufs != ~0) {
-      pipe->set_framebuffer_state(pipe, &ctx->blitter.saved_fb_state);
-      ctx->blitter.saved_fb_state.nr_cbufs = ~0;
+   if (ctx->base.saved_fb_state.nr_cbufs != ~0) {
+      pipe->set_framebuffer_state(pipe, &ctx->base.saved_fb_state);
+      util_unreference_framebuffer_state(&ctx->base.saved_fb_state);
+      ctx->base.saved_fb_state.nr_cbufs = ~0;
    }
 
-   if (ctx->blitter.saved_num_sampler_states != ~0) {
+   if (ctx->base.saved_num_sampler_states != ~0) {
       pipe->bind_fragment_sampler_states(pipe,
-                                         ctx->blitter.saved_num_sampler_states,
-                                         ctx->blitter.saved_sampler_states);
-      ctx->blitter.saved_num_sampler_states = ~0;
+                                         ctx->base.saved_num_sampler_states,
+                                         ctx->base.saved_sampler_states);
+      ctx->base.saved_num_sampler_states = ~0;
    }
 
-   if (ctx->blitter.saved_num_sampler_views != ~0) {
+   if (ctx->base.saved_num_sampler_views != ~0) {
       pipe->set_fragment_sampler_views(pipe,
-                                       ctx->blitter.saved_num_sampler_views,
-                                       ctx->blitter.saved_sampler_views);
-      ctx->blitter.saved_num_sampler_views = ~0;
+                                       ctx->base.saved_num_sampler_views,
+                                       ctx->base.saved_sampler_views);
+
+      for (i = 0; i < ctx->base.saved_num_sampler_views; i++)
+         pipe_sampler_view_reference(&ctx->base.saved_sampler_views[i],
+                                     NULL);
+
+      ctx->base.saved_num_sampler_views = ~0;
    }
 
-   if (ctx->blitter.saved_num_vertex_buffers != ~0) {
+   if (ctx->base.saved_num_vertex_buffers != ~0) {
       pipe->set_vertex_buffers(pipe,
-                                       ctx->blitter.saved_num_vertex_buffers,
-                                       ctx->blitter.saved_vertex_buffers);
-      ctx->blitter.saved_num_vertex_buffers = ~0;
+                               ctx->base.saved_num_vertex_buffers,
+                               ctx->base.saved_vertex_buffers);
+
+      for (i = 0; i < ctx->base.saved_num_vertex_buffers; i++) {
+         if (ctx->base.saved_vertex_buffers[i].buffer) {
+            pipe_resource_reference(&ctx->base.saved_vertex_buffers[i].buffer,
+                                    NULL);
+         }
+      }
+      ctx->base.saved_num_vertex_buffers = ~0;
    }
 }
 
 static void blitter_set_rectangle(struct blitter_context_priv *ctx,
                                   unsigned x1, unsigned y1,
                                   unsigned x2, unsigned y2,
-                                  unsigned width, unsigned height,
                                   float depth)
 {
    int i;
 
    /* set vertex positions */
-   ctx->vertices[0][0][0] = (float)x1 / width * 2.0f - 1.0f; /*v0.x*/
-   ctx->vertices[0][0][1] = (float)y1 / height * 2.0f - 1.0f; /*v0.y*/
+   ctx->vertices[0][0][0] = (float)x1 / ctx->dst_width * 2.0f - 1.0f; /*v0.x*/
+   ctx->vertices[0][0][1] = (float)y1 / ctx->dst_height * 2.0f - 1.0f; /*v0.y*/
 
-   ctx->vertices[1][0][0] = (float)x2 / width * 2.0f - 1.0f; /*v1.x*/
-   ctx->vertices[1][0][1] = (float)y1 / height * 2.0f - 1.0f; /*v1.y*/
+   ctx->vertices[1][0][0] = (float)x2 / ctx->dst_width * 2.0f - 1.0f; /*v1.x*/
+   ctx->vertices[1][0][1] = (float)y1 / ctx->dst_height * 2.0f - 1.0f; /*v1.y*/
 
-   ctx->vertices[2][0][0] = (float)x2 / width * 2.0f - 1.0f; /*v2.x*/
-   ctx->vertices[2][0][1] = (float)y2 / height * 2.0f - 1.0f; /*v2.y*/
+   ctx->vertices[2][0][0] = (float)x2 / ctx->dst_width * 2.0f - 1.0f; /*v2.x*/
+   ctx->vertices[2][0][1] = (float)y2 / ctx->dst_height * 2.0f - 1.0f; /*v2.y*/
 
-   ctx->vertices[3][0][0] = (float)x1 / width * 2.0f - 1.0f; /*v3.x*/
-   ctx->vertices[3][0][1] = (float)y2 / height * 2.0f - 1.0f; /*v3.y*/
+   ctx->vertices[3][0][0] = (float)x1 / ctx->dst_width * 2.0f - 1.0f; /*v3.x*/
+   ctx->vertices[3][0][1] = (float)y2 / ctx->dst_height * 2.0f - 1.0f; /*v3.y*/
 
    for (i = 0; i < 4; i++)
       ctx->vertices[i][0][2] = depth; /*z*/
 
    /* viewport */
-   ctx->viewport.scale[0] = 0.5f * width;
-   ctx->viewport.scale[1] = 0.5f * height;
+   ctx->viewport.scale[0] = 0.5f * ctx->dst_width;
+   ctx->viewport.scale[1] = 0.5f * ctx->dst_height;
    ctx->viewport.scale[2] = 1.0f;
    ctx->viewport.scale[3] = 1.0f;
-   ctx->viewport.translate[0] = 0.5f * width;
-   ctx->viewport.translate[1] = 0.5f * height;
+   ctx->viewport.translate[0] = 0.5f * ctx->dst_width;
+   ctx->viewport.translate[1] = 0.5f * ctx->dst_height;
    ctx->viewport.translate[2] = 0.0f;
    ctx->viewport.translate[3] = 0.0f;
-   ctx->pipe->set_viewport_state(ctx->pipe, &ctx->viewport);
+   ctx->base.pipe->set_viewport_state(ctx->base.pipe, &ctx->viewport);
 
    /* clip */
-   ctx->pipe->set_clip_state(ctx->pipe, &ctx->clip);
+   ctx->base.pipe->set_clip_state(ctx->base.pipe, &ctx->clip);
 }
 
 static void blitter_set_clear_color(struct blitter_context_priv *ctx,
@@ -372,36 +401,72 @@ static void blitter_set_clear_color(struct blitter_context_priv *ctx,
 {
    int i;
 
-   for (i = 0; i < 4; i++) {
-      ctx->vertices[i][1][0] = rgba[0];
-      ctx->vertices[i][1][1] = rgba[1];
-      ctx->vertices[i][1][2] = rgba[2];
-      ctx->vertices[i][1][3] = rgba[3];
+   if (rgba) {
+      for (i = 0; i < 4; i++) {
+         ctx->vertices[i][1][0] = rgba[0];
+         ctx->vertices[i][1][1] = rgba[1];
+         ctx->vertices[i][1][2] = rgba[2];
+         ctx->vertices[i][1][3] = rgba[3];
+      }
+   } else {
+      for (i = 0; i < 4; i++) {
+         ctx->vertices[i][1][0] = 0;
+         ctx->vertices[i][1][1] = 0;
+         ctx->vertices[i][1][2] = 0;
+         ctx->vertices[i][1][3] = 0;
+      }
    }
 }
 
-static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx,
-                                     struct pipe_surface *surf,
+static void get_texcoords(struct pipe_resource *src,
+                                     struct pipe_subresource subsrc,
                                      unsigned x1, unsigned y1,
-                                     unsigned x2, unsigned y2)
+                                     unsigned x2, unsigned y2,
+                                     boolean normalized, float out[4])
 {
-   int i;
-   float s1 = x1 / (float)surf->width;
-   float t1 = y1 / (float)surf->height;
-   float s2 = x2 / (float)surf->width;
-   float t2 = y2 / (float)surf->height;
-
-   ctx->vertices[0][1][0] = s1; /*t0.s*/
-   ctx->vertices[0][1][1] = t1; /*t0.t*/
+   if(normalized)
+   {
+      out[0] = x1 / (float)u_minify(src->width0,  subsrc.level);
+      out[1] = y1 / (float)u_minify(src->height0, subsrc.level);
+      out[2] = x2 / (float)u_minify(src->width0,  subsrc.level);
+      out[3] = y2 / (float)u_minify(src->height0, subsrc.level);
+   }
+   else
+   {
+      out[0] = x1;
+      out[1] = y1;
+      out[2] = x2;
+      out[3] = y2;
+   }
+}
 
-   ctx->vertices[1][1][0] = s2; /*t1.s*/
-   ctx->vertices[1][1][1] = t1; /*t1.t*/
+static void set_texcoords_in_vertices(const float coord[4],
+                                      float *out, unsigned stride)
+{
+   out[0] = coord[0]; /*t0.s*/
+   out[1] = coord[1]; /*t0.t*/
+   out += stride;
+   out[0] = coord[2]; /*t1.s*/
+   out[1] = coord[1]; /*t1.t*/
+   out += stride;
+   out[0] = coord[2]; /*t2.s*/
+   out[1] = coord[3]; /*t2.t*/
+   out += stride;
+   out[0] = coord[0]; /*t3.s*/
+   out[1] = coord[3]; /*t3.t*/
+}
 
-   ctx->vertices[2][1][0] = s2; /*t2.s*/
-   ctx->vertices[2][1][1] = t2; /*t2.t*/
+static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx,
+                                     struct pipe_resource *src,
+                                     struct pipe_subresource subsrc,
+                                     unsigned x1, unsigned y1,
+                                     unsigned x2, unsigned y2)
+{
+   unsigned i;
+   float coord[4];
 
-   ctx->vertices[3][1][0] = s1; /*t3.s*/
-   ctx->vertices[3][1][1] = t2; /*t3.t*/
+   get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord);
+   set_texcoords_in_vertices(coord, &ctx->vertices[0][1][0], 8);
 
    for (i = 0; i < 4; i++) {
       ctx->vertices[i][1][2] = 0; /*r*/
@@ -410,42 +475,35 @@ static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx,
 }
 
 static void blitter_set_texcoords_3d(struct blitter_context_priv *ctx,
-                                     struct pipe_surface *surf,
+                                     struct pipe_resource *src,
+                                     struct pipe_subresource subsrc,
+                                     unsigned zslice,
                                      unsigned x1, unsigned y1,
                                      unsigned x2, unsigned y2)
 {
    int i;
-   float depth = u_minify(surf->texture->depth0, surf->level);
-   float r = surf->zslice / depth;
+   float r = zslice / (float)u_minify(src->depth0, subsrc.level);
 
-   blitter_set_texcoords_2d(ctx, surf, x1, y1, x2, y2);
+   blitter_set_texcoords_2d(ctx, src, subsrc, x1, y1, x2, y2);
 
    for (i = 0; i < 4; i++)
       ctx->vertices[i][1][2] = r; /*r*/
 }
 
 static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx,
-                                       struct pipe_surface *surf,
+                                       struct pipe_resource *src,
+                                       struct pipe_subresource subsrc,
                                        unsigned x1, unsigned y1,
                                        unsigned x2, unsigned y2)
 {
    int i;
-   float s1 = x1 / (float)surf->width;
-   float t1 = y1 / (float)surf->height;
-   float s2 = x2 / (float)surf->width;
-   float t2 = y2 / (float)surf->height;
+   float coord[4];
    float st[4][2];
 
-   st[0][0] = s1;
-   st[0][1] = t1;
-   st[1][0] = s2;
-   st[1][1] = t1;
-   st[2][0] = s2;
-   st[2][1] = t2;
-   st[3][0] = s1;
-   st[3][1] = t2;
+   get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord);
+   set_texcoords_in_vertices(coord, &st[0][0], 2);
 
-   util_map_texcoords2d_onto_cubemap(surf->face,
+   util_map_texcoords2d_onto_cubemap(subsrc.face,
                                      /* pointer, stride in floats */
                                      &st[0][0], 2,
                                      &ctx->vertices[0][1][0], 8);
@@ -454,9 +512,16 @@ static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx,
       ctx->vertices[i][1][3] = 1; /*q*/
 }
 
+static void blitter_set_dst_dimensions(struct blitter_context_priv *ctx,
+                                       unsigned width, unsigned height)
+{
+   ctx->dst_width = width;
+   ctx->dst_height = height;
+}
+
 static void blitter_draw_quad(struct blitter_context_priv *ctx)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
 
    /* write vertices and draw them */
    pipe_buffer_write(pipe, ctx->vbuf,
@@ -469,72 +534,79 @@ static void blitter_draw_quad(struct blitter_context_priv *ctx)
 
 static INLINE
 void **blitter_get_sampler_state(struct blitter_context_priv *ctx,
-                                 int miplevel)
+                                 int miplevel, boolean normalized)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_sampler_state *sampler_state = &ctx->template_sampler_state;
 
    assert(miplevel < PIPE_MAX_TEXTURE_LEVELS);
 
    /* Create the sampler state on-demand. */
-   if (!ctx->sampler_state[miplevel]) {
+   if (!ctx->sampler_state[miplevel * 2 + normalized]) {
       sampler_state->lod_bias = miplevel;
       sampler_state->min_lod = miplevel;
       sampler_state->max_lod = miplevel;
+      sampler_state->normalized_coords = normalized;
 
-      ctx->sampler_state[miplevel] = pipe->create_sampler_state(pipe,
+      ctx->sampler_state[miplevel * 2 + normalized] = pipe->create_sampler_state(pipe,
                                                                 sampler_state);
    }
 
    /* Return void** so that it can be passed to bind_fragment_sampler_states
     * directly. */
-   return &ctx->sampler_state[miplevel];
+   return &ctx->sampler_state[miplevel * 2 + normalized];
 }
 
 static INLINE
 void *blitter_get_fs_col(struct blitter_context_priv *ctx, unsigned num_cbufs)
 {
-   struct pipe_context *pipe = ctx->pipe;
-   unsigned index = num_cbufs ? num_cbufs - 1 : 0;
+   struct pipe_context *pipe = ctx->base.pipe;
 
    assert(num_cbufs <= PIPE_MAX_COLOR_BUFS);
 
-   if (!ctx->fs_col[index])
-      ctx->fs_col[index] =
+   if (!ctx->fs_col[num_cbufs])
+      ctx->fs_col[num_cbufs] =
          util_make_fragment_clonecolor_shader(pipe, num_cbufs);
 
-   return ctx->fs_col[index];
+   return ctx->fs_col[num_cbufs];
 }
 
+/** Convert PIPE_TEXTURE_x to TGSI_TEXTURE_x */
+static unsigned
+pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target)
+{
+   switch (pipe_tex_target) {
+   case PIPE_TEXTURE_1D:
+      return TGSI_TEXTURE_1D;
+   case PIPE_TEXTURE_2D:
+      return TGSI_TEXTURE_2D;
+   case PIPE_TEXTURE_RECT:
+      return TGSI_TEXTURE_RECT;
+   case PIPE_TEXTURE_3D:
+      return TGSI_TEXTURE_3D;
+   case PIPE_TEXTURE_CUBE:
+      return TGSI_TEXTURE_CUBE;
+   default:
+      assert(0 && "unexpected texture target");
+      return TGSI_TEXTURE_UNKNOWN;
+   }
+}
+
+
 static INLINE
 void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
                                   unsigned tex_target)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
 
    assert(tex_target < PIPE_MAX_TEXTURE_TYPES);
 
    /* Create the fragment shader on-demand. */
    if (!ctx->fs_texfetch_col[tex_target]) {
-      switch (tex_target) {
-         case PIPE_TEXTURE_1D:
-            ctx->fs_texfetch_col[PIPE_TEXTURE_1D] =
-               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_1D);
-            break;
-         case PIPE_TEXTURE_2D:
-            ctx->fs_texfetch_col[PIPE_TEXTURE_2D] =
-               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D);
-            break;
-         case PIPE_TEXTURE_3D:
-            ctx->fs_texfetch_col[PIPE_TEXTURE_3D] =
-               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_3D);
-            break;
-         case PIPE_TEXTURE_CUBE:
-            ctx->fs_texfetch_col[PIPE_TEXTURE_CUBE] =
-               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE);
-            break;
-         default:;
-      }
+      unsigned tgsi_tex = pipe_tex_to_tgsi_tex(tex_target);
+
+      ctx->fs_texfetch_col[tex_target] =
+        util_make_fragment_tex_shader(pipe, tgsi_tex, TGSI_INTERPOLATE_LINEAR);
    }
 
    return ctx->fs_texfetch_col[tex_target];
@@ -544,36 +616,47 @@ static INLINE
 void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
                                     unsigned tex_target)
 {
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
 
    assert(tex_target < PIPE_MAX_TEXTURE_TYPES);
 
    /* Create the fragment shader on-demand. */
    if (!ctx->fs_texfetch_depth[tex_target]) {
-      switch (tex_target) {
-         case PIPE_TEXTURE_1D:
-            ctx->fs_texfetch_depth[PIPE_TEXTURE_1D] =
-               util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_1D);
-            break;
-         case PIPE_TEXTURE_2D:
-            ctx->fs_texfetch_depth[PIPE_TEXTURE_2D] =
-               util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_2D);
-            break;
-         case PIPE_TEXTURE_3D:
-            ctx->fs_texfetch_depth[PIPE_TEXTURE_3D] =
-               util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_3D);
-            break;
-         case PIPE_TEXTURE_CUBE:
-            ctx->fs_texfetch_depth[PIPE_TEXTURE_CUBE] =
-               util_make_fragment_tex_shader_writedepth(pipe,TGSI_TEXTURE_CUBE);
-            break;
-         default:;
-      }
+      unsigned tgsi_tex = pipe_tex_to_tgsi_tex(tex_target);
+
+      ctx->fs_texfetch_depth[tex_target] =
+         util_make_fragment_tex_shader_writedepth(pipe, tgsi_tex,
+                                                  TGSI_INTERPOLATE_LINEAR);
    }
 
    return ctx->fs_texfetch_depth[tex_target];
 }
 
+static void blitter_draw_rectangle(struct blitter_context *blitter,
+                                   unsigned x1, unsigned y1,
+                                   unsigned x2, unsigned y2,
+                                   float depth,
+                                   enum blitter_attrib_type type,
+                                   const float attrib[4])
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+
+   switch (type) {
+      case UTIL_BLITTER_ATTRIB_COLOR:
+         blitter_set_clear_color(ctx, attrib);
+         break;
+
+      case UTIL_BLITTER_ATTRIB_TEXCOORD:
+         set_texcoords_in_vertices(attrib, &ctx->vertices[0][1][0], 8);
+         break;
+
+      default:;
+   }
+
+   blitter_set_rectangle(ctx, x1, y1, x2, y2, depth);
+   blitter_draw_quad(ctx);
+}
+
 void util_blitter_clear(struct blitter_context *blitter,
                         unsigned width, unsigned height,
                         unsigned num_cbufs,
@@ -582,7 +665,7 @@ void util_blitter_clear(struct blitter_context *blitter,
                         double depth, unsigned stencil)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_stencil_ref sr = { { 0 } };
 
    assert(num_cbufs <= PIPE_MAX_COLOR_BUFS);
@@ -595,11 +678,19 @@ void util_blitter_clear(struct blitter_context *blitter,
    else
       pipe->bind_blend_state(pipe, ctx->blend_keep_color);
 
-   if (clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+   if ((clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL) {
       sr.ref_value[0] = stencil & 0xff;
       pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil);
       pipe->set_stencil_ref(pipe, &sr);
    }
+   else if (clear_buffers & PIPE_CLEAR_DEPTH) {
+      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_keep_stencil);
+   }
+   else if (clear_buffers & PIPE_CLEAR_STENCIL) {
+      sr.ref_value[0] = stencil & 0xff;
+      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_write_stencil);
+      pipe->set_stencil_ref(pipe, &sr);
+   }
    else
       pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
 
@@ -608,244 +699,284 @@ void util_blitter_clear(struct blitter_context *blitter,
    pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, num_cbufs));
    pipe->bind_vs_state(pipe, ctx->vs_col);
 
-   blitter_set_clear_color(ctx, rgba);
-   blitter_set_rectangle(ctx, 0, 0, width, height, width, height, depth);
-   blitter_draw_quad(ctx);
+   blitter_set_dst_dimensions(ctx, width, height);
+   blitter->draw_rectangle(blitter, 0, 0, width, height, depth,
+                           UTIL_BLITTER_ATTRIB_COLOR, rgba);
    blitter_restore_CSOs(ctx);
 }
 
-static boolean
-is_overlap(unsigned sx1, unsigned sx2, unsigned sy1, unsigned sy2,
-           unsigned dx1, unsigned dx2, unsigned dy1, unsigned dy2)
+static
+boolean is_overlap(unsigned sx1, unsigned sx2, unsigned sy1, unsigned sy2,
+                   unsigned dx1, unsigned dx2, unsigned dy1, unsigned dy2)
 {
-    if (sx1 >= dx2 || sx2 <= dx1 || sy1 >= dy2 || sy2 <= dy1) {
-        return FALSE;
-    } else {
-        return TRUE;
-    }
+   return sx1 < dx2 && sx2 > dx1 && sy1 < dy2 && sy2 > dy1;
 }
 
-static void util_blitter_do_copy(struct blitter_context *blitter,
-				 struct pipe_surface *dst,
-				 unsigned dstx, unsigned dsty,
-				 struct pipe_surface *src,
-				 unsigned srcx, unsigned srcy,
-				 unsigned width, unsigned height,
-				 boolean is_depth)
+void util_blitter_copy_region(struct blitter_context *blitter,
+                              struct pipe_resource *dst,
+                              struct pipe_subresource subdst,
+                              unsigned dstx, unsigned dsty, unsigned dstz,
+                              struct pipe_resource *src,
+                              struct pipe_subresource subsrc,
+                              unsigned srcx, unsigned srcy, unsigned srcz,
+                              unsigned width, unsigned height,
+                              boolean ignore_stencil)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_context *pipe = ctx->base.pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_surface *dstsurf;
    struct pipe_framebuffer_state fb_state;
    struct pipe_sampler_view viewTempl, *view;
+   unsigned bind;
+   boolean is_stencil, is_depth;
+   boolean normalized;
+
+   /* Give up if textures are not set. */
+   assert(dst && src);
+   if (!dst || !src)
+      return;
 
+   /* Sanity checks. */
+   if (dst == src) {
+      assert(!is_overlap(srcx, srcx + width, srcy, srcy + height,
+                         dstx, dstx + width, dsty, dsty + height));
+   } else {
+      assert(dst->format == src->format);
+   }
+   assert(src->target < PIPE_MAX_TEXTURE_TYPES);
+
+   /* Is this a ZS format? */
+   is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
+   is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0;
+
+   if (is_depth || is_stencil)
+      bind = PIPE_BIND_DEPTH_STENCIL;
+   else
+      bind = PIPE_BIND_RENDER_TARGET;
+
+   /* Check if we can sample from and render to the surfaces. */
+   /* (assuming copying a stencil buffer is not possible) */
+    if ((!ignore_stencil && is_stencil) ||
+       !screen->is_format_supported(screen, dst->format, dst->target,
+                                    dst->nr_samples, bind, 0) ||
+       !screen->is_format_supported(screen, src->format, src->target,
+                                    src->nr_samples, PIPE_BIND_SAMPLER_VIEW, 0)) {
+      util_resource_copy_region(pipe, dst, subdst, dstx, dsty, dstz,
+                                src, subsrc, srcx, srcy, srcz, width, height);
+      return;
+   }
+
+   /* Get surfaces. */
+   dstsurf = screen->get_tex_surface(screen, dst,
+                                     subdst.face, subdst.level, dstz,
+                                     bind);
+
+   /* Check whether the states are properly saved. */
+   blitter_check_saved_CSOs(ctx);
    assert(blitter->saved_fb_state.nr_cbufs != ~0);
    assert(blitter->saved_num_sampler_views != ~0);
    assert(blitter->saved_num_sampler_states != ~0);
-   assert(src->texture->target < PIPE_MAX_TEXTURE_TYPES);
 
-   /* bind CSOs */
-   fb_state.width = dst->width;
-   fb_state.height = dst->height;
+   /* Initialize framebuffer state. */
+   fb_state.width = dstsurf->width;
+   fb_state.height = dstsurf->height;
 
    if (is_depth) {
       pipe->bind_blend_state(pipe, ctx->blend_keep_color);
       pipe->bind_depth_stencil_alpha_state(pipe,
                                            ctx->dsa_write_depth_keep_stencil);
       pipe->bind_fs_state(pipe,
-         blitter_get_fs_texfetch_depth(ctx, src->texture->target));
+                          blitter_get_fs_texfetch_depth(ctx, src->target));
 
       fb_state.nr_cbufs = 0;
-      fb_state.zsbuf = dst;
+      fb_state.zsbuf = dstsurf;
    } else {
       pipe->bind_blend_state(pipe, ctx->blend_write_color);
       pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
       pipe->bind_fs_state(pipe,
-         blitter_get_fs_texfetch_col(ctx, src->texture->target));
+                          blitter_get_fs_texfetch_col(ctx, src->target));
 
       fb_state.nr_cbufs = 1;
-      fb_state.cbufs[0] = dst;
+      fb_state.cbufs[0] = dstsurf;
       fb_state.zsbuf = 0;
    }
 
-   u_sampler_view_default_template(&viewTempl,
-                                   src->texture,
-                                   src->texture->format);
-   view = pipe->create_sampler_view(pipe,
-                                    src->texture,
-                                    &viewTempl);
+   normalized = src->target != PIPE_TEXTURE_RECT;
 
-   if (ctx->sampler_view) {
-      pipe_sampler_view_reference(&ctx->sampler_view, NULL);
-   }
-   ctx->sampler_view = view;
+   /* Initialize sampler view. */
+   u_sampler_view_default_template(&viewTempl, src, src->format);
+   view = pipe->create_sampler_view(pipe, src, &viewTempl);
 
+   /* Set rasterizer state, shaders, and textures. */
    pipe->bind_rasterizer_state(pipe, ctx->rs_state);
    pipe->bind_vs_state(pipe, ctx->vs_tex);
    pipe->bind_fragment_sampler_states(pipe, 1,
-      blitter_get_sampler_state(ctx, src->level));
+                                      blitter_get_sampler_state(ctx, subsrc.level, normalized));
    pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
    pipe->set_fragment_sampler_views(pipe, 1, &view);
    pipe->set_framebuffer_state(pipe, &fb_state);
 
-   /* set texture coordinates */
-   switch (src->texture->target) {
+   blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height);
+
+   switch (src->target) {
+      /* Draw the quad with the draw_rectangle callback. */
       case PIPE_TEXTURE_1D:
       case PIPE_TEXTURE_2D:
-         blitter_set_texcoords_2d(ctx, src, srcx, srcy,
-                                  srcx+width, srcy+height);
+      case PIPE_TEXTURE_RECT:
+         {
+            /* Set texture coordinates. */
+            float coord[4];
+            get_texcoords(src, subsrc, srcx, srcy,
+                                     srcx+width, srcy+height, normalized, coord);
+
+            /* Draw. */
+            blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0,
+                                    UTIL_BLITTER_ATTRIB_TEXCOORD, coord);
+         }
          break;
+
+      /* Draw the quad with the generic codepath. */
       case PIPE_TEXTURE_3D:
-         blitter_set_texcoords_3d(ctx, src, srcx, srcy,
-                                  srcx+width, srcy+height);
-         break;
       case PIPE_TEXTURE_CUBE:
-         blitter_set_texcoords_cube(ctx, src, srcx, srcy,
-                                    srcx+width, srcy+height);
+         /* Set texture coordinates. */
+         if (src->target == PIPE_TEXTURE_3D)
+            blitter_set_texcoords_3d(ctx, src, subsrc, srcz,
+                                     srcx, srcy, srcx+width, srcy+height);
+         else
+            blitter_set_texcoords_cube(ctx, src, subsrc,
+                                       srcx, srcy, srcx+width, srcy+height);
+
+         /* Draw. */
+         blitter_set_rectangle(ctx, dstx, dsty, dstx+width, dsty+height, 0);
+         blitter_draw_quad(ctx);
          break;
+
       default:
          assert(0);
+         return;
    }
 
-   blitter_set_rectangle(ctx, dstx, dsty, dstx+width, dsty+height, dst->width, dst->height, 0);
-   blitter_draw_quad(ctx);
+   blitter_restore_CSOs(ctx);
 
+   pipe_surface_reference(&dstsurf, NULL);
+   pipe_sampler_view_reference(&view, NULL);
 }
 
-static void util_blitter_overlap_copy(struct blitter_context *blitter,
-				      struct pipe_surface *dst,
-				      unsigned dstx, unsigned dsty,
-				      struct pipe_surface *src,
-				      unsigned srcx, unsigned srcy,
-				      unsigned width, unsigned height)
+/* Clear a region of a color surface to a constant value. */
+void util_blitter_clear_render_target(struct blitter_context *blitter,
+                                      struct pipe_surface *dstsurf,
+                                      const float *rgba,
+                                      unsigned dstx, unsigned dsty,
+                                      unsigned width, unsigned height)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
-   struct pipe_screen *screen = pipe->screen;
+   struct pipe_context *pipe = ctx->base.pipe;
+   struct pipe_framebuffer_state fb_state;
 
-   struct pipe_resource texTemp;
-   struct pipe_resource *texture;
-   struct pipe_surface *tex_surf;
+   assert(dstsurf->texture);
+   if (!dstsurf->texture)
+      return;
 
-   /* check whether the states are properly saved */
+   /* check the saved state */
    blitter_check_saved_CSOs(ctx);
+   assert(blitter->saved_fb_state.nr_cbufs != ~0);
 
-   memset(&texTemp, 0, sizeof(texTemp));
-   texTemp.target = PIPE_TEXTURE_2D;
-   texTemp.format = dst->texture->format; /* XXX verify supported by driver! */
-   texTemp.last_level = 0;
-   texTemp.width0 = width;
-   texTemp.height0 = height;
-   texTemp.depth0 = 1;
+   /* bind CSOs */
+   pipe->bind_blend_state(pipe, ctx->blend_write_color);
+   pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
+   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
+   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 1));
+   pipe->bind_vs_state(pipe, ctx->vs_col);
+   pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
 
-   texture = screen->resource_create(screen, &texTemp);
-   if (!texture)
-      return;
+   /* set a framebuffer state */
+   fb_state.width = dstsurf->width;
+   fb_state.height = dstsurf->height;
+   fb_state.nr_cbufs = 1;
+   fb_state.cbufs[0] = dstsurf;
+   fb_state.zsbuf = 0;
+   pipe->set_framebuffer_state(pipe, &fb_state);
 
-   tex_surf = screen->get_tex_surface(screen, texture, 0, 0, 0,
-				      PIPE_BIND_BLIT_SOURCE | 
-				      PIPE_BIND_BLIT_DESTINATION);
-
-   /* blit from the src to the temp */
-   util_blitter_do_copy(blitter, tex_surf, 0, 0,
-			src, srcx, srcy,
-			width, height,
-			FALSE);
-   util_blitter_do_copy(blitter, dst, dstx, dsty,
-			tex_surf, 0, 0,
-			width, height,
-			FALSE);
-   pipe_surface_reference(&tex_surf, NULL);
-   pipe_resource_reference(&texture, NULL);
+   blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height);
+   blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0,
+                           UTIL_BLITTER_ATTRIB_COLOR, rgba);
    blitter_restore_CSOs(ctx);
 }
 
-void util_blitter_copy(struct blitter_context *blitter,
-                       struct pipe_surface *dst,
-                       unsigned dstx, unsigned dsty,
-                       struct pipe_surface *src,
-                       unsigned srcx, unsigned srcy,
-                       unsigned width, unsigned height,
-                       boolean ignore_stencil)
+/* Clear a region of a depth stencil surface. */
+void util_blitter_clear_depth_stencil(struct blitter_context *blitter,
+                                      struct pipe_surface *dstsurf,
+                                      unsigned clear_flags,
+                                      double depth,
+                                      unsigned stencil,
+                                      unsigned dstx, unsigned dsty,
+                                      unsigned width, unsigned height)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
-   struct pipe_screen *screen = pipe->screen;
-   boolean is_stencil, is_depth;
-   unsigned dst_tex_usage;
+   struct pipe_context *pipe = ctx->base.pipe;
+   struct pipe_framebuffer_state fb_state;
+   struct pipe_stencil_ref sr = { { 0 } };
 
-   /* give up if textures are not set */
-   assert(dst->texture && src->texture);
-   if (!dst->texture || !src->texture)
+   assert(dstsurf->texture);
+   if (!dstsurf->texture)
       return;
 
-   if (dst->texture == src->texture) {
-      if (is_overlap(srcx, srcx + width, srcy, srcy + height,
-		             dstx, dstx + width, dsty, dsty + height)) {
-         util_blitter_overlap_copy(blitter, dst, dstx, dsty, src, srcx, srcy,
-                                   width, height);
-         return;
-      }
-   }
-		   
-   is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
-   is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0;
-   dst_tex_usage = is_depth || is_stencil ? PIPE_BIND_DEPTH_STENCIL :
-                                            PIPE_BIND_RENDER_TARGET;
+   /* check the saved state */
+   blitter_check_saved_CSOs(ctx);
+   assert(blitter->saved_fb_state.nr_cbufs != ~0);
 
-   /* check if we can sample from and render to the surfaces */
-   /* (assuming copying a stencil buffer is not possible) */
-   if ((!ignore_stencil && is_stencil) ||
-       !screen->is_format_supported(screen, dst->format, dst->texture->target,
-                                    dst_tex_usage, 0) ||
-       !screen->is_format_supported(screen, src->format, src->texture->target,
-                                    PIPE_BIND_SAMPLER_VIEW, 0)) {
-      util_surface_copy(pipe, FALSE, dst, dstx, dsty, src, srcx, srcy,
-                        width, height);
-      return;
+   /* bind CSOs */
+   pipe->bind_blend_state(pipe, ctx->blend_keep_color);
+   if ((clear_flags & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL) {
+      sr.ref_value[0] = stencil & 0xff;
+      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil);
+      pipe->set_stencil_ref(pipe, &sr);
+   }
+   else if (clear_flags & PIPE_CLEAR_DEPTH) {
+      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_keep_stencil);
+   }
+   else if (clear_flags & PIPE_CLEAR_STENCIL) {
+      sr.ref_value[0] = stencil & 0xff;
+      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_write_stencil);
+      pipe->set_stencil_ref(pipe, &sr);
    }
+   else
+      /* hmm that should be illegal probably, or make it a no-op somewhere */
+      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
 
-   /* check whether the states are properly saved */
-   blitter_check_saved_CSOs(ctx);
-   util_blitter_do_copy(blitter,
-			dst, dstx, dsty,
-			src, srcx, srcy,
-			width, height, is_depth);
+   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
+   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 0));
+   pipe->bind_vs_state(pipe, ctx->vs_col);
+   pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
+
+   /* set a framebuffer state */
+   fb_state.width = dstsurf->width;
+   fb_state.height = dstsurf->height;
+   fb_state.nr_cbufs = 0;
+   fb_state.cbufs[0] = 0;
+   fb_state.zsbuf = dstsurf;
+   pipe->set_framebuffer_state(pipe, &fb_state);
+
+   blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height);
+   blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, depth,
+                           UTIL_BLITTER_ATTRIB_NONE, NULL);
    blitter_restore_CSOs(ctx);
 }
 
-void util_blitter_fill(struct blitter_context *blitter,
-                       struct pipe_surface *dst,
-                       unsigned dstx, unsigned dsty,
-                       unsigned width, unsigned height,
-                       unsigned value)
+/* draw a rectangle across a region using a custom dsa stage - for r600g */
+void util_blitter_custom_depth_stencil(struct blitter_context *blitter,
+				       struct pipe_surface *zsurf,
+				       struct pipe_surface *cbsurf,
+				       void *dsa_stage, float depth)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
-   struct pipe_context *pipe = ctx->pipe;
-   struct pipe_screen *screen = pipe->screen;
+   struct pipe_context *pipe = ctx->base.pipe;
    struct pipe_framebuffer_state fb_state;
-   float rgba[4];
-   ubyte ub_rgba[4] = {0};
-   union util_color color;
-   int i;
-
-   assert(dst->texture);
-   if (!dst->texture)
-      return;
 
-   /* check if we can render to the surface */
-   if (util_format_is_depth_or_stencil(dst->format) || /* unlikely, but you never know */
-       !screen->is_format_supported(screen, dst->format, dst->texture->target,
-                                    PIPE_BIND_RENDER_TARGET, 0)) {
-      util_surface_fill(pipe, dst, dstx, dsty, width, height, value);
+   assert(zsurf->texture);
+   if (!zsurf->texture)
       return;
-   }
-
-   /* unpack the color */
-   color.ui = value;
-   util_unpack_color_ub(dst->format, &color,
-                        ub_rgba, ub_rgba+1, ub_rgba+2, ub_rgba+3);
-   for (i = 0; i < 4; i++)
-      rgba[i] = ubyte_to_float(ub_rgba[i]);
 
    /* check the saved state */
    blitter_check_saved_CSOs(ctx);
@@ -853,22 +984,38 @@ void util_blitter_fill(struct blitter_context *blitter,
 
    /* bind CSOs */
    pipe->bind_blend_state(pipe, ctx->blend_write_color);
-   pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
+   pipe->bind_depth_stencil_alpha_state(pipe, dsa_stage);
+
    pipe->bind_rasterizer_state(pipe, ctx->rs_state);
-   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 1));
+   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 0));
    pipe->bind_vs_state(pipe, ctx->vs_col);
    pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
 
    /* set a framebuffer state */
-   fb_state.width = dst->width;
-   fb_state.height = dst->height;
+   fb_state.width = zsurf->width;
+   fb_state.height = zsurf->height;
    fb_state.nr_cbufs = 1;
-   fb_state.cbufs[0] = dst;
-   fb_state.zsbuf = 0;
+   if (cbsurf) {
+	   fb_state.cbufs[0] = cbsurf;
+	   fb_state.nr_cbufs = 1;
+   } else {
+	   fb_state.cbufs[0] = NULL;
+	   fb_state.nr_cbufs = 0;
+   }
+   fb_state.zsbuf = zsurf;
    pipe->set_framebuffer_state(pipe, &fb_state);
 
-   blitter_set_clear_color(ctx, rgba);
-   blitter_set_rectangle(ctx, 0, 0, width, height, dst->width, dst->height, 0);
-   blitter_draw_quad(ctx);
+   blitter_set_dst_dimensions(ctx, zsurf->width, zsurf->height);
+   blitter->draw_rectangle(blitter, 0, 0, zsurf->width, zsurf->height, depth,
+                           UTIL_BLITTER_ATTRIB_NONE, NULL);
    blitter_restore_CSOs(ctx);
 }
+
+/* flush a region of a depth stencil surface for r300g */
+void util_blitter_flush_depth_stencil(struct blitter_context *blitter,
+                                      struct pipe_surface *dstsurf)
+{
+	struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+	util_blitter_custom_depth_stencil(blitter, dstsurf, NULL,
+					  ctx->dsa_flush_depth_stencil, 0.0f);
+}
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index f6e3ce4874e..f9f96f25c77 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -27,6 +27,8 @@
 #ifndef U_BLITTER_H
 #define U_BLITTER_H
 
+#include "util/u_framebuffer.h"
+#include "util/u_inlines.h"
 #include "util/u_memory.h"
 
 #include "pipe/p_state.h"
@@ -38,9 +40,48 @@ extern "C" {
 
 struct pipe_context;
 
+enum blitter_attrib_type {
+   UTIL_BLITTER_ATTRIB_NONE,
+   UTIL_BLITTER_ATTRIB_COLOR,
+   UTIL_BLITTER_ATTRIB_TEXCOORD
+};
+
 struct blitter_context
 {
+   /**
+    * Draw a rectangle.
+    *
+    * \param x1      An X coordinate of the top-left corner.
+    * \param y1      A Y coordinate of the top-left corner.
+    * \param x2      An X coordinate of the bottom-right corner.
+    * \param y2      A Y coordinate of the bottom-right corner.
+    * \param depth  A depth which the rectangle is rendered at.
+    *
+    * \param type   Semantics of the attributes "attrib".
+    *               If type is UTIL_BLITTER_ATTRIB_NONE, ignore them.
+    *               If type is UTIL_BLITTER_ATTRIB_COLOR, the attributes
+    *               make up a constant RGBA color, and should go to the COLOR0
+    *               varying slot of a fragment shader.
+    *               If type is UTIL_BLITTER_ATTRIB_TEXCOORD, {a1, a2} and
+    *               {a3, a4} specify top-left and bottom-right texture
+    *               coordinates of the rectangle, respectively, and should go
+    *               to the GENERIC0 varying slot of a fragment shader.
+    *
+    * \param attrib See type.
+    *
+    * \note A driver may optionally override this callback to implement
+    *       a specialized hardware path for drawing a rectangle, e.g. using
+    *       a rectangular point sprite.
+    */
+   void (*draw_rectangle)(struct blitter_context *blitter,
+                          unsigned x1, unsigned y1, unsigned x2, unsigned y2,
+                          float depth,
+                          enum blitter_attrib_type type,
+                          const float attrib[4]);
+
    /* Private members, really. */
+   struct pipe_context *pipe; /**< pipe context */
+
    void *saved_blend_state;   /**< blend state */
    void *saved_dsa_state;     /**< depth stencil alpha state */
    void *saved_velem_state;   /**< vertex elements state */
@@ -72,6 +113,15 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe);
  */
 void util_blitter_destroy(struct blitter_context *blitter);
 
+/**
+ * Return the pipe context associated with a blitter context.
+ */
+static INLINE
+struct pipe_context *util_blitter_get_pipe(struct blitter_context *blitter)
+{
+   return blitter->pipe;
+}
+
 /*
  * These CSOs must be saved before any of the following functions is called:
  * - blend state
@@ -112,52 +162,56 @@ void util_blitter_clear(struct blitter_context *blitter,
  * - fragment sampler states
  * - fragment sampler textures
  */
-void util_blitter_copy(struct blitter_context *blitter,
-                       struct pipe_surface *dst,
-                       unsigned dstx, unsigned dsty,
-                       struct pipe_surface *src,
-                       unsigned srcx, unsigned srcy,
-                       unsigned width, unsigned height,
-                       boolean ignore_stencil);
+void util_blitter_copy_region(struct blitter_context *blitter,
+                              struct pipe_resource *dst,
+                              struct pipe_subresource subdst,
+                              unsigned dstx, unsigned dsty, unsigned dstz,
+                              struct pipe_resource *src,
+                              struct pipe_subresource subsrc,
+                              unsigned srcx, unsigned srcy, unsigned srcz,
+                              unsigned width, unsigned height,
+                              boolean ignore_stencil);
 
 /**
- * Fill a region of a surface with a constant value.
- *
- * If the surface cannot be rendered to or it's a depth-stencil format,
- * a software fallback path is taken.
+ * Clear a region of a (color) surface to a constant value.
  *
  * These states must be saved in the blitter in addition to the state objects
  * already required to be saved:
  * - framebuffer state
  */
-void util_blitter_fill(struct blitter_context *blitter,
-                       struct pipe_surface *dst,
-                       unsigned dstx, unsigned dsty,
-                       unsigned width, unsigned height,
-                       unsigned value);
+void util_blitter_clear_render_target(struct blitter_context *blitter,
+                                      struct pipe_surface *dst,
+                                      const float *rgba,
+                                      unsigned dstx, unsigned dsty,
+                                      unsigned width, unsigned height);
 
 /**
- * Copy all pixels from one surface to another.
+ * Clear a region of a depth-stencil surface, both stencil and depth
+ * or only one of them if this is a combined depth-stencil surface.
  *
- * The rules are the same as in util_blitter_copy with the addition that
- * surfaces must have the same size.
+ * These states must be saved in the blitter in addition to the state objects
+ * already required to be saved:
+ * - framebuffer state
  */
-static INLINE
-void util_blitter_copy_surface(struct blitter_context *blitter,
-                               struct pipe_surface *dst,
-                               struct pipe_surface *src,
-                               boolean ignore_stencil)
-{
-   assert(dst->width == src->width && dst->height == src->height);
-
-   util_blitter_copy(blitter, dst, 0, 0, src, 0, 0, src->width, src->height,
-                     ignore_stencil);
-}
-
+void util_blitter_clear_depth_stencil(struct blitter_context *blitter,
+                                      struct pipe_surface *dst,
+                                      unsigned clear_flags,
+                                      double depth,
+                                      unsigned stencil,
+                                      unsigned dstx, unsigned dsty,
+                                      unsigned width, unsigned height);
+
+void util_blitter_flush_depth_stencil(struct blitter_context *blitter,
+                                      struct pipe_surface *dstsurf);
+
+void util_blitter_custom_depth_stencil(struct blitter_context *blitter,
+				       struct pipe_surface *zsurf,
+				       struct pipe_surface *cbsurf,
+				       void *dsa_stage, float depth);
 
 /* The functions below should be used to save currently bound constant state
  * objects inside a driver. The objects are automatically restored at the end
- * of the util_blitter_{clear, fill, copy, copy_surface} functions and then
+ * of the util_blitter_{clear, copy_region, fill_region} functions and then
  * forgotten.
  *
  * CSOs not listed here are not affected by util_blitter. */
@@ -213,9 +267,10 @@ void util_blitter_save_vertex_shader(struct blitter_context *blitter,
 
 static INLINE
 void util_blitter_save_framebuffer(struct blitter_context *blitter,
-                                   struct pipe_framebuffer_state *state)
+                                   const struct pipe_framebuffer_state *state)
 {
-   blitter->saved_fb_state = *state;
+   blitter->saved_fb_state.nr_cbufs = 0; /* It's ~0 now, meaning it's unsaved. */
+   util_copy_framebuffer_state(&blitter->saved_fb_state, state);
 }
 
 static INLINE
@@ -250,12 +305,13 @@ util_blitter_save_fragment_sampler_views(struct blitter_context *blitter,
                                          int num_views,
                                          struct pipe_sampler_view **views)
 {
+   unsigned i;
    assert(num_views <= Elements(blitter->saved_sampler_views));
 
    blitter->saved_num_sampler_views = num_views;
-   memcpy(blitter->saved_sampler_views,
-          views,
-          num_views * sizeof(struct pipe_sampler_view *));
+   for (i = 0; i < num_views; i++)
+      pipe_sampler_view_reference(&blitter->saved_sampler_views[i],
+                                  views[i]);
 }
 
 static INLINE void
@@ -263,9 +319,18 @@ util_blitter_save_vertex_buffers(struct blitter_context *blitter,
                                          int num_vertex_buffers,
                                          struct pipe_vertex_buffer *vertex_buffers)
 {
+   unsigned i;
    assert(num_vertex_buffers <= Elements(blitter->saved_vertex_buffers));
 
    blitter->saved_num_vertex_buffers = num_vertex_buffers;
+
+   for (i = 0; i < num_vertex_buffers; i++) {
+      if (vertex_buffers[i].buffer) {
+         pipe_resource_reference(&blitter->saved_vertex_buffers[i].buffer,
+                                 vertex_buffers[i].buffer);
+      }
+   }
+
    memcpy(blitter->saved_vertex_buffers,
           vertex_buffers,
           num_vertex_buffers * sizeof(struct pipe_vertex_buffer));
diff --git a/src/gallium/auxiliary/util/u_box.h b/src/gallium/auxiliary/util/u_box.h
index 919967b55a7..e9c71743fc8 100644
--- a/src/gallium/auxiliary/util/u_box.h
+++ b/src/gallium/auxiliary/util/u_box.h
@@ -60,6 +60,25 @@ void u_box_2d_zslice( unsigned x,
    box->depth = 1;
 }
 
+
+static INLINE
+void u_box_3d( unsigned x,
+	       unsigned y,
+	       unsigned z,
+	       unsigned w,
+	       unsigned h,
+	       unsigned d,
+	       struct pipe_box *box )
+{
+   box->x = x;
+   box->y = y;
+   box->z = z;
+   box->width = w;
+   box->height = h;
+   box->depth = d;
+}
+
+
 static INLINE
 struct pipe_subresource u_subresource( unsigned face,
 				       unsigned level )
diff --git a/src/gallium/auxiliary/util/u_caps.c b/src/gallium/auxiliary/util/u_caps.c
new file mode 100644
index 00000000000..e209a98b706
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_caps.c
@@ -0,0 +1,271 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Vmware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_screen.h"
+#include "util/u_format.h"
+#include "util/u_debug.h"
+#include "u_caps.h"
+
+/**
+ * Iterates over a list of caps checks as defined in u_caps.h. Should
+ * all checks pass returns TRUE and out is set to the last element of
+ * the list (TERMINATE). Should any check fail returns FALSE and set
+ * out to the index of the start of the first failing check.
+ */
+boolean
+util_check_caps_out(struct pipe_screen *screen, const unsigned *list, int *out)
+{
+   int i, tmpi;
+   float tmpf;
+
+   for (i = 0; list[i];) {
+      switch(list[i++]) {
+      case UTIL_CAPS_CHECK_CAP:
+         if (!screen->get_param(screen, list[i++])) {
+            *out = i - 2;
+            return FALSE;
+         }
+         break;
+      case UTIL_CAPS_CHECK_INT:
+         tmpi = screen->get_param(screen, list[i++]);
+         if (tmpi < (int)list[i++]) {
+            *out = i - 3;
+            return FALSE;
+         }
+         break;
+      case UTIL_CAPS_CHECK_FLOAT:
+         tmpf = screen->get_paramf(screen, list[i++]);
+         if (tmpf < (float)list[i++]) {
+            *out = i - 3;
+            return FALSE;
+         }
+         break;
+      case UTIL_CAPS_CHECK_FORMAT:
+         if (!screen->is_format_supported(screen,
+                                          list[i++],
+                                          PIPE_TEXTURE_2D,
+                                          0,
+                                          PIPE_BIND_SAMPLER_VIEW,
+                                          0)) {
+            *out = i - 2;
+            return FALSE;
+         }
+         break;
+      case UTIL_CAPS_CHECK_SHADER:
+         tmpi = screen->get_shader_param(screen, list[i] >> 24, list[i] & ((1 << 24) - 1));
+         ++i;
+         if (tmpi < (int)list[i++]) {
+            *out = i - 3;
+            return FALSE;
+         }
+         break;
+      case UTIL_CAPS_CHECK_UNIMPLEMENTED:
+         *out = i - 1;
+         return FALSE;
+      default:
+         assert(!"Unsupported check");
+         return FALSE;
+      }
+   }
+
+   *out = i;
+   return TRUE;
+}
+
+/**
+ * Iterates over a list of caps checks as defined in u_caps.h.
+ * Returns TRUE if all caps checks pass returns FALSE otherwise.
+ */
+boolean
+util_check_caps(struct pipe_screen *screen, const unsigned *list)
+{
+   int out;
+   return util_check_caps_out(screen, list, &out);
+}
+
+
+/*
+ * Below follows some demo lists.
+ *
+ * None of these lists are exhausting lists of what is
+ * actually needed to support said API and more here for
+ * as example on how to uses the above functions. Especially
+ * for DX10 and DX11 where Gallium is missing features.
+ */
+
+/* DX 9_1 */
+static unsigned caps_dx_9_1[] = {
+   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 1),
+   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 12),    /* 2048 */
+   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 9),     /* 256 */
+   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 10),  /* 512 */
+   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 2),
+   UTIL_CHECK_TERMINATE
+};
+
+/* DX 9_2 */
+static unsigned caps_dx_9_2[] = {
+   UTIL_CHECK_CAP(OCCLUSION_QUERY),
+   UTIL_CHECK_CAP(BLEND_EQUATION_SEPARATE),
+   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 1),
+   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 12),    /* 2048 */
+   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 9),     /* 256 */
+   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 10),  /* 512 */
+   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16),
+   UTIL_CHECK_TERMINATE
+};
+
+/* DX 9_3 */
+static unsigned caps_dx_9_3[] = {
+   UTIL_CHECK_CAP(SM3),
+ //UTIL_CHECK_CAP(INSTANCING),
+   UTIL_CHECK_CAP(OCCLUSION_QUERY),
+   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 4),
+   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 13),    /* 4096 */
+   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 9),     /* 256 */
+   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 10),  /* 512 */
+   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16),
+   UTIL_CHECK_TERMINATE
+};
+
+/* DX 10 */
+static unsigned caps_dx_10[] = {
+   UTIL_CHECK_CAP(SM3),
+ //UTIL_CHECK_CAP(INSTANCING),
+   UTIL_CHECK_CAP(OCCLUSION_QUERY),
+   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8),
+   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 14),    /* 8192 */
+   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 12),    /* 2048 */
+   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 14),  /* 8192 */
+   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16),
+   UTIL_CHECK_UNIMPLEMENTED, /* XXX Unimplemented features in Gallium */
+   UTIL_CHECK_TERMINATE
+};
+
+/* DX11 */
+static unsigned caps_dx_11[] = {
+   UTIL_CHECK_CAP(SM3),
+ //UTIL_CHECK_CAP(INSTANCING),
+   UTIL_CHECK_CAP(OCCLUSION_QUERY),
+   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8),
+   UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 14),    /* 16384 */
+   UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 12),    /* 2048 */
+   UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 14),  /* 16384 */
+   UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16),
+   UTIL_CHECK_FORMAT(B8G8R8A8_UNORM),
+   UTIL_CHECK_UNIMPLEMENTED, /* XXX Unimplemented features in Gallium */
+   UTIL_CHECK_TERMINATE
+};
+
+/* OpenGL 2.1 */
+static unsigned caps_opengl_2_1[] = {
+   UTIL_CHECK_CAP(GLSL),
+   UTIL_CHECK_CAP(OCCLUSION_QUERY),
+   UTIL_CHECK_CAP(TWO_SIDED_STENCIL),
+   UTIL_CHECK_CAP(BLEND_EQUATION_SEPARATE),
+   UTIL_CHECK_INT(MAX_RENDER_TARGETS, 2),
+   UTIL_CHECK_TERMINATE
+};
+
+/* OpenGL 3.0 */
+/* UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8), */
+
+/* Shader Model 3 */
+static unsigned caps_sm3[] = {
+    UTIL_CHECK_SHADER(FRAGMENT, MAX_INSTRUCTIONS, 512),
+    UTIL_CHECK_SHADER(FRAGMENT, MAX_INPUTS, 10),
+    UTIL_CHECK_SHADER(FRAGMENT, MAX_TEMPS, 32),
+    UTIL_CHECK_SHADER(FRAGMENT, MAX_ADDRS, 1),
+    UTIL_CHECK_SHADER(FRAGMENT, MAX_CONSTS, 224),
+
+    UTIL_CHECK_SHADER(VERTEX, MAX_INSTRUCTIONS, 512),
+    UTIL_CHECK_SHADER(VERTEX, MAX_INPUTS, 16),
+    UTIL_CHECK_SHADER(VERTEX, MAX_TEMPS, 32),
+    UTIL_CHECK_SHADER(VERTEX, MAX_ADDRS, 2),
+    UTIL_CHECK_SHADER(VERTEX, MAX_CONSTS, 256),
+
+    UTIL_CHECK_TERMINATE
+};
+
+/**
+ * Demo function which checks against theoretical caps needed for different APIs.
+ */
+void util_caps_demo_print(struct pipe_screen *screen)
+{
+   struct {
+      char* name;
+      unsigned *list;
+   } list[] = {
+      {"DX 9.1", caps_dx_9_1},
+      {"DX 9.2", caps_dx_9_2},
+      {"DX 9.3", caps_dx_9_3},
+      {"DX 10", caps_dx_10},
+      {"DX 11", caps_dx_11},
+      {"OpenGL 2.1", caps_opengl_2_1},
+/*    {"OpenGL 3.0", caps_opengl_3_0},*/
+      {"SM3", caps_sm3},
+      {NULL, NULL}
+   };
+   int i, out = 0;
+
+   for (i = 0; list[i].name; i++) {
+      if (util_check_caps_out(screen, list[i].list, &out)) {
+         debug_printf("%s: %s yes\n", __FUNCTION__, list[i].name);
+         continue;
+      }
+      switch (list[i].list[out]) {
+      case UTIL_CAPS_CHECK_CAP:
+         debug_printf("%s: %s no (cap %u not supported)\n", __FUNCTION__,
+                      list[i].name,
+                      list[i].list[out + 1]);
+         break;
+      case UTIL_CAPS_CHECK_INT:
+         debug_printf("%s: %s no (cap %u less then %u)\n", __FUNCTION__,
+                      list[i].name,
+                      list[i].list[out + 1],
+                      list[i].list[out + 2]);
+         break;
+      case UTIL_CAPS_CHECK_FLOAT:
+         debug_printf("%s: %s no (cap %u less then %f)\n", __FUNCTION__,
+                      list[i].name,
+                      list[i].list[out + 1],
+                      (double)(int)list[i].list[out + 2]);
+         break;
+      case UTIL_CAPS_CHECK_FORMAT:
+         debug_printf("%s: %s no (format %s not supported)\n", __FUNCTION__,
+                      list[i].name,
+                      util_format_name(list[i].list[out + 1]) + 12);
+         break;
+      case UTIL_CAPS_CHECK_UNIMPLEMENTED:
+         debug_printf("%s: %s no (not implemented in gallium or state tracker)\n",
+                      __FUNCTION__, list[i].name);
+         break;
+      default:
+            assert(!"Unsupported check");
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/util/u_caps.h b/src/gallium/auxiliary/util/u_caps.h
new file mode 100644
index 00000000000..7bd23800414
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_caps.h
@@ -0,0 +1,71 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Vmware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_CAPS_H
+#define U_CAPS_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_screen;
+
+enum u_caps_check_enum {
+   UTIL_CAPS_CHECK_TERMINATE = 0,
+   UTIL_CAPS_CHECK_CAP,
+   UTIL_CAPS_CHECK_INT,
+   UTIL_CAPS_CHECK_FLOAT,
+   UTIL_CAPS_CHECK_FORMAT,
+   UTIL_CAPS_CHECK_SHADER,
+   UTIL_CAPS_CHECK_UNIMPLEMENTED,
+};
+
+#define UTIL_CHECK_CAP(cap) \
+   UTIL_CAPS_CHECK_CAP, PIPE_CAP_##cap
+
+#define UTIL_CHECK_INT(cap, higher) \
+   UTIL_CAPS_CHECK_INT, PIPE_CAP_##cap, (unsigned)(higher)
+
+/* Floats currently lose precision */
+#define UTIL_CHECK_FLOAT(cap, higher) \
+   UTIL_CAPS_CHECK_FLOAT, PIPE_CAP_##cap, (unsigned)(int)(higher)
+
+#define UTIL_CHECK_FORMAT(format) \
+   UTIL_CAPS_CHECK_FORMAT, PIPE_FORMAT_##format
+
+#define UTIL_CHECK_SHADER(shader, cap, higher) \
+   UTIL_CAPS_CHECK_SHADER, (PIPE_SHADER_##shader << 24) | PIPE_SHADER_CAP_##cap, (unsigned)(higher)
+
+#define UTIL_CHECK_UNIMPLEMENTED \
+   UTIL_CAPS_CHECK_UNIMPLEMENTED
+
+#define UTIL_CHECK_TERMINATE \
+   UTIL_CAPS_CHECK_TERMINATE
+
+boolean util_check_caps(struct pipe_screen *screen, const unsigned *list);
+boolean util_check_caps_out(struct pipe_screen *screen, const unsigned *list, int *out);
+void util_caps_demo_print(struct pipe_screen *screen);
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_clear.h b/src/gallium/auxiliary/util/u_clear.h
index 2c32db61756..ad69df3f898 100644
--- a/src/gallium/auxiliary/util/u_clear.h
+++ b/src/gallium/auxiliary/util/u_clear.h
@@ -31,8 +31,6 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
-#include "util/u_pack_color.h"
-#include "util/u_rect.h"
 
 
 /**
@@ -45,26 +43,17 @@ util_clear(struct pipe_context *pipe,
            const float *rgba, double depth, unsigned stencil)
 {
    if (buffers & PIPE_CLEAR_COLOR) {
-      struct pipe_surface *ps = framebuffer->cbufs[0];
-      union util_color uc;
-
-      util_pack_color(rgba, ps->format, &uc);
-      if (pipe->surface_fill) {
-         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, uc.ui);
-      } else {
-         util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height, uc.ui);
+      unsigned i;
+      for (i = 0; i < framebuffer->nr_cbufs; i++) {
+         struct pipe_surface *ps = framebuffer->cbufs[i];
+         pipe->clear_render_target(pipe, ps, rgba, 0, 0, ps->width, ps->height);
       }
    }
 
    if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
       struct pipe_surface *ps = framebuffer->zsbuf;
-
-      if (pipe->surface_fill) {
-         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
-                            util_pack_z_stencil(ps->format, depth, stencil));
-      } else {
-         util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
-                           util_pack_z_stencil(ps->format, depth, stencil));
-      }
+      pipe->clear_depth_stencil(pipe, ps, buffers & PIPE_CLEAR_DEPTHSTENCIL,
+                                depth, stencil,
+                                0, 0, ps->width, ps->height);
    }
 }
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index a08241971ca..32519b148b6 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -38,7 +38,7 @@
 #include "u_cpu_detect.h"
 
 #if defined(PIPE_ARCH_PPC)
-#if defined(PIPE_OS_DARWIN)
+#if defined(PIPE_OS_APPLE)
 #include <sys/sysctl.h>
 #else
 #include <signal.h>
@@ -73,66 +73,19 @@
 #endif
 
 
-struct util_cpu_caps util_cpu_caps;
-
-static int has_cpuid(void);
-
-#if defined(PIPE_ARCH_X86)
-
-/* The sigill handlers */
-#if defined(PIPE_OS_LINUX) /*&& defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)*/
-static void
-sigill_handler_sse(int signal, struct sigcontext sc)
-{
-   /* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
-    * instructions are 3 bytes long.  We must increment the instruction
-    * pointer manually to avoid repeated execution of the offending
-    * instruction.
-    *
-    * If the SIGILL is caused by a divide-by-zero when unmasked
-    * exceptions aren't supported, the SIMD FPU status and control
-    * word will be restored at the end of the test, so we don't need
-    * to worry about doing it here.  Besides, we may not be able to...
-    */
-   sc.eip += 3;
-
-   util_cpu_caps.has_sse=0;
-}
+#ifdef DEBUG
+DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE)
+#endif
 
-static void
-sigfpe_handler_sse(int signal, struct sigcontext sc)
-{
-   if (sc.fpstate->magic != 0xffff) {
-      /* Our signal context has the extended FPU state, so reset the
-       * divide-by-zero exception mask and clear the divide-by-zero
-       * exception bit.
-       */
-      sc.fpstate->mxcsr |= 0x00000200;
-      sc.fpstate->mxcsr &= 0xfffffffb;
-   } else {
-      /* If we ever get here, we're completely hosed.
-      */
-   }
-}
-#endif /* PIPE_OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
 
-#if defined(PIPE_OS_WINDOWS)
-static LONG CALLBACK
-win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
-{
-   if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
-      ep->ContextRecord->Eip +=3;
-      util_cpu_caps.has_sse=0;
-      return EXCEPTION_CONTINUE_EXECUTION;
-   }
-   return EXCEPTION_CONTINUE_SEARCH;
-}
-#endif /* PIPE_OS_WINDOWS */
+struct util_cpu_caps util_cpu_caps;
 
-#endif /* PIPE_ARCH_X86 */
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+static int has_cpuid(void);
+#endif
 
 
-#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_DARWIN)
+#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE)
 static jmp_buf  __lv_powerpc_jmpbuf;
 static volatile sig_atomic_t __lv_powerpc_canjump = 0;
 
@@ -153,7 +106,7 @@ sigill_handler(int sig)
 static void
 check_os_altivec_support(void)
 {
-#if defined(PIPE_OS_DARWIN)
+#if defined(PIPE_OS_APPLE)
    int sels[2] = {CTL_HW, HW_VECTORUNIT};
    int has_vu = 0;
    int len = sizeof (has_vu);
@@ -166,8 +119,8 @@ check_os_altivec_support(void)
          util_cpu_caps.has_altivec = 1;
       }
    }
-#else /* !PIPE_OS_DARWIN */
-   /* no Darwin, do it the brute-force way */
+#else /* !PIPE_OS_APPLE */
+   /* not on Apple/Darwin, do it the brute-force way */
    /* this is borrowed from the libmpeg2 library */
    signal(SIGILL, sigill_handler);
    if (setjmp(__lv_powerpc_jmpbuf)) {
@@ -184,127 +137,12 @@ check_os_altivec_support(void)
       signal(SIGILL, SIG_DFL);
       util_cpu_caps.has_altivec = 1;
    }
-#endif /* PIPE_OS_DARWIN */
+#endif /* !PIPE_OS_APPLE */
 }
 #endif /* PIPE_ARCH_PPC */
 
-/* If we're running on a processor that can do SSE, let's see if we
- * are allowed to or not.  This will catch 2.4.0 or later kernels that
- * haven't been configured for a Pentium III but are running on one,
- * and RedHat patched 2.2 kernels that have broken exception handling
- * support for user space apps that do SSE.
- */
-#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
-static void
-check_os_katmai_support(void)
-{
-#if defined(PIPE_ARCH_X86)
-#if defined(PIPE_OS_FREEBSD)
-   int has_sse=0, ret;
-   int len = sizeof (has_sse);
-
-   ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
-   if (ret || !has_sse)
-      util_cpu_caps.has_sse=0;
-
-#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
-   int has_sse, has_sse2, ret, mib[2];
-   int varlen;
-
-   mib[0] = CTL_MACHDEP;
-   mib[1] = CPU_SSE;
-   varlen = sizeof (has_sse);
-
-   ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
-   if (ret < 0 || !has_sse) {
-      util_cpu_caps.has_sse = 0;
-   } else {
-      util_cpu_caps.has_sse = 1;
-   }
-
-   mib[1] = CPU_SSE2;
-   varlen = sizeof (has_sse2);
-   ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
-   if (ret < 0 || !has_sse2) {
-      util_cpu_caps.has_sse2 = 0;
-   } else {
-      util_cpu_caps.has_sse2 = 1;
-   }
-   util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */
-
-#elif defined(PIPE_OS_WINDOWS)
-   LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
-   if (util_cpu_caps.has_sse) {
-      exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
-#if defined(PIPE_CC_GCC)
-      __asm __volatile ("xorps %xmm0, %xmm0");
-#elif defined(PIPE_CC_MSVC)
-      __asm {
-          xorps xmm0, xmm0        /* executing SSE instruction */
-      }
-#else
-#error Unsupported compiler
-#endif
-      SetUnhandledExceptionFilter(exc_fil);
-   }
-#elif defined(PIPE_OS_LINUX)
-   struct sigaction saved_sigill;
-   struct sigaction saved_sigfpe;
-
-   /* Save the original signal handlers.
-   */
-   sigaction(SIGILL, NULL, &saved_sigill);
-   sigaction(SIGFPE, NULL, &saved_sigfpe);
-
-   signal(SIGILL, (void (*)(int))sigill_handler_sse);
-   signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
-
-   /* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
-    * supports the extended FPU save and restore required for SSE.  If
-    * we execute an SSE instruction on a PIII and get a SIGILL, the OS
-    * doesn't support Streaming SIMD Exceptions, even if the processor
-    * does.
-    */
-   if (util_cpu_caps.has_sse) {
-      __asm __volatile ("xorps %xmm1, %xmm0");
-   }
-
-   /* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
-    * it supports unmasked SIMD FPU exceptions.  If we unmask the
-    * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
-    * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
-    * as expected, we're okay but we need to clean up after it.
-    *
-    * Are we being too stringent in our requirement that the OS support
-    * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
-    * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
-    * doesn't even support them.  We at least know the user-space SSE
-    * support is good in kernels that do support unmasked exceptions,
-    * and therefore to be safe I'm going to leave this test in here.
-    */
-   if (util_cpu_caps.has_sse) {
-      /* test_os_katmai_exception_support(); */
-   }
-
-   /* Restore the original signal handlers.
-   */
-   sigaction(SIGILL, &saved_sigill, NULL);
-   sigaction(SIGFPE, &saved_sigfpe, NULL);
-
-#else
-   /* We can't use POSIX signal handling to test the availability of
-    * SSE, so we disable it by default.
-    */
-   util_cpu_caps.has_sse = 0;
-#endif /* __linux__ */
-#endif
-
-#if defined(PIPE_ARCH_X86_64)
-   util_cpu_caps.has_sse = 1;
-#endif
-}
-
 
+#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
 static int has_cpuid(void)
 {
 #if defined(PIPE_ARCH_X86)
@@ -385,23 +223,6 @@ util_cpu_detect(void)
 
    memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
 
-   /* Check for arch type */
-#if defined(PIPE_ARCH_MIPS)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_MIPS;
-#elif defined(PIPE_ARCH_ALPHA)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_ALPHA;
-#elif defined(PIPE_ARCH_SPARC)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_SPARC;
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_X86;
-   util_cpu_caps.little_endian = 1;
-#elif defined(PIPE_ARCH_PPC)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_POWERPC;
-   util_cpu_caps.little_endian = 0;
-#else
-   util_cpu_caps.arch = UTIL_CPU_ARCH_UNKNOWN;
-#endif
-
    /* Count the number of CPUs in system */
 #if defined(PIPE_OS_WINDOWS)
    {
@@ -480,9 +301,6 @@ util_cpu_detect(void)
          util_cpu_caps.cacheline = regs2[2] & 0xFF;
       }
 
-      if (util_cpu_caps.has_sse)
-         check_os_katmai_support();
-
       if (!util_cpu_caps.has_sse) {
          util_cpu_caps.has_sse2 = 0;
          util_cpu_caps.has_sse3 = 0;
@@ -497,23 +315,24 @@ util_cpu_detect(void)
 #endif /* PIPE_ARCH_PPC */
 
 #ifdef DEBUG
-   debug_printf("util_cpu_caps.arch = %i\n", util_cpu_caps.arch);
-   debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
-
-   debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
-   debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
-
-   debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
-   debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
-   debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
-   debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
-   debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
-   debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
-   debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
-   debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
-   debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
-   debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
-   debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
+   if (debug_get_option_dump_cpu()) {
+      debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
+
+      debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
+      debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline);
+
+      debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc);
+      debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx);
+      debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2);
+      debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse);
+      debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2);
+      debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3);
+      debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3);
+      debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1);
+      debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow);
+      debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext);
+      debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec);
+   }
 #endif
 
    util_cpu_detect_initialized = TRUE;
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 4b3dc39c342..f3bef0993c7 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -36,26 +36,15 @@
 #define _UTIL_CPU_DETECT_H
 
 #include "pipe/p_compiler.h"
-
-enum util_cpu_arch {
-   UTIL_CPU_ARCH_UNKNOWN = 0,
-   UTIL_CPU_ARCH_MIPS,
-   UTIL_CPU_ARCH_ALPHA,
-   UTIL_CPU_ARCH_SPARC,
-   UTIL_CPU_ARCH_X86,
-   UTIL_CPU_ARCH_POWERPC
-};
+#include "pipe/p_config.h"
 
 struct util_cpu_caps {
-   enum util_cpu_arch arch;
    unsigned nr_cpus;
 
    /* Feature flags */
    int x86_cpu_type;
    unsigned cacheline;
 
-   unsigned little_endian:1;
-
    unsigned has_tsc:1;
    unsigned has_mmx:1;
    unsigned has_mmx2:1;
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 0de38e791d6..504e6d2a18f 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -42,6 +42,7 @@
 #include "util/u_tile.h" 
 #include "util/u_prim.h" 
 
+#include <limits.h> /* CHAR_BIT */
 
 void _debug_vprintf(const char *format, va_list ap)
 {
@@ -87,7 +88,7 @@ debug_get_option_should_print(void)
     * but its cool since we set first to false
     */
    first = FALSE;
-   value = debug_get_bool_option("GALLIUM_PRINT_OPTIONS", TRUE);
+   value = debug_get_bool_option("GALLIUM_PRINT_OPTIONS", FALSE);
    /* XXX should we print this option? Currently it wont */
    return value;
 }
@@ -123,8 +124,12 @@ debug_get_bool_option(const char *name, boolean dfault)
       result = FALSE;
    else if(!util_strcmp(str, "f"))
       result = FALSE;
+   else if(!util_strcmp(str, "F"))
+      result = FALSE;
    else if(!util_strcmp(str, "false"))
       result = FALSE;
+   else if(!util_strcmp(str, "FALSE"))
+      result = FALSE;
    else
       result = TRUE;
 
@@ -177,16 +182,21 @@ debug_get_flags_option(const char *name,
 {
    unsigned long result;
    const char *str;
+   const struct debug_named_value *orig = flags;
+   int namealign = 0;
    
    str = os_get_option(name);
    if(!str)
       result = dfault;
    else if (!util_strcmp(str, "help")) {
       result = dfault;
-      while (flags->name) {
-         debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value);
-         flags++;
-      }
+      _debug_printf("%s: help for %s:\n", __FUNCTION__, name);
+      for (; flags->name; ++flags)
+         namealign = MAX2(namealign, strlen(flags->name));
+      for (flags = orig; flags->name; ++flags)
+         _debug_printf("| %*s [0x%0*lx]%s%s\n", namealign, flags->name,
+                      (int)sizeof(unsigned long)*CHAR_BIT/4, flags->value,
+                      flags->desc ? " " : "", flags->desc ? flags->desc : "");
    }
    else {
       result = 0;
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index e8ff2773e69..1c9624ea3ed 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -230,6 +230,7 @@ struct debug_named_value
 {
    const char *name;
    unsigned long value;
+   const char *desc;
 };
 
 
@@ -252,8 +253,9 @@ struct debug_named_value
  *    ...
  * @endcode
  */
-#define DEBUG_NAMED_VALUE(__symbol) {#__symbol, (unsigned long)__symbol} 
-#define DEBUG_NAMED_VALUE_END {NULL, 0} 
+#define DEBUG_NAMED_VALUE(__symbol) DEBUG_NAMED_VALUE_WITH_DESCRIPTION(__symbol, NULL)
+#define DEBUG_NAMED_VALUE_WITH_DESCRIPTION(__symbol, __desc) {#__symbol, (unsigned long)__symbol, __desc}
+#define DEBUG_NAMED_VALUE_END {NULL, 0, NULL}
 
 
 /**
diff --git a/src/gallium/auxiliary/util/u_debug_describe.c b/src/gallium/auxiliary/util/u_debug_describe.c
new file mode 100644
index 00000000000..1c90ff31069
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_describe.c
@@ -0,0 +1,81 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <pipe/p_state.h>
+#include <util/u_format.h>
+#include <util/u_debug_describe.h>
+#include <util/u_string.h>
+
+void
+debug_describe_reference(char* buf, const struct pipe_reference*ptr)
+{
+   strcpy(buf, "pipe_object");
+}
+
+void
+debug_describe_resource(char* buf, const struct pipe_resource *ptr)
+{
+   switch(ptr->target)
+   {
+   case PIPE_BUFFER:
+      util_sprintf(buf, "pipe_buffer<%u>", (unsigned)util_format_get_stride(ptr->format, ptr->width0));
+      break;
+   case PIPE_TEXTURE_1D:
+      util_sprintf(buf, "pipe_texture1d<%u,%s,%u>", ptr->width0, util_format_short_name(ptr->format), ptr->last_level);
+      break;
+   case PIPE_TEXTURE_2D:
+      util_sprintf(buf, "pipe_texture2d<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level);
+      break;
+   case PIPE_TEXTURE_RECT:
+      util_sprintf(buf, "pipe_texture_rect<%u,%u,%s>", ptr->width0, ptr->height0, util_format_short_name(ptr->format));
+      break;
+   case PIPE_TEXTURE_CUBE:
+      util_sprintf(buf, "pipe_texture_cube<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level);
+      break;
+   case PIPE_TEXTURE_3D:
+      util_sprintf(buf, "pipe_texture3d<%u,%u,%u,%s,%u>", ptr->width0, ptr->height0, ptr->depth0, util_format_short_name(ptr->format), ptr->last_level);
+      break;
+   default:
+      util_sprintf(buf, "pipe_martian_resource<%u>", ptr->target);
+      break;
+   }
+}
+
+void
+debug_describe_surface(char* buf, const struct pipe_surface *ptr)
+{
+   char res[128];
+   debug_describe_resource(res, ptr->texture);
+   util_sprintf(buf, "pipe_surface<%s,%u,%u,%u>", res, ptr->face, ptr->level, ptr->zslice);
+}
+
+void
+debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr)
+{
+   char res[128];
+   debug_describe_resource(res, ptr->texture);
+   util_sprintf(buf, "pipe_sampler_view<%s,%s>", res, util_format_short_name(ptr->format));
+}
diff --git a/src/gallium/auxiliary/util/u_debug_describe.h b/src/gallium/auxiliary/util/u_debug_describe.h
new file mode 100644
index 00000000000..26d1f803bf0
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_describe.h
@@ -0,0 +1,49 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_DEBUG_DESCRIBE_H_
+#define U_DEBUG_DESCRIBE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_reference;
+struct pipe_resource;
+struct pipe_surface;
+struct pipe_sampler_view;
+
+/* a 256-byte buffer is necessary and sufficient */
+void debug_describe_reference(char* buf, const struct pipe_reference*ptr);
+void debug_describe_resource(char* buf, const struct pipe_resource *ptr);
+void debug_describe_surface(char* buf, const struct pipe_surface *ptr);
+void debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_DEBUG_DESCRIBE_H_ */
diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.c b/src/gallium/auxiliary/util/u_debug_refcnt.c
new file mode 100644
index 00000000000..40a26c9c697
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_refcnt.c
@@ -0,0 +1,181 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER))
+
+/* see http://www.mozilla.org/performance/refcnt-balancer.html for what do with the output
+ * on Linux, use tools/addr2line.sh to postprocess it before anything else
+ **/
+#include <util/u_debug.h>
+#include <util/u_debug_refcnt.h>
+#include <util/u_debug_stack.h>
+#include <util/u_debug_symbol.h>
+#include <util/u_string.h>
+#include <util/u_hash_table.h>
+#include <os/os_thread.h>
+#include <os/os_stream.h>
+
+int debug_refcnt_state;
+
+struct os_stream* stream;
+
+/* TODO: maybe move this serial machinery to a stand-alone module and expose it? */
+static pipe_mutex serials_mutex;
+static struct util_hash_table* serials_hash;
+static unsigned serials_last;
+
+static unsigned hash_ptr(void* p)
+{
+   return (unsigned)(uintptr_t)p;
+}
+
+static int compare_ptr(void* a, void* b)
+{
+   if(a == b)
+      return 0;
+   else if(a < b)
+      return -1;
+   else
+      return 1;
+}
+
+static boolean debug_serial(void* p, unsigned* pserial)
+{
+   unsigned serial;
+   boolean found = TRUE;
+   pipe_mutex_lock(serials_mutex);
+   if(!serials_hash)
+      serials_hash = util_hash_table_create(hash_ptr, compare_ptr);
+   serial = (unsigned)(uintptr_t)util_hash_table_get(serials_hash, p);
+   if(!serial)
+   {
+      /* time to stop logging... (you'll have a 100 GB logfile at least at this point)
+       * TODO: avoid this
+       */
+      serial = ++serials_last;
+      if(!serial)
+      {
+         debug_error("More than 2^32 objects detected, aborting.\n");
+         os_abort();
+      }
+
+      util_hash_table_set(serials_hash, p, (void*)(uintptr_t)serial);
+      found = FALSE;
+   }
+   pipe_mutex_unlock(serials_mutex);
+   *pserial = serial;
+   return found;
+}
+
+static void debug_serial_delete(void* p)
+{
+   pipe_mutex_lock(serials_mutex);
+   util_hash_table_remove(serials_hash, p);
+   pipe_mutex_unlock(serials_mutex);
+}
+
+#define STACK_LEN 64
+
+static void dump_stack(const char* symbols[STACK_LEN])
+{
+   unsigned i;
+   for(i = 0; i < STACK_LEN; ++i)
+   {
+      if(symbols[i])
+         os_stream_printf(stream, "%s\n", symbols[i]);
+   }
+   os_stream_write(stream, "\n", 1);
+}
+
+void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+{
+   if(debug_refcnt_state < 0)
+      return;
+
+   if(!debug_refcnt_state)
+   {
+      const char* filename = debug_get_option("GALLIUM_REFCNT_LOG", NULL);
+      if(filename && filename[0])
+         stream = os_file_stream_create(filename);
+
+      if(stream)
+         debug_refcnt_state = 1;
+      else
+         debug_refcnt_state = -1;
+   }
+
+   if(debug_refcnt_state > 0)
+   {
+      struct debug_stack_frame frames[STACK_LEN];
+      const char* symbols[STACK_LEN];
+      char buf[1024];
+
+      unsigned i;
+      unsigned refcnt = p->count;
+      unsigned serial;
+      boolean existing = debug_serial((void*)p, &serial);
+
+      debug_backtrace_capture(frames, 1, STACK_LEN);
+      for(i = 0; i < STACK_LEN; ++i)
+      {
+         if(frames[i].function)
+            symbols[i] = debug_symbol_name_cached(frames[i].function);
+         else
+            symbols[i] = 0;
+      }
+
+      get_desc(buf, p);
+
+      if(!existing)
+      {
+         os_stream_printf(stream, "<%s> %p %u Create\n", buf, p, serial);
+         dump_stack(symbols);
+
+         /* this is there to provide a gradual change even if we don't see the initialization */
+         for(i = 1; i <= refcnt - change; ++i)
+         {
+            os_stream_printf(stream, "<%s> %p %u AddRef %u\n", buf, p, serial, i);
+            dump_stack(symbols);
+         }
+      }
+
+      if(change)
+      {
+         os_stream_printf(stream, "<%s> %p %u %s %u\n", buf, p, serial, change > 0 ? "AddRef" : "Release", refcnt);
+         dump_stack(symbols);
+      }
+
+      if(!refcnt)
+      {
+         debug_serial_delete((void*)p);
+         os_stream_printf(stream, "<%s> %p %u Destroy\n", buf, p, serial);
+         dump_stack(symbols);
+      }
+
+      os_stream_flush(stream);
+   }
+}
+#endif
diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.h b/src/gallium/auxiliary/util/u_debug_refcnt.h
new file mode 100644
index 00000000000..bea2d1c478a
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_debug_refcnt.h
@@ -0,0 +1,63 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_DEBUG_REFCNT_H_
+#define U_DEBUG_REFCNT_H_
+
+#include <pipe/p_config.h>
+#include <pipe/p_state.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*debug_reference_descriptor)(char*, const struct pipe_reference*);
+
+#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER))
+
+extern int debug_refcnt_state;
+
+void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change);
+
+static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+{
+   if (debug_refcnt_state >= 0)
+      debug_reference_slowpath(p, get_desc, change);
+}
+
+#else
+
+static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+{
+}
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_DEBUG_REFCNT_H_ */
diff --git a/src/gallium/auxiliary/util/u_debug_symbol.c b/src/gallium/auxiliary/util/u_debug_symbol.c
index 417d0cf04c9..332952af88b 100644
--- a/src/gallium/auxiliary/util/u_debug_symbol.c
+++ b/src/gallium/auxiliary/util/u_debug_symbol.c
@@ -33,9 +33,12 @@
  */
 
 #include "pipe/p_compiler.h"
+#include "os/os_thread.h"
+#include "u_string.h"
 
 #include "u_debug.h"
 #include "u_debug_symbol.h"
+#include "u_hash_table.h"
 
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86)
    
@@ -67,21 +70,6 @@ BOOL WINAPI j_SymInitialize(HANDLE hProcess, PSTR UserSearchPath, BOOL fInvadePr
       return FALSE;
 }
 
-typedef BOOL (WINAPI *PFNSYMCLEANUP)(HANDLE);
-static PFNSYMCLEANUP pfnSymCleanup = NULL;
-
-static
-BOOL WINAPI j_SymCleanup(HANDLE hProcess)
-{
-   if(
-      (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) &&
-      (pfnSymCleanup || (pfnSymCleanup = (PFNSYMCLEANUP) GetProcAddress(hModule_Imagehlp, "SymCleanup")))
-   )
-      return pfnSymCleanup(hProcess);
-   else
-      return FALSE;
-}
-
 typedef DWORD (WINAPI *PFNSYMSETOPTIONS)(DWORD);
 static PFNSYMSETOPTIONS pfnSymSetOptions = NULL;
 
@@ -97,36 +85,6 @@ DWORD WINAPI j_SymSetOptions(DWORD SymOptions)
       return FALSE;
 }
 
-typedef BOOL (WINAPI *PFNSYMUNDNAME)(PIMAGEHLP_SYMBOL, PSTR, DWORD);
-static PFNSYMUNDNAME pfnSymUnDName = NULL;
-
-static
-BOOL WINAPI j_SymUnDName(PIMAGEHLP_SYMBOL Symbol, PSTR UnDecName, DWORD UnDecNameLength)
-{
-   if(
-      (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) &&
-      (pfnSymUnDName || (pfnSymUnDName = (PFNSYMUNDNAME) GetProcAddress(hModule_Imagehlp, "SymUnDName")))
-   )
-      return pfnSymUnDName(Symbol, UnDecName, UnDecNameLength);
-   else
-      return FALSE;
-}
-
-typedef PFUNCTION_TABLE_ACCESS_ROUTINE PFNSYMFUNCTIONTABLEACCESS;
-static PFNSYMFUNCTIONTABLEACCESS pfnSymFunctionTableAccess = NULL;
-
-static
-PVOID WINAPI j_SymFunctionTableAccess(HANDLE hProcess, DWORD AddrBase)
-{
-   if(
-      (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) &&
-      (pfnSymFunctionTableAccess || (pfnSymFunctionTableAccess = (PFNSYMFUNCTIONTABLEACCESS) GetProcAddress(hModule_Imagehlp, "SymFunctionTableAccess")))
-   )
-      return pfnSymFunctionTableAccess(hProcess, AddrBase);
-   else
-      return NULL;
-}
-
 typedef PGET_MODULE_BASE_ROUTINE PFNSYMGETMODULEBASE;
 static PFNSYMGETMODULEBASE pfnSymGetModuleBase = NULL;
 
@@ -142,41 +100,6 @@ DWORD WINAPI j_SymGetModuleBase(HANDLE hProcess, DWORD dwAddr)
       return 0;
 }
 
-typedef BOOL (WINAPI *PFNSTACKWALK)(DWORD, HANDLE, HANDLE, LPSTACKFRAME, LPVOID, PREAD_PROCESS_MEMORY_ROUTINE, PFUNCTION_TABLE_ACCESS_ROUTINE, PGET_MODULE_BASE_ROUTINE, PTRANSLATE_ADDRESS_ROUTINE);
-static PFNSTACKWALK pfnStackWalk = NULL;
-
-static
-BOOL WINAPI j_StackWalk(
-   DWORD MachineType, 
-   HANDLE hProcess, 
-   HANDLE hThread, 
-   LPSTACKFRAME StackFrame, 
-   PVOID ContextRecord, 
-   PREAD_PROCESS_MEMORY_ROUTINE ReadMemoryRoutine,  
-   PFUNCTION_TABLE_ACCESS_ROUTINE FunctionTableAccessRoutine,
-   PGET_MODULE_BASE_ROUTINE GetModuleBaseRoutine, 
-   PTRANSLATE_ADDRESS_ROUTINE TranslateAddress 
-)
-{
-   if(
-      (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) &&
-      (pfnStackWalk || (pfnStackWalk = (PFNSTACKWALK) GetProcAddress(hModule_Imagehlp, "StackWalk")))
-   )
-      return pfnStackWalk(
-         MachineType, 
-         hProcess, 
-         hThread, 
-         StackFrame, 
-         ContextRecord, 
-         ReadMemoryRoutine,  
-         FunctionTableAccessRoutine,
-         GetModuleBaseRoutine, 
-         TranslateAddress 
-      );
-   else
-      return FALSE;
-}
-
 typedef BOOL (WINAPI *PFNSYMGETSYMFROMADDR)(HANDLE, DWORD, LPDWORD, PIMAGEHLP_SYMBOL);
 static PFNSYMGETSYMFROMADDR pfnSymGetSymFromAddr = NULL;
 
@@ -192,24 +115,9 @@ BOOL WINAPI j_SymGetSymFromAddr(HANDLE hProcess, DWORD Address, PDWORD Displacem
       return FALSE;
 }
 
-typedef BOOL (WINAPI *PFNSYMGETLINEFROMADDR)(HANDLE, DWORD, LPDWORD, PIMAGEHLP_LINE);
-static PFNSYMGETLINEFROMADDR pfnSymGetLineFromAddr = NULL;
 
-static
-BOOL WINAPI j_SymGetLineFromAddr(HANDLE hProcess, DWORD dwAddr, PDWORD pdwDisplacement, PIMAGEHLP_LINE Line)
-{
-   if(
-      (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) &&
-      (pfnSymGetLineFromAddr || (pfnSymGetLineFromAddr = (PFNSYMGETLINEFROMADDR) GetProcAddress(hModule_Imagehlp, "SymGetLineFromAddr")))
-   )
-      return pfnSymGetLineFromAddr(hProcess, dwAddr, pdwDisplacement, Line);
-   else
-      return FALSE;
-}
-
-
-static INLINE boolean
-debug_symbol_print_imagehlp(const void *addr)
+static INLINE void
+debug_symbol_name_imagehlp(const void *addr, char* buf, unsigned size)
 {
    HANDLE hProcess;
    BYTE symbolBuffer[1024];
@@ -226,25 +134,95 @@ debug_symbol_print_imagehlp(const void *addr)
       if(j_SymInitialize(hProcess, NULL, TRUE))
          bSymInitialized = TRUE;
    }
-      
+
    if(!j_SymGetSymFromAddr(hProcess, (DWORD)addr, &dwDisplacement, pSymbol))
-      return FALSE;
+      buf[0] = 0;
+   else
+   {
+      strncpy(buf, pSymbol->Name, size);
+      buf[size - 1] = 0;
+   }
+}
+#endif
 
-   debug_printf("\t%s\n", pSymbol->Name);
+#ifdef __GLIBC__
+#include <execinfo.h>
 
-   return TRUE;
-   
+/* This can only provide dynamic symbols, or binary offsets into a file.
+ *
+ * To fix this, post-process the output with tools/addr2line.sh
+ */
+static INLINE void
+debug_symbol_name_glibc(const void *addr, char* buf, unsigned size)
+{
+   char** syms = backtrace_symbols((void**)&addr, 1);
+   strncpy(buf, syms[0], size);
+   buf[size - 1] = 0;
+   free(syms);
 }
 #endif
 
-
 void
-debug_symbol_print(const void *addr)
+debug_symbol_name(const void *addr, char* buf, unsigned size)
 {
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86)
-   if(debug_symbol_print_imagehlp(addr))
+   debug_symbol_name_imagehlp(addr, buf, size);
+   if(buf[0])
       return;
 #endif
-   
-   debug_printf("\t%p\n", addr);
+
+#ifdef __GLIBC__
+   debug_symbol_name_glibc(addr, buf, size);
+   if(buf[0])
+      return;
+#endif
+
+   util_snprintf(buf, size, "%p", addr);
+   buf[size - 1] = 0;
+}
+
+void
+debug_symbol_print(const void *addr)
+{
+   char buf[1024];
+   debug_symbol_name(addr, buf, sizeof(buf));
+   debug_printf("\t%s\n", buf);
+}
+
+struct util_hash_table* symbols_hash;
+pipe_mutex symbols_mutex;
+
+static unsigned hash_ptr(void* p)
+{
+   return (unsigned)(uintptr_t)p;
+}
+
+static int compare_ptr(void* a, void* b)
+{
+   if(a == b)
+      return 0;
+   else if(a < b)
+      return -1;
+   else
+      return 1;
+}
+
+const char*
+debug_symbol_name_cached(const void *addr)
+{
+   const char* name;
+   pipe_mutex_lock(symbols_mutex);
+   if(!symbols_hash)
+      symbols_hash = util_hash_table_create(hash_ptr, compare_ptr);
+   name = util_hash_table_get(symbols_hash, (void*)addr);
+   if(!name)
+   {
+      char buf[1024];
+      debug_symbol_name(addr, buf, sizeof(buf));
+      name = strdup(buf);
+
+      util_hash_table_set(symbols_hash, (void*)addr, (void*)name);
+   }
+   pipe_mutex_unlock(symbols_mutex);
+   return name;
 }
diff --git a/src/gallium/auxiliary/util/u_debug_symbol.h b/src/gallium/auxiliary/util/u_debug_symbol.h
index 021586987b6..b247706c2a0 100644
--- a/src/gallium/auxiliary/util/u_debug_symbol.h
+++ b/src/gallium/auxiliary/util/u_debug_symbol.h
@@ -43,8 +43,13 @@ extern "C" {
 
 
 void
-debug_symbol_print(const void *addr);
+debug_symbol_name(const void *addr, char* buf, unsigned size);
+
+const char*
+debug_symbol_name_cached(const void *addr);
 
+void
+debug_symbol_print(const void *addr);
 
 #ifdef	__cplusplus
 }
diff --git a/src/gallium/auxiliary/util/u_dirty_flags.h b/src/gallium/auxiliary/util/u_dirty_flags.h
new file mode 100644
index 00000000000..7e1be45ad5a
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_dirty_flags.h
@@ -0,0 +1,28 @@
+#ifndef U_DIRTY_FLAGS_H
+#define U_DIRTY_FLAGS_H
+
+/* Here's a convenient list of dirty flags to use in a driver.  Either
+ * include it directly or use it as a starting point for your own
+ * list.
+ */
+#define U_NEW_VIEWPORT              0x1
+#define U_NEW_RASTERIZER            0x2
+#define U_NEW_FS                    0x4
+#define U_NEW_FS_CONSTANTS          0x8
+#define U_NEW_FS_SAMPLER_VIEW       0x10
+#define U_NEW_FS_SAMPLER_STATES     0x20
+#define U_NEW_VS                    0x40
+#define U_NEW_VS_CONSTANTS          0x80
+#define U_NEW_VS_SAMPLER_VIEW       0x100
+#define U_NEW_VS_SAMPLER_STATES     0x200
+#define U_NEW_BLEND                 0x400
+#define U_NEW_CLIP                  0x800
+#define U_NEW_SCISSOR               0x1000
+#define U_NEW_POLYGON_STIPPLE       0x2000
+#define U_NEW_FRAMEBUFFER           0x4000
+#define U_NEW_VERTEX_ELEMENTS       0x8000
+#define U_NEW_VERTEX_BUFFER         0x10000
+#define U_NEW_QUERY                 0x20000
+#define U_NEW_DEPTH_STENCIL         0x40000
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_dirty_surfaces.h b/src/gallium/auxiliary/util/u_dirty_surfaces.h
index 99f260bf967..fd1bbe5ffdf 100644
--- a/src/gallium/auxiliary/util/u_dirty_surfaces.h
+++ b/src/gallium/auxiliary/util/u_dirty_surfaces.h
@@ -1,9 +1,39 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #ifndef U_DIRTY_SURFACES_H_
 #define U_DIRTY_SURFACES_H_
 
+#include "pipe/p_state.h"
+
 #include "util/u_double_list.h"
 #include "util/u_math.h"
 
+struct pipe_context;
+
 typedef void (*util_dirty_surface_flush_t) (struct pipe_context *, struct pipe_surface *);
 
 struct util_dirty_surfaces
diff --git a/src/gallium/auxiliary/util/u_dl.h b/src/gallium/auxiliary/util/u_dl.h
index 2853b447c61..80a00ed6796 100644
--- a/src/gallium/auxiliary/util/u_dl.h
+++ b/src/gallium/auxiliary/util/u_dl.h
@@ -35,10 +35,13 @@
 
 #if defined(PIPE_OS_WINDOWS)
 #  define UTIL_DL_EXT ".dll"
+#  define UTIL_DL_PREFIX ""
 #elif defined(PIPE_OS_APPLE)
 #  define UTIL_DL_EXT ".dylib"
+#  define UTIL_DL_PREFIX "lib"
 #else
 #  define UTIL_DL_EXT ".so"
+#  define UTIL_DL_PREFIX "lib"
 #endif
 
 
diff --git a/src/gallium/auxiliary/util/u_double_list.h b/src/gallium/auxiliary/util/u_double_list.h
index 53bb1342ddc..42adb1f0699 100644
--- a/src/gallium/auxiliary/util/u_double_list.h
+++ b/src/gallium/auxiliary/util/u_double_list.h
@@ -98,5 +98,20 @@ struct list_head
 #define LIST_IS_EMPTY(__list)                   \
     ((__list)->next == (__list))
 
-
+#ifndef container_of
+#define container_of(ptr, sample, member)				\
+    (void *)((char *)(ptr)						\
+	     - ((char *)&(sample)->member - (char *)(sample)))
+#endif
+
+#define LIST_FOR_EACH_ENTRY(pos, head, member)				\
+   for (pos = container_of((head)->next, pos, member);			\
+	&pos->member != (head);						\
+	pos = container_of(pos->member.next, pos, member))
+
+#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member)	\
+   for (pos = container_of((head)->next, pos, member),			\
+	storage = container_of(pos->member.next, pos, member);	\
+	&pos->member != (head);						\
+	pos = storage, storage = container_of(storage->member.next, storage, member))
 #endif /*_U_DOUBLE_LIST_H_*/
diff --git a/src/gallium/auxiliary/util/u_draw.h b/src/gallium/auxiliary/util/u_draw.h
new file mode 100644
index 00000000000..f06d09ef91d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_draw.h
@@ -0,0 +1,139 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_DRAW_H
+#define U_DRAW_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+
+static INLINE void
+util_draw_init_info(struct pipe_draw_info *info)
+{
+   memset(info, 0, sizeof(*info));
+   info->instance_count = 1;
+   info->max_index = 0xffffffff;
+}
+
+
+static INLINE void
+util_draw_arrays(struct pipe_context *pipe, uint mode, uint start, uint count)
+{
+   struct pipe_draw_info info;
+
+   util_draw_init_info(&info);
+   info.mode = mode;
+   info.start = start;
+   info.count = count;
+   info.min_index = start;
+   info.max_index = start + count - 1;
+
+   pipe->draw_vbo(pipe, &info);
+}
+
+static INLINE void
+util_draw_elements(struct pipe_context *pipe, int index_bias,
+                   uint mode, uint start, uint count)
+{
+   struct pipe_draw_info info;
+
+   util_draw_init_info(&info);
+   info.indexed = TRUE;
+   info.mode = mode;
+   info.start = start;
+   info.count = count;
+   info.index_bias = index_bias;
+
+   pipe->draw_vbo(pipe, &info);
+}
+
+static INLINE void
+util_draw_arrays_instanced(struct pipe_context *pipe,
+                           uint mode, uint start, uint count,
+                           uint start_instance,
+                           uint instance_count)
+{
+   struct pipe_draw_info info;
+
+   util_draw_init_info(&info);
+   info.mode = mode;
+   info.start = start;
+   info.count = count;
+   info.start_instance = start_instance;
+   info.instance_count = instance_count;
+   info.min_index = start;
+   info.max_index = start + count - 1;
+
+   pipe->draw_vbo(pipe, &info);
+}
+
+static INLINE void
+util_draw_elements_instanced(struct pipe_context *pipe,
+                             int index_bias,
+                             uint mode, uint start, uint count,
+                             uint start_instance,
+                             uint instance_count)
+{
+   struct pipe_draw_info info;
+
+   util_draw_init_info(&info);
+   info.indexed = TRUE;
+   info.mode = mode;
+   info.start = start;
+   info.count = count;
+   info.index_bias = index_bias;
+   info.start_instance = start_instance;
+   info.instance_count = instance_count;
+
+   pipe->draw_vbo(pipe, &info);
+}
+
+static INLINE void
+util_draw_range_elements(struct pipe_context *pipe,
+                         int index_bias,
+                         uint min_index,
+                         uint max_index,
+                         uint mode, uint start, uint count)
+{
+   struct pipe_draw_info info;
+
+   util_draw_init_info(&info);
+   info.indexed = TRUE;
+   info.mode = mode;
+   info.start = start;
+   info.count = count;
+   info.index_bias = index_bias;
+   info.min_index = min_index;
+   info.max_index = max_index;
+
+   pipe->draw_vbo(pipe, &info);
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_draw_quad.c b/src/gallium/auxiliary/util/u_draw_quad.c
index b37b48b5aef..0b6dc5880f3 100644
--- a/src/gallium/auxiliary/util/u_draw_quad.c
+++ b/src/gallium/auxiliary/util/u_draw_quad.c
@@ -60,7 +60,7 @@ util_draw_vertex_buffer(struct pipe_context *pipe,
    /* note: vertex elements already set by caller */
 
    /* draw */
-   pipe->draw_arrays(pipe, prim_type, 0, num_verts);
+   util_draw_arrays(pipe, prim_type, 0, num_verts);
 }
 
 
diff --git a/src/gallium/auxiliary/util/u_draw_quad.h b/src/gallium/auxiliary/util/u_draw_quad.h
index 42eb1844289..52994fe05c3 100644
--- a/src/gallium/auxiliary/util/u_draw_quad.h
+++ b/src/gallium/auxiliary/util/u_draw_quad.h
@@ -29,12 +29,18 @@
 #define U_DRAWQUAD_H
 
 
+#include "pipe/p_compiler.h"
+#include "pipe/p_context.h"
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct pipe_resource;
 
+#include "util/u_draw.h"
+
 extern void 
 util_draw_vertex_buffer(struct pipe_context *pipe,
                         struct pipe_resource *vbuf, uint offset,
diff --git a/src/gallium/auxiliary/util/u_dump.h b/src/gallium/auxiliary/util/u_dump.h
index bdc73ac47d2..49536c0d593 100644
--- a/src/gallium/auxiliary/util/u_dump.h
+++ b/src/gallium/auxiliary/util/u_dump.h
@@ -71,9 +71,15 @@ const char *
 util_dump_blend_func(unsigned value, boolean shortened);
 
 const char *
+util_dump_logicop(unsigned value, boolean shortened);
+
+const char *
 util_dump_func(unsigned value, boolean shortened);
 
 const char *
+util_dump_stencil_op(unsigned value, boolean shortened);
+
+const char *
 util_dump_tex_target(unsigned value, boolean shortened);
 
 const char *
diff --git a/src/gallium/auxiliary/util/u_dump_defines.c b/src/gallium/auxiliary/util/u_dump_defines.c
index 96a22563473..692d4447c66 100644
--- a/src/gallium/auxiliary/util/u_dump_defines.c
+++ b/src/gallium/auxiliary/util/u_dump_defines.c
@@ -160,6 +160,49 @@ DEFINE_UTIL_DUMP_CONTINUOUS(blend_func)
 
 
 static const char *
+util_dump_logicop_names[] = {
+   "PIPE_LOGICOP_CLEAR",
+   "PIPE_LOGICOP_NOR",
+   "PIPE_LOGICOP_AND_INVERTED",
+   "PIPE_LOGICOP_COPY_INVERTED",
+   "PIPE_LOGICOP_AND_REVERSE",
+   "PIPE_LOGICOP_INVERT",
+   "PIPE_LOGICOP_XOR",
+   "PIPE_LOGICOP_NAND",
+   "PIPE_LOGICOP_AND",
+   "PIPE_LOGICOP_EQUIV",
+   "PIPE_LOGICOP_NOOP",
+   "PIPE_LOGICOP_OR_INVERTED",
+   "PIPE_LOGICOP_COPY",
+   "PIPE_LOGICOP_OR_REVERSE",
+   "PIPE_LOGICOP_OR",
+   "PIPE_LOGICOP_SET"
+};
+
+static const char *
+util_dump_logicop_short_names[] = {
+   "clear",
+   "nor",
+   "and_inverted",
+   "copy_inverted",
+   "and_reverse",
+   "invert",
+   "xor",
+   "nand",
+   "and",
+   "equiv",
+   "noop",
+   "or_inverted",
+   "copy",
+   "or_reverse",
+   "or",
+   "set"
+};
+
+DEFINE_UTIL_DUMP_CONTINUOUS(logicop)
+
+
+static const char *
 util_dump_func_names[] = {
    "PIPE_FUNC_NEVER",
    "PIPE_FUNC_LESS",
@@ -187,7 +230,35 @@ DEFINE_UTIL_DUMP_CONTINUOUS(func)
 
 
 static const char *
+util_dump_stencil_op_names[] = {
+   "PIPE_STENCIL_OP_KEEP",
+   "PIPE_STENCIL_OP_ZERO",
+   "PIPE_STENCIL_OP_REPLACE",
+   "PIPE_STENCIL_OP_INCR",
+   "PIPE_STENCIL_OP_DECR",
+   "PIPE_STENCIL_OP_INCR_WRAP",
+   "PIPE_STENCIL_OP_DECR_WRAP",
+   "PIPE_STENCIL_OP_INVERT"
+};
+
+static const char *
+util_dump_stencil_op_short_names[] = {
+   "keep",
+   "zero",
+   "replace",
+   "incr",
+   "decr",
+   "incr_wrap",
+   "decr_wrap",
+   "invert"
+};
+
+DEFINE_UTIL_DUMP_CONTINUOUS(stencil_op)
+
+
+static const char *
 util_dump_tex_target_names[] = {
+   "PIPE_BUFFER",
    "PIPE_TEXTURE_1D",
    "PIPE_TEXTURE_2D",
    "PIPE_TEXTURE_3D",
@@ -196,6 +267,7 @@ util_dump_tex_target_names[] = {
 
 static const char *
 util_dump_tex_target_short_names[] = {
+   "buffer",
    "1d",
    "2d",
    "3d",
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index 2ce643e90cd..cda5b8ba512 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -300,12 +300,13 @@ util_dump_rasterizer_state(struct os_stream *stream, const struct pipe_rasterize
 
    util_dump_member(stream, bool, state, flatshade);
    util_dump_member(stream, bool, state, light_twoside);
-   util_dump_member(stream, uint, state, front_winding);
-   util_dump_member(stream, uint, state, cull_mode);
-   util_dump_member(stream, uint, state, fill_cw);
-   util_dump_member(stream, uint, state, fill_ccw);
-   util_dump_member(stream, bool, state, offset_cw);
-   util_dump_member(stream, bool, state, offset_ccw);
+   util_dump_member(stream, uint, state, front_ccw);
+   util_dump_member(stream, uint, state, cull_face);
+   util_dump_member(stream, uint, state, fill_front);
+   util_dump_member(stream, uint, state, fill_back);
+   util_dump_member(stream, bool, state, offset_point);
+   util_dump_member(stream, bool, state, offset_line);
+   util_dump_member(stream, bool, state, offset_tri);
    util_dump_member(stream, bool, state, scissor);
    util_dump_member(stream, bool, state, poly_smooth);
    util_dump_member(stream, bool, state, poly_stipple_enable);
diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h
index 9d1c1713a7c..980cadf22d1 100644
--- a/src/gallium/auxiliary/util/u_dynarray.h
+++ b/src/gallium/auxiliary/util/u_dynarray.h
@@ -106,6 +106,9 @@ util_dynarray_trim(struct util_dynarray *buf)
 #define util_dynarray_pop_ptr(buf, type) (type*)((char*)(buf)->data + ((buf)->size -= sizeof(type)))
 #define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type)
 #define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type))
+#define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx))
+#define util_dynarray_begin(buf) ((buf)->data)
+#define util_dynarray_end(buf) ((void*)util_dynarray_element((buf), char, (buf)->size))
 
 #endif /* U_DYNARRAY_H */
 
diff --git a/src/gallium/auxiliary/util/u_format.c b/src/gallium/auxiliary/util/u_format.c
index c50c807eb89..4896faa12bf 100644
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/util/u_format.c
@@ -120,11 +120,67 @@ util_format_write_4ub(enum pipe_format format, const uint8_t *src, unsigned src_
 }
 
 
-static INLINE boolean
+boolean
+util_is_format_compatible(const struct util_format_description *src_desc,
+                          const struct util_format_description *dst_desc)
+{
+   unsigned chan;
+
+   if (src_desc->format == dst_desc->format) {
+      return TRUE;
+   }
+
+   if (src_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+       dst_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
+      return FALSE;
+   }
+
+   if (src_desc->block.bits != dst_desc->block.bits ||
+       src_desc->nr_channels != dst_desc->nr_channels ||
+       src_desc->colorspace != dst_desc->colorspace) {
+      return FALSE;
+   }
+
+   for (chan = 0; chan < 4; ++chan) {
+      if (src_desc->channel[chan].size !=
+          dst_desc->channel[chan].size) {
+         return FALSE;
+      }
+   }
+
+   for (chan = 0; chan < 4; ++chan) {
+      enum util_format_swizzle swizzle = dst_desc->swizzle[chan];
+
+      if (swizzle < 4) {
+         if (src_desc->swizzle[chan] != swizzle) {
+            return FALSE;
+         }
+         if ((src_desc->channel[swizzle].type !=
+              dst_desc->channel[swizzle].type) ||
+             (src_desc->channel[swizzle].normalized !=
+              dst_desc->channel[swizzle].normalized)) {
+            return FALSE;
+         }
+      }
+   }
+
+   return TRUE;
+}
+
+
+boolean
 util_format_fits_8unorm(const struct util_format_description *format_desc)
 {
    unsigned chan;
 
+   /*
+    * After linearized sRGB values require more than 8bits.
+    */
+
+   if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+      return FALSE;
+   }
+
    switch (format_desc->layout) {
 
    case UTIL_FORMAT_LAYOUT_S3TC:
@@ -189,11 +245,14 @@ util_format_translate(enum pipe_format dst_format,
    const struct util_format_description *src_format_desc;
    uint8_t *dst_row;
    const uint8_t *src_row;
-   unsigned y_step;
+   unsigned x_step, y_step;
    unsigned dst_step;
    unsigned src_step;
 
-   if (dst_format == src_format) {
+   dst_format_desc = util_format_description(dst_format);
+   src_format_desc = util_format_description(src_format);
+
+   if (util_is_format_compatible(src_format_desc, dst_format_desc)) {
       /*
        * Trivial case.
        */
@@ -204,9 +263,6 @@ util_format_translate(enum pipe_format dst_format,
       return;
    }
 
-   dst_format_desc = util_format_description(dst_format);
-   src_format_desc = util_format_description(src_format);
-
    assert(dst_x % dst_format_desc->block.width == 0);
    assert(dst_y % dst_format_desc->block.height == 0);
    assert(src_x % src_format_desc->block.width == 0);
@@ -221,6 +277,7 @@ util_format_translate(enum pipe_format dst_format,
     */
 
    y_step = MAX2(dst_format_desc->block.height, src_format_desc->block.height);
+   x_step = MAX2(dst_format_desc->block.width, src_format_desc->block.width);
    assert(y_step % dst_format_desc->block.height == 0);
    assert(y_step % src_format_desc->block.height == 0);
 
@@ -237,7 +294,7 @@ util_format_translate(enum pipe_format dst_format,
       unsigned tmp_stride;
       uint8_t *tmp_row;
 
-      tmp_stride = width * 4 * sizeof *tmp_row;
+      tmp_stride = MAX2(width, x_step) * 4 * sizeof *tmp_row;
       tmp_row = MALLOC(y_step * tmp_stride);
       if (!tmp_row)
          return;
@@ -262,7 +319,7 @@ util_format_translate(enum pipe_format dst_format,
       unsigned tmp_stride;
       float *tmp_row;
 
-      tmp_stride = width * 4 * sizeof *tmp_row;
+      tmp_stride = MAX2(width, x_step) * 4 * sizeof *tmp_row;
       tmp_row = MALLOC(y_step * tmp_stride);
       if (!tmp_row)
          return;
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index 605b13bd114..03b73c0e98f 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -213,6 +213,16 @@ struct util_format_description
                        unsigned width, unsigned height);
 
    /**
+    * Fetch a single pixel (i, j) from a block.
+    *
+    * XXX: Only defined for a very few select formats.
+    */
+   void
+   (*fetch_rgba_8unorm)(uint8_t *dst,
+                        const uint8_t *src,
+                        unsigned i, unsigned j);
+
+   /**
     * Unpack pixel blocks to R32G32B32A32_FLOAT.
     * Note: strides are in bytes.
     *
@@ -332,12 +342,60 @@ util_format_name(enum pipe_format format)
 
    assert(desc);
    if (!desc) {
-      return "???";
+      return "PIPE_FORMAT_???";
    }
 
    return desc->name;
 }
 
+static INLINE const char *
+util_format_short_name(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return "???";
+   }
+
+   return desc->short_name;
+}
+
+/**
+ * Whether this format is plain, see UTIL_FORMAT_LAYOUT_PLAIN for more info.
+ */
+static INLINE boolean
+util_format_is_plain(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   if (!format) {
+      return FALSE;
+   }
+
+   return desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ? TRUE : FALSE;
+}
+
+static INLINE boolean 
+util_format_is_compressed(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(desc);
+   if (!desc) {
+      return FALSE;
+   }
+
+   switch (desc->layout) {
+   case UTIL_FORMAT_LAYOUT_S3TC:
+   case UTIL_FORMAT_LAYOUT_RGTC:
+      /* XXX add other formats in the future */
+      return TRUE;
+   default:
+      return FALSE;
+   }
+}
+
 static INLINE boolean 
 util_format_is_s3tc(enum pipe_format format)
 {
@@ -382,6 +440,48 @@ util_format_is_depth_and_stencil(enum pipe_format format)
            desc->swizzle[1] != UTIL_FORMAT_SWIZZLE_NONE) ? TRUE : FALSE;
 }
 
+
+/**
+ * Give the RGBA colormask of the channels that can be represented in this
+ * format.
+ *
+ * That is, the channels whose values are preserved.
+ */
+static INLINE unsigned
+util_format_colormask(const struct util_format_description *desc)
+{
+   unsigned colormask;
+   unsigned chan;
+
+   switch (desc->colorspace) {
+   case UTIL_FORMAT_COLORSPACE_RGB:
+   case UTIL_FORMAT_COLORSPACE_SRGB:
+   case UTIL_FORMAT_COLORSPACE_YUV:
+      colormask = 0;
+      for (chan = 0; chan < 4; ++chan) {
+         if (desc->swizzle[chan] < 4) {
+            colormask |= (1 << chan);
+         }
+      }
+      return colormask;
+   case UTIL_FORMAT_COLORSPACE_ZS:
+      return 0;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+
+/**
+ * Whether the src format can be blitted to destation format with a simple
+ * memcpy.
+ */
+boolean
+util_is_format_compatible(const struct util_format_description *src_desc,
+                          const struct util_format_description *dst_desc);
+
+
 /**
  * Whether this format is a rgab8 variant.
  *
@@ -573,6 +673,44 @@ util_format_has_alpha(enum pipe_format format)
 }
 
 /**
+ * Return the matching SRGB format, or PIPE_FORMAT_NONE if none.
+ */
+static INLINE enum pipe_format
+util_format_srgb(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_L8_UNORM:
+      return PIPE_FORMAT_L8_SRGB;
+   case PIPE_FORMAT_L8A8_UNORM:
+      return PIPE_FORMAT_L8A8_SRGB;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return PIPE_FORMAT_R8G8B8_SRGB;
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
+      return PIPE_FORMAT_A8B8G8R8_SRGB;
+   case PIPE_FORMAT_X8B8G8R8_UNORM:
+      return PIPE_FORMAT_X8B8G8R8_SRGB;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return PIPE_FORMAT_B8G8R8A8_SRGB;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return PIPE_FORMAT_B8G8R8X8_SRGB;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return PIPE_FORMAT_A8R8G8B8_SRGB;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      return PIPE_FORMAT_X8R8G8B8_SRGB;
+   case PIPE_FORMAT_DXT1_RGB:
+      return PIPE_FORMAT_DXT1_SRGB;
+   case PIPE_FORMAT_DXT1_RGBA:
+      return PIPE_FORMAT_DXT1_SRGBA;
+   case PIPE_FORMAT_DXT3_RGBA:
+      return PIPE_FORMAT_DXT3_SRGBA;
+   case PIPE_FORMAT_DXT5_RGBA:
+      return PIPE_FORMAT_DXT5_SRGBA;
+   default:
+      return PIPE_FORMAT_NONE;
+   }
+}
+
+/**
  * Return the number of components stored.
  * Formats with block size != 1x1 will always have 1 component (the block).
  */
@@ -615,6 +753,9 @@ util_format_write_4ub(enum pipe_format format,
  * Generic format conversion;
  */
 
+boolean
+util_format_fits_8unorm(const struct util_format_description *format_desc);
+
 void
 util_format_translate(enum pipe_format dst_format,
                       void *dst, unsigned dst_stride,
diff --git a/src/gallium/auxiliary/util/u_format_other.c b/src/gallium/auxiliary/util/u_format_other.c
index 723fa8c3bf9..fa42ec37138 100644
--- a/src/gallium/auxiliary/util/u_format_other.c
+++ b/src/gallium/auxiliary/util/u_format_other.c
@@ -121,6 +121,15 @@ util_format_r1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
  * A.k.a. D3DFMT_CxV8U8
  */
 
+static uint8_t
+r8g8bx_derive(int16_t r, int16_t g)
+{
+   /* Derive blue from red and green components.
+    * Apparently, we must always use integers to perform calculations,
+    * otherwise the results won't match D3D's CxV8U8 definition.
+    */
+   return (uint8_t)sqrtf(0x7f * 0x7f - r * r - g * g) * 0xff / 0x7f;
+}
 
 void
 util_format_r8g8bx_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
@@ -145,7 +154,7 @@ util_format_r8g8bx_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
 
          dst[0] = (float)(r * (1.0f/0x7f)); /* r */
          dst[1] = (float)(g * (1.0f/0x7f)); /* g */
-         dst[2] = sqrtf(1.0f - dst[0] * dst[0] - dst[1] * dst[1]); /* b */
+         dst[2] = r8g8bx_derive(r, g) * (1.0f/0xff); /* b */
          dst[3] = 1.0f; /* a */
          dst += 4;
       }
@@ -177,7 +186,7 @@ util_format_r8g8bx_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_strid
 
          dst[0] = (uint8_t)(((uint16_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */
          dst[1] = (uint8_t)(((uint16_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */
-         dst[2] = (uint8_t)sqrtf(0x7f*0x7f - r * r - g * g) * 0xff / 0x7f; /* b */
+         dst[2] = r8g8bx_derive(r, g); /* b */
          dst[3] = 255; /* a */
          dst += 4;
       }
@@ -262,6 +271,6 @@ util_format_r8g8bx_snorm_fetch_rgba_float(float *dst, const uint8_t *src,
 
    dst[0] = r * (1.0f/0x7f); /* r */
    dst[1] = g * (1.0f/0x7f); /* g */
-   dst[2] = sqrtf(1.0f - dst[0] * dst[0] - dst[1] * dst[1]); /* b */
+   dst[2] = r8g8bx_derive(r, g) * (1.0f/0xff); /* b */
    dst[3] = 1.0f; /* a */
 }
diff --git a/src/gallium/auxiliary/util/u_format_pack.py b/src/gallium/auxiliary/util/u_format_pack.py
index 0c1bbc84c17..6d0016c0ad8 100644
--- a/src/gallium/auxiliary/util/u_format_pack.py
+++ b/src/gallium/auxiliary/util/u_format_pack.py
@@ -37,9 +37,6 @@
 '''
 
 
-import sys
-import math
-
 from u_format_parse import *
 
 
diff --git a/src/gallium/auxiliary/util/u_format_parse.py b/src/gallium/auxiliary/util/u_format_parse.py
index 7076c676aaf..ddb9f2443d9 100755
--- a/src/gallium/auxiliary/util/u_format_parse.py
+++ b/src/gallium/auxiliary/util/u_format_parse.py
@@ -43,7 +43,7 @@ ZS = 'zs'
 
 
 def is_pot(x):
-   return (x & (x - 1)) == 0;
+   return (x & (x - 1)) == 0
 
 
 VERY_LARGE = 99999999999999999999999
diff --git a/src/gallium/auxiliary/util/u_format_s3tc.c b/src/gallium/auxiliary/util/u_format_s3tc.c
index 5b279b8fe26..bb989c29d81 100644
--- a/src/gallium/auxiliary/util/u_format_s3tc.c
+++ b/src/gallium/auxiliary/util/u_format_s3tc.c
@@ -120,7 +120,7 @@ util_format_s3tc_init(void)
    library = util_dl_open(DXTN_LIBNAME);
    if (!library) {
       debug_printf("couldn't open " DXTN_LIBNAME ", software DXTn "
-         "compression/decompression unavailable");
+         "compression/decompression unavailable\n");
       return;
    }
 
@@ -142,7 +142,7 @@ util_format_s3tc_init(void)
        !util_format_dxtn_pack) {
       debug_printf("couldn't reference all symbols in " DXTN_LIBNAME
                    ", software DXTn compression/decompression "
-                   "unavailable");
+                   "unavailable\n");
       util_dl_close(library);
       return;
    }
diff --git a/src/gallium/auxiliary/util/u_format_srgb.py b/src/gallium/auxiliary/util/u_format_srgb.py
index a4c76dc00b3..3e8000f3687 100644
--- a/src/gallium/auxiliary/util/u_format_srgb.py
+++ b/src/gallium/auxiliary/util/u_format_srgb.py
@@ -39,7 +39,6 @@
 '''
 
 
-import sys
 import math
 
 
diff --git a/src/gallium/auxiliary/util/u_format_table.py b/src/gallium/auxiliary/util/u_format_table.py
index ae9a5981973..f0b407b8b8e 100755
--- a/src/gallium/auxiliary/util/u_format_table.py
+++ b/src/gallium/auxiliary/util/u_format_table.py
@@ -132,12 +132,17 @@ def write_format_table(formats):
         if format.colorspace != ZS:
             print "   &util_format_%s_unpack_rgba_8unorm," % format.short_name() 
             print "   &util_format_%s_pack_rgba_8unorm," % format.short_name() 
+            if format.layout == 's3tc':
+                print "   &util_format_%s_fetch_rgba_8unorm," % format.short_name()
+            else:
+                print "   NULL, /* fetch_rgba_8unorm */" 
             print "   &util_format_%s_unpack_rgba_float," % format.short_name() 
             print "   &util_format_%s_pack_rgba_float," % format.short_name() 
             print "   &util_format_%s_fetch_rgba_float," % format.short_name()
         else:
             print "   NULL, /* unpack_rgba_8unorm */" 
             print "   NULL, /* pack_rgba_8unorm */" 
+            print "   NULL, /* fetch_rgba_8unorm */" 
             print "   NULL, /* unpack_rgba_float */" 
             print "   NULL, /* pack_rgba_float */" 
             print "   NULL, /* fetch_rgba_float */" 
diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c
new file mode 100644
index 00000000000..7803ec6a8b5
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_framebuffer.c
@@ -0,0 +1,148 @@
+/**************************************************************************
+ *
+ * Copyright 2009-2010 VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Framebuffer utility functions.
+ *  
+ * @author Brian Paul
+ */
+
+
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+#include "util/u_memory.h"
+#include "util/u_framebuffer.h"
+
+
+/**
+ * Compare pipe_framebuffer_state objects.
+ * \return TRUE if same, FALSE if different
+ */
+boolean
+util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
+                             const struct pipe_framebuffer_state *src)
+{
+   unsigned i;
+
+   if (dst->width != src->width ||
+       dst->height != src->height)
+      return FALSE;
+
+   for (i = 0; i < Elements(src->cbufs); i++) {
+      if (dst->cbufs[i] != src->cbufs[i]) {
+         return FALSE;
+      }
+   }
+
+   if (dst->nr_cbufs != src->nr_cbufs) {
+      return FALSE;
+   }
+
+   if (dst->zsbuf != src->zsbuf) {
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * Copy framebuffer state from src to dst, updating refcounts.
+ */
+void
+util_copy_framebuffer_state(struct pipe_framebuffer_state *dst,
+                            const struct pipe_framebuffer_state *src)
+{
+   unsigned i;
+
+   dst->width = src->width;
+   dst->height = src->height;
+
+   for (i = 0; i < src->nr_cbufs; i++)
+      pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
+
+   for (i = src->nr_cbufs; i < dst->nr_cbufs; i++)
+      pipe_surface_reference(&dst->cbufs[i], NULL);
+
+   dst->nr_cbufs = src->nr_cbufs;
+
+   pipe_surface_reference(&dst->zsbuf, src->zsbuf);
+}
+
+
+void
+util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb)
+{
+   unsigned i;
+
+   for (i = 0; i < fb->nr_cbufs; i++) {
+      pipe_surface_reference(&fb->cbufs[i], NULL);
+   }
+
+   pipe_surface_reference(&fb->zsbuf, NULL);
+
+   fb->width = fb->height = 0;
+   fb->nr_cbufs = 0;
+}
+
+
+/* Where multiple sizes are allowed for framebuffer surfaces, find the
+ * minimum width and height of all bound surfaces.
+ */
+boolean
+util_framebuffer_min_size(const struct pipe_framebuffer_state *fb,
+                          unsigned *width,
+                          unsigned *height)
+{
+   unsigned w = ~0;
+   unsigned h = ~0;
+   unsigned i;
+
+   for (i = 0; i < fb->nr_cbufs; i++) {
+      w = MIN2(w, fb->cbufs[i]->width);
+      h = MIN2(h, fb->cbufs[i]->height);
+   }
+
+   if (fb->zsbuf) {
+      w = MIN2(w, fb->zsbuf->width);
+      h = MIN2(h, fb->zsbuf->height);
+   }
+
+   if (w == ~0) {
+      *width = 0;
+      *height = 0;
+      return FALSE;
+   }
+   else {
+      *width = w;
+      *height = h;
+      return TRUE;
+   }
+}
diff --git a/src/gallium/auxiliary/util/u_framebuffer.h b/src/gallium/auxiliary/util/u_framebuffer.h
new file mode 100644
index 00000000000..e7dc1e9e41d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_framebuffer.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ *
+ * Copyright 2009-2010 VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef U_FRAMEBUFFER_H
+#define U_FRAMEBUFFER_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+
+extern boolean
+util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
+                             const struct pipe_framebuffer_state *src);
+
+extern void
+util_copy_framebuffer_state(struct pipe_framebuffer_state *dst,
+                            const struct pipe_framebuffer_state *src);
+
+
+extern void
+util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb);
+
+
+extern boolean
+util_framebuffer_min_size(const struct pipe_framebuffer_state *fb,
+                          unsigned *width,
+                          unsigned *height);
+
+#endif /* U_FRAMEBUFFER_H */
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index eee6030ddcc..6a931a95819 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -1255,6 +1255,7 @@ fallback_gen_mipmap(struct gen_mipmap_state *ctx,
       make_1d_mipmap(ctx, pt, face, baseLevel, lastLevel);
       break;
    case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_RECT:
    case PIPE_TEXTURE_CUBE:
       make_2d_mipmap(ctx, pt, face, baseLevel, lastLevel);
       break;
@@ -1295,8 +1296,7 @@ util_create_gen_mipmap(struct pipe_context *pipe,
 
    /* rasterizer */
    memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
-   ctx->rasterizer.front_winding = PIPE_WINDING_CW;
-   ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
+   ctx->rasterizer.cull_face = PIPE_FACE_NONE;
    ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* sampler state */
@@ -1328,8 +1328,10 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    }
 
    /* fragment shader */
-   ctx->fs2d = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D);
-   ctx->fsCube = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE);
+   ctx->fs2d = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D,
+                                             TGSI_INTERPOLATE_LINEAR);
+   ctx->fsCube = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE,
+                                               TGSI_INTERPOLATE_LINEAR);
 
    /* vertex data that doesn't change */
    for (i = 0; i < 4; i++) {
@@ -1494,7 +1496,7 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
 
    /* check if we can render in the texture's format */
    if (!screen->is_format_supported(screen, psv->format, PIPE_TEXTURE_2D,
-                                    PIPE_BIND_RENDER_TARGET, 0)) {
+                                    pt->nr_samples, PIPE_BIND_RENDER_TARGET, 0)) {
       fallback_gen_mipmap(ctx, pt, face, baseLevel, lastLevel);
       return;
    }
diff --git a/src/gallium/auxiliary/util/u_half.py b/src/gallium/auxiliary/util/u_half.py
index 8007482e971..915cf3b9273 100644
--- a/src/gallium/auxiliary/util/u_half.py
+++ b/src/gallium/auxiliary/util/u_half.py
@@ -83,11 +83,11 @@ for i in xrange(1, 1024):
 
 	# normalize number
 	while (m & 0x00800000) == 0:
-		e -= 0x00800000;
-		m <<= 1;
+		e -= 0x00800000
+		m <<= 1
 
-	m &= ~0x00800000;
-	e += 0x38800000;
+	m &= ~0x00800000
+	e += 0x38800000
 	value(m | e)
 
 # normals
diff --git a/src/gallium/auxiliary/util/u_index_modify.c b/src/gallium/auxiliary/util/u_index_modify.c
new file mode 100644
index 00000000000..65b079ed537
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_index_modify.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2010 Marek Olšák <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "pipe/p_context.h"
+#include "util/u_index_modify.h"
+#include "util/u_inlines.h"
+
+void util_shorten_ubyte_elts(struct pipe_context *context,
+			     struct pipe_resource **elts,
+			     int index_bias,
+			     unsigned start,
+			     unsigned count)
+{
+    struct pipe_screen* screen = context->screen;
+    struct pipe_resource* new_elts;
+    unsigned char *in_map;
+    unsigned short *out_map;
+    struct pipe_transfer *src_transfer, *dst_transfer;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts, PIPE_TRANSFER_READ, &src_transfer);
+    out_map = pipe_buffer_map(context, new_elts, PIPE_TRANSFER_WRITE, &dst_transfer);
+
+    in_map += start;
+
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, src_transfer);
+    pipe_buffer_unmap(context, new_elts, dst_transfer);
+
+    *elts = new_elts;
+}
+
+void util_rebuild_ushort_elts(struct pipe_context *context,
+			      struct pipe_resource **elts,
+			      int index_bias,
+			      unsigned start, unsigned count)
+{
+    struct pipe_transfer *in_transfer = NULL;
+    struct pipe_transfer *out_transfer = NULL;
+    struct pipe_resource *new_elts;
+    unsigned short *in_map;
+    unsigned short *out_map;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(context->screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts,
+                             PIPE_TRANSFER_READ, &in_transfer);
+    out_map = pipe_buffer_map(context, new_elts,
+                              PIPE_TRANSFER_WRITE, &out_transfer);
+
+    in_map += start;
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned short)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, in_transfer);
+    pipe_buffer_unmap(context, new_elts, out_transfer);
+
+    *elts = new_elts;
+}
+
+void util_rebuild_uint_elts(struct pipe_context *context,
+			    struct pipe_resource **elts,
+			    int index_bias,
+			    unsigned start, unsigned count)
+{
+    struct pipe_transfer *in_transfer = NULL;
+    struct pipe_transfer *out_transfer = NULL;
+    struct pipe_resource *new_elts;
+    unsigned int *in_map;
+    unsigned int *out_map;
+    unsigned i;
+
+    new_elts = pipe_buffer_create(context->screen,
+                                  PIPE_BIND_INDEX_BUFFER,
+                                  2 * count);
+
+    in_map = pipe_buffer_map(context, *elts,
+                             PIPE_TRANSFER_READ, &in_transfer);
+    out_map = pipe_buffer_map(context, new_elts,
+                              PIPE_TRANSFER_WRITE, &out_transfer);
+
+    in_map += start;
+    for (i = 0; i < count; i++) {
+        *out_map = (unsigned int)(*in_map + index_bias);
+        in_map++;
+        out_map++;
+    }
+
+    pipe_buffer_unmap(context, *elts, in_transfer);
+    pipe_buffer_unmap(context, new_elts, out_transfer);
+
+    *elts = new_elts;
+}
diff --git a/src/gallium/auxiliary/util/u_index_modify.h b/src/gallium/auxiliary/util/u_index_modify.h
new file mode 100644
index 00000000000..01a6cae94fc
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_index_modify.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2010 Marek Olšák <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef UTIL_INDEX_MODIFY_H
+#define UTIL_INDEX_MODIFY_H
+
+void util_shorten_ubyte_elts(struct pipe_context *context,
+			     struct pipe_resource **elts,
+			     int index_bias,
+			     unsigned start,
+			     unsigned count);
+
+void util_rebuild_ushort_elts(struct pipe_context *context,
+			      struct pipe_resource **elts,
+			      int index_bias,
+			      unsigned start, unsigned count);
+
+void util_rebuild_uint_elts(struct pipe_context *context,
+			    struct pipe_resource **elts,
+			    int index_bias,
+			    unsigned start, unsigned count);
+#endif
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index a48689ee8be..6ed39561fbe 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -33,6 +33,8 @@
 #include "pipe/p_state.h"
 #include "pipe/p_screen.h"
 #include "util/u_debug.h"
+#include "util/u_debug_describe.h"
+#include "util/u_debug_refcnt.h"
 #include "util/u_atomic.h"
 #include "util/u_box.h"
 #include "util/u_math.h"
@@ -67,7 +69,9 @@ pipe_is_referenced(struct pipe_reference *reference)
  * \return TRUE if the object's refcount hits zero and should be destroyed.
  */
 static INLINE boolean
-pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
+pipe_reference_described(struct pipe_reference *ptr, 
+                         struct pipe_reference *reference, 
+                         debug_reference_descriptor get_desc)
 {
    boolean destroy = FALSE;
 
@@ -76,6 +80,7 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
       if (reference) {
          assert(pipe_is_referenced(reference));
          p_atomic_inc(&reference->count);
+         debug_reference(reference, get_desc, 1);
       }
 
       if (ptr) {
@@ -83,41 +88,49 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
          if (p_atomic_dec_zero(&ptr->count)) {
             destroy = TRUE;
          }
+         debug_reference(ptr, get_desc, -1);
       }
    }
 
    return destroy;
 }
 
+static INLINE boolean
+pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
+{
+   return pipe_reference_described(ptr, reference, 
+                                   (debug_reference_descriptor)debug_describe_reference);
+}
 
 static INLINE void
 pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
 {
    struct pipe_surface *old_surf = *ptr;
 
-   if (pipe_reference(&(*ptr)->reference, &surf->reference))
+   if (pipe_reference_described(&(*ptr)->reference, &surf->reference, 
+                                (debug_reference_descriptor)debug_describe_surface))
       old_surf->texture->screen->tex_surface_destroy(old_surf);
    *ptr = surf;
 }
 
-
 static INLINE void
 pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex)
 {
    struct pipe_resource *old_tex = *ptr;
 
-   if (pipe_reference(&(*ptr)->reference, &tex->reference))
+   if (pipe_reference_described(&(*ptr)->reference, &tex->reference, 
+                                (debug_reference_descriptor)debug_describe_resource))
       old_tex->screen->resource_destroy(old_tex->screen, old_tex);
    *ptr = tex;
 }
 
-
 static INLINE void
 pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_view *view)
 {
    struct pipe_sampler_view *old_view = *ptr;
 
-   if (pipe_reference(&(*ptr)->reference, &view->reference))
+   if (pipe_reference_described(&(*ptr)->reference, &view->reference,
+                                (debug_reference_descriptor)debug_describe_sampler_view))
       old_view->context->sampler_view_destroy(old_view->context, old_view);
    *ptr = view;
 }
@@ -369,6 +382,23 @@ pipe_transfer_destroy( struct pipe_context *context,
 }
 
 
+static INLINE boolean util_get_offset( 
+   const struct pipe_rasterizer_state *templ,
+   unsigned fill_mode)
+{
+   switch(fill_mode) {
+   case PIPE_POLYGON_MODE_POINT:
+      return templ->offset_point;
+   case PIPE_POLYGON_MODE_LINE:
+      return templ->offset_line;
+   case PIPE_POLYGON_MODE_FILL:
+      return templ->offset_tri;
+   default:
+      assert(0);
+      return FALSE;
+   }
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/util/u_linear.h b/src/gallium/auxiliary/util/u_linear.h
index 42c40b2aa75..81ffc9fb27d 100644
--- a/src/gallium/auxiliary/util/u_linear.h
+++ b/src/gallium/auxiliary/util/u_linear.h
@@ -33,6 +33,7 @@
 #ifndef U_LINEAR_H
 #define U_LINEAR_H
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 
 struct u_linear_format_block
diff --git a/src/gallium/auxiliary/util/u_linkage.c b/src/gallium/auxiliary/util/u_linkage.c
new file mode 100644
index 00000000000..2f6f41ba843
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_linkage.c
@@ -0,0 +1,149 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_linkage.h"
+
+/* we must only record the registers that are actually used, not just declared */
+static INLINE boolean
+util_semantic_set_test_and_set(struct util_semantic_set *set, unsigned value)
+{
+   unsigned mask = 1 << (value % (sizeof(long) * 8));
+   unsigned long *p = &set->masks[value / (sizeof(long) * 8)];
+   unsigned long v = *p & mask;
+   *p |= mask;
+   return !!v;
+}
+
+unsigned
+util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file)
+{
+   struct tgsi_shader_info info;
+   struct tgsi_parse_context parse;
+   unsigned count = 0;
+   ubyte *semantic_name;
+   ubyte *semantic_index;
+
+   tgsi_scan_shader(tokens, &info);
+
+   if(file == TGSI_FILE_INPUT)
+   {
+      semantic_name = info.input_semantic_name;
+      semantic_index = info.input_semantic_index;
+   }
+   else if(file == TGSI_FILE_OUTPUT)
+   {
+      semantic_name = info.output_semantic_name;
+      semantic_index = info.output_semantic_index;
+   }
+   else
+   {
+      assert(0);
+      semantic_name = NULL;
+      semantic_index = NULL;
+   }
+
+   tgsi_parse_init(&parse, tokens);
+
+   memset(set->masks, 0, sizeof(set->masks));
+   while(!tgsi_parse_end_of_tokens(&parse))
+   {
+      tgsi_parse_token(&parse);
+
+      if(parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION)
+      {
+	 const struct tgsi_full_instruction *finst = &parse.FullToken.FullInstruction;
+	 unsigned i;
+	 for(i = 0; i < finst->Instruction.NumDstRegs; ++i)
+	 {
+	    if(finst->Dst[i].Register.File == file)
+	    {
+	       unsigned idx = finst->Dst[i].Register.Index;
+	       if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC)
+	       {
+		  if(!util_semantic_set_test_and_set(set, semantic_index[idx]))
+		     ++count;
+	       }
+	    }
+	 }
+
+	 for(i = 0; i < finst->Instruction.NumSrcRegs; ++i)
+	 {
+	    if(finst->Src[i].Register.File == file)
+	    {
+	       unsigned idx = finst->Src[i].Register.Index;
+	       if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC)
+	       {
+		  if(!util_semantic_set_test_and_set(set, semantic_index[idx]))
+		     ++count;
+	       }
+	    }
+	 }
+      }
+   }
+   tgsi_parse_free(&parse);
+
+   return count;
+}
+
+#define UTIL_SEMANTIC_SET_FOR_EACH(i, set) for(i = 0; i < 256; ++i) if(set->masks[i / (sizeof(long) * 8)] & (1 << (i % (sizeof(long) * 8))))
+
+void
+util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots)
+{
+   int first = -1;
+   int last = -1;
+   unsigned i;
+
+   memset(layout, 0xff, num_slots);
+
+   UTIL_SEMANTIC_SET_FOR_EACH(i, set)
+   {
+      if(first < 0)
+	 first = i;
+      last = i;
+   }
+
+   if(last < efficient_slots)
+   {
+      UTIL_SEMANTIC_SET_FOR_EACH(i, set)
+         layout[i] = i;
+   }
+   else if((last - first) < efficient_slots)
+   {
+      UTIL_SEMANTIC_SET_FOR_EACH(i, set)
+         layout[i - first] = i;
+   }
+   else
+   {
+      unsigned idx = 0;
+      UTIL_SEMANTIC_SET_FOR_EACH(i, set)
+         layout[idx++] = i;
+   }
+}
diff --git a/src/gallium/auxiliary/util/u_linkage.h b/src/gallium/auxiliary/util/u_linkage.h
new file mode 100644
index 00000000000..4720e0ee603
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_linkage.h
@@ -0,0 +1,66 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_LINKAGE_H_
+#define U_LINKAGE_H_
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_shader_tokens.h"
+
+struct util_semantic_set
+{
+   unsigned long masks[256 / 8 / sizeof(unsigned long)];
+};
+
+static INLINE bool
+util_semantic_set_contains(struct util_semantic_set *set, unsigned char value)
+{
+   return !!(set->masks[value / (sizeof(long) * 8)] & (1 << (value / (sizeof(long) * 8))));
+}
+
+unsigned util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file);
+
+/* efficient_slots is the number of slots such that hardware performance is
+ * the same for using that amount, with holes, or less slots but with less
+ * holes.
+ *
+ * num_slots is the size of the layout array and hardware limit instead.
+ *
+ * efficient_slots == 0 or efficient_solts == num_slots are typical settings.
+ */
+void util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots);
+
+static INLINE void
+util_semantic_table_from_layout(unsigned char *table, unsigned char *layout, unsigned char first_slot_value, unsigned char num_slots)
+{
+   int i;
+   memset(table, 0xff, sizeof(table));
+
+   for(i = 0; i < num_slots; ++i)
+      table[layout[i]] = first_slot_value + i;
+}
+
+#endif /* U_LINKAGE_H_ */
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index d1ec13def30..69a76814945 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -168,6 +168,9 @@ static INLINE float logf( float f )
 #undef logf
 #define logf(x) ((float)log((double)(x)))
 #endif /* logf */
+
+#define isfinite(x) _finite((double)(x))
+#define isnan(x) _isnan((double)(x))
 #endif
 
 static INLINE double log2( double x )
@@ -335,26 +338,25 @@ util_iround(float f)
 }
 
 
-
 /**
- * Test if x is NaN or +/- infinity.
+ * Approximate floating point comparison
  */
 static INLINE boolean
-util_is_inf_or_nan(float x)
+util_is_approx(float a, float b, float tol)
 {
-   union fi tmp;
-   tmp.f = x;
-   return !(int)((unsigned int)((tmp.i & 0x7fffffff)-0x7f800000) >> 31);
+   return fabs(b - a) <= tol;
 }
 
 
 /**
- * Test whether x is a power of two.
+ * Test if x is NaN or +/- infinity.
  */
 static INLINE boolean
-util_is_pot(unsigned x)
+util_is_inf_or_nan(float x)
 {
-   return (x & (x - 1)) == 0;
+   union fi tmp;
+   tmp.f = x;
+   return !(int)((unsigned int)((tmp.i & 0x7fffffff)-0x7f800000) >> 31);
 }
 
 
@@ -554,13 +556,30 @@ util_bswap16(uint16_t n)
 #define MIN3( A, B, C ) MIN2( MIN2( A, B ), C )
 #define MAX3( A, B, C ) MAX2( MAX2( A, B ), C )
 
+#define MIN4( A, B, C, D ) MIN2( MIN2( A, B ), MIN2(C, D) )
+#define MAX4( A, B, C, D ) MAX2( MAX2( A, B ), MAX2(C, D) )
+
 
+/**
+ * Align a value, only works pot alignemnts.
+ */
 static INLINE int
 align(int value, int alignment)
 {
    return (value + alignment - 1) & ~(alignment - 1);
 }
 
+/**
+ * Works like align but on npot alignments.
+ */
+static INLINE size_t
+util_align_npot(size_t value, size_t alignment)
+{
+   if (value % alignment)
+      return value + (alignment - (value % alignment));
+   return value;
+}
+
 static INLINE unsigned
 u_minify(unsigned value, unsigned levels)
 {
diff --git a/src/gallium/auxiliary/util/u_mempool.c b/src/gallium/auxiliary/util/u_mempool.c
new file mode 100644
index 00000000000..1f336b39a1a
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_mempool.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright 2010 Marek Olšák <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "util/u_mempool.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+
+#include <stdio.h>
+
+#define UTIL_MEMPOOL_MAGIC 0xcafe4321
+
+/* The block is either allocated memory or free space. */
+struct util_mempool_block {
+   /* The header. */
+   /* The first next free block. */
+   struct util_mempool_block *next_free;
+
+   intptr_t magic;
+
+   /* Memory after the last member is dedicated to the block itself.
+    * The allocated size is always larger than this structure. */
+};
+
+static struct util_mempool_block *
+util_mempool_get_block(struct util_mempool *pool,
+                       struct util_mempool_page *page, unsigned index)
+{
+   return (struct util_mempool_block*)
+          ((uint8_t*)page + sizeof(struct util_mempool_page) +
+           (pool->block_size * index));
+}
+
+static void util_mempool_add_new_page(struct util_mempool *pool)
+{
+   struct util_mempool_page *page;
+   struct util_mempool_block *block;
+   int i;
+
+   page = MALLOC(pool->page_size);
+   insert_at_tail(&pool->list, page);
+
+   /* Mark all blocks as free. */
+   for (i = 0; i < pool->num_blocks-1; i++) {
+      block = util_mempool_get_block(pool, page, i);
+      block->next_free = util_mempool_get_block(pool, page, i+1);
+      block->magic = UTIL_MEMPOOL_MAGIC;
+   }
+
+   block = util_mempool_get_block(pool, page, pool->num_blocks-1);
+   block->next_free = pool->first_free;
+   block->magic = UTIL_MEMPOOL_MAGIC;
+   pool->first_free = util_mempool_get_block(pool, page, 0);
+   pool->num_pages++;
+
+#if 0
+   fprintf(stderr, "New page! Num of pages: %i\n", pool->num_pages);
+#endif
+}
+
+static void *util_mempool_malloc_st(struct util_mempool *pool)
+{
+   struct util_mempool_block *block;
+
+   if (!pool->first_free)
+      util_mempool_add_new_page(pool);
+
+   block = pool->first_free;
+   assert(block->magic == UTIL_MEMPOOL_MAGIC);
+   pool->first_free = block->next_free;
+
+   return (uint8_t*)block + sizeof(struct util_mempool_block);
+}
+
+static void util_mempool_free_st(struct util_mempool *pool, void *ptr)
+{
+   struct util_mempool_block *block =
+         (struct util_mempool_block*)
+         ((uint8_t*)ptr - sizeof(struct util_mempool_block));
+
+   assert(block->magic == UTIL_MEMPOOL_MAGIC);
+   block->next_free = pool->first_free;
+   pool->first_free = block;
+}
+
+static void *util_mempool_malloc_mt(struct util_mempool *pool)
+{
+   void *mem;
+
+   pipe_mutex_lock(pool->mutex);
+   mem = util_mempool_malloc_st(pool);
+   pipe_mutex_unlock(pool->mutex);
+   return mem;
+}
+
+static void util_mempool_free_mt(struct util_mempool *pool, void *ptr)
+{
+   pipe_mutex_lock(pool->mutex);
+   util_mempool_free_st(pool, ptr);
+   pipe_mutex_unlock(pool->mutex);
+}
+
+void util_mempool_set_thread_safety(struct util_mempool *pool,
+                                    enum util_mempool_threading threading)
+{
+   pool->threading = threading;
+
+   if (threading) {
+      pool->malloc = util_mempool_malloc_mt;
+      pool->free = util_mempool_free_mt;
+   } else {
+      pool->malloc = util_mempool_malloc_st;
+      pool->free = util_mempool_free_st;
+   }
+}
+
+void util_mempool_create(struct util_mempool *pool,
+                         unsigned item_size,
+                         unsigned num_blocks,
+                         enum util_mempool_threading threading)
+{
+   item_size = align(item_size, sizeof(intptr_t));
+
+   pool->num_pages = 0;
+   pool->num_blocks = num_blocks;
+   pool->block_size = sizeof(struct util_mempool_block) + item_size;
+   pool->block_size = align(pool->block_size, sizeof(intptr_t));
+   pool->page_size = sizeof(struct util_mempool_page) +
+                     num_blocks * pool->block_size;
+   pool->first_free = NULL;
+
+   make_empty_list(&pool->list);
+
+   pipe_mutex_init(pool->mutex);
+
+   util_mempool_set_thread_safety(pool, threading);
+}
+
+void util_mempool_destroy(struct util_mempool *pool)
+{
+   struct util_mempool_page *page, *temp;
+
+   foreach_s(page, temp, &pool->list) {
+      remove_from_list(page);
+      FREE(page);
+   }
+
+   pipe_mutex_destroy(pool->mutex);
+}
diff --git a/src/gallium/auxiliary/util/u_mempool.h b/src/gallium/auxiliary/util/u_mempool.h
new file mode 100644
index 00000000000..a5b5d6a9b7c
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_mempool.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2010 Marek Olšák <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/**
+ * @file
+ * Simple memory pool for equally sized memory allocations.
+ * util_mempool_malloc and util_mempool_free are in O(1).
+ *
+ * Good for allocations which have very low lifetime and are allocated
+ * and freed very often. Use a profiler first!
+ *
+ * Candidates: get_transfer, user_buffer_create
+ *
+ * @author Marek Olšák
+ */
+
+#ifndef U_MEMPOOL_H
+#define U_MEMPOOL_H
+
+#include "os/os_thread.h"
+
+enum util_mempool_threading {
+   UTIL_MEMPOOL_SINGLETHREADED = FALSE,
+   UTIL_MEMPOOL_MULTITHREADED = TRUE
+};
+
+/* The page is an array of blocks (allocations). */
+struct util_mempool_page {
+   /* The header (linked-list pointers). */
+   struct util_mempool_page *prev, *next;
+
+   /* Memory after the last member is dedicated to the page itself.
+    * The allocated size is always larger than this structure. */
+};
+
+struct util_mempool {
+   /* Public members. */
+   void *(*malloc)(struct util_mempool *pool);
+   void (*free)(struct util_mempool *pool, void *ptr);
+
+   /* Private members. */
+   struct util_mempool_block *first_free;
+
+   struct util_mempool_page list;
+
+   unsigned block_size;
+   unsigned page_size;
+   unsigned num_blocks;
+   unsigned num_pages;
+   enum util_mempool_threading threading;
+
+   pipe_mutex mutex;
+};
+
+void util_mempool_create(struct util_mempool *pool,
+                         unsigned item_size,
+                         unsigned num_blocks,
+                         enum util_mempool_threading threading);
+
+void util_mempool_destroy(struct util_mempool *pool);
+
+void util_mempool_set_thread_safety(struct util_mempool *pool,
+                                    enum util_mempool_threading threading);
+
+#define util_mempool_malloc(pool)    (pool)->malloc(pool)
+#define util_mempool_free(pool, ptr) (pool)->free(pool, ptr)
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_network.c b/src/gallium/auxiliary/util/u_network.c
index 87ee0e47685..77f2c5fc7de 100644
--- a/src/gallium/auxiliary/util/u_network.c
+++ b/src/gallium/auxiliary/util/u_network.c
@@ -6,7 +6,7 @@
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  include <winsock2.h>
 #  include <windows.h>
-#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_CYGWIN)
 #  include <sys/socket.h>
 #  include <netinet/in.h>
 #  include <unistd.h>
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index 3ebef9fb749..c90b0fdbc3f 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -42,12 +42,18 @@
 #include "util/u_math.h"
 
 
-
+/**
+ * Helper union for packing pixel values.
+ * Will often contain values in formats which are too complex to be described
+ * in simple terms, hence might just effectively contain a number of bytes.
+ * Must be big enough to hold data for all formats (currently 256 bits).
+ */
 union util_color {
    ubyte ub;
    ushort us;
    uint ui;
    float f[4];
+   double d[4];
 };
 
 /**
@@ -388,7 +394,7 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color *
       return;
    case PIPE_FORMAT_B4G4R4A4_UNORM:
       {
-         uc->ub = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
+         uc->us = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
       }
       return;
    case PIPE_FORMAT_A8_UNORM:
@@ -425,6 +431,53 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color *
    }
 }
  
+/* Integer versions of util_pack_z and util_pack_z_stencil - useful for
+ * constructing clear masks.
+ */
+static INLINE uint
+util_pack_uint_z(enum pipe_format format, unsigned z)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      return z & 0xffff;
+   case PIPE_FORMAT_Z32_UNORM:
+   case PIPE_FORMAT_Z32_FLOAT:
+      return z;
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      return z & 0xffffff;
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      return (z & 0xffffff) << 8;
+   case PIPE_FORMAT_S8_USCALED:
+      return 0;
+   default:
+      debug_print_format("gallium: unhandled format in util_pack_z()", format);
+      assert(0);
+      return 0;
+   }
+}
+
+static INLINE uint
+util_pack_uint_z_stencil(enum pipe_format format, double z, uint s)
+{
+   unsigned packed = util_pack_uint_z(format, z);
+
+   s &= 0xff;
+
+   switch (format) {
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+      return packed | (s << 24);
+   case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
+      return packed | s;
+   case PIPE_FORMAT_S8_USCALED:
+      return packed | s;
+   default:
+      return packed;
+   }
+}
+
+
 
 /**
  * Note: it's assumed that z is in [0,1]
diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h
index e1af9f11cb9..cce0c7430e7 100644
--- a/src/gallium/auxiliary/util/u_pointer.h
+++ b/src/gallium/auxiliary/util/u_pointer.h
@@ -98,6 +98,29 @@ align16( void *unaligned )
    return align_pointer( unaligned, 16 );
 }
 
+typedef void (*func_pointer)(void);
+
+static INLINE func_pointer
+pointer_to_func( void *p )
+{
+   union {
+      void *p;
+      func_pointer f;
+   } pf;
+   pf.p = p;
+   return pf.f;
+}
+
+static INLINE void *
+func_to_pointer( func_pointer f )
+{
+   union {
+      void *p;
+      func_pointer f;
+   } pf;
+   pf.f = f;
+   return pf.p;
+}
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/auxiliary/util/u_prim.h b/src/gallium/auxiliary/util/u_prim.h
index 64390e13851..3c851f73401 100644
--- a/src/gallium/auxiliary/util/u_prim.h
+++ b/src/gallium/auxiliary/util/u_prim.h
@@ -108,6 +108,20 @@ static INLINE boolean u_trim_pipe_prim( unsigned pipe_prim, unsigned *nr )
       ok = (*nr >= 4);
       *nr -= (*nr % 2);
       break;
+   case PIPE_PRIM_LINES_ADJACENCY:
+      ok = (*nr >= 4);
+      *nr -= (*nr % 4);
+      break;
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      ok = (*nr >= 4);
+      break;
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      ok = (*nr >= 6);
+      *nr -= (*nr % 5);
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      ok = (*nr >= 4);
+      break;
    default:
       ok = 0;
       break;
@@ -169,6 +183,52 @@ u_vertices_per_prim(int primitive)
    }
 }
 
+/**
+ * Returns the number of decomposed primitives for the given
+ * vertex count.
+ * Geometry shader is invoked once for each triangle in
+ * triangle strip, triangle fans and triangles and once
+ * for each line in line strip, line loop, lines.
+ */
+static INLINE unsigned
+u_gs_prims_for_vertices(int primitive, int vertices)
+{
+   switch(primitive) {
+   case PIPE_PRIM_POINTS:
+      return vertices;
+   case PIPE_PRIM_LINES:
+      return vertices / 2;
+   case PIPE_PRIM_LINE_LOOP:
+      return vertices;
+   case PIPE_PRIM_LINE_STRIP:
+      return vertices - 1;
+   case PIPE_PRIM_TRIANGLES:
+      return vertices /  3;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      return vertices - 2;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      return vertices - 2;
+   case PIPE_PRIM_LINES_ADJACENCY:
+      return vertices / 2;
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      return vertices - 1;
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      return vertices / 3;
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return vertices - 2;
+
+   /* following primitives should never be used
+    * with geometry shaders abd their size is
+    * undefined */
+   case PIPE_PRIM_POLYGON:
+   case PIPE_PRIM_QUADS:
+   case PIPE_PRIM_QUAD_STRIP:
+   default:
+      debug_printf("Unrecognized geometry shader primitive");
+      return 3;
+   }
+}
+
 const char *u_prim_name( unsigned pipe_prim );
 
 #endif
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index 098cdfd58b1..56fcfac0693 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -30,13 +30,9 @@
  */
 
 
-#include "pipe/p_defines.h"
-#include "pipe/p_format.h"
-#include "pipe/p_context.h"
-#include "pipe/p_screen.h"
 #include "util/u_format.h"
-#include "util/u_inlines.h"
 #include "util/u_rect.h"
+#include "util/u_pack_color.h"
 
 
 /**
@@ -99,7 +95,7 @@ util_fill_rect(ubyte * dst,
                unsigned dst_y,
                unsigned width,
                unsigned height,
-               uint32_t value)
+               union util_color *uc)
 {
    unsigned i, j;
    unsigned width_size;
@@ -115,193 +111,54 @@ util_fill_rect(ubyte * dst,
    dst_y /= blockheight;
    width = (width + blockwidth - 1)/blockwidth;
    height = (height + blockheight - 1)/blockheight;
-   
+
    dst += dst_x * blocksize;
    dst += dst_y * dst_stride;
    width_size = width * blocksize;
-   
+
    switch (blocksize) {
    case 1:
       if(dst_stride == width_size)
-	 memset(dst, (ubyte) value, height * width_size);
+         memset(dst, uc->ub, height * width_size);
       else {
-	 for (i = 0; i < height; i++) {
-	    memset(dst, (ubyte) value, width_size);
-	    dst += dst_stride;
-	 }
+         for (i = 0; i < height; i++) {
+            memset(dst, uc->ub, width_size);
+            dst += dst_stride;
+         }
       }
       break;
    case 2:
       for (i = 0; i < height; i++) {
-	 uint16_t *row = (uint16_t *)dst;
-	 for (j = 0; j < width; j++)
-	    *row++ = (uint16_t) value;
-	 dst += dst_stride;
+         uint16_t *row = (uint16_t *)dst;
+         for (j = 0; j < width; j++)
+            *row++ = uc->us;
+         dst += dst_stride;
       }
       break;
    case 4:
       for (i = 0; i < height; i++) {
-	 uint32_t *row = (uint32_t *)dst;
-	 for (j = 0; j < width; j++)
-	    *row++ = value;
-	 dst += dst_stride;
+         uint32_t *row = (uint32_t *)dst;
+         for (j = 0; j < width; j++)
+            *row++ = uc->ui;
+         dst += dst_stride;
       }
       break;
-   default:
-	 assert(0);
-	 break;
-   }
-}
-
-
-
-/**
- * Fallback function for pipe->surface_copy().
- * Note: (X,Y)=(0,0) is always the upper-left corner.
- * if do_flip, flip the image vertically on its way from src rect to dst rect.
- * XXX should probably put this in new u_surface.c file...
- */
-void
-util_surface_copy(struct pipe_context *pipe,
-                  boolean do_flip,
-                  struct pipe_surface *dst,
-                  unsigned dst_x, unsigned dst_y,
-                  struct pipe_surface *src,
-                  unsigned src_x, unsigned src_y, 
-                  unsigned w, unsigned h)
-{
-   struct pipe_transfer *src_trans, *dst_trans;
-   void *dst_map;
-   const void *src_map;
-   enum pipe_format src_format, dst_format;
-
-   assert(src->texture && dst->texture);
-   if (!src->texture || !dst->texture)
-      return;
-
-   src_format = src->texture->format;
-   dst_format = dst->texture->format;
-
-   src_trans = pipe_get_transfer(pipe,
-				 src->texture,
-				 src->face,
-				 src->level,
-				 src->zslice,
-				 PIPE_TRANSFER_READ,
-				 src_x, src_y, w, h);
-
-   dst_trans = pipe_get_transfer(pipe,
-				 dst->texture,
-				 dst->face,
-				 dst->level,
-				 dst->zslice,
-				 PIPE_TRANSFER_WRITE,
-				 dst_x, dst_y, w, h);
-
-   assert(util_format_get_blocksize(dst_format) == util_format_get_blocksize(src_format));
-   assert(util_format_get_blockwidth(dst_format) == util_format_get_blockwidth(src_format));
-   assert(util_format_get_blockheight(dst_format) == util_format_get_blockheight(src_format));
-
-   src_map = pipe->transfer_map(pipe, src_trans);
-   dst_map = pipe->transfer_map(pipe, dst_trans);
-
-   assert(src_map);
-   assert(dst_map);
-
-   if (src_map && dst_map) {
-      /* If do_flip, invert src_y position and pass negative src stride */
-      util_copy_rect(dst_map,
-                     dst_format,
-                     dst_trans->stride,
-                     0, 0,
-                     w, h,
-                     src_map,
-                     do_flip ? -(int) src_trans->stride : src_trans->stride,
-                     0,
-                     do_flip ? h - 1 : 0);
-   }
-
-   pipe->transfer_unmap(pipe, src_trans);
-   pipe->transfer_unmap(pipe, dst_trans);
-
-   pipe->transfer_destroy(pipe, src_trans);
-   pipe->transfer_destroy(pipe, dst_trans);
-}
-
-
-
-#define UBYTE_TO_USHORT(B) ((B) | ((B) << 8))
-
-
-/**
- * Fallback for pipe->surface_fill() function.
- * XXX should probably put this in new u_surface.c file...
- */
-void
-util_surface_fill(struct pipe_context *pipe,
-                  struct pipe_surface *dst,
-                  unsigned dstx, unsigned dsty,
-                  unsigned width, unsigned height, unsigned value)
-{
-   struct pipe_transfer *dst_trans;
-   void *dst_map;
-
-   assert(dst->texture);
-   if (!dst->texture)
-      return;
-   dst_trans = pipe_get_transfer(pipe,
-				 dst->texture,
-				 dst->face,
-				 dst->level,
-				 dst->zslice,
-				 PIPE_TRANSFER_WRITE,
-				 dstx, dsty, width, height);
-
-   dst_map = pipe->transfer_map(pipe, dst_trans);
-
-   assert(dst_map);
-
-   if (dst_map) {
-      assert(dst_trans->stride > 0);
-
-      switch (util_format_get_blocksize(dst->texture->format)) {
-      case 1:
-      case 2:
-      case 4:
-         util_fill_rect(dst_map, dst->texture->format,
-			dst_trans->stride,
-                        0, 0, width, height, value);
-         break;
-      case 8:
-      {
-	 /* expand the 4-byte clear value to an 8-byte value */
-	 ushort *row = (ushort *) dst_map;
-	 ushort val0 = UBYTE_TO_USHORT((value >>  0) & 0xff);
-	 ushort val1 = UBYTE_TO_USHORT((value >>  8) & 0xff);
-	 ushort val2 = UBYTE_TO_USHORT((value >> 16) & 0xff);
-	 ushort val3 = UBYTE_TO_USHORT((value >> 24) & 0xff);
-	 unsigned i, j;
-	 val0 = (val0 << 8) | val0;
-	 val1 = (val1 << 8) | val1;
-	 val2 = (val2 << 8) | val2;
-	 val3 = (val3 << 8) | val3;
-	 for (i = 0; i < height; i++) {
-	    for (j = 0; j < width; j++) {
-	       row[j*4+0] = val0;
-	       row[j*4+1] = val1;
-	       row[j*4+2] = val2;
-	       row[j*4+3] = val3;
-	    }
-	    row += dst_trans->stride/2;
-	 }
+   case 8:
+   case 12:
+   case 16:
+   case 24:
+   case 32:
+      for (i = 0; i < height; i++) {
+         ubyte *row = dst;
+         for (j = 0; j < width; j++) {
+            memcpy(row, uc, blocksize);
+            row += blocksize;
+         }
+         dst += dst_stride;
       }
       break;
-      default:
-         assert(0);
-         break;
-      }
+   default:
+      assert(0);
+      break;
    }
-
-   pipe->transfer_unmap(pipe, dst_trans);
-   pipe->transfer_destroy(pipe, dst_trans);
 }
diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h
index b44d821904b..4cb90d3c316 100644
--- a/src/gallium/auxiliary/util/u_rect.h
+++ b/src/gallium/auxiliary/util/u_rect.h
@@ -26,20 +26,67 @@
  **************************************************************************/
 
 
-/**
- * Pipe copy/fill rect helpers.
+#ifndef U_RECT_H
+#define U_RECT_H
+
+#include "pipe/p_compiler.h"
+
+struct u_rect {
+   int x0, x1;
+   int y0, y1;
+};
+
+/* Do two rectangles intersect?
  */
+static INLINE boolean
+u_rect_test_intersection(const struct u_rect *a,
+                         const struct u_rect *b)
+{
+   return (!(a->x1 < b->x0 ||
+             b->x1 < a->x0 ||
+             a->y1 < b->y0 ||
+             b->y1 < a->y0));
+}
 
+/* Find the intersection of two rectangles known to intersect.
+ */
+static INLINE void
+u_rect_find_intersection(const struct u_rect *a,
+                         struct u_rect *b)
+{
+   /* Caller should verify intersection exists before calling.
+    */
+   if (b->x0 < a->x0) b->x0 = a->x0;
+   if (b->x1 > a->x1) b->x1 = a->x1;
+   if (b->y0 < a->y0) b->y0 = a->y0;
+   if (b->y1 > a->y1) b->y1 = a->y1;
+}
 
-#ifndef U_RECT_H
-#define U_RECT_H
 
+static INLINE void
+u_rect_possible_intersection(const struct u_rect *a,
+                             struct u_rect *b)
+{
+   if (u_rect_test_intersection(a,b)) {
+      u_rect_find_intersection(a,b);
+   }
+   else {
+      b->x0 = b->x1 = b->y0 = b->y1 = 0;
+   }
+}
 
 #include "pipe/p_format.h"
+#include "util/u_pack_color.h"
+
 
-struct pipe_context;
-struct pipe_surface;
 
+/**********************************************************************
+ * Pipe copy/fill rect helpers.
+ */
+
+/* These really should move to a different file:
+ */
+#include "pipe/p_format.h"
 
 extern void
 util_copy_rect(ubyte * dst, enum pipe_format format,
@@ -50,23 +97,7 @@ util_copy_rect(ubyte * dst, enum pipe_format format,
 extern void
 util_fill_rect(ubyte * dst, enum pipe_format format,
                unsigned dst_stride, unsigned dst_x, unsigned dst_y,
-               unsigned width, unsigned height, uint32_t value);
-
-
-extern void
-util_surface_copy(struct pipe_context *pipe,
-                  boolean do_flip,
-                  struct pipe_surface *dst,
-                  unsigned dst_x, unsigned dst_y,
-                  struct pipe_surface *src,
-                  unsigned src_x, unsigned src_y, 
-                  unsigned w, unsigned h);
-
-extern void
-util_surface_fill(struct pipe_context *pipe,
-                  struct pipe_surface *dst,
-                  unsigned dstx, unsigned dsty,
-                  unsigned width, unsigned height, unsigned value);
+               unsigned width, unsigned height, union util_color *uc);
 
 
 #endif /* U_RECT_H */
diff --git a/src/gallium/auxiliary/util/u_simple_list.h b/src/gallium/auxiliary/util/u_simple_list.h
index f5f43b0faa2..fe59771371b 100644
--- a/src/gallium/auxiliary/util/u_simple_list.h
+++ b/src/gallium/auxiliary/util/u_simple_list.h
@@ -46,6 +46,8 @@
 do {						\
    (elem)->next->prev = (elem)->prev;		\
    (elem)->prev->next = (elem)->next;		\
+   (elem)->next = elem;                         \
+   (elem)->prev = elem;                         \
 } while (0)
 
 /**
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 019dda767d0..58ef68377fc 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -37,6 +37,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_state.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_debug.h"
 #include "tgsi/tgsi_ureg.h"
@@ -87,10 +88,15 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
  *  MOV OUT[0], IMM[0]                    // (if writemask != 0xf)
  *  TEX OUT[0].writemask, IN[0], SAMP[0], 2D;
  *  END;
+ *
+ * \param tex_target  one of PIPE_TEXTURE_x
+ * \parma interp_mode  either TGSI_INTERPOLATE_LINEAR or PERSPECTIVE
+ * \param writemask  mask of TGSI_WRITEMASK_x
  */
 void *
 util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
                                         unsigned tex_target,
+                                        unsigned interp_mode,
                                         unsigned writemask )
 {
    struct ureg_program *ureg;
@@ -98,6 +104,9 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
    struct ureg_src tex;
    struct ureg_dst out;
 
+   assert(interp_mode == TGSI_INTERPOLATE_LINEAR ||
+          interp_mode == TGSI_INTERPOLATE_PERSPECTIVE);
+
    ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
    if (ureg == NULL)
       return NULL;
@@ -106,7 +115,7 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
 
    tex = ureg_DECL_fs_input( ureg, 
                              TGSI_SEMANTIC_GENERIC, 0, 
-                             TGSI_INTERPOLATE_PERSPECTIVE );
+                             interp_mode );
 
    out = ureg_DECL_output( ureg, 
                            TGSI_SEMANTIC_COLOR,
@@ -133,10 +142,12 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
  * \param tex_target  one of PIPE_TEXTURE_x
  */
 void *
-util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target )
+util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target,
+                              unsigned interp_mode)
 {
    return util_make_fragment_tex_shader_writemask( pipe,
                                                    tex_target,
+                                                   interp_mode,
                                                    TGSI_WRITEMASK_XYZW );
 }
 
@@ -147,7 +158,8 @@ util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target )
  */
 void *
 util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
-                                         unsigned tex_target)
+                                         unsigned tex_target,
+                                         unsigned interp_mode)
 {
    struct ureg_program *ureg;
    struct ureg_src sampler;
@@ -163,7 +175,7 @@ util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
 
    tex = ureg_DECL_fs_input( ureg,
                              TGSI_SEMANTIC_GENERIC, 0,
-                             TGSI_INTERPOLATE_PERSPECTIVE );
+                             interp_mode );
 
    out = ureg_DECL_output( ureg,
                            TGSI_SEMANTIC_COLOR,
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h
index 6e760942e25..4aa34bc4757 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.h
+++ b/src/gallium/auxiliary/util/u_simple_shaders.h
@@ -52,15 +52,18 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
 extern void *
 util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, 
                                         unsigned tex_target,
+                                        unsigned interp_mode,
                                         unsigned writemask);
 
 extern void *
-util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target);
+util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target,
+                              unsigned interp_mode);
 
 
 extern void *
 util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
-                                         unsigned tex_target);
+                                         unsigned tex_target,
+                                         unsigned interp_mode);
 
 
 extern void *
diff --git a/src/gallium/auxiliary/util/u_split_prim.h b/src/gallium/auxiliary/util/u_split_prim.h
new file mode 100644
index 00000000000..7f80fc12700
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_split_prim.h
@@ -0,0 +1,114 @@
+/* Originally written by Ben Skeggs for the nv50 driver*/
+
+#ifndef U_SPLIT_PRIM_H
+#define U_SPLIT_PRIM_H
+
+#include "pipe/p_defines.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_debug.h"
+
+struct util_split_prim {
+   void *priv;
+   void (*emit)(void *priv, unsigned start, unsigned count);
+   void (*edge)(void *priv, boolean enabled);
+
+   unsigned mode;
+   unsigned start;
+   unsigned p_start;
+   unsigned p_end;
+
+   uint repeat_first:1;
+   uint close_first:1;
+   uint edgeflag_off:1;
+};
+
+static INLINE void
+util_split_prim_init(struct util_split_prim *s,
+                  unsigned mode, unsigned start, unsigned count)
+{
+   if (mode == PIPE_PRIM_LINE_LOOP) {
+      s->mode = PIPE_PRIM_LINE_STRIP;
+      s->close_first = 1;
+   } else {
+      s->mode = mode;
+      s->close_first = 0;
+   }
+   s->start = start;
+   s->p_start = start;
+   s->p_end = start + count;
+   s->edgeflag_off = 0;
+   s->repeat_first = 0;
+}
+
+static INLINE boolean
+util_split_prim_next(struct util_split_prim *s, unsigned max_verts)
+{
+   int repeat = 0;
+
+   if (s->repeat_first) {
+      s->emit(s->priv, s->start, 1);
+      max_verts--;
+      if (s->edgeflag_off) {
+         s->edge(s->priv, TRUE);
+         s->edgeflag_off = FALSE;
+      }
+   }
+
+   if ((s->p_end - s->p_start) + s->close_first <= max_verts) {
+      s->emit(s->priv, s->p_start, s->p_end - s->p_start);
+      if (s->close_first)
+         s->emit(s->priv, s->start, 1);
+      return TRUE;
+   }
+
+   switch (s->mode) {
+   case PIPE_PRIM_LINES:
+      max_verts &= ~1;
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+      repeat = 1;
+      break;
+   case PIPE_PRIM_POLYGON:
+      max_verts--;
+      s->emit(s->priv, s->p_start, max_verts);
+      s->edge(s->priv, FALSE);
+      s->emit(s->priv, s->p_start + max_verts, 1);
+      s->p_start += max_verts;
+      s->repeat_first = TRUE;
+      s->edgeflag_off = TRUE;
+      return FALSE;
+   case PIPE_PRIM_TRIANGLES:
+      max_verts = max_verts - (max_verts % 3);
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      /* to ensure winding stays correct, always split
+       * on an even number of generated triangles
+       */
+      max_verts = max_verts & ~1;
+      repeat = 2;
+      break;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      s->repeat_first = TRUE;
+      repeat = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      max_verts &= ~3;
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      max_verts &= ~1;
+      repeat = 2;
+      break;
+   case PIPE_PRIM_POINTS:
+      break;
+   default:
+      /* TODO: implement adjacency primitives */
+      assert(0);
+   }
+
+   s->emit (s->priv, s->p_start, max_verts);
+   s->p_start += (max_verts - repeat);
+   return FALSE;
+}
+
+#endif /* U_SPLIT_PRIM_H */
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index e2a8491e62c..03198c91da4 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -41,7 +41,6 @@
 
 #if defined(PIPE_ARCH_SSE)
 
-#include <xmmintrin.h>
 #include <emmintrin.h>
 
 
@@ -72,6 +71,33 @@ _mm_castps_si128(__m128 a)
 
 #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
 
+
+#if defined(PIPE_ARCH_SSSE3)
+
+#include <tmmintrin.h>
+
+#else /* !PIPE_ARCH_SSSE3 */
+
+/**
+ * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases
+ * where -mssse3 is not supported/enabled.
+ *
+ * MSVC will never get in here as its intrinsics support do not rely on
+ * compiler command line options.
+ */
+static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi8(__m128i a, __m128i mask)
+{
+    __m128i result;
+    __asm__("pshufb %1, %0"
+            : "=x" (result)
+            : "xm" (mask), "0" (a));
+    return result;
+}
+
+#endif /* !PIPE_ARCH_SSSE3 */
+
+
 #endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
 
 #endif /* U_SSE_H_ */
diff --git a/src/gallium/auxiliary/util/u_staging.c b/src/gallium/auxiliary/util/u_staging.c
new file mode 100644
index 00000000000..c5d68f8df86
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_staging.c
@@ -0,0 +1,117 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_staging.h"
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+
+static void
+util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigned height, unsigned depth, struct pipe_resource *template)
+{
+   memset(template, 0, sizeof(struct pipe_resource));
+   if(pt->target != PIPE_BUFFER && depth <= 1)
+      template->target = PIPE_TEXTURE_RECT;
+   else
+      template->target = pt->target;
+   template->format = pt->format;
+   template->width0 = width;
+   template->height0 = height;
+   template->depth0 = depth;
+   template->last_level = 0;
+   template->nr_samples = pt->nr_samples;
+   template->bind = 0;
+   template->usage = PIPE_USAGE_STAGING;
+   template->flags = 0;
+}
+
+struct util_staging_transfer *
+util_staging_transfer_init(struct pipe_context *pipe,
+           struct pipe_resource *pt,
+           struct pipe_subresource sr,
+           unsigned usage,
+           const struct pipe_box *box,
+           bool direct, struct util_staging_transfer *tx)
+{
+   struct pipe_screen *pscreen = pipe->screen;
+
+   struct pipe_resource staging_resource_template;
+
+   pipe_resource_reference(&tx->base.resource, pt);
+   tx->base.sr = sr;
+   tx->base.usage = usage;
+   tx->base.box = *box;
+
+   if (direct)
+   {
+      tx->staging_resource = pt;
+      return tx;
+   }
+
+   util_staging_resource_template(pt, box->width, box->height, box->depth, &staging_resource_template);
+   tx->staging_resource = pscreen->resource_create(pscreen, &staging_resource_template);
+   if (!tx->staging_resource)
+   {
+      pipe_resource_reference(&tx->base.resource, NULL);
+      FREE(tx);
+      return NULL;
+   }
+
+   if (usage & PIPE_TRANSFER_READ)
+   {
+      struct pipe_subresource dstsr;
+      unsigned zi;
+      dstsr.face = 0;
+      dstsr.level = 0;
+      for(zi = 0; zi < box->depth; ++zi)
+         pipe->resource_copy_region(pipe, tx->staging_resource, dstsr, 0, 0, 0, tx->base.resource, sr, box->x, box->y, box->z + zi, box->width, box->height);
+   }
+
+   return tx;
+}
+
+void
+util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+   struct util_staging_transfer *tx = (struct util_staging_transfer *)ptx;
+
+   if (tx->staging_resource != tx->base.resource)
+   {
+      if(tx->base.usage & PIPE_TRANSFER_WRITE) {
+         struct pipe_subresource srcsr;
+         unsigned zi;
+         srcsr.face = 0;
+         srcsr.level = 0;
+         for(zi = 0; zi < tx->base.box.depth; ++zi)
+            pipe->resource_copy_region(pipe, tx->base.resource, tx->base.sr, tx->base.box.x, tx->base.box.y, tx->base.box.z + zi, tx->staging_resource, srcsr, 0, 0, 0, tx->base.box.width, tx->base.box.height);
+      }
+
+      pipe_resource_reference(&tx->staging_resource, NULL);
+   }
+
+   pipe_resource_reference(&ptx->resource, NULL);
+   FREE(ptx);
+}
diff --git a/src/gallium/auxiliary/util/u_staging.h b/src/gallium/auxiliary/util/u_staging.h
new file mode 100644
index 00000000000..1aab78cc881
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_staging.h
@@ -0,0 +1,63 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Direct3D 10/11 has no concept of transfers. Applications instead
+ * create resources with a STAGING or DYNAMIC usage, copy between them
+ * and the real resource and use Map to map the STAGING/DYNAMIC resource.
+ *
+ * This util module allows to implement Gallium drivers as a Direct3D
+ * driver would be implemented: transfers allocate a resource with
+ * PIPE_USAGE_STAGING, and copy the data between it and the real resource
+ * with resource_copy_region.
+ */
+
+#ifndef U_STAGING_H
+#define U_STAGING_H
+
+#include "pipe/p_state.h"
+
+struct util_staging_transfer {
+   struct pipe_transfer base;
+
+   /* if direct, same as base.resource, otherwise the temporary staging resource */
+   struct pipe_resource *staging_resource;
+};
+
+/* user must be stride, slice_stride and offset */
+/* pt->usage == PIPE_USAGE_DYNAMIC || pt->usage == PIPE_USAGE_STAGING should be a good value to pass for direct */
+/* staging resource is currently created with PIPE_USAGE_STAGING */
+struct util_staging_transfer *
+util_staging_transfer_init(struct pipe_context *pipe,
+           struct pipe_resource *pt,
+           struct pipe_subresource sr,
+           unsigned usage,
+           const struct pipe_box *box,
+           bool direct, struct util_staging_transfer *tx);
+
+void
+util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx);
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c
index 42440d0d673..f78b6838a72 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -32,13 +32,15 @@
  */
 
 
+#include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
-#include "pipe/p_defines.h"
-#include "util/u_inlines.h"
 
-#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_rect.h"
 #include "util/u_surface.h"
+#include "util/u_pack_color.h"
 
 
 /**
@@ -68,7 +70,7 @@ util_create_rgba_surface(struct pipe_screen *screen,
    /* Choose surface format */
    for (i = 0; rgbaFormats[i]; i++) {
       if (screen->is_format_supported(screen, rgbaFormats[i],
-                                      target, bind, 0)) {
+                                      target, 0, bind, 0)) {
          format = rgbaFormats[i];
          break;
       }
@@ -118,70 +120,231 @@ util_destroy_rgba_surface(struct pipe_resource *texture,
 
 
 /**
- * Compare pipe_framebuffer_state objects.
- * \return TRUE if same, FALSE if different
+ * Fallback function for pipe->resource_copy_region().
+ * Note: (X,Y)=(0,0) is always the upper-left corner.
  */
-boolean
-util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
-                             const struct pipe_framebuffer_state *src)
+void
+util_resource_copy_region(struct pipe_context *pipe,
+                          struct pipe_resource *dst,
+                          struct pipe_subresource subdst,
+                          unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                          struct pipe_resource *src,
+                          struct pipe_subresource subsrc,
+                          unsigned src_x, unsigned src_y, unsigned src_z,
+                          unsigned w, unsigned h)
 {
-   unsigned i;
+   struct pipe_transfer *src_trans, *dst_trans;
+   void *dst_map;
+   const void *src_map;
+   enum pipe_format src_format, dst_format;
+
+   assert(src && dst);
+   if (!src || !dst)
+      return;
+
+   src_format = src->format;
+   dst_format = dst->format;
+
+   src_trans = pipe_get_transfer(pipe,
+				 src,
+				 subsrc.face,
+				 subsrc.level,
+				 src_z,
+				 PIPE_TRANSFER_READ,
+				 src_x, src_y, w, h);
+
+   dst_trans = pipe_get_transfer(pipe,
+				 dst,
+				 subdst.face,
+				 subdst.level,
+				 src_z,
+				 PIPE_TRANSFER_WRITE,
+				 dst_x, dst_y, w, h);
+
+   assert(util_format_get_blocksize(dst_format) == util_format_get_blocksize(src_format));
+   assert(util_format_get_blockwidth(dst_format) == util_format_get_blockwidth(src_format));
+   assert(util_format_get_blockheight(dst_format) == util_format_get_blockheight(src_format));
+
+   src_map = pipe->transfer_map(pipe, src_trans);
+   dst_map = pipe->transfer_map(pipe, dst_trans);
+
+   assert(src_map);
+   assert(dst_map);
+
+   if (src_map && dst_map) {
+      util_copy_rect(dst_map,
+                     dst_format,
+                     dst_trans->stride,
+                     0, 0,
+                     w, h,
+                     src_map,
+                     src_trans->stride,
+                     0,
+                     0);
+   }
 
-   if (dst->width != src->width ||
-       dst->height != src->height)
-      return FALSE;
+   pipe->transfer_unmap(pipe, src_trans);
+   pipe->transfer_unmap(pipe, dst_trans);
 
-   for (i = 0; i < Elements(src->cbufs); i++) {
-      if (dst->cbufs[i] != src->cbufs[i]) {
-         return FALSE;
-      }
-   }
+   pipe->transfer_destroy(pipe, src_trans);
+   pipe->transfer_destroy(pipe, dst_trans);
+}
 
-   if (dst->nr_cbufs != src->nr_cbufs) {
-      return FALSE;
-   }
 
-   if (dst->zsbuf != src->zsbuf) {
-      return FALSE;
-   }
 
-   return TRUE;
-}
+#define UBYTE_TO_USHORT(B) ((B) | ((B) << 8))
 
 
 /**
- * Copy framebuffer state from src to dst, updating refcounts.
+ * Fallback for pipe->clear_render_target() function.
+ * XXX this looks too hackish to be really useful.
+ * cpp > 4 looks like a gross hack at best...
+ * Plus can't use these transfer fallbacks when clearing
+ * multisampled surfaces for instance.
  */
 void
-util_copy_framebuffer_state(struct pipe_framebuffer_state *dst,
-                            const struct pipe_framebuffer_state *src)
+util_clear_render_target(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+                         const float *rgba,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
 {
-   unsigned i;
+   struct pipe_transfer *dst_trans;
+   void *dst_map;
+   union util_color uc;
 
-   dst->width = src->width;
-   dst->height = src->height;
+   assert(dst->texture);
+   if (!dst->texture)
+      return;
 
-   for (i = 0; i < Elements(src->cbufs); i++) {
-      pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
-   }
+   dst_trans = pipe_get_transfer(pipe,
+				 dst->texture,
+				 dst->face,
+				 dst->level,
+				 dst->zslice,
+				 PIPE_TRANSFER_WRITE,
+				 dstx, dsty, width, height);
 
-   dst->nr_cbufs = src->nr_cbufs;
+   dst_map = pipe->transfer_map(pipe, dst_trans);
 
-   pipe_surface_reference(&dst->zsbuf, src->zsbuf);
-}
+   assert(dst_map);
+
+   if (dst_map) {
+      assert(dst_trans->stride > 0);
 
+      util_pack_color(rgba, dst->texture->format, &uc);
+      util_fill_rect(dst_map, dst->texture->format,
+                     dst_trans->stride,
+                     0, 0, width, height, &uc);
+   }
+
+   pipe->transfer_unmap(pipe, dst_trans);
+   pipe->transfer_destroy(pipe, dst_trans);
+}
 
+/**
+ * Fallback for pipe->clear_stencil() function.
+ * sw fallback doesn't look terribly useful here.
+ * Plus can't use these transfer fallbacks when clearing
+ * multisampled surfaces for instance.
+ */
 void
-util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb)
+util_clear_depth_stencil(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+                         unsigned clear_flags,
+                         double depth,
+                         unsigned stencil,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height)
 {
-   unsigned i;
-
-   for (i = 0; i < fb->nr_cbufs; i++) {
-      pipe_surface_reference(&fb->cbufs[i], NULL);
+   struct pipe_transfer *dst_trans;
+   ubyte *dst_map;
+   boolean need_rmw = FALSE;
+
+   if ((clear_flags & PIPE_CLEAR_DEPTHSTENCIL) &&
+       ((clear_flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
+       util_format_is_depth_and_stencil(dst->format))
+      need_rmw = TRUE;
+
+   assert(dst->texture);
+   if (!dst->texture)
+      return;
+   dst_trans = pipe_get_transfer(pipe,
+                                 dst->texture,
+                                 dst->face,
+                                 dst->level,
+                                 dst->zslice,
+                                 (need_rmw ? PIPE_TRANSFER_READ_WRITE :
+                                     PIPE_TRANSFER_WRITE),
+                                 dstx, dsty, width, height);
+
+   dst_map = pipe->transfer_map(pipe, dst_trans);
+
+   assert(dst_map);
+
+   if (dst_map) {
+      unsigned dst_stride = dst_trans->stride;
+      unsigned zstencil = util_pack_z_stencil(dst->texture->format, depth, stencil);
+      unsigned i, j;
+      assert(dst_trans->stride > 0);
+
+      switch (util_format_get_blocksize(dst->format)) {
+      case 1:
+         assert(dst->format == PIPE_FORMAT_S8_USCALED);
+         if(dst_stride == width)
+            memset(dst_map, (ubyte) zstencil, height * width);
+         else {
+            for (i = 0; i < height; i++) {
+               memset(dst_map, (ubyte) zstencil, width);
+               dst_map += dst_stride;
+            }
+         }
+         break;
+      case 2:
+         assert(dst->format == PIPE_FORMAT_Z16_UNORM);
+         for (i = 0; i < height; i++) {
+            uint16_t *row = (uint16_t *)dst_map;
+            for (j = 0; j < width; j++)
+               *row++ = (uint16_t) zstencil;
+            dst_map += dst_stride;
+            }
+         break;
+      case 4:
+         if (!need_rmw) {
+            for (i = 0; i < height; i++) {
+               uint32_t *row = (uint32_t *)dst_map;
+               for (j = 0; j < width; j++)
+                  *row++ = zstencil;
+               dst_map += dst_stride;
+            }
+         }
+         else {
+            uint32_t dst_mask;
+            if (dst->format == PIPE_FORMAT_Z24_UNORM_S8_USCALED)
+               dst_mask = 0xffffff00;
+            else {
+               assert(dst->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM);
+               dst_mask = 0xffffff;
+            }
+            if (clear_flags & PIPE_CLEAR_DEPTH)
+               dst_mask = ~dst_mask;
+            for (i = 0; i < height; i++) {
+               uint32_t *row = (uint32_t *)dst_map;
+               for (j = 0; j < width; j++) {
+                  uint32_t tmp = *row & dst_mask;
+                  *row++ = tmp | (zstencil & ~dst_mask);
+               }
+               dst_map += dst_stride;
+            }
+         }
+        break;
+      case 8:
+      default:
+         assert(0);
+         break;
+      }
    }
 
-   pipe_surface_reference(&fb->zsbuf, NULL);
-
-   fb->width = fb->height = 0;
-   fb->nr_cbufs = 0;
+   pipe->transfer_unmap(pipe, dst_trans);
+   pipe->transfer_destroy(pipe, dst_trans);
 }
diff --git a/src/gallium/auxiliary/util/u_surface.h b/src/gallium/auxiliary/util/u_surface.h
index 119fcd4ce8e..6cd12af3a8b 100644
--- a/src/gallium/auxiliary/util/u_surface.h
+++ b/src/gallium/auxiliary/util/u_surface.h
@@ -33,23 +33,6 @@
 #include "pipe/p_state.h"
 
 
-/**
- * Are s1 and s2 the same surface?
- * Surfaces are basically views into textures so check if the two surfaces
- * name the same part of the same texture.
- */
-static INLINE boolean
-util_same_surface(const struct pipe_surface *s1, const struct pipe_surface *s2)
-{
-   return (s1->texture == s2->texture &&
-           s1->face == s2->face &&
-           s1->level == s2->level &&
-           s1->zslice == s2->zslice);
-}
-
-
-
-
 extern boolean
 util_create_rgba_surface(struct pipe_screen *screen,
                          uint width, uint height, uint bind,
@@ -62,17 +45,32 @@ util_destroy_rgba_surface(struct pipe_resource *texture,
                           struct pipe_surface *surface);
 
 
-extern boolean
-util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
-                             const struct pipe_framebuffer_state *src);
 
 extern void
-util_copy_framebuffer_state(struct pipe_framebuffer_state *dst,
-                            const struct pipe_framebuffer_state *src);
+util_resource_copy_region(struct pipe_context *pipe,
+                          struct pipe_resource *dst,
+                          struct pipe_subresource subdst,
+                          unsigned dst_x, unsigned dst_y, unsigned dst_z,
+                          struct pipe_resource *src,
+                          struct pipe_subresource subsrc,
+                          unsigned src_x, unsigned src_y, unsigned src_z,
+                          unsigned w, unsigned h);
 
+extern void
+util_clear_render_target(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+                         const float *rgba,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height);
 
 extern void
-util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb);
+util_clear_depth_stencil(struct pipe_context *pipe,
+                         struct pipe_surface *dst,
+                         unsigned clear_flags,
+                         double depth,
+                         unsigned stencil,
+                         unsigned dstx, unsigned dsty,
+                         unsigned width, unsigned height);
 
 
 #endif /* U_SURFACE_H */
diff --git a/src/gallium/auxiliary/util/u_surfaces.c b/src/gallium/auxiliary/util/u_surfaces.c
index 668da8c5c27..404e1219952 100644
--- a/src/gallium/auxiliary/util/u_surfaces.c
+++ b/src/gallium/auxiliary/util/u_surfaces.c
@@ -1,42 +1,50 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #include "u_surfaces.h"
 #include "util/u_hash_table.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 
-/* TODO: ouch, util_hash_table should do these by default when passed a null function pointer
- * this indirect function call is quite bad
- */
-static unsigned
-hash(void *key)
-{
-   return (unsigned)(uintptr_t)key;
-}
-
-static int
-compare(void *key1, void *key2)
-{
-   return (unsigned)(uintptr_t)key1 - (unsigned)(uintptr_t)key2;
-}
-
 struct pipe_surface *
 util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags)
 {
    struct pipe_surface *ps;
-   void *key = NULL;
 
    if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
-   {	/* or 2D array */
-      if(!us->u.table)
-	 us->u.table = util_hash_table_create(hash, compare);
-      key = (void *)(((zslice + face) << 8) | level);
-      /* TODO: ouch, should have a get-reference function...
-       * also, shouldn't allocate a two-pointer structure for each item... */
-      ps = util_hash_table_get(us->u.table, key);
+   {    /* or 2D array */
+      if(!us->u.hash)
+         us->u.hash = cso_hash_create();
+
+      ps = cso_hash_iter_data(cso_hash_find(us->u.hash, ((zslice + face) << 8) | level));
    }
    else
    {
       if(!us->u.array)
-	 us->u.array = CALLOC(pt->last_level + 1, sizeof(struct pipe_surface *));
+         us->u.array = CALLOC(pt->last_level + 1, sizeof(struct pipe_surface *));
       ps = us->u.array[level];
    }
 
@@ -54,7 +62,7 @@ util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, str
    ps->offset = ~0;
 
    if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
-      util_hash_table_set(us->u.table, key, ps);
+      cso_hash_insert(us->u.hash, ((zslice + face) << 8) | level, ps);
    else
       us->u.array[level] = ps;
 
@@ -66,47 +74,44 @@ util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps)
 {
    struct pipe_resource *pt = ps->texture;
    if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
-   {	/* or 2D array */
-      void* key = (void*)(uintptr_t)(((ps->zslice + ps->face) << 8) | ps->level);
-      util_hash_table_remove(us->u.table, key);
+   {    /* or 2D array */
+      cso_hash_erase(us->u.hash, cso_hash_find(us->u.hash, ((ps->zslice + ps->face) << 8) | ps->level));
    }
    else
       us->u.array[ps->level] = 0;
 }
 
-static enum pipe_error
-util_surfaces_destroy_callback(void *key, void *value, void *data)
-{
-   void (*destroy_surface) (struct pipe_surface * ps) = data;
-   destroy_surface((struct pipe_surface *)value);
-   return PIPE_OK;
-}
-
 void
 util_surfaces_destroy(struct util_surfaces *us, struct pipe_resource *pt, void (*destroy_surface) (struct pipe_surface *))
 {
    if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
-   {	/* or 2D array */
-      if(us->u.table)
+   {    /* or 2D array */
+      if(us->u.hash)
       {
-	 util_hash_table_foreach(us->u.table, util_surfaces_destroy_callback, destroy_surface);
-	 util_hash_table_destroy(us->u.table);
-	 us->u.table = NULL;
+         struct cso_hash_iter iter;
+         iter = cso_hash_first_node(us->u.hash);
+         while (!cso_hash_iter_is_null(iter)) {
+            destroy_surface(cso_hash_iter_data(iter));
+            iter = cso_hash_iter_next(iter);
+         }
+
+         cso_hash_delete(us->u.hash);
+         us->u.hash = NULL;
       }
    }
    else
    {
       if(us->u.array)
       {
-	 unsigned i;
-	 for(i = 0; i < pt->last_level; ++i)
-	 {
-	    struct pipe_surface *ps = us->u.array[i];
-	    if(ps)
-	       destroy_surface(ps);
-	 }
-	 FREE(us->u.array);
-	 us->u.array = NULL;
+         unsigned i;
+         for(i = 0; i <= pt->last_level; ++i)
+         {
+            struct pipe_surface *ps = us->u.array[i];
+            if(ps)
+               destroy_surface(ps);
+         }
+         FREE(us->u.array);
+         us->u.array = NULL;
       }
    }
 }
diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h
index 0195bf5afba..17d8a5d3a5b 100644
--- a/src/gallium/auxiliary/util/u_surfaces.h
+++ b/src/gallium/auxiliary/util/u_surfaces.h
@@ -1,18 +1,44 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
 #ifndef U_SURFACES_H_
 #define U_SURFACES_H_
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "util/u_atomic.h"
-
-struct util_hash_table;
+#include "cso_cache/cso_hash.h"
 
 struct util_surfaces
 {
    union
    {
-      struct util_hash_table *table;
+      struct cso_hash *hash;
       struct pipe_surface **array;
+      void* pv;
    } u;
 };
 
@@ -22,7 +48,7 @@ struct pipe_surface *util_surfaces_do_get(struct util_surfaces *us, unsigned sur
 static INLINE struct pipe_surface *
 util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags)
 {
-   if(likely(pt->target == PIPE_TEXTURE_2D && us->u.array))
+   if(likely((pt->target == PIPE_TEXTURE_2D || pt->target == PIPE_TEXTURE_RECT) && us->u.array))
    {
       struct pipe_surface *ps = us->u.array[level];
       if(ps)
@@ -35,12 +61,24 @@ util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct
    return util_surfaces_do_get(us, surface_struct_size, pscreen, pt, face, level, zslice, flags);
 }
 
+static INLINE struct pipe_surface *
+util_surfaces_peek(struct util_surfaces *us, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice)
+{
+   if(!us->u.pv)
+      return 0;
+
+   if(unlikely(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE))
+      return cso_hash_iter_data(cso_hash_find(us->u.hash, ((zslice + face) << 8) | level));
+   else
+      return us->u.array[level];
+}
+
 void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps);
 
 static INLINE void
 util_surfaces_detach(struct util_surfaces *us, struct pipe_surface *ps)
 {
-   if(likely(ps->texture->target == PIPE_TEXTURE_2D))
+   if(likely(ps->texture->target == PIPE_TEXTURE_2D || ps->texture->target == PIPE_TEXTURE_RECT))
    {
       us->u.array[ps->level] = 0;
       return;
diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h
index 986eee07435..558351d0ce5 100644
--- a/src/gallium/auxiliary/util/u_tile.h
+++ b/src/gallium/auxiliary/util/u_tile.h
@@ -29,7 +29,10 @@
 #define P_TILE_H
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
 
+struct pipe_context;
 struct pipe_transfer;
 
 /**
diff --git a/src/gallium/auxiliary/util/u_transfer.c b/src/gallium/auxiliary/util/u_transfer.c
index bedace3b1dc..69f6fab9504 100644
--- a/src/gallium/auxiliary/util/u_transfer.c
+++ b/src/gallium/auxiliary/util/u_transfer.c
@@ -35,12 +35,12 @@ void u_default_transfer_inline_write( struct pipe_context *pipe,
    
    util_copy_rect(map,
 		  resource->format,
-		  transfer->stride, /* bytes? */
+		  transfer->stride, /* bytes */
 		  0, 0,
 		  box->width,
 		  box->height,
 		  data,
-		  box->width,	/* bytes? texels? */
+		  stride,       /* bytes */
 		  0, 0);
 
 out:
diff --git a/src/gallium/auxiliary/util/u_transfer.h b/src/gallium/auxiliary/util/u_transfer.h
index eb07945d15f..e3a38730f21 100644
--- a/src/gallium/auxiliary/util/u_transfer.h
+++ b/src/gallium/auxiliary/util/u_transfer.h
@@ -8,6 +8,7 @@
 #include "pipe/p_state.h"
 
 struct pipe_context;
+struct winsys_handle;
 
 boolean u_default_resource_get_handle(struct pipe_screen *screen,
 				      struct pipe_resource *resource,
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 75d44432d9e..af229e61a00 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -59,6 +59,8 @@ struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
                                       unsigned usage )
 {
    struct u_upload_mgr *upload = CALLOC_STRUCT( u_upload_mgr );
+   if (!upload)
+      return NULL;
 
    upload->pipe = pipe;
    upload->default_size = default_size;
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index a124924fc80..de016df02e0 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -32,11 +32,8 @@
 #ifndef U_UPLOAD_MGR_H
 #define U_UPLOAD_MGR_H
 
-#include "pipe/p_defines.h"
-
-struct pipe_screen;
+struct pipe_context;
 struct pipe_resource;
-struct u_upload_mgr;
 
 
 struct u_upload_mgr *u_upload_create( struct pipe_context *pipe,
author	Thomas Balling Sørensen <tball@tball-laptop.(none)>	2010-10-05 12:04:08 +0200
committer	Thomas Balling Sørensen <tball@tball-laptop.(none)>	2010-10-05 12:04:08 +0200
commit	1218430e1200a08cd64b6555d3fd1fd0274ad9e5 (patch)
tree	e060fb27b8388a4bd237ca39fc20f1675c5e367c /src/gallium/auxiliary
parent	63b1525cf0a50e3d31328c3b56355a86056e4c05 (diff)
parent	bf21b7006c63c3dc47045c22d4f372dfe6c7ce67 (diff)