Merge remote branch 'origin/master' into pipe-video

Conflicts: configure.ac src/gallium/auxiliary/vl/Makefile src/gallium/auxiliary/vl/SConscript src/gallium/auxiliary/vl/vl_compositor.c src/gallium/auxiliary/vl/vl_compositor.h src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h src/gallium/drivers/nouveau/nouveau_winsys.h src/gallium/drivers/softpipe/sp_video_context.c src/gallium/include/pipe/p_video_state.h src/gallium/include/state_tracker/drm_api.h src/gallium/state_trackers/xorg/xvmc/surface.c src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.c src/gallium/winsys/drm/nouveau/drm/nouveau_drm_api.h src/gallium/winsys/drm/radeon/core/radeon_drm.c src/gallium/winsys/g3dvl/nouveau/Makefile
author: Younes Manton <[email protected]> 2010-03-07 12:47:45 -0500
committer: Younes Manton <[email protected]> 2010-03-12 01:37:49 -0500
commit: a8238bb08a95e7ea4430450c304a6bee210df1a6 (patch)
tree: 00f4e852473dc1d6a86aece436f3e5bf89d029c7 /src/gallium/auxiliary
parent: 80468464897682b8e10aeab310f20fdd7ddc6cb4 (diff)
parent: 45df4bad9fc0379f05197bee10c03fd351f24094 (diff)
247 files changed, 25341 insertions, 11955 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 5446eb68a98..ac9d50af7a9 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -1,12 +1,198 @@
-# src/gallium/auxiliary/Makefile
 TOP = ../../..
 include $(TOP)/configs/current
 
-SUBDIRS = $(GALLIUM_AUXILIARY_DIRS)
+LIBNAME = gallium
+
+C_SOURCES = \
+	cso_cache/cso_context.c \
+	cso_cache/cso_cache.c \
+	cso_cache/cso_hash.c \
+	draw/draw_context.c \
+	draw/draw_gs.c \
+	draw/draw_pipe.c \
+	draw/draw_pipe_aaline.c \
+	draw/draw_pipe_aapoint.c \
+	draw/draw_pipe_clip.c \
+	draw/draw_pipe_cull.c \
+	draw/draw_pipe_flatshade.c \
+	draw/draw_pipe_offset.c \
+	draw/draw_pipe_pstipple.c \
+	draw/draw_pipe_stipple.c \
+	draw/draw_pipe_twoside.c \
+	draw/draw_pipe_unfilled.c \
+	draw/draw_pipe_util.c \
+	draw/draw_pipe_validate.c \
+	draw/draw_pipe_vbuf.c \
+	draw/draw_pipe_wide_line.c \
+	draw/draw_pipe_wide_point.c \
+	draw/draw_pt.c \
+	draw/draw_pt_elts.c \
+	draw/draw_pt_emit.c \
+	draw/draw_pt_fetch.c \
+	draw/draw_pt_fetch_emit.c \
+	draw/draw_pt_fetch_shade_emit.c \
+	draw/draw_pt_fetch_shade_pipeline.c \
+	draw/draw_pt_post_vs.c \
+	draw/draw_pt_util.c \
+	draw/draw_pt_varray.c \
+	draw/draw_pt_vcache.c \
+	draw/draw_vertex.c \
+	draw/draw_vs.c \
+	draw/draw_vs_varient.c \
+	draw/draw_vs_aos.c \
+	draw/draw_vs_aos_io.c \
+	draw/draw_vs_aos_machine.c \
+	draw/draw_vs_exec.c \
+	draw/draw_vs_llvm.c \
+	draw/draw_vs_ppc.c \
+	draw/draw_vs_sse.c \
+	indices/u_indices_gen.c \
+	indices/u_unfilled_gen.c \
+	os/os_misc.c \
+	os/os_stream_log.c \
+	os/os_stream_stdc.c \
+	os/os_stream_str.c \
+	os/os_stream_null.c \
+	os/os_time.c \
+	pipebuffer/pb_buffer_fenced.c \
+	pipebuffer/pb_buffer_malloc.c \
+	pipebuffer/pb_bufmgr_alt.c \
+	pipebuffer/pb_bufmgr_cache.c \
+	pipebuffer/pb_bufmgr_debug.c \
+	pipebuffer/pb_bufmgr_mm.c \
+	pipebuffer/pb_bufmgr_ondemand.c \
+	pipebuffer/pb_bufmgr_pool.c \
+	pipebuffer/pb_bufmgr_slab.c \
+	pipebuffer/pb_validate.c \
+	rbug/rbug_connection.c \
+	rbug/rbug_core.c \
+	rbug/rbug_texture.c \
+	rbug/rbug_context.c \
+	rbug/rbug_shader.c \
+	rbug/rbug_demarshal.c \
+	rtasm/rtasm_cpu.c \
+	rtasm/rtasm_execmem.c \
+	rtasm/rtasm_x86sse.c \
+	rtasm/rtasm_ppc.c \
+	rtasm/rtasm_ppc_spe.c \
+	tgsi/tgsi_sanity.c \
+	tgsi/tgsi_build.c \
+	tgsi/tgsi_dump.c \
+	tgsi/tgsi_exec.c \
+	tgsi/tgsi_info.c \
+	tgsi/tgsi_iterate.c \
+	tgsi/tgsi_parse.c \
+	tgsi/tgsi_ppc.c \
+	tgsi/tgsi_scan.c \
+	tgsi/tgsi_sse2.c \
+	tgsi/tgsi_text.c \
+	tgsi/tgsi_transform.c \
+	tgsi/tgsi_ureg.c \
+	tgsi/tgsi_util.c \
+	translate/translate_generic.c \
+	translate/translate_sse.c \
+	translate/translate.c \
+	translate/translate_cache.c \
+	util/u_debug.c \
+	util/u_debug_symbol.c \
+	util/u_debug_stack.c \
+	util/u_dump_defines.c \
+	util/u_dump_state.c \
+	util/u_bitmask.c \
+	util/u_blit.c \
+	util/u_blitter.c \
+	util/u_cache.c \
+	util/u_cpu_detect.c \
+	util/u_dl.c \
+	util/u_draw_quad.c \
+	util/u_format_access.c \
+	util/u_format_table.c \
+	util/u_gen_mipmap.c \
+	util/u_handle_table.c \
+	util/u_hash_table.c \
+	util/u_hash.c \
+	util/u_keymap.c \
+	util/u_linear.c \
+	util/u_network.c \
+	util/u_math.c \
+	util/u_mm.c \
+	util/u_rect.c \
+	util/u_ringbuffer.c \
+	util/u_simple_shaders.c \
+	util/u_snprintf.c \
+	util/u_surface.c \
+	util/u_texture.c \
+	util/u_tile.c \
+	util/u_timed_winsys.c \
+	util/u_upload_mgr.c \
+	util/u_simple_screen.c \
+	vl/vl_bitstream_parser.c \
+	vl/vl_mpeg12_mc_renderer.c \
+	vl/vl_compositor.c \
+	vl/vl_csc.c
+
+GALLIVM_SOURCES = \
+        gallivm/lp_bld_alpha.c \
+        gallivm/lp_bld_arit.c \
+        gallivm/lp_bld_blend_aos.c \
+        gallivm/lp_bld_blend_logicop.c \
+        gallivm/lp_bld_blend_soa.c \
+        gallivm/lp_bld_const.c \
+        gallivm/lp_bld_conv.c \
+        gallivm/lp_bld_debug.c \
+        gallivm/lp_bld_depth.c \
+        gallivm/lp_bld_flow.c \
+        gallivm/lp_bld_format_aos.c \
+        gallivm/lp_bld_format_query.c \
+        gallivm/lp_bld_format_soa.c \
+        gallivm/lp_bld_interp.c \
+        gallivm/lp_bld_intr.c \
+        gallivm/lp_bld_logic.c \
+        gallivm/lp_bld_pack.c \
+        gallivm/lp_bld_sample.c \
+        gallivm/lp_bld_sample_soa.c \
+        gallivm/lp_bld_struct.c \
+        gallivm/lp_bld_swizzle.c \
+        gallivm/lp_bld_tgsi_soa.c \
+        gallivm/lp_bld_type.c
+
+GALLIVM_CPP_SOURCES = \
+        gallivm/lp_bld_init.cpp
+
+GENERATED_SOURCES = \
+	indices/u_indices_gen.c \
+	indices/u_unfilled_gen.c \
+	util/u_format_access.c \
+	util/u_format_pack.h \
+	util/u_format_table.c
+
+
+ifeq ($(MESA_LLVM),1)
+C_SOURCES += \
+	$(GALLIVM_SOURCES)
+CPP_SOURCES += \
+	$(GALLIVM_CPP_SOURCES)
+endif
+
+
+LIBRARY_DEFINES += -D__STDC_CONSTANT_MACROS
+
+
+include ../Makefile.template
+
+
+indices/u_indices_gen.c: indices/u_indices_gen.py
+	python $< > $@
+
+indices/u_unfilled_gen.c: indices/u_unfilled_gen.py
+	python $< > $@
+
+util/u_format_table.c: util/u_format_table.py util/u_format_parse.py util/u_format.csv
+	python util/u_format_table.py util/u_format.csv > $@
+
+util/u_format_pack.h: util/u_format_pack.py util/u_format_parse.py util/u_format.csv
+	python util/u_format_pack.py util/u_format.csv > $@
+
+util/u_format_access.c: util/u_format_access.py util/u_format_parse.py util/u_format.csv
+	python util/u_format_access.py util/u_format.csv > $@
 
-default install clean:
-	@for dir in $(SUBDIRS) ; do \
-		if [ -d $$dir ] ; then \
-			(cd $$dir && $(MAKE) $@) || exit 1; \
-		fi \
-	done
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
new file mode 100644
index 00000000000..f0f3e783a3b
--- /dev/null
+++ b/src/gallium/auxiliary/SConscript
@@ -0,0 +1,209 @@
+Import('*')
+
+from sys import executable as python_cmd
+
+env.Append(CPPPATH = [
+    'indices',
+    'util',
+])
+
+env.CodeGenerate(
+    target = 'indices/u_indices_gen.c', 
+    script = 'indices/u_indices_gen.py', 
+    source = [],
+    command = python_cmd + ' $SCRIPT > $TARGET'
+)
+
+env.CodeGenerate(
+    target = 'indices/u_unfilled_gen.c', 
+    script = 'indices/u_unfilled_gen.py', 
+    source = [],
+    command = python_cmd + ' $SCRIPT > $TARGET'
+)
+
+env.CodeGenerate(
+    target = 'util/u_format_table.c',
+    script = 'util/u_format_table.py',
+    source = ['util/u_format.csv'],
+    command = 'python $SCRIPT $SOURCE > $TARGET'
+)
+
+env.CodeGenerate(
+    target = File('util/u_format_pack.h').srcnode(),
+    script = 'util/u_format_pack.py',
+    source = ['util/u_format.csv'],
+    command = 'python $SCRIPT $SOURCE > $TARGET'
+)
+
+env.CodeGenerate(
+    target = 'util/u_format_access.c',
+    script = 'util/u_format_access.py',
+    source = ['util/u_format.csv'],
+    command = 'python $SCRIPT $SOURCE > $TARGET'
+)
+
+source = [
+    'cso_cache/cso_context.c',
+    'cso_cache/cso_cache.c',
+    'cso_cache/cso_hash.c',
+    'draw/draw_context.c',
+    'draw/draw_pipe.c',
+    'draw/draw_pipe_aaline.c',
+    'draw/draw_pipe_aapoint.c',
+    'draw/draw_pipe_clip.c',
+    'draw/draw_pipe_cull.c',
+    'draw/draw_pipe_flatshade.c',
+    'draw/draw_pipe_offset.c',
+    'draw/draw_pipe_pstipple.c',
+    'draw/draw_pipe_stipple.c',
+    'draw/draw_pipe_twoside.c',
+    'draw/draw_pipe_unfilled.c',
+    'draw/draw_pipe_util.c',
+    'draw/draw_pipe_validate.c',
+    'draw/draw_pipe_vbuf.c',
+    'draw/draw_pipe_wide_line.c',
+    'draw/draw_pipe_wide_point.c',
+    'draw/draw_pt.c',
+    'draw/draw_pt_elts.c',
+    'draw/draw_pt_emit.c',
+    'draw/draw_pt_fetch.c',
+    'draw/draw_pt_fetch_emit.c',
+    'draw/draw_pt_fetch_shade_emit.c',
+    'draw/draw_pt_fetch_shade_pipeline.c',
+    'draw/draw_pt_post_vs.c',
+    'draw/draw_pt_util.c',
+    'draw/draw_pt_varray.c',
+    'draw/draw_pt_vcache.c',
+    'draw/draw_vertex.c',
+    'draw/draw_vs.c',
+    'draw/draw_vs_aos.c',
+    'draw/draw_vs_aos_io.c',
+    'draw/draw_vs_aos_machine.c',
+    'draw/draw_vs_exec.c',
+    'draw/draw_vs_llvm.c',
+    'draw/draw_vs_ppc.c',
+    'draw/draw_vs_sse.c',
+    'draw/draw_vs_varient.c',
+    'draw/draw_gs.c',
+    #'indices/u_indices.c',
+    #'indices/u_unfilled_indices.c',
+    'indices/u_indices_gen.c',
+    'indices/u_unfilled_gen.c',
+    'os/os_misc.c',
+    'os/os_stream_log.c',
+    'os/os_stream_stdc.c',
+    'os/os_stream_str.c',
+    'os/os_stream_null.c',
+    'os/os_time.c',
+    'pipebuffer/pb_buffer_fenced.c',
+    'pipebuffer/pb_buffer_malloc.c',
+    'pipebuffer/pb_bufmgr_alt.c',
+    'pipebuffer/pb_bufmgr_cache.c',
+    'pipebuffer/pb_bufmgr_debug.c',
+    'pipebuffer/pb_bufmgr_mm.c',
+    'pipebuffer/pb_bufmgr_ondemand.c',
+    'pipebuffer/pb_bufmgr_pool.c',
+    'pipebuffer/pb_bufmgr_slab.c',
+    'pipebuffer/pb_validate.c',
+    'rbug/rbug_core.c',
+    'rbug/rbug_shader.c',
+    'rbug/rbug_context.c',
+    'rbug/rbug_texture.c',
+    'rbug/rbug_demarshal.c',
+    'rbug/rbug_connection.c',
+    'rtasm/rtasm_cpu.c',
+    'rtasm/rtasm_execmem.c',
+    'rtasm/rtasm_x86sse.c',
+    'rtasm/rtasm_ppc.c',
+    'rtasm/rtasm_ppc_spe.c',
+    'tgsi/tgsi_build.c',
+    'tgsi/tgsi_dump.c',
+    'tgsi/tgsi_exec.c',
+    'tgsi/tgsi_info.c',
+    'tgsi/tgsi_iterate.c',
+    'tgsi/tgsi_parse.c',
+    'tgsi/tgsi_sanity.c',
+    'tgsi/tgsi_scan.c',
+    'tgsi/tgsi_ppc.c',
+    'tgsi/tgsi_sse2.c',
+    'tgsi/tgsi_text.c',
+    'tgsi/tgsi_transform.c',
+    'tgsi/tgsi_ureg.c',
+    'tgsi/tgsi_util.c',
+    'translate/translate_generic.c',
+    'translate/translate_sse.c',
+    'translate/translate.c',
+    'translate/translate_cache.c',
+    'util/u_bitmask.c',
+    'util/u_blit.c',
+    'util/u_blitter.c',
+    'util/u_cache.c',
+    'util/u_cpu_detect.c',
+    'util/u_debug.c',
+    'util/u_debug_memory.c',
+    'util/u_debug_stack.c',
+    'util/u_debug_symbol.c',
+    'util/u_dump_defines.c',
+    'util/u_dump_state.c',
+    'util/u_dl.c',
+    'util/u_draw_quad.c',
+    'util/u_format_access.c',
+    'util/u_format_table.c',
+    'util/u_gen_mipmap.c',
+    'util/u_handle_table.c',
+    'util/u_hash.c',
+    'util/u_hash_table.c',
+    'util/u_keymap.c',
+    'util/u_network.c',
+    'util/u_math.c',
+    'util/u_mm.c',
+    'util/u_rect.c',
+    'util/u_ringbuffer.c',
+    'util/u_simple_shaders.c',
+    'util/u_snprintf.c',
+    'util/u_surface.c',
+    'util/u_texture.c',
+    'util/u_tile.c',
+    'util/u_timed_winsys.c',
+    'util/u_upload_mgr.c',
+    'util/u_simple_screen.c',
+    'vl/vl_bitstream_parser.c',
+    'vl/vl_mpeg12_mc_renderer.c',
+    'vl/vl_compositor.c',
+    'vl/vl_csc.c',
+]
+
+if drawllvm:
+    source += [
+    'gallivm/lp_bld_alpha.c',
+    'gallivm/lp_bld_arit.c',
+    'gallivm/lp_bld_blend_aos.c',
+    'gallivm/lp_bld_blend_logicop.c',
+    'gallivm/lp_bld_blend_soa.c',
+    'gallivm/lp_bld_const.c',
+    'gallivm/lp_bld_conv.c',
+    'gallivm/lp_bld_debug.c',
+    'gallivm/lp_bld_depth.c',
+    'gallivm/lp_bld_flow.c',
+    'gallivm/lp_bld_format_aos.c',
+    'gallivm/lp_bld_format_query.c',
+    'gallivm/lp_bld_format_soa.c',
+    'gallivm/lp_bld_interp.c',
+    'gallivm/lp_bld_intr.c',
+    'gallivm/lp_bld_logic.c',
+    'gallivm/lp_bld_init.cpp',
+    'gallivm/lp_bld_pack.c',
+    'gallivm/lp_bld_sample.c',
+    'gallivm/lp_bld_sample_soa.c',
+    'gallivm/lp_bld_struct.c',
+    'gallivm/lp_bld_swizzle.c',
+    'gallivm/lp_bld_tgsi_soa.c',
+    'gallivm/lp_bld_type.c',
+    ]
+
+gallium = env.ConvenienceLibrary(
+    target = 'gallium',
+    source = source,
+)
+
+Export('gallium')
diff --git a/src/gallium/auxiliary/cso_cache/Makefile b/src/gallium/auxiliary/cso_cache/Makefile
deleted file mode 100644
index 8726afcd949..00000000000
--- a/src/gallium/auxiliary/cso_cache/Makefile
+++ /dev/null
@@ -1,11 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = cso_cache
-
-C_SOURCES = \
-	cso_context.c \
-	cso_cache.c \
-	cso_hash.c
-
-include ../../Makefile.template
diff --git a/src/gallium/auxiliary/cso_cache/SConscript b/src/gallium/auxiliary/cso_cache/SConscript
deleted file mode 100644
index 651e68a191a..00000000000
--- a/src/gallium/auxiliary/cso_cache/SConscript
+++ /dev/null
@@ -1,11 +0,0 @@
-Import('*')
-
-cso_cache = env.ConvenienceLibrary(
-	target = 'cso_cache',
-	source = [
-		'cso_context.c',
-		'cso_cache.c',
-		'cso_hash.c',
-	])
-
-auxiliaries.insert(0, cso_cache)
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
index e6dce3f0e5b..a6a07e72c2f 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -113,26 +113,6 @@ static struct cso_hash *_cso_hash_for_type(struct cso_cache *sc, enum cso_cache_
    return hash;
 }
 
-static int _cso_size_for_type(enum cso_cache_type type)
-{
-   switch(type) {
-   case CSO_BLEND:
-      return sizeof(struct pipe_blend_state);
-   case CSO_SAMPLER:
-      return sizeof(struct pipe_sampler_state);
-   case CSO_DEPTH_STENCIL_ALPHA:
-      return sizeof(struct pipe_depth_stencil_alpha_state);
-   case CSO_RASTERIZER:
-      return sizeof(struct pipe_rasterizer_state);
-   case CSO_FRAGMENT_SHADER:
-      return sizeof(struct pipe_shader_state);
-   case CSO_VERTEX_SHADER:
-      return sizeof(struct pipe_shader_state);
-   }
-   return 0;
-}
-
-
 static void delete_blend_state(void *state, void *data)
 {
    struct cso_blend *cso = (struct cso_blend *)state;
@@ -282,10 +262,9 @@ void *cso_hash_find_data_from_template( struct cso_hash *hash,
 
 struct cso_hash_iter cso_find_state_template(struct cso_cache *sc,
                                              unsigned hash_key, enum cso_cache_type type,
-                                             void *templ)
+                                             void *templ, unsigned size)
 {
    struct cso_hash_iter iter = cso_find_state(sc, hash_key, type);
-   int size = _cso_size_for_type(type);
    while (!cso_hash_iter_is_null(iter)) {
       void *iter_data = cso_hash_iter_data(iter);
       if (!memcmp(iter_data, templ, size))
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.h b/src/gallium/auxiliary/cso_cache/cso_cache.h
index 6b5c230e8f2..eea60b940bb 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.h
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.h
@@ -160,7 +160,7 @@ struct cso_hash_iter cso_find_state(struct cso_cache *sc,
                                     unsigned hash_key, enum cso_cache_type type);
 struct cso_hash_iter cso_find_state_template(struct cso_cache *sc,
                                              unsigned hash_key, enum cso_cache_type type,
-                                             void *templ);
+                                             void *templ, unsigned size);
 void cso_for_each_state(struct cso_cache *sc, enum cso_cache_type type,
                         cso_state_callback func, void *user_data);
 void * cso_take_state(struct cso_cache *sc, unsigned hash_key,
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 80bd0c91db0..a7335c340ca 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -36,12 +36,14 @@
   */
 
 #include "pipe/p_state.h"
+#include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "tgsi/tgsi_parse.h"
 
 #include "cso_cache/cso_context.h"
 #include "cso_cache/cso_cache.h"
 #include "cso_cache/cso_hash.h"
+#include "cso_context.h"
 
 struct cso_context {
    struct pipe_context *pipe;
@@ -85,12 +87,16 @@ struct cso_context {
    void *blend, *blend_saved;
    void *depth_stencil, *depth_stencil_saved;
    void *rasterizer, *rasterizer_saved;
-   void *fragment_shader, *fragment_shader_saved;
-   void *vertex_shader, *vertex_shader_saved;
+   void *fragment_shader, *fragment_shader_saved, *geometry_shader;
+   void *vertex_shader, *vertex_shader_saved, *geometry_shader_saved;
+
+   struct pipe_clip_state clip;
+   struct pipe_clip_state clip_saved;
 
    struct pipe_framebuffer_state fb, fb_saved;
    struct pipe_viewport_state vp, vp_saved;
    struct pipe_blend_color blend_color;
+   struct pipe_stencil_ref stencil_ref, stencil_ref_saved;
 };
 
 
@@ -309,18 +315,21 @@ void cso_destroy_context( struct cso_context *ctx )
 enum pipe_error cso_set_blend(struct cso_context *ctx,
                               const struct pipe_blend_state *templ)
 {
-   unsigned hash_key = cso_construct_key((void*)templ, sizeof(struct pipe_blend_state));
-   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
-                                                       hash_key, CSO_BLEND,
-                                                       (void*)templ);
+   unsigned key_size, hash_key;
+   struct cso_hash_iter iter;
    void *handle;
 
+   key_size = templ->independent_blend_enable ? sizeof(struct pipe_blend_state) :
+              (char *)&(templ->rt[1]) - (char *)templ;
+   hash_key = cso_construct_key((void*)templ, key_size);
+   iter = cso_find_state_template(ctx->cache, hash_key, CSO_BLEND, (void*)templ, key_size);
+
    if (cso_hash_iter_is_null(iter)) {
       struct cso_blend *cso = MALLOC(sizeof(struct cso_blend));
       if (!cso)
          return PIPE_ERROR_OUT_OF_MEMORY;
 
-      memcpy(&cso->state, templ, sizeof(*templ));
+      memcpy(&cso->state, templ, key_size);
       cso->data = ctx->pipe->create_blend_state(ctx->pipe, &cso->state);
       cso->delete_state = (cso_state_callback)ctx->pipe->delete_blend_state;
       cso->context = ctx->pipe;
@@ -368,10 +377,11 @@ enum pipe_error cso_single_sampler(struct cso_context *ctx,
    void *handle = NULL;
 
    if (templ != NULL) {
-      unsigned hash_key = cso_construct_key((void*)templ, sizeof(struct pipe_sampler_state));
+      unsigned key_size = sizeof(struct pipe_sampler_state);
+      unsigned hash_key = cso_construct_key((void*)templ, key_size);
       struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
                                                           hash_key, CSO_SAMPLER,
-                                                          (void*)templ);
+                                                          (void*)templ, key_size);
 
       if (cso_hash_iter_is_null(iter)) {
          struct cso_sampler *cso = MALLOC(sizeof(struct cso_sampler));
@@ -408,10 +418,11 @@ cso_single_vertex_sampler(struct cso_context *ctx,
    void *handle = NULL;
 
    if (templ != NULL) {
-      unsigned hash_key = cso_construct_key((void*)templ, sizeof(struct pipe_sampler_state));
+      unsigned key_size = sizeof(struct pipe_sampler_state);
+      unsigned hash_key = cso_construct_key((void*)templ, key_size);
       struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
                                                           hash_key, CSO_SAMPLER,
-                                                          (void*)templ);
+                                                          (void*)templ, key_size);
 
       if (cso_hash_iter_is_null(iter)) {
          struct cso_sampler *cso = MALLOC(sizeof(struct cso_sampler));
@@ -538,6 +549,38 @@ void cso_restore_samplers(struct cso_context *ctx)
    cso_single_sampler_done( ctx );
 }
 
+/*
+ * If the function encouters any errors it will return the
+ * last one. Done to always try to set as many samplers
+ * as possible.
+ */
+enum pipe_error cso_set_vertex_samplers(struct cso_context *ctx,
+                                        unsigned nr,
+                                        const struct pipe_sampler_state **templates)
+{
+   unsigned i;
+   enum pipe_error temp, error = PIPE_OK;
+
+   /* TODO: fastpath
+    */
+
+   for (i = 0; i < nr; i++) {
+      temp = cso_single_vertex_sampler( ctx, i, templates[i] );
+      if (temp != PIPE_OK)
+         error = temp;
+   }
+
+   for ( ; i < ctx->nr_samplers; i++) {
+      temp = cso_single_vertex_sampler( ctx, i, NULL );
+      if (temp != PIPE_OK)
+         error = temp;
+   }
+
+   cso_single_vertex_sampler_done( ctx );
+
+   return error;
+}
+
 void
 cso_save_vertex_samplers(struct cso_context *ctx)
 {
@@ -665,12 +708,12 @@ cso_restore_vertex_sampler_textures(struct cso_context *ctx)
 enum pipe_error cso_set_depth_stencil_alpha(struct cso_context *ctx,
                                             const struct pipe_depth_stencil_alpha_state *templ)
 {
-   unsigned hash_key = cso_construct_key((void*)templ,
-                                         sizeof(struct pipe_depth_stencil_alpha_state));
+   unsigned key_size = sizeof(struct pipe_depth_stencil_alpha_state);
+   unsigned hash_key = cso_construct_key((void*)templ, key_size);
    struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
                                                        hash_key, 
-						       CSO_DEPTH_STENCIL_ALPHA,
-                                                       (void*)templ);
+                                                       CSO_DEPTH_STENCIL_ALPHA,
+                                                       (void*)templ, key_size);
    void *handle;
 
    if (cso_hash_iter_is_null(iter)) {
@@ -722,11 +765,11 @@ void cso_restore_depth_stencil_alpha(struct cso_context *ctx)
 enum pipe_error cso_set_rasterizer(struct cso_context *ctx,
                                    const struct pipe_rasterizer_state *templ)
 {
-   unsigned hash_key = cso_construct_key((void*)templ,
-                                         sizeof(struct pipe_rasterizer_state));
+   unsigned key_size = sizeof(struct pipe_rasterizer_state);
+   unsigned hash_key = cso_construct_key((void*)templ, key_size);
    struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
                                                        hash_key, CSO_RASTERIZER,
-                                                       (void*)templ);
+                                                       (void*)templ, key_size);
    void *handle = NULL;
 
    if (cso_hash_iter_is_null(iter)) {
@@ -808,7 +851,8 @@ enum pipe_error cso_set_fragment_shader(struct cso_context *ctx,
    struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
                                                        hash_key, 
                                                        CSO_FRAGMENT_SHADER,
-                                                       (void*)tokens);
+                                                       (void*)tokens,
+                                                       sizeof(*templ)); /* XXX correct? tokens_size? */
    void *handle = NULL;
 
    if (cso_hash_iter_is_null(iter)) {
@@ -887,7 +931,8 @@ enum pipe_error cso_set_vertex_shader(struct cso_context *ctx,
                                          sizeof(struct pipe_shader_state));
    struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
                                                        hash_key, CSO_VERTEX_SHADER,
-                                                       (void*)templ);
+                                                       (void*)templ,
+                                                       sizeof(*templ));
    void *handle = NULL;
 
    if (cso_hash_iter_is_null(iter)) {
@@ -1016,8 +1061,6 @@ void cso_restore_viewport(struct cso_context *ctx)
 }
 
 
-
-
 enum pipe_error cso_set_blend_color(struct cso_context *ctx,
                                     const struct pipe_blend_color *bc)
 {
@@ -1027,3 +1070,113 @@ enum pipe_error cso_set_blend_color(struct cso_context *ctx,
    }
    return PIPE_OK;
 }
+
+enum pipe_error cso_set_stencil_ref(struct cso_context *ctx,
+                                    const struct pipe_stencil_ref *sr)
+{
+   if (memcmp(&ctx->stencil_ref, sr, sizeof(ctx->stencil_ref))) {
+      ctx->stencil_ref = *sr;
+      ctx->pipe->set_stencil_ref(ctx->pipe, sr);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_stencil_ref(struct cso_context *ctx)
+{
+   ctx->stencil_ref_saved = ctx->stencil_ref;
+}
+
+
+void cso_restore_stencil_ref(struct cso_context *ctx)
+{
+   if (memcmp(&ctx->stencil_ref, &ctx->stencil_ref_saved, sizeof(ctx->stencil_ref))) {
+      ctx->stencil_ref = ctx->stencil_ref_saved;
+      ctx->pipe->set_stencil_ref(ctx->pipe, &ctx->stencil_ref);
+   }
+}
+
+enum pipe_error cso_set_geometry_shader_handle(struct cso_context *ctx,
+                                               void *handle)
+{
+   if (ctx->geometry_shader != handle) {
+      ctx->geometry_shader = handle;
+      ctx->pipe->bind_gs_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_delete_geometry_shader(struct cso_context *ctx, void *handle)
+{
+    if (handle == ctx->geometry_shader) {
+      /* unbind before deleting */
+      ctx->pipe->bind_gs_state(ctx->pipe, NULL);
+      ctx->geometry_shader = NULL;
+   }
+   ctx->pipe->delete_gs_state(ctx->pipe, handle);
+}
+
+void cso_save_geometry_shader(struct cso_context *ctx)
+{
+   assert(!ctx->geometry_shader_saved);
+   ctx->geometry_shader_saved = ctx->geometry_shader;
+}
+
+void cso_restore_geometry_shader(struct cso_context *ctx)
+{
+   if (ctx->geometry_shader_saved != ctx->geometry_shader) {
+      ctx->pipe->bind_gs_state(ctx->pipe, ctx->geometry_shader_saved);
+      ctx->geometry_shader = ctx->geometry_shader_saved;
+   }
+   ctx->geometry_shader_saved = NULL;
+}
+
+
+/* clip state */
+
+static INLINE void
+clip_state_cpy(struct pipe_clip_state *dst,
+               const struct pipe_clip_state *src)
+{
+   dst->nr = src->nr;
+   if (src->nr) {
+      memcpy(dst->ucp, src->ucp, src->nr * sizeof(src->ucp[0]));
+   }
+}
+
+static INLINE int
+clip_state_cmp(const struct pipe_clip_state *a,
+               const struct pipe_clip_state *b)
+{
+   if (a->nr != b->nr) {
+      return 1;
+   }
+   if (a->nr) {
+      return memcmp(a->ucp, b->ucp, a->nr * sizeof(a->ucp[0]));
+   }
+   return 0;
+}
+
+void
+cso_set_clip(struct cso_context *ctx,
+             const struct pipe_clip_state *clip)
+{
+   if (clip_state_cmp(&ctx->clip, clip)) {
+      clip_state_cpy(&ctx->clip, clip);
+      ctx->pipe->set_clip_state(ctx->pipe, clip);
+   }
+}
+
+void
+cso_save_clip(struct cso_context *ctx)
+{
+   clip_state_cpy(&ctx->clip_saved, &ctx->clip);
+}
+
+void
+cso_restore_clip(struct cso_context *ctx)
+{
+   if (clip_state_cmp(&ctx->clip, &ctx->clip_saved)) {
+      clip_state_cpy(&ctx->clip, &ctx->clip_saved);
+      ctx->pipe->set_clip_state(ctx->pipe, &ctx->clip_saved);
+   }
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index e5b92177cfd..251a9a644f8 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -84,6 +84,10 @@ enum pipe_error cso_single_sampler( struct cso_context *cso,
 
 void cso_single_sampler_done( struct cso_context *cso );
 
+enum pipe_error cso_set_vertex_samplers(struct cso_context *cso,
+                                        unsigned count,
+                                        const struct pipe_sampler_state **states);
+
 void
 cso_save_vertex_samplers(struct cso_context *cso);
 
@@ -146,6 +150,13 @@ void cso_save_vertex_shader(struct cso_context *cso);
 void cso_restore_vertex_shader(struct cso_context *cso);
 
 
+enum pipe_error cso_set_geometry_shader_handle(struct cso_context *ctx,
+                                               void *handle);
+void cso_delete_geometry_shader(struct cso_context *ctx, void *handle);
+void cso_save_geometry_shader(struct cso_context *cso);
+void cso_restore_geometry_shader(struct cso_context *cso);
+
+
 
 enum pipe_error cso_set_framebuffer(struct cso_context *cso,
                                     const struct pipe_framebuffer_state *fb);
@@ -163,6 +174,25 @@ enum pipe_error cso_set_blend_color(struct cso_context *cso,
                                     const struct pipe_blend_color *bc);
 
 
+enum pipe_error cso_set_stencil_ref(struct cso_context *cso,
+                                    const struct pipe_stencil_ref *sr);
+void cso_save_stencil_ref(struct cso_context *cso);
+void cso_restore_stencil_ref(struct cso_context *cso);
+
+
+/* clip state */
+
+void
+cso_set_clip(struct cso_context *cso,
+             const struct pipe_clip_state *clip);
+
+void
+cso_save_clip(struct cso_context *cso);
+
+void
+cso_restore_clip(struct cso_context *cso);
+
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
deleted file mode 100644
index 5041dcc072b..00000000000
--- a/src/gallium/auxiliary/draw/Makefile
+++ /dev/null
@@ -1,46 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = draw
-
-C_SOURCES = \
-	draw_context.c \
-	draw_pipe.c \
-	draw_pipe_aaline.c \
-	draw_pipe_aapoint.c \
-	draw_pipe_clip.c \
-	draw_pipe_cull.c \
-	draw_pipe_flatshade.c \
-	draw_pipe_offset.c \
-	draw_pipe_pstipple.c \
-	draw_pipe_stipple.c \
-	draw_pipe_twoside.c \
-	draw_pipe_unfilled.c \
-	draw_pipe_util.c \
-	draw_pipe_validate.c \
-	draw_pipe_vbuf.c \
-	draw_pipe_wide_line.c \
-	draw_pipe_wide_point.c \
-	draw_pt.c \
-	draw_pt_elts.c \
-	draw_pt_emit.c \
-	draw_pt_fetch.c \
-	draw_pt_fetch_emit.c \
-	draw_pt_fetch_shade_emit.c \
-	draw_pt_fetch_shade_pipeline.c \
-	draw_pt_post_vs.c \
-        draw_pt_util.c \
-        draw_pt_varray.c \
-	draw_pt_vcache.c \
-	draw_vertex.c \
-	draw_vs.c \
-	draw_vs_varient.c \
-	draw_vs_aos.c \
-	draw_vs_aos_io.c \
-	draw_vs_aos_machine.c \
-	draw_vs_exec.c \
-	draw_vs_llvm.c \
-	draw_vs_ppc.c  \
-	draw_vs_sse.c 
-
-include ../../Makefile.template
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
deleted file mode 100644
index 5f05aa324a5..00000000000
--- a/src/gallium/auxiliary/draw/SConscript
+++ /dev/null
@@ -1,46 +0,0 @@
-Import('*')
-
-draw = env.ConvenienceLibrary(
-	target = 'draw',
-	source = [
-		'draw_context.c',
-		'draw_pipe.c',
-		'draw_pipe_aaline.c',
-		'draw_pipe_aapoint.c',
-		'draw_pipe_clip.c',
-		'draw_pipe_cull.c',
-		'draw_pipe_flatshade.c',
-		'draw_pipe_offset.c',
-		'draw_pipe_pstipple.c',
-		'draw_pipe_stipple.c',
-		'draw_pipe_twoside.c',
-		'draw_pipe_unfilled.c',
-		'draw_pipe_util.c',
-		'draw_pipe_validate.c',
-		'draw_pipe_vbuf.c',
-		'draw_pipe_wide_line.c',
-		'draw_pipe_wide_point.c',
-		'draw_pt.c',
-		'draw_pt_elts.c',
-		'draw_pt_emit.c',
-		'draw_pt_fetch.c',
-		'draw_pt_fetch_emit.c',
-		'draw_pt_fetch_shade_emit.c',
-		'draw_pt_fetch_shade_pipeline.c',
-		'draw_pt_post_vs.c',
-		'draw_pt_util.c',
-		'draw_pt_varray.c',
-		'draw_pt_vcache.c',
-		'draw_vertex.c',
-		'draw_vs.c',
-		'draw_vs_aos.c',
-		'draw_vs_aos_io.c',
-		'draw_vs_aos_machine.c',
-		'draw_vs_exec.c',
-		'draw_vs_llvm.c',
-		'draw_vs_ppc.c',
-		'draw_vs_sse.c',
-		'draw_vs_varient.c'
-	])
-
-auxiliaries.insert(0, draw)
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index a4f1fcddc1a..bb0988543f5 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -34,10 +34,8 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "draw_context.h"
-#include "draw_vbuf.h"
 #include "draw_vs.h"
-#include "draw_pt.h"
-#include "draw_pipe.h"
+#include "draw_gs.h"
 
 
 struct draw_context *draw_create( void )
@@ -67,6 +65,9 @@ struct draw_context *draw_create( void )
    if (!draw_vs_init( draw ))
       goto fail;
 
+   if (!draw_gs_init( draw ))
+      goto fail;
+
    return draw;
 
 fail:
@@ -91,6 +92,7 @@ void draw_destroy( struct draw_context *draw )
    draw_pipeline_destroy( draw );
    draw_pt_destroy( draw );
    draw_vs_destroy( draw );
+   draw_gs_destroy( draw );
 
    FREE( draw );
 }
@@ -126,9 +128,7 @@ void draw_set_rasterizer_state( struct draw_context *draw,
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
 
    draw->rasterizer = raster;
-   draw->bypass_clipping =
-      ((draw->rasterizer && draw->rasterizer->bypass_vs_clip_and_viewport) ||
-       draw->driver.bypass_clipping);
+   draw->bypass_clipping = draw->driver.bypass_clipping;
 }
 
 
@@ -138,9 +138,7 @@ void draw_set_driver_clipping( struct draw_context *draw,
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
 
    draw->driver.bypass_clipping = bypass_clipping;
-   draw->bypass_clipping =
-      ((draw->rasterizer && draw->rasterizer->bypass_vs_clip_and_viewport) ||
-       draw->driver.bypass_clipping);
+   draw->bypass_clipping = draw->driver.bypass_clipping;
 }
 
 
@@ -231,11 +229,22 @@ draw_set_mapped_vertex_buffer(struct draw_context *draw,
 
 void
 draw_set_mapped_constant_buffer(struct draw_context *draw,
-                                const void *buffer, 
+                                unsigned shader_type,
+                                unsigned slot,
+                                const void *buffer,
                                 unsigned size )
 {
-   draw->pt.user.constants = buffer;
-   draw_vs_set_constants( draw, (const float (*)[4])buffer, size );
+   debug_assert(shader_type == PIPE_SHADER_VERTEX ||
+                shader_type == PIPE_SHADER_GEOMETRY);
+   debug_assert(slot < PIPE_MAX_CONSTANT_BUFFERS);
+
+   if (shader_type == PIPE_SHADER_VERTEX) {
+      draw->pt.user.vs_constants[slot] = buffer;
+      draw_vs_set_constants(draw, slot, buffer, size);
+   } else if (shader_type == PIPE_SHADER_GEOMETRY) {
+      draw->pt.user.gs_constants[slot] = buffer;
+      draw_gs_set_constants(draw, slot, buffer, size);
+   }
 }
 
 
@@ -298,7 +307,7 @@ draw_set_force_passthrough( struct draw_context *draw, boolean enable )
  * a post-transformed vertex.
  *
  * With this function, drivers that use the draw module should have no reason
- * to track the current vertex shader.
+ * to track the current vertex/geometry shader.
  *
  * Note that the draw module may sometimes generate vertices with extra
  * attributes (such as texcoords for AA lines).  The driver can call this
@@ -309,43 +318,64 @@ draw_set_force_passthrough( struct draw_context *draw, boolean enable )
  * work for the drivers.
  */
 int
-draw_find_vs_output(const struct draw_context *draw,
-                    uint semantic_name, uint semantic_index)
+draw_find_shader_output(const struct draw_context *draw,
+                        uint semantic_name, uint semantic_index)
 {
    const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+   const struct draw_geometry_shader *gs = draw->gs.geometry_shader;
    uint i;
-   for (i = 0; i < vs->info.num_outputs; i++) {
-      if (vs->info.output_semantic_name[i] == semantic_name &&
-          vs->info.output_semantic_index[i] == semantic_index)
+   const struct tgsi_shader_info *info = &vs->info;
+
+   if (gs)
+      info = &gs->info;
+
+   for (i = 0; i < info->num_outputs; i++) {
+      if (info->output_semantic_name[i] == semantic_name &&
+          info->output_semantic_index[i] == semantic_index)
          return i;
    }
 
    /* XXX there may be more than one extra vertex attrib.
     * For example, simulated gl_FragCoord and gl_PointCoord.
     */
-   if (draw->extra_vp_outputs.semantic_name == semantic_name &&
-       draw->extra_vp_outputs.semantic_index == semantic_index) {
-      return draw->extra_vp_outputs.slot;
+   if (draw->extra_shader_outputs.semantic_name == semantic_name &&
+       draw->extra_shader_outputs.semantic_index == semantic_index) {
+      return draw->extra_shader_outputs.slot;
    }
+
    return 0;
 }
 
 
 /**
- * Return number of vertex shader outputs.
+ * Return total number of the shader outputs.  This function is similar to
+ * draw_current_shader_outputs() but this function also counts any extra
+ * vertex/geometry output attributes that may be filled in by some draw
+ * stages (such as AA point, AA line).
+ *
+ * If geometry shader is present, its output will be returned,
+ * if not vertex shader is used.
  */
 uint
-draw_num_vs_outputs(const struct draw_context *draw)
+draw_num_shader_outputs(const struct draw_context *draw)
 {
    uint count = draw->vs.vertex_shader->info.num_outputs;
-   if (draw->extra_vp_outputs.slot > 0)
+
+   /* If a geometry shader is present, its outputs go to the
+    * driver, else the vertex shader's outputs.
+    */
+   if (draw->gs.geometry_shader)
+      count = draw->gs.geometry_shader->info.num_outputs;
+
+   if (draw->extra_shader_outputs.slot > 0)
       count++;
    return count;
 }
 
 
 /**
- * Provide TGSI sampler objects for vertex shaders that use texture fetches.
+ * Provide TGSI sampler objects for vertex/geometry shaders that use
+ * texture fetches.
  * This might only be used by software drivers for the time being.
  */
 void
@@ -355,6 +385,8 @@ draw_texture_samplers(struct draw_context *draw,
 {
    draw->vs.num_samplers = num_samplers;
    draw->vs.samplers = samplers;
+   draw->gs.num_samplers = num_samplers;
+   draw->gs.samplers = samplers;
 }
 
 
@@ -366,13 +398,6 @@ void draw_set_render( struct draw_context *draw,
    draw->render = render;
 }
 
-void draw_set_edgeflags( struct draw_context *draw,
-                         const unsigned *edgeflag )
-{
-   draw->pt.user.edgeflag = edgeflag;
-}
-
-
 
 
 /**
@@ -428,3 +453,31 @@ void draw_do_flush( struct draw_context *draw, unsigned flags )
       draw->flushing = FALSE;
    }
 }
+
+
+/**
+ * Return the number of output attributes produced by the geometry
+ * shader, if present.  If no geometry shader, return the number of
+ * outputs from the vertex shader.
+ * \sa draw_num_shader_outputs
+ */
+uint
+draw_current_shader_outputs(const struct draw_context *draw)
+{
+   if (draw->gs.geometry_shader)
+      return draw->gs.num_gs_outputs;
+   return draw->vs.num_vs_outputs;
+}
+
+
+/**
+ * Return the index of the shader output which will contain the
+ * vertex position.
+ */
+uint
+draw_current_shader_position_output(const struct draw_context *draw)
+{
+   if (draw->gs.geometry_shader)
+      return draw->gs.position_output;
+   return draw->vs.position_output;
+}
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index d529e4e9a27..acd81b9712d 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -45,6 +45,7 @@ struct pipe_context;
 struct draw_context;
 struct draw_stage;
 struct draw_vertex_shader;
+struct draw_geometry_shader;
 struct tgsi_sampler;
 
 
@@ -85,11 +86,11 @@ draw_install_pstipple_stage(struct draw_context *draw, struct pipe_context *pipe
 
 
 int
-draw_find_vs_output(const struct draw_context *draw,
-                    uint semantic_name, uint semantic_index);
+draw_find_shader_output(const struct draw_context *draw,
+                        uint semantic_name, uint semantic_index);
 
 uint
-draw_num_vs_outputs(const struct draw_context *draw);
+draw_num_shader_outputs(const struct draw_context *draw);
 
 
 void
@@ -112,6 +113,17 @@ void draw_delete_vertex_shader(struct draw_context *draw,
                                struct draw_vertex_shader *dvs);
 
 
+/*
+ * Geometry shader functions
+ */
+struct draw_geometry_shader *
+draw_create_geometry_shader(struct draw_context *draw,
+                            const struct pipe_shader_state *shader);
+void draw_bind_geometry_shader(struct draw_context *draw,
+                               struct draw_geometry_shader *dvs);
+void draw_delete_geometry_shader(struct draw_context *draw,
+                                 struct draw_geometry_shader *dvs);
+
 
 /*
  * Vertex data functions
@@ -139,12 +151,12 @@ void draw_set_mapped_element_buffer( struct draw_context *draw,
 void draw_set_mapped_vertex_buffer(struct draw_context *draw,
                                    unsigned attr, const void *buffer);
 
-void draw_set_mapped_constant_buffer(struct draw_context *draw,
-                                     const void *buffer,
-                                     unsigned size );
-
-void draw_set_edgeflags( struct draw_context *draw,
-                         const unsigned *edgeflag );
+void
+draw_set_mapped_constant_buffer(struct draw_context *draw,
+                                unsigned shader_type,
+                                unsigned slot,
+                                const void *buffer,
+                                unsigned size);
 
 
 /***********************************************************************
@@ -154,6 +166,14 @@ void draw_set_edgeflags( struct draw_context *draw,
 void draw_arrays(struct draw_context *draw, unsigned prim,
 		 unsigned start, unsigned count);
 
+void
+draw_arrays_instanced(struct draw_context *draw,
+                      unsigned mode,
+                      unsigned start,
+                      unsigned count,
+                      unsigned startInstance,
+                      unsigned instanceCount);
+
 void draw_flush(struct draw_context *draw);
 
 
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
new file mode 100644
index 00000000000..7069aa6b181
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -0,0 +1,351 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMWare Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "draw_gs.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_exec.h"
+
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#define MAX_PRIM_VERTICES 6
+/* fixme: move it from here */
+#define MAX_PRIMITIVES 64
+
+boolean
+draw_gs_init( struct draw_context *draw )
+{
+   draw->gs.machine = tgsi_exec_machine_create();
+   if (!draw->gs.machine)
+      return FALSE;
+
+   draw->gs.machine->Primitives = align_malloc(
+      MAX_PRIMITIVES * sizeof(struct tgsi_exec_vector), 16);
+   if (!draw->gs.machine->Primitives)
+      return FALSE;
+   memset(draw->gs.machine->Primitives, 0,
+          MAX_PRIMITIVES * sizeof(struct tgsi_exec_vector));
+
+   return TRUE;
+}
+
+void draw_gs_destroy( struct draw_context *draw )
+{
+   if (!draw->gs.machine)
+      return;
+
+   align_free(draw->gs.machine->Primitives);
+
+   tgsi_exec_machine_destroy(draw->gs.machine);
+}
+
+void
+draw_gs_set_constants(struct draw_context *draw,
+                      unsigned slot,
+                      const void *constants,
+                      unsigned size)
+{
+}
+
+
+struct draw_geometry_shader *
+draw_create_geometry_shader(struct draw_context *draw,
+                            const struct pipe_shader_state *state)
+{
+   struct draw_geometry_shader *gs;
+   int i;
+
+   gs = CALLOC_STRUCT(draw_geometry_shader);
+
+   if (!gs)
+      return NULL;
+
+   gs->state = *state;
+   gs->state.tokens = tgsi_dup_tokens(state->tokens);
+   if (!gs->state.tokens) {
+      FREE(gs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(state->tokens, &gs->info);
+
+   /* setup the defaults */
+   gs->input_primitive = PIPE_PRIM_TRIANGLES;
+   gs->output_primitive = PIPE_PRIM_TRIANGLE_STRIP;
+   gs->max_output_vertices = 32;
+
+   for (i = 0; i < gs->info.num_properties; ++i) {
+      if (gs->info.properties[i].name ==
+          TGSI_PROPERTY_GS_INPUT_PRIM)
+         gs->input_primitive = gs->info.properties[i].data[0];
+      else if (gs->info.properties[i].name ==
+               TGSI_PROPERTY_GS_OUTPUT_PRIM)
+         gs->output_primitive = gs->info.properties[i].data[0];
+      else if (gs->info.properties[i].name ==
+               TGSI_PROPERTY_GS_MAX_VERTICES)
+         gs->max_output_vertices = gs->info.properties[i].data[0];
+   }
+
+   gs->machine = draw->gs.machine;
+
+   if (gs)
+   {
+      uint i;
+      for (i = 0; i < gs->info.num_outputs; i++) {
+         if (gs->info.output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
+             gs->info.output_semantic_index[i] == 0)
+            gs->position_output = i;
+      }
+   }
+
+   return gs;
+}
+
+void draw_bind_geometry_shader(struct draw_context *draw,
+                               struct draw_geometry_shader *dgs)
+{
+   draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
+
+   if (dgs) {
+      draw->gs.geometry_shader = dgs;
+      draw->gs.num_gs_outputs = dgs->info.num_outputs;
+      draw->gs.position_output = dgs->position_output;
+      draw_geometry_shader_prepare(dgs, draw);
+   }
+   else {
+      draw->gs.geometry_shader = NULL;
+      draw->gs.num_gs_outputs = 0;
+   }
+}
+
+void draw_delete_geometry_shader(struct draw_context *draw,
+                                 struct draw_geometry_shader *dgs)
+{
+   FREE(dgs);
+}
+
+static INLINE int num_vertices_for_prim(int prim)
+{
+   switch(prim) {
+   case PIPE_PRIM_POINTS:
+      return 1;
+   case PIPE_PRIM_LINES:
+      return 2;
+   case PIPE_PRIM_LINE_LOOP:
+      return 2;
+   case PIPE_PRIM_LINE_STRIP:
+      return 2;
+   case PIPE_PRIM_TRIANGLES:
+      return 3;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      return 3;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      return 3;
+   case PIPE_PRIM_LINES_ADJACENCY:
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      return 4;
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return 6;
+   default:
+      assert(!"Bad geometry shader input");
+      return 0;
+   }
+}
+
+static void draw_fetch_geometry_input(struct draw_geometry_shader *shader,
+                                      int start_primitive,
+                                      int num_primitives,
+                                      const float (*input_ptr)[4],
+                                      unsigned input_vertex_stride,
+                                      unsigned inputs_from_vs)
+{
+   struct tgsi_exec_machine *machine = shader->machine;
+   unsigned slot, vs_slot, k, j;
+   unsigned num_vertices = num_vertices_for_prim(shader->input_primitive);
+   int idx = 0;
+
+   for (slot = 0, vs_slot = 0; slot < shader->info.num_inputs; slot++) {
+      /*debug_printf("Slot = %d (semantic = %d)\n", slot,
+        shader->info.input_semantic_name[slot]);*/
+      if (shader->info.input_semantic_name[slot] ==
+          TGSI_SEMANTIC_PRIMID) {
+         for (j = 0; j < num_primitives; ++j) {
+            machine->Inputs[idx].xyzw[0].f[j] = (float)start_primitive + j;
+            machine->Inputs[idx].xyzw[1].f[j] = (float)start_primitive + j;
+            machine->Inputs[idx].xyzw[2].f[j] = (float)start_primitive + j;
+            machine->Inputs[idx].xyzw[3].f[j] = (float)start_primitive + j;
+         }
+         ++idx;
+      } else {
+         for (j = 0; j < num_primitives; ++j) {
+            int vidx = idx;
+            const float (*prim_ptr)[4];
+            /*debug_printf("    %d) Prim (num_verts = %d)\n", start_primitive + j,
+              num_vertices);*/
+            prim_ptr = (const float (*)[4])(
+               (const char *)input_ptr +
+               (j * num_vertices * input_vertex_stride));
+
+            for (k = 0; k < num_vertices; ++k, ++vidx) {
+               const float (*input)[4];
+               input = (const float (*)[4])(
+                  (const char *)prim_ptr + (k * input_vertex_stride));
+               vidx = k * TGSI_EXEC_MAX_INPUT_ATTRIBS + slot;
+               /*debug_printf("\t%d)(%d) Input vert:\n", vidx, k);*/
+#if 1
+               assert(!util_is_inf_or_nan(input[vs_slot][0]));
+               assert(!util_is_inf_or_nan(input[vs_slot][1]));
+               assert(!util_is_inf_or_nan(input[vs_slot][2]));
+               assert(!util_is_inf_or_nan(input[vs_slot][3]));
+#endif
+               machine->Inputs[vidx].xyzw[0].f[j] = input[vs_slot][0];
+               machine->Inputs[vidx].xyzw[1].f[j] = input[vs_slot][1];
+               machine->Inputs[vidx].xyzw[2].f[j] = input[vs_slot][2];
+               machine->Inputs[vidx].xyzw[3].f[j] = input[vs_slot][3];
+#if 0
+               debug_printf("\t\t%d %f %f %f %f\n", slot,
+                            machine->Inputs[vidx].xyzw[0].f[j],
+                            machine->Inputs[vidx].xyzw[1].f[j],
+                            machine->Inputs[vidx].xyzw[2].f[j],
+                            machine->Inputs[vidx].xyzw[3].f[j]);
+#endif
+            }
+         }
+         ++vs_slot;
+         idx += num_vertices;
+      }
+   }
+}
+
+static INLINE void
+draw_geometry_fetch_outputs(struct draw_geometry_shader *shader,
+                            int num_primitives,
+                            float (*output)[4],
+                            unsigned vertex_size)
+{
+   struct tgsi_exec_machine *machine = shader->machine;
+   unsigned prim_idx, j, slot;
+
+   /* Unswizzle all output results.
+    */
+   /* FIXME: handle all the primitives produced by the gs, not just
+    * the first one
+    unsigned prim_count =
+    mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];*/
+   for (prim_idx = 0; prim_idx < num_primitives; ++prim_idx) {
+      unsigned num_verts_per_prim = machine->Primitives[0];
+      for (j = 0; j < num_verts_per_prim; j++) {
+         int idx = (prim_idx * num_verts_per_prim + j) *
+                   shader->info.num_outputs;
+#ifdef DEBUG_OUTPUTS
+         debug_printf("%d) Output vert:\n", idx);
+#endif
+         for (slot = 0; slot < shader->info.num_outputs; slot++) {
+            output[slot][0] = machine->Outputs[idx + slot].xyzw[0].f[prim_idx];
+            output[slot][1] = machine->Outputs[idx + slot].xyzw[1].f[prim_idx];
+            output[slot][2] = machine->Outputs[idx + slot].xyzw[2].f[prim_idx];
+            output[slot][3] = machine->Outputs[idx + slot].xyzw[3].f[prim_idx];
+#ifdef DEBUG_OUTPUTS
+            debug_printf("\t%d: %f %f %f %f\n", slot,
+                         output[slot][0],
+                         output[slot][1],
+                         output[slot][2],
+                         output[slot][3]);
+#endif
+            debug_assert(!util_is_inf_or_nan(output[slot][0]));
+         }
+         output = (float (*)[4])((char *)output + vertex_size);
+      }
+   }
+}
+
+void draw_geometry_shader_run(struct draw_geometry_shader *shader,
+                              const float (*input)[4],
+                              float (*output)[4],
+                              const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+                              unsigned count,
+                              unsigned input_stride,
+                              unsigned vertex_size)
+{
+   struct tgsi_exec_machine *machine = shader->machine;
+   unsigned int i;
+   unsigned num_vertices = num_vertices_for_prim(shader->input_primitive);
+   unsigned num_primitives = count/num_vertices;
+   unsigned inputs_from_vs = 0;
+
+   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      machine->Consts[i] = constants[i];
+   }
+
+   for (i = 0; i < shader->info.num_inputs; ++i) {
+      if (shader->info.input_semantic_name[i] != TGSI_SEMANTIC_PRIMID)
+         ++inputs_from_vs;
+   }
+
+   for (i = 0; i < num_primitives; ++i) {
+      unsigned int max_primitives = 1;
+
+      draw_fetch_geometry_input(shader, i, max_primitives, input,
+                                input_stride, inputs_from_vs);
+
+      tgsi_set_exec_mask(machine,
+                         1,
+                         max_primitives > 1,
+                         max_primitives > 2,
+                         max_primitives > 3);
+
+      /* run interpreter */
+      tgsi_exec_machine_run(machine);
+
+      draw_geometry_fetch_outputs(shader, max_primitives,
+                                  output, vertex_size);
+   }
+}
+
+void draw_geometry_shader_delete(struct draw_geometry_shader *shader)
+{
+   FREE((void*) shader->state.tokens);
+   FREE(shader);
+}
+
+void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
+                                  struct draw_context *draw)
+{
+    if (shader->machine->Tokens != shader->state.tokens) {
+       tgsi_exec_machine_bind_shader(shader->machine,
+                                     shader->state.tokens,
+                                     draw->gs.num_samplers,
+                                     draw->gs.samplers);
+    }
+}
diff --git a/src/gallium/auxiliary/draw/draw_gs.h b/src/gallium/auxiliary/draw/draw_gs.h
new file mode 100644
index 00000000000..d8eb2103433
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_gs.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMWare Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef DRAW_GS_H
+#define DRAW_GS_H
+
+#include "draw_context.h"
+#include "draw_private.h"
+
+
+#define MAX_TGSI_PRIMITIVES 4
+
+struct draw_context;
+
+/**
+ * Private version of the compiled geometry shader
+ */
+struct draw_geometry_shader {
+   struct draw_context *draw;
+
+   struct tgsi_exec_machine *machine;
+
+   /* This member will disappear shortly:*/
+   struct pipe_shader_state state;
+
+   struct tgsi_shader_info info;
+   unsigned position_output;
+
+   unsigned max_output_vertices;
+   unsigned input_primitive;
+   unsigned output_primitive;
+
+   /* Extracted from shader:
+    */
+   const float (*immediates)[4];
+};
+
+void draw_geometry_shader_run(struct draw_geometry_shader *shader,
+                              const float (*input)[4],
+                              float (*output)[4],
+                              const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
+                              unsigned count,
+                              unsigned input_stride,
+                              unsigned output_stride);
+
+void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
+                                  struct draw_context *draw);
+
+void draw_geometry_shader_delete(struct draw_geometry_shader *shader);
+
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 1c6d657297c..83dc1a35f4c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -32,6 +32,7 @@
 
 #include "draw/draw_private.h"
 #include "draw/draw_pipe.h"
+#include "util/u_debug.h"
 
 
 
@@ -106,10 +107,9 @@ void draw_pipeline_destroy( struct draw_context *draw )
 
 
 
-
-
-
-
+/**
+ * Build primitive to render a point with vertex at v0.
+ */
 static void do_point( struct draw_context *draw,
 		      const char *v0 )
 {
@@ -123,6 +123,10 @@ static void do_point( struct draw_context *draw,
 }
 
 
+/**
+ * Build primitive to render a line with vertices at v0, v1.
+ * \param flags  bitmask of DRAW_PIPE_EDGE_x, DRAW_PIPE_RESET_STIPPLE
+ */
 static void do_line( struct draw_context *draw,
                      ushort flags,
 		     const char *v0,
@@ -139,6 +143,10 @@ static void do_line( struct draw_context *draw,
 }
 
 
+/**
+ * Build primitive to render a triangle with vertices at v0, v1, v2.
+ * \param flags  bitmask of DRAW_PIPE_EDGE_x, DRAW_PIPE_RESET_STIPPLE
+ */
 static void do_triangle( struct draw_context *draw,
                          ushort flags,
 			 char *v0,
@@ -157,7 +165,10 @@ static void do_triangle( struct draw_context *draw,
 }
 
 
-
+/*
+ * Set up macros for draw_pt_decompose.h template code.
+ * This code uses vertex indexes / elements.
+ */
 #define QUAD(i0,i1,i2,i3)                       \
    do_triangle( draw,                           \
                 ( DRAW_PIPE_RESET_STIPPLE |     \
@@ -175,16 +186,16 @@ static void do_triangle( struct draw_context *draw,
 
 #define TRIANGLE(flags,i0,i1,i2)                                        \
    do_triangle( draw,                                                   \
-                elts[i0],  /* flags */                          \
+                elts[i0],  /* flags */                                  \
                 verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK),     \
-                verts + stride * elts[i1],                              \
-                verts + stride * elts[i2])
+                verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK),     \
+                verts + stride * (elts[i2] & ~DRAW_PIPE_FLAG_MASK) );
 
 #define LINE(flags,i0,i1)                                       \
    do_line( draw,                                               \
-            elts[i0],                                   \
+            elts[i0],                                           \
             verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK), \
-            verts + stride * elts[i1])
+            verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK) );
 
 #define POINT(i0)                               \
    do_point( draw,                              \
@@ -213,7 +224,9 @@ static void do_triangle( struct draw_context *draw,
 
 
 
-/* Code to run the pipeline on a fairly arbitary collection of vertices.
+/**
+ * Code to run the pipeline on a fairly arbitary collection of vertices.
+ * For drawing indexed primitives.
  *
  * Vertex headers must be pre-initialized with the
  * UNDEFINED_VERTEX_ID, this code will cause that id to become
@@ -243,6 +256,12 @@ void draw_pipeline_run( struct draw_context *draw,
    draw->pipeline.vertex_count = 0;
 }
 
+
+
+/*
+ * Set up macros for draw_pt_decompose.h template code.
+ * This code is for non-indexed rendering (no elts).
+ */
 #define QUAD(i0,i1,i2,i3)                                        \
    do_triangle( draw,                                            \
                 ( DRAW_PIPE_RESET_STIPPLE |                      \
@@ -293,6 +312,10 @@ void draw_pipeline_run( struct draw_context *draw,
 
 #include "draw_pt_decompose.h"
 
+
+/*
+ * For drawing non-indexed primitives.
+ */
 void draw_pipeline_run_linear( struct draw_context *draw,
                                unsigned prim,
                                struct vertex_header *vertices,
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 23d8b609e17..8f6ca15dfa2 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -35,6 +35,9 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_inlines.h"
+
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -46,6 +49,10 @@
 #include "draw_pipe.h"
 
 
+/** Approx number of new tokens for instructions in aa_transform_inst() */
+#define NUM_NEW_TOKENS 50
+
+
 /**
  * Max texture level for the alpha texture used for antialiasing
  */
@@ -176,12 +183,7 @@ aa_transform_decl(struct tgsi_transform_context *ctx,
 static int
 free_bit(uint bitfield)
 {
-   int i;
-   for (i = 0; i < 32; i++) {
-      if ((bitfield & (1 << i)) == 0)
-         return i;
-   }
-   return -1;
+   return ffs(~bitfield) - 1;
 }
 
 
@@ -340,11 +342,10 @@ generate_aaline_fs(struct aaline_stage *aaline)
    const struct pipe_shader_state *orig_fs = &aaline->fs->state;
    struct pipe_shader_state aaline_fs;
    struct aa_transform_context transform;
-
-#define MAX 1000
+   const uint newLen = tgsi_num_tokens(orig_fs->tokens) + NUM_NEW_TOKENS;
 
    aaline_fs = *orig_fs; /* copy to init */
-   aaline_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   aaline_fs.tokens = tgsi_alloc_tokens(newLen);
    if (aaline_fs.tokens == NULL)
       return FALSE;
 
@@ -360,7 +361,7 @@ generate_aaline_fs(struct aaline_stage *aaline)
 
    tgsi_transform_shader(orig_fs->tokens,
                          (struct tgsi_token *) aaline_fs.tokens,
-                         MAX, &transform.base);
+                         newLen, &transform.base);
 
 #if 0 /* DEBUG */
    tgsi_dump(orig_fs->tokens, 0);
@@ -658,13 +659,13 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
    }
 
    /* update vertex attrib info */
-   aaline->tex_slot = draw->vs.num_vs_outputs;
-   aaline->pos_slot = draw->vs.position_output;
+   aaline->tex_slot = draw_current_shader_outputs(draw);
+   aaline->pos_slot = draw_current_shader_position_output(draw);;
 
    /* advertise the extra post-transformed vertex attribute */
-   draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
-   draw->extra_vp_outputs.semantic_index = aaline->fs->generic_attrib;
-   draw->extra_vp_outputs.slot = aaline->tex_slot;
+   draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
+   draw->extra_shader_outputs.semantic_index = aaline->fs->generic_attrib;
+   draw->extra_shader_outputs.slot = aaline->tex_slot;
 
    /* how many samplers? */
    /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
@@ -705,7 +706,7 @@ aaline_flush(struct draw_stage *stage, unsigned flags)
                                        aaline->state.texture);
    draw->suspend_flushing = FALSE;
 
-   draw->extra_vp_outputs.slot = 0;
+   draw->extra_shader_outputs.slot = 0;
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index 75130a8fb0e..9f9fb4312c1 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -53,6 +53,10 @@
 #include "draw_pipe.h"
 
 
+/** Approx number of new tokens for instructions in aa_transform_inst() */
+#define NUM_NEW_TOKENS 200
+
+
 /*
  * Enabling NORMALIZE might give _slightly_ better results.
  * Basically, it controls whether we compute distance as d=sqrt(x*x+y*y) or
@@ -81,16 +85,19 @@ struct aapoint_stage
 {
    struct draw_stage stage;
 
-   int psize_slot;
+   /** half of pipe_rasterizer_state::point_size */
    float radius;
 
+   /** vertex attrib slot containing point size */
+   int psize_slot;
+
    /** this is the vertex attrib slot for the new texcoords */
    uint tex_slot;
+
+   /** vertex attrib slot containing position */
    uint pos_slot;
 
-   /*
-    * Currently bound state
-    */
+   /** Currently bound fragment shader */
    struct aapoint_fragment_shader *fs;
 
    /*
@@ -491,11 +498,10 @@ generate_aapoint_fs(struct aapoint_stage *aapoint)
    const struct pipe_shader_state *orig_fs = &aapoint->fs->state;
    struct pipe_shader_state aapoint_fs;
    struct aa_transform_context transform;
-
-#define MAX 1000
+   const uint newLen = tgsi_num_tokens(orig_fs->tokens) + NUM_NEW_TOKENS;
 
    aapoint_fs = *orig_fs; /* copy to init */
-   aapoint_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   aapoint_fs.tokens = tgsi_alloc_tokens(newLen);
    if (aapoint_fs.tokens == NULL)
       return FALSE;
 
@@ -511,7 +517,7 @@ generate_aapoint_fs(struct aapoint_stage *aapoint)
 
    tgsi_transform_shader(orig_fs->tokens,
                          (struct tgsi_token *) aapoint_fs.tokens,
-                         MAX, &transform.base);
+                         newLen, &transform.base);
 
 #if 0 /* DEBUG */
    printf("draw_aapoint, orig shader:\n");
@@ -575,8 +581,8 @@ aapoint_point(struct draw_stage *stage, struct prim_header *header)
    const struct aapoint_stage *aapoint = aapoint_stage(stage);
    struct prim_header tri;
    struct vertex_header *v[4];
-   uint texPos = aapoint->tex_slot;
-   uint pos_slot = aapoint->pos_slot;
+   const uint tex_slot = aapoint->tex_slot;
+   const uint pos_slot = aapoint->pos_slot;
    float radius, *pos, *tex;
    uint i;
    float k;
@@ -643,16 +649,16 @@ aapoint_point(struct draw_stage *stage, struct prim_header *header)
    pos[1] += radius;
 
    /* new texcoords */
-   tex = v[0]->data[texPos];
+   tex = v[0]->data[tex_slot];
    ASSIGN_4V(tex, -1, -1, k, 1);
 
-   tex = v[1]->data[texPos];
+   tex = v[1]->data[tex_slot];
    ASSIGN_4V(tex,  1, -1, k, 1);
 
-   tex = v[2]->data[texPos];
+   tex = v[2]->data[tex_slot];
    ASSIGN_4V(tex,  1,  1, k, 1);
 
-   tex = v[3]->data[texPos];
+   tex = v[3]->data[tex_slot];
    ASSIGN_4V(tex, -1,  1, k, 1);
 
    /* emit 2 tris for the quad strip */
@@ -687,14 +693,14 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header)
    bind_aapoint_fragment_shader(aapoint);
 
    /* update vertex attrib info */
-   aapoint->tex_slot = draw->vs.num_vs_outputs;
+   aapoint->tex_slot = draw_current_shader_outputs(draw);
    assert(aapoint->tex_slot > 0); /* output[0] is vertex pos */
 
-   aapoint->pos_slot = draw->vs.position_output;
+   aapoint->pos_slot = draw_current_shader_position_output(draw);
 
-   draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
-   draw->extra_vp_outputs.semantic_index = aapoint->fs->generic_attrib;
-   draw->extra_vp_outputs.slot = aapoint->tex_slot;
+   draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
+   draw->extra_shader_outputs.semantic_index = aapoint->fs->generic_attrib;
+   draw->extra_shader_outputs.slot = aapoint->tex_slot;
 
    /* find psize slot in post-transform vertex */
    aapoint->psize_slot = -1;
@@ -731,7 +737,7 @@ aapoint_flush(struct draw_stage *stage, unsigned flags)
    aapoint->driver_bind_fs_state(pipe, aapoint->fs->driver_fs);
    draw->suspend_flushing = FALSE;
 
-   draw->extra_vp_outputs.slot = 0;
+   draw->extra_shader_outputs.slot = 0;
 }
 
 
@@ -858,7 +864,7 @@ draw_install_aapoint_stage(struct draw_context *draw,
     */
    aapoint = draw_aapoint_stage( draw );
    if (aapoint == NULL)
-      goto fail;
+      return FALSE;
 
    aapoint->pipe = pipe;
 
@@ -875,10 +881,4 @@ draw_install_aapoint_stage(struct draw_context *draw,
    draw->pipeline.aapoint = &aapoint->stage;
 
    return TRUE;
-
- fail:
-   if (aapoint)
-      aapoint->stage.destroy( &aapoint->stage );
-
-   return FALSE;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 0670268a196..51a6115ebf5 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -55,7 +55,7 @@
 
 
 
-struct clipper {
+struct clip_stage {
    struct draw_stage stage;      /**< base class */
 
    /* Basically duplicate some of the flatshading logic here:
@@ -70,9 +70,9 @@ struct clipper {
 
 /* This is a bit confusing:
  */
-static INLINE struct clipper *clipper_stage( struct draw_stage *stage )
+static INLINE struct clip_stage *clip_stage( struct draw_stage *stage )
 {
-   return (struct clipper *)stage;
+   return (struct clip_stage *)stage;
 }
 
 
@@ -92,11 +92,12 @@ static void interp_attr( float *fdst,
    fdst[3] = LINTERP( t, fout[3], fin[3] );
 }
 
+
 static void copy_colors( struct draw_stage *stage,
 			 struct vertex_header *dst,
 			 const struct vertex_header *src )
 {
-   const struct clipper *clipper = clipper_stage(stage);
+   const struct clip_stage *clipper = clip_stage(stage);
    uint i;
    for (i = 0; i < clipper->num_color_attribs; i++) {
       const uint attr = clipper->color_attribs[i];
@@ -108,14 +109,14 @@ static void copy_colors( struct draw_stage *stage,
 
 /* Interpolate between two vertices to produce a third.  
  */
-static void interp( const struct clipper *clip,
+static void interp( const struct clip_stage *clip,
 		    struct vertex_header *dst,
 		    float t,
 		    const struct vertex_header *out, 
 		    const struct vertex_header *in )
 {
-   const unsigned nr_attrs = clip->stage.draw->vs.num_vs_outputs;
-   const unsigned pos_attr = clip->stage.draw->vs.position_output;
+   const unsigned nr_attrs = draw_current_shader_outputs(clip->stage.draw);
+   const unsigned pos_attr = draw_current_shader_position_output(clip->stage.draw);
    unsigned j;
 
    /* Vertex header.
@@ -179,7 +180,7 @@ static void emit_poly( struct draw_stage *stage,
       header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
 
       if (i == n-1)
-        header.flags |= edge_last;
+         header.flags |= edge_last;
 
       if (0) {
          const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
@@ -200,13 +201,14 @@ static void emit_poly( struct draw_stage *stage,
    }
 }
 
+
 static INLINE float
 dot4(const float *a, const float *b)
 {
-   return (a[0]*b[0] +
-           a[1]*b[1] +
-           a[2]*b[2] +
-           a[3]*b[3]);
+   return (a[0] * b[0] +
+           a[1] * b[1] +
+           a[2] * b[2] +
+           a[3] * b[3]);
 }
 
 
@@ -217,7 +219,7 @@ do_clip_tri( struct draw_stage *stage,
 	     struct prim_header *header,
 	     unsigned clipmask )
 {
-   struct clipper *clipper = clipper_stage( stage );
+   struct clip_stage *clipper = clip_stage( stage );
    struct vertex_header *a[MAX_CLIPPED_VERTICES];
    struct vertex_header *b[MAX_CLIPPED_VERTICES];
    struct vertex_header **inlist = a;
@@ -280,6 +282,7 @@ do_clip_tri( struct draw_stage *stage,
 	 dp_prev = dp;
       }
 
+      /* swap in/out lists */
       {
 	 struct vertex_header **tmp = inlist;
 	 inlist = outlist;
@@ -291,15 +294,11 @@ do_clip_tri( struct draw_stage *stage,
    /* If flat-shading, copy color to new provoking vertex.
     */
    if (clipper->flat && inlist[0] != header->v[2]) {
-      if (1) {
-	 inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
-      }
+      inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
 
       copy_colors(stage, inlist[0], header->v[2]);
    }
 
-
-
    /* Emit the polygon as triangles to the setup stage:
     */
    if (n >= 3)
@@ -314,7 +313,7 @@ do_clip_line( struct draw_stage *stage,
 	      struct prim_header *header,
 	      unsigned clipmask )
 {
-   const struct clipper *clipper = clipper_stage( stage );
+   const struct clip_stage *clipper = clip_stage( stage );
    struct vertex_header *v0 = header->v[0];
    struct vertex_header *v1 = header->v[1];
    const float *pos0 = v0->clip;
@@ -416,13 +415,14 @@ clip_tri( struct draw_stage *stage,
    }
 }
 
+
 /* Update state.  Could further delay this until we hit the first
  * primitive that really requires clipping.
  */
 static void 
 clip_init_state( struct draw_stage *stage )
 {
-   struct clipper *clipper = clipper_stage( stage );
+   struct clip_stage *clipper = clip_stage( stage );
 
    clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
 
@@ -488,7 +488,7 @@ static void clip_destroy( struct draw_stage *stage )
  */
 struct draw_stage *draw_clip_stage( struct draw_context *draw )
 {
-   struct clipper *clipper = CALLOC_STRUCT(clipper);
+   struct clip_stage *clipper = CALLOC_STRUCT(clip_stage);
    if (clipper == NULL)
       goto fail;
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c
index 0a70483858c..dc66c65a56c 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -50,19 +50,17 @@ static INLINE struct cull_stage *cull_stage( struct draw_stage *stage )
 }
 
 
-
-
 static void cull_tri( struct draw_stage *stage,
 		      struct prim_header *header )
 {
-   const unsigned pos = stage->draw->vs.position_output;
+   const unsigned pos = draw_current_shader_position_output(stage->draw);
 
    /* Window coords: */
    const float *v0 = header->v[0]->data[pos];
    const float *v1 = header->v[1]->data[pos];
    const float *v2 = header->v[2]->data[pos];
 
-   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   /* edge vectors: e = v0 - v2, f = v1 - v2 */
    const float ex = v0[0] - v2[0];
    const float ey = v0[1] - v2[1];
    const float fx = v1[0] - v2[0];
@@ -72,7 +70,7 @@ static void cull_tri( struct draw_stage *stage,
    header->det = ex * fy - ey * fx;
 
    if (header->det != 0) {
-      /* if (det < 0 then Z points toward camera and triangle is 
+      /* if det < 0 then Z points toward the camera and the triangle is 
        * counter-clockwise winding.
        */
       unsigned winding = (header->det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
@@ -84,6 +82,7 @@ static void cull_tri( struct draw_stage *stage,
    }
 }
 
+
 static void cull_first_tri( struct draw_stage *stage, 
 			    struct prim_header *header )
 {
@@ -96,13 +95,13 @@ static void cull_first_tri( struct draw_stage *stage,
 }
 
 
-
 static void cull_flush( struct draw_stage *stage, unsigned flags )
 {
    stage->tri = cull_first_tri;
    stage->next->flush( stage->next, flags );
 }
 
+
 static void cull_reset_stipple_counter( struct draw_stage *stage )
 {
    stage->next->reset_stipple_counter( stage->next );
@@ -140,7 +139,7 @@ struct draw_stage *draw_cull_stage( struct draw_context *draw )
 
    return &cull->stage;
 
- fail:
+fail:
    if (cull)
       cull->stage.destroy( &cull->stage );
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
index 40798a5d6e7..8e321946ced 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_offset.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -63,7 +63,7 @@ static INLINE struct offset_stage *offset_stage( struct draw_stage *stage )
 static void do_offset_tri( struct draw_stage *stage,
 			   struct prim_header *header )
 {
-   const unsigned pos = stage->draw->vs.position_output;
+   const unsigned pos = draw_current_shader_position_output(stage->draw);
    struct offset_stage *offset = offset_stage(stage);   
    float inv_det = 1.0f / header->det;
 
@@ -161,7 +161,7 @@ struct draw_stage *draw_offset_stage( struct draw_context *draw )
 {
    struct offset_stage *offset = CALLOC_STRUCT(offset_stage);
    if (offset == NULL)
-      goto fail;
+      return NULL;
 
    draw_alloc_temp_verts( &offset->stage, 3 );
 
@@ -176,10 +176,4 @@ struct draw_stage *draw_offset_stage( struct draw_context *draw )
    offset->stage.destroy = offset_destroy;
 
    return &offset->stage;
-
- fail:
-   if (offset)
-      offset->stage.destroy( &offset->stage );
-
-   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index bcb860da2e0..d0d99aa331a 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -37,7 +37,9 @@
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_inlines.h"
 
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -48,6 +50,9 @@
 #include "draw_pipe.h"
 
 
+/** Approx number of new tokens for instructions in pstip_transform_inst() */
+#define NUM_NEW_TOKENS 50
+
 
 /**
  * Subclass of pipe_shader_state to carry extra fragment shader info.
@@ -170,12 +175,7 @@ pstip_transform_immed(struct tgsi_transform_context *ctx,
 static int
 free_bit(uint bitfield)
 {
-   int i;
-   for (i = 0; i < 32; i++) {
-      if ((bitfield & (1 << i)) == 0)
-         return i;
-   }
-   return -1;
+   return ffs(~bitfield) - 1;
 }
 
 
@@ -331,11 +331,10 @@ generate_pstip_fs(struct pstip_stage *pstip)
    /*struct draw_context *draw = pstip->stage.draw;*/
    struct pipe_shader_state pstip_fs;
    struct pstip_transform_context transform;
-
-#define MAX 1000
+   const uint newLen = tgsi_num_tokens(orig_fs->tokens) + NUM_NEW_TOKENS;
 
    pstip_fs = *orig_fs; /* copy to init */
-   pstip_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   pstip_fs.tokens = tgsi_alloc_tokens(newLen);
    if (pstip_fs.tokens == NULL)
       return FALSE;
 
@@ -350,7 +349,7 @@ generate_pstip_fs(struct pstip_stage *pstip)
 
    tgsi_transform_shader(orig_fs->tokens,
                          (struct tgsi_token *) pstip_fs.tokens,
-                         MAX, &transform.base);
+                         newLen, &transform.base);
 
 #if 0 /* DEBUG */
    tgsi_dump(orig_fs->tokens, 0);
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 6e921bac278..70fbab9ea76 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -73,7 +73,8 @@ screen_interp( struct draw_context *draw,
                const struct vertex_header *v1 )
 {
    uint attr;
-   for (attr = 0; attr < draw->vs.num_vs_outputs; attr++) {
+   int num_outputs = draw_current_shader_outputs(draw);
+   for (attr = 0; attr < num_outputs; attr++) {
       const float *val0 = v0->data[attr];
       const float *val1 = v1->data[attr];
       float *newv = dst->data[attr];
@@ -121,7 +122,7 @@ stipple_line(struct draw_stage *stage, struct prim_header *header)
    struct stipple_stage *stipple = stipple_stage(stage);
    struct vertex_header *v0 = header->v[0];
    struct vertex_header *v1 = header->v[1];
-   const unsigned pos = stage->draw->vs.position_output;
+   const unsigned pos = draw_current_shader_position_output(stage->draw);
    const float *pos0 = v0->data[pos];
    const float *pos1 = v1->data[pos];
    float start = 0;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c
index bea90e50d30..153097e543e 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_validate.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c
@@ -105,7 +105,7 @@ draw_need_pipeline(const struct draw_context *draw,
          return TRUE;
 
       /* point sprites */
-      if (rasterizer->point_sprite && draw->pipeline.point_sprite)
+      if (rasterizer->sprite_coord_enable && draw->pipeline.point_sprite)
          return TRUE;
    }
 
@@ -151,8 +151,8 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
 {
    struct draw_context *draw = stage->draw;
    struct draw_stage *next = draw->pipeline.rasterize;
-   int need_det = 0;
-   int precalc_flat = 0;
+   boolean need_det = FALSE;
+   boolean precalc_flat = FALSE;
    boolean wide_lines, wide_points;
 
    /* Set the validate's next stage to the rasterize stage, so that it
@@ -165,7 +165,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
                  && !draw->rasterizer->line_smooth);
 
    /* drawing large points? */
-   if (draw->rasterizer->point_sprite && draw->pipeline.point_sprite)
+   if (draw->rasterizer->sprite_coord_enable && draw->pipeline.point_sprite)
       wide_points = TRUE;
    else if (draw->rasterizer->point_smooth && draw->pipeline.aapoint)
       wide_points = FALSE;
@@ -194,10 +194,10 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    if (wide_lines) {
       draw->pipeline.wide_line->next = next;
       next = draw->pipeline.wide_line;
-      precalc_flat = 1;
+      precalc_flat = TRUE;
    }
 
-   if (wide_points || draw->rasterizer->point_sprite) {
+   if (wide_points || draw->rasterizer->sprite_coord_enable) {
       draw->pipeline.wide_point->next = next;
       next = draw->pipeline.wide_point;
    }
@@ -205,7 +205,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    if (draw->rasterizer->line_stipple_enable && draw->pipeline.line_stipple) {
       draw->pipeline.stipple->next = next;
       next = draw->pipeline.stipple;
-      precalc_flat = 1;		/* only needed for lines really */
+      precalc_flat = TRUE;		/* only needed for lines really */
    }
 
    if (draw->rasterizer->poly_stipple_enable
@@ -218,8 +218,8 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
        draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL) {
       draw->pipeline.unfilled->next = next;
       next = draw->pipeline.unfilled;
-      precalc_flat = 1;		/* only needed for triangles really */
-      need_det = 1;
+      precalc_flat = TRUE;		/* only needed for triangles really */
+      need_det = TRUE;
    }
 
    if (draw->rasterizer->flatshade && precalc_flat) {
@@ -231,13 +231,13 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
        draw->rasterizer->offset_ccw) {
       draw->pipeline.offset->next = next;
       next = draw->pipeline.offset;
-      need_det = 1;
+      need_det = TRUE;
    }
 
    if (draw->rasterizer->light_twoside) {
       draw->pipeline.twoside->next = next;
       next = draw->pipeline.twoside;
-      need_det = 1;
+      need_det = TRUE;
    }
 
    /* Always run the cull stage as we calculate determinant there
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index 1a5269c0de9..27099579618 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -138,7 +138,7 @@ emit_vertex( struct vbuf_stage *vbuf,
       /* Note: we really do want data[0] here, not data[pos]: 
        */
       vbuf->translate->set_buffer(vbuf->translate, 0, vertex->data[0], 0);
-      vbuf->translate->run(vbuf->translate, 0, 1, vbuf->vertex_ptr);
+      vbuf->translate->run(vbuf->translate, 0, 1, 0, vbuf->vertex_ptr);
 
       if (0) draw_dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr);
       
@@ -262,7 +262,7 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
 	 src_offset = 0;
 	 break;
       case EMIT_4UB:
-	 output_format = PIPE_FORMAT_B8G8R8A8_UNORM;
+	 output_format = PIPE_FORMAT_A8R8G8B8_UNORM;
 	 emit_sz = 4 * sizeof(ubyte);
          break;
       default:
@@ -271,10 +271,12 @@ vbuf_start_prim( struct vbuf_stage *vbuf, uint prim )
 	 emit_sz = 0;
 	 break;
       }
-      
+
+      hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       hw_key.element[i].input_buffer = src_buffer;
       hw_key.element[i].input_offset = src_offset;
+      hw_key.element[i].instance_divisor = 0;
       hw_key.element[i].output_format = output_format;
       hw_key.element[i].output_offset = dst_offset;
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
index f32cbef983d..3073c870825 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
@@ -59,7 +59,7 @@ static void wideline_line( struct draw_stage *stage,
                            struct prim_header *header )
 {
    /*const struct wideline_stage *wide = wideline_stage(stage);*/
-   const unsigned pos = stage->draw->vs.position_output;
+   const unsigned pos = draw_current_shader_position_output(stage->draw);
    const float half_width = 0.5f * stage->draw->rasterizer->line_width;
 
    struct prim_header tri;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index 7d76a7dbf39..fdabce7d443 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -64,15 +64,14 @@ struct widepoint_stage {
    struct draw_stage stage;
 
    float half_point_size;
-   float point_size_min;
-   float point_size_max;
 
    float xbias;
    float ybias;
 
    uint texcoord_slot[PIPE_MAX_SHADER_OUTPUTS];
-   uint texcoord_mode[PIPE_MAX_SHADER_OUTPUTS];
+   uint texcoord_enable[PIPE_MAX_SHADER_OUTPUTS];
    uint num_texcoords;
+   uint texcoord_mode;
 
    int psize_slot;
 
@@ -98,10 +97,10 @@ static void set_texcoords(const struct widepoint_stage *wide,
 {
    uint i;
    for (i = 0; i < wide->num_texcoords; i++) {
-      if (wide->texcoord_mode[i] != PIPE_SPRITE_COORD_NONE) {
+      if (wide->texcoord_enable[i]) {
          uint j = wide->texcoord_slot[i];
          v->data[j][0] = tc[0];
-         if (wide->texcoord_mode[i] == PIPE_SPRITE_COORD_LOWER_LEFT)
+         if (wide->texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
             v->data[j][1] = 1.0f - tc[1];
          else
             v->data[j][1] = tc[1];
@@ -112,7 +111,7 @@ static void set_texcoords(const struct widepoint_stage *wide,
 
    if (wide->point_coord_fs_input >= 0) {
       /* put gl_PointCoord into the extra vertex slot */
-      uint slot = wide->stage.draw->extra_vp_outputs.slot;
+      uint slot = wide->stage.draw->extra_shader_outputs.slot;
       v->data[slot][0] = tc[0];
       v->data[slot][1] = tc[1];
       v->data[slot][2] = 0.0F;
@@ -129,9 +128,10 @@ static void set_texcoords(const struct widepoint_stage *wide,
 static void widepoint_point( struct draw_stage *stage,
                              struct prim_header *header )
 {
+   /* XXX should take point_quad_rasterization into account? */
    const struct widepoint_stage *wide = widepoint_stage(stage);
-   const unsigned pos = stage->draw->vs.position_output;
-   const boolean sprite = (boolean) stage->draw->rasterizer->point_sprite;
+   const unsigned pos = draw_current_shader_position_output(stage->draw);
+   const boolean sprite = (boolean) stage->draw->rasterizer->sprite_coord_enable;
    float half_size;
    float left_adj, right_adj, bot_adj, top_adj;
 
@@ -151,13 +151,6 @@ static void widepoint_point( struct draw_stage *stage,
    /* point size is either per-vertex or fixed size */
    if (wide->psize_slot >= 0) {
       half_size = header->v[0]->data[wide->psize_slot][0];
-
-      /* XXX: temporary -- do this in the vertex shader??
-       */
-      half_size = CLAMP(half_size,
-                        wide->point_size_min,
-                        wide->point_size_max);
-      
       half_size *= 0.5f; 
    }
    else {
@@ -222,8 +215,6 @@ static void widepoint_first_point( struct draw_stage *stage,
    struct draw_context *draw = stage->draw;
 
    wide->half_point_size = 0.5f * draw->rasterizer->point_size;
-   wide->point_size_min = draw->rasterizer->point_size_min;
-   wide->point_size_max = draw->rasterizer->point_size_max;
    wide->xbias = 0.0;
    wide->ybias = 0.0;
 
@@ -233,21 +224,22 @@ static void widepoint_first_point( struct draw_stage *stage,
 
    /* XXX we won't know the real size if it's computed by the vertex shader! */
    if ((draw->rasterizer->point_size > draw->pipeline.wide_point_threshold) ||
-       (draw->rasterizer->point_sprite && draw->pipeline.point_sprite)) {
+       (draw->rasterizer->sprite_coord_enable && draw->pipeline.point_sprite)) {
       stage->point = widepoint_point;
    }
    else {
       stage->point = draw_pipe_passthrough_point;
    }
 
-   if (draw->rasterizer->point_sprite) {
+   if (draw->rasterizer->sprite_coord_enable) {
       /* find vertex shader texcoord outputs */
       const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
       uint i, j = 0;
+      wide->texcoord_mode = draw->rasterizer->sprite_coord_mode;
       for (i = 0; i < vs->info.num_outputs; i++) {
          if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
             wide->texcoord_slot[j] = i;
-            wide->texcoord_mode[j] = draw->rasterizer->sprite_coord_mode[j];
+            wide->texcoord_enable[j] = (draw->rasterizer->sprite_coord_enable >> j) & 1;
             j++;
          }
       }
@@ -257,13 +249,13 @@ static void widepoint_first_point( struct draw_stage *stage,
       wide->point_coord_fs_input = find_pntc_input_attrib(draw);
 
       /* setup extra vp output (point coord implemented as a texcoord) */
-      draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
-      draw->extra_vp_outputs.semantic_index = 0;
-      draw->extra_vp_outputs.slot = draw->vs.num_vs_outputs;
+      draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
+      draw->extra_shader_outputs.semantic_index = 0;
+      draw->extra_shader_outputs.slot = draw_current_shader_outputs(draw);
    }
    else {
       wide->point_coord_fs_input = -1;
-      draw->extra_vp_outputs.slot = 0;
+      draw->extra_shader_outputs.slot = 0;
    }
 
    wide->psize_slot = -1;
@@ -287,7 +279,7 @@ static void widepoint_flush( struct draw_stage *stage, unsigned flags )
 {
    stage->point = widepoint_first_point;
    stage->next->flush( stage->next, flags );
-   stage->draw->extra_vp_outputs.slot = 0;
+   stage->draw->extra_shader_outputs.slot = 0;
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 41fcb16a0a5..1e6e01af9e2 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -48,8 +48,6 @@
 
 
 struct pipe_context;
-struct gallivm_prog;
-struct gallivm_cpu_engine;
 struct draw_vertex_shader;
 struct draw_context;
 struct draw_stage;
@@ -142,8 +140,6 @@ struct draw_context
 
       /* user-space vertex data, buffers */
       struct {
-         const unsigned *edgeflag;
-
          /** vertex element/index buffer (ex: glDrawElements) */
          const void *elts;
          /** bytes per index (0, 1, 2 or 4) */
@@ -154,8 +150,9 @@ struct draw_context
          /** vertex arrays */
          const void *vbuffer[PIPE_MAX_ATTRIBS];
          
-         /** constant buffer (for vertex shader) */
-         const void *constants;
+         /** constant buffer (for vertex/geometry shader) */
+         const void *vs_constants[PIPE_MAX_CONSTANT_BUFFERS];
+         const void *gs_constants[PIPE_MAX_CONSTANT_BUFFERS];
       } user;
 
       boolean test_fse;         /* enable FSE even though its not correct (eg for softpipe) */
@@ -173,6 +170,8 @@ struct draw_context
 
    boolean force_passthrough; /**< never clip or shade */
 
+   boolean dump_vs;
+
    double mrd;  /**< minimum resolvable depth value, for polygon offset */
 
    /* pipe state that we need: */
@@ -184,6 +183,7 @@ struct draw_context
       struct draw_vertex_shader *vertex_shader;
       uint num_vs_outputs;  /**< convenience, from vertex_shader */
       uint position_output;
+      uint edgeflag_output;
 
       /** TGSI program interpreter runtime state */
       struct tgsi_exec_machine *machine;
@@ -191,19 +191,15 @@ struct draw_context
       uint num_samplers;
       struct tgsi_sampler **samplers;
 
-      /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private.
-       */
-      struct gallivm_cpu_engine *engine;   
-
       /* Here's another one:
        */
       struct aos_machine *aos_machine; 
 
 
-      const float (*aligned_constants)[4];
+      const void *aligned_constants[PIPE_MAX_CONSTANT_BUFFERS];
 
-      const float (*aligned_constant_storage)[4];
-      unsigned const_storage_size;
+      const void *aligned_constant_storage[PIPE_MAX_CONSTANT_BUFFERS];
+      unsigned const_storage_size[PIPE_MAX_CONSTANT_BUFFERS];
 
 
       struct translate *fetch;
@@ -212,6 +208,18 @@ struct draw_context
       struct translate_cache *emit_cache;
    } vs;
 
+   struct {
+      struct draw_geometry_shader *geometry_shader;
+      uint num_gs_outputs;  /**< convenience, from geometry_shader */
+      uint position_output;
+
+      /** TGSI program interpreter runtime state */
+      struct tgsi_exec_machine *machine;
+
+      uint num_samplers;
+      struct tgsi_sampler **samplers;
+   } gs;
+
    /* Clip derived state:
     */
    float plane[12][4];
@@ -223,10 +231,12 @@ struct draw_context
       uint semantic_name;
       uint semantic_index;
       int slot;
-   } extra_vp_outputs;
+   } extra_shader_outputs;
 
    unsigned reduced_prim;
 
+   unsigned instance_id;
+
    void *driver_private;
 };
 
@@ -240,12 +250,32 @@ void draw_vs_destroy( struct draw_context *draw );
 void draw_vs_set_viewport( struct draw_context *, 
                            const struct pipe_viewport_state * );
 
-void draw_vs_set_constants( struct draw_context *,
-                            const float (*constants)[4],
-                            unsigned size );
+void
+draw_vs_set_constants(struct draw_context *,
+                      unsigned slot,
+                      const void *constants,
+                      unsigned size);
+
+
 
+/*******************************************************************************
+ * Geometry shading code:
+ */
+boolean draw_gs_init( struct draw_context *draw );
+
+void
+draw_gs_set_constants(struct draw_context *,
+                      unsigned slot,
+                      const void *constants,
+                      unsigned size);
 
+void draw_gs_destroy( struct draw_context *draw );
 
+/*******************************************************************************
+ * Common shading code:
+ */
+uint draw_current_shader_outputs(const struct draw_context *draw);
+uint draw_current_shader_position_output(const struct draw_context *draw);
 
 /*******************************************************************************
  * Vertex processing (was passthrough) code:
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 4865a2d8542..6d90a6c42fd 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -33,7 +33,6 @@
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
-#include "draw/draw_vs.h"
 #include "tgsi/tgsi_dump.h"
 #include "util/u_math.h"
 #include "util/u_prim.h"
@@ -88,9 +87,7 @@ draw_pt_arrays(struct draw_context *draw,
          opt |= PT_CLIPTEST;
       }
       
-      if (!draw->rasterizer->bypass_vs_clip_and_viewport) {
-         opt |= PT_SHADE;
-      }
+      opt |= PT_SHADE;
    }
       
    if (opt == 0) 
@@ -280,25 +277,38 @@ void
 draw_arrays(struct draw_context *draw, unsigned prim,
             unsigned start, unsigned count)
 {
-   unsigned reduced_prim = u_reduced_prim(prim);
+   draw_arrays_instanced(draw, prim, start, count, 0, 1);
+}
+
+void
+draw_arrays_instanced(struct draw_context *draw,
+                      unsigned mode,
+                      unsigned start,
+                      unsigned count,
+                      unsigned startInstance,
+                      unsigned instanceCount)
+{
+   unsigned reduced_prim = u_reduced_prim(mode);
+   unsigned instance;
+
    if (reduced_prim != draw->reduced_prim) {
-      draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+      draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE);
       draw->reduced_prim = reduced_prim;
    }
 
    if (0)
-      draw_print_arrays(draw, prim, start, MIN2(count, 20));
+      draw_print_arrays(draw, mode, start, MIN2(count, 20));
 
 #if 0
    {
       int i;
-      debug_printf("draw_arrays(prim=%u start=%u count=%u):\n",
-                   prim, start, count);
+      debug_printf("draw_arrays(mode=%u start=%u count=%u):\n",
+                   mode, start, count);
       tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
       debug_printf("Elements:\n");
       for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
          debug_printf("  format=%s comps=%u\n",
-                      pf_name(draw->pt.vertex_element[i].src_format),
+                      util_format_name(draw->pt.vertex_element[i].src_format),
                       draw->pt.vertex_element[i].nr_components);
       }
       debug_printf("Buffers:\n");
@@ -311,15 +321,8 @@ draw_arrays(struct draw_context *draw, unsigned prim,
    }
 #endif
 
-   /* drawing done here: */
-   draw_pt_arrays(draw, prim, start, count);
-}
-
-boolean draw_pt_get_edgeflag( struct draw_context *draw,
-                              unsigned idx )
-{
-   if (draw->pt.user.edgeflag)
-      return (draw->pt.user.edgeflag[idx/32] & (1 << (idx%32))) != 0;
-   else
-      return 1;
+   for (instance = 0; instance < instanceCount; instance++) {
+      draw->instance_id = instance + startInstance;
+      draw_pt_arrays(draw, mode, start, count);
+   }
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index 7a17a9fb6b2..d5e0d92a605 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -149,11 +149,6 @@ struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw );
 struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit(struct draw_context *draw);
 
 
-/* More helpers:
- */
-boolean draw_pt_get_edgeflag( struct draw_context *draw,
-                              unsigned idx );
-
 
 /*******************************************************************************
  * HW vertex emit:
@@ -188,7 +183,8 @@ struct pt_emit *draw_pt_emit_create( struct draw_context *draw );
 struct pt_fetch;
 void draw_pt_fetch_prepare( struct pt_fetch *fetch,
                             unsigned vertex_input_count,
-			    unsigned vertex_size );
+                            unsigned vertex_size,
+                            unsigned instance_id_index );
 
 void draw_pt_fetch_run( struct pt_fetch *fetch,
 			const unsigned *elts,
@@ -217,7 +213,8 @@ boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
 void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
 			      boolean bypass_clipping,
 			      boolean bypass_viewport,
-			      boolean opengl );
+			      boolean opengl,
+			      boolean need_edgeflags );
 
 struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw );
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 064e16c295c..ae357b51226 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -112,7 +112,7 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
 	 src_offset = 0;
 	 break;
       case EMIT_4UB:
-	 output_format = PIPE_FORMAT_B8G8R8A8_UNORM;
+	 output_format = PIPE_FORMAT_A8R8G8B8_UNORM;
 	 emit_sz = 4 * sizeof(ubyte);
          break;
       default:
@@ -121,10 +121,12 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
 	 emit_sz = 0;
 	 break;
       }
-      
+
+      hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       hw_key.element[i].input_buffer = src_buffer;
       hw_key.element[i].input_offset = src_offset;
+      hw_key.element[i].instance_divisor = 0;
       hw_key.element[i].output_format = output_format;
       hw_key.element[i].output_offset = dst_offset;
 
@@ -204,6 +206,7 @@ void draw_pt_emit( struct pt_emit *emit,
    translate->run( translate,
 		   0, 
 		   vertex_count,
+                   draw->instance_id,
 		   hw_verts );
 
    render->unmap_vertices( render, 
@@ -263,6 +266,7 @@ void draw_pt_emit_linear(struct pt_emit *emit,
    translate->run(translate,
                   0,
                   count,
+                  draw->instance_id,
                   hw_verts);
 
    if (0) {
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
index 65c3a34c347..252be5053e4 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -30,7 +30,6 @@
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
-#include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
@@ -42,11 +41,11 @@ struct pt_fetch {
    struct translate *translate;
 
    unsigned vertex_size;
-   boolean need_edgeflags;
 
    struct translate_cache *cache;
 };
 
+
 /* Perform the fetch from API vertex elements & vertex buffers, to a
  * contiguous set of float[4] attributes as required for the
  * vertex_shader->run_linear() method.
@@ -58,12 +57,14 @@ struct pt_fetch {
  */
 void draw_pt_fetch_prepare( struct pt_fetch *fetch,
                             unsigned vs_input_count,
-			    unsigned vertex_size )
+                            unsigned vertex_size,
+                            unsigned instance_id_index )
 {
    struct draw_context *draw = fetch->draw;
    unsigned nr_inputs;
-   unsigned i, nr = 0;
+   unsigned i, nr = 0, ei = 0;
    unsigned dst_offset = 0;
+   unsigned num_extra_inputs = 0;
    struct translate_key key;
 
    fetch->vertex_size = vertex_size;
@@ -78,9 +79,11 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
    {
       /* Need to set header->vertex_id = 0xffff somehow.
        */
+      key.element[nr].type = TRANSLATE_ELEMENT_NORMAL;
       key.element[nr].input_format = PIPE_FORMAT_R32_FLOAT;
       key.element[nr].input_buffer = draw->pt.nr_vertex_buffers;
       key.element[nr].input_offset = 0;
+      key.element[nr].instance_divisor = 0;
       key.element[nr].output_format = PIPE_FORMAT_R32_FLOAT;
       key.element[nr].output_offset = dst_offset;
       dst_offset += 1 * sizeof(float);
@@ -91,19 +94,36 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
        */
       dst_offset += 4 * sizeof(float);
    }
-      
-   assert( draw->pt.nr_vertex_elements >= vs_input_count );
 
-   nr_inputs = MIN2( vs_input_count, draw->pt.nr_vertex_elements );
+   if (instance_id_index != ~0) {
+      num_extra_inputs++;
+   }
+
+   assert(draw->pt.nr_vertex_elements + num_extra_inputs >= vs_input_count);
+
+   nr_inputs = MIN2(vs_input_count, draw->pt.nr_vertex_elements + num_extra_inputs);
 
    for (i = 0; i < nr_inputs; i++) {
-      key.element[nr].input_format = draw->pt.vertex_element[i].src_format;
-      key.element[nr].input_buffer = draw->pt.vertex_element[i].vertex_buffer_index;
-      key.element[nr].input_offset = draw->pt.vertex_element[i].src_offset;
-      key.element[nr].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-      key.element[nr].output_offset = dst_offset;
+      if (i == instance_id_index) {
+         key.element[nr].type = TRANSLATE_ELEMENT_INSTANCE_ID;
+         key.element[nr].input_format = PIPE_FORMAT_R32_USCALED;
+         key.element[nr].output_format = PIPE_FORMAT_R32_USCALED;
+         key.element[nr].output_offset = dst_offset;
+
+         dst_offset += sizeof(uint);
+      } else {
+         key.element[nr].type = TRANSLATE_ELEMENT_NORMAL;
+         key.element[nr].input_format = draw->pt.vertex_element[ei].src_format;
+         key.element[nr].input_buffer = draw->pt.vertex_element[ei].vertex_buffer_index;
+         key.element[nr].input_offset = draw->pt.vertex_element[ei].src_offset;
+         key.element[nr].instance_divisor = draw->pt.vertex_element[ei].instance_divisor;
+         key.element[nr].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+         key.element[nr].output_offset = dst_offset;
+
+         ei++;
+         dst_offset += 4 * sizeof(float);
+      }
 
-      dst_offset += 4 * sizeof(float);
       nr++;
    }
 
@@ -120,7 +140,12 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
       fetch->translate = translate_cache_find(fetch->cache, &key);
 
       {
-         static struct vertex_header vh = { 0, 1, 0, UNDEFINED_VERTEX_ID, { .0f, .0f, .0f, .0f } };
+         static struct vertex_header vh = { 0,
+                                            1,
+                                            0,
+                                            UNDEFINED_VERTEX_ID,
+                                            { .0f, .0f, .0f, .0f } };
+
 	 fetch->translate->set_buffer(fetch->translate,
 				      draw->pt.nr_vertex_buffers,
 				      &vh,
@@ -128,9 +153,6 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch,
       }
    }
 
-   fetch->need_edgeflags = ((draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
-                             draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL) &&
-                            draw->pt.user.edgeflag);
 }
 
 
@@ -156,19 +178,9 @@ void draw_pt_fetch_run( struct pt_fetch *fetch,
    translate->run_elts( translate,
 			elts, 
 			count,
+                        draw->instance_id,
 			verts );
 
-   /* Edgeflags are hard to fit into a translate program, populate
-    * them separately if required.  In the setup above they are
-    * defaulted to one, so only need this if there is reason to change
-    * that default:
-    */
-   if (fetch->need_edgeflags) {
-      for (i = 0; i < count; i++) {
-         struct vertex_header *vh = (struct vertex_header *)(verts + i * fetch->vertex_size);
-         vh->edgeflag = draw_pt_get_edgeflag( draw, elts[i] );
-      }
-   }
 }
 
 
@@ -192,19 +204,8 @@ void draw_pt_fetch_run_linear( struct pt_fetch *fetch,
    translate->run( translate,
                    start,
                    count,
+                   draw->instance_id,
                    verts );
-
-   /* Edgeflags are hard to fit into a translate program, populate
-    * them separately if required.  In the setup above they are
-    * defaulted to one, so only need this if there is reason to change
-    * that default:
-    */
-   if (fetch->need_edgeflags) {
-      for (i = 0; i < count; i++) {
-         struct vertex_header *vh = (struct vertex_header *)(verts + i * fetch->vertex_size);
-         vh->edgeflag = draw_pt_get_edgeflag( draw, start + i );
-      }
-   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index e7fe6b3b768..2a604470e9a 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -166,9 +166,11 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
 	 continue;
       }
 
+      key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       key.element[i].input_format = input_format;
       key.element[i].input_buffer = input_buffer;
       key.element[i].input_offset = input_offset;
+      key.element[i].instance_divisor = src->instance_divisor;
       key.element[i].output_format = output_format;
       key.element[i].output_offset = dst_offset;
       
@@ -256,6 +258,7 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
    feme->translate->run_elts( feme->translate, 
 			      fetch_elts,
 			      fetch_count,
+                              draw->instance_id,
 			      hw_verts );
 
    if (0) {
@@ -314,6 +317,7 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
    feme->translate->run( feme->translate,
                          start,
                          count,
+                         draw->instance_id,
                          hw_verts );
 
    if (0) {
@@ -374,6 +378,7 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
    feme->translate->run( feme->translate,
                          start,
                          count,
+                         draw->instance_id,
                          hw_verts );
 
    draw->render->unmap_vertices( draw->render, 0, (ushort)(count - 1) );
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index 734c05f0688..1aecb510777 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -40,7 +40,6 @@
 #include "draw/draw_pt.h"
 #include "draw/draw_vs.h"
 
-#include "translate/translate.h"
 
 struct fetch_shade_emit;
 
@@ -101,8 +100,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    fse->key.nr_elements = MAX2(fse->key.nr_outputs,     /* outputs - translate to hw format */
                                fse->key.nr_inputs);     /* inputs - fetch from api format */
 
-   fse->key.viewport = (!draw->rasterizer->bypass_vs_clip_and_viewport &&
-                        !draw->identity_viewport);
+   fse->key.viewport = !draw->identity_viewport;
    fse->key.clip = !draw->bypass_clipping;
    fse->key.const_vbuffers = 0;
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index df6c265b7ec..da5106463a7 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -32,7 +32,7 @@
 #include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
 #include "draw/draw_vs.h"
-#include "translate/translate.h"
+#include "draw/draw_gs.h"
 
 
 struct fetch_pipeline_middle_end {
@@ -58,6 +58,8 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
    struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+   unsigned i;
+   unsigned instance_id_index = ~0;
 
    /* Add one to num_outputs because the pipeline occasionally tags on
     * an additional texcoord, eg for AA lines.
@@ -65,6 +67,15 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
    unsigned nr = MAX2( vs->info.num_inputs,
 		       vs->info.num_outputs + 1 );
 
+   /* Scan for instanceID system value.
+    */
+   for (i = 0; i < vs->info.num_inputs; i++) {
+      if (vs->info.input_semantic_name[i] == TGSI_SEMANTIC_INSTANCEID) {
+         instance_id_index = i;
+         break;
+      }
+   }
+
    fpme->prim = prim;
    fpme->opt = opt;
 
@@ -78,16 +89,16 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
 
    draw_pt_fetch_prepare( fpme->fetch, 
                           vs->info.num_inputs,
-			  fpme->vertex_size );
+                          fpme->vertex_size,
+                          instance_id_index );
    /* XXX: it's not really gl rasterization rules we care about here,
     * but gl vs dx9 clip spaces.
     */
    draw_pt_post_vs_prepare( fpme->post_vs,
 			    (boolean)draw->bypass_clipping,
-			    (boolean)(draw->identity_viewport ||
-                                      draw->rasterizer->bypass_vs_clip_and_viewport),
-			    (boolean)draw->rasterizer->gl_rasterization_rules );
-			    
+			    (boolean)draw->identity_viewport,
+			    (boolean)draw->rasterizer->gl_rasterization_rules,
+			    (draw->vs.edgeflag_output ? true : false) );    
 
    if (!(opt & PT_PIPELINE)) {
       draw_pt_emit_prepare( fpme->emit, 
@@ -119,7 +130,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
+   struct draw_vertex_shader *vshader = draw->vs.vertex_shader;
+   struct draw_geometry_shader *gshader = draw->gs.geometry_shader;
    unsigned opt = fpme->opt;
    unsigned alloc_count = align( fetch_count, 4 );
 
@@ -141,19 +153,25 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
 		      (char *)pipeline_verts );
 
    /* Run the shader, note that this overwrites the data[] parts of
-    * the pipeline verts.  If there is no shader, eg if
-    * bypass_vs_clip_and_viewport, then the inputs == outputs, and are
-    * already in the correct place.
+    * the pipeline verts.
     */
    if (opt & PT_SHADE)
    {
-      shader->run_linear(shader, 
-			 (const float (*)[4])pipeline_verts->data,
-			 (      float (*)[4])pipeline_verts->data,
-			 (const float (*)[4])draw->pt.user.constants,
-			 fetch_count,
-			 fpme->vertex_size,
-			 fpme->vertex_size);
+      vshader->run_linear(vshader,
+                          (const float (*)[4])pipeline_verts->data,
+                          (      float (*)[4])pipeline_verts->data,
+                          draw->pt.user.vs_constants,
+                          fetch_count,
+                          fpme->vertex_size,
+                          fpme->vertex_size);
+      if (gshader)
+         draw_geometry_shader_run(gshader,
+                                  (const float (*)[4])pipeline_verts->data,
+                                  (      float (*)[4])pipeline_verts->data,
+                                  draw->pt.user.gs_constants,
+                                  fetch_count,
+                                  fpme->vertex_size,
+                                  fpme->vertex_size);
    }
 
    if (draw_pt_post_vs_run( fpme->post_vs,
@@ -196,6 +214,7 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
    struct draw_vertex_shader *shader = draw->vs.vertex_shader;
+   struct draw_geometry_shader *geometry_shader = draw->gs.geometry_shader;
    unsigned opt = fpme->opt;
    unsigned alloc_count = align( count, 4 );
 
@@ -217,19 +236,26 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
                              (char *)pipeline_verts );
 
    /* Run the shader, note that this overwrites the data[] parts of
-    * the pipeline verts.  If there is no shader, ie if
-    * bypass_vs_clip_and_viewport, then the inputs == outputs, and are
-    * already in the correct place.
+    * the pipeline verts.
     */
    if (opt & PT_SHADE)
    {
       shader->run_linear(shader,
 			 (const float (*)[4])pipeline_verts->data,
 			 (      float (*)[4])pipeline_verts->data,
-			 (const float (*)[4])draw->pt.user.constants,
+                         draw->pt.user.vs_constants,
 			 count,
 			 fpme->vertex_size,
 			 fpme->vertex_size);
+
+      if (geometry_shader)
+         draw_geometry_shader_run(geometry_shader,
+                                  (const float (*)[4])pipeline_verts->data,
+                                  (      float (*)[4])pipeline_verts->data,
+                                  draw->pt.user.gs_constants,
+                                  count,
+                                  fpme->vertex_size,
+                                  fpme->vertex_size);
    }
 
    if (draw_pt_post_vs_run( fpme->post_vs,
@@ -270,6 +296,7 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
    struct draw_vertex_shader *shader = draw->vs.vertex_shader;
+   struct draw_geometry_shader *geometry_shader = draw->gs.geometry_shader;
    unsigned opt = fpme->opt;
    unsigned alloc_count = align( count, 4 );
 
@@ -287,19 +314,26 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle
                              (char *)pipeline_verts );
 
    /* Run the shader, note that this overwrites the data[] parts of
-    * the pipeline verts.  If there is no shader, ie if
-    * bypass_vs_clip_and_viewport, then the inputs == outputs, and are
-    * already in the correct place.
+    * the pipeline verts.
     */
    if (opt & PT_SHADE)
    {
       shader->run_linear(shader,
 			 (const float (*)[4])pipeline_verts->data,
 			 (      float (*)[4])pipeline_verts->data,
-			 (const float (*)[4])draw->pt.user.constants,
+                         draw->pt.user.vs_constants,
 			 count,
 			 fpme->vertex_size,
 			 fpme->vertex_size);
+
+      if (geometry_shader)
+         draw_geometry_shader_run(geometry_shader,
+                                  (const float (*)[4])pipeline_verts->data,
+                                  (      float (*)[4])pipeline_verts->data,
+                                  draw->pt.user.gs_constants,
+                                  count,
+                                  fpme->vertex_size,
+                                  fpme->vertex_size);
    }
 
    if (draw_pt_post_vs_run( fpme->post_vs,
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index 6c1cb48e8b8..9728d5c2bdf 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -30,7 +30,6 @@
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
-#include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
 
 struct pt_post_vs {
@@ -100,7 +99,7 @@ static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
    struct vertex_header *out = vertices;
    const float *scale = pvs->draw->viewport.scale;
    const float *trans = pvs->draw->viewport.translate;
-   const unsigned pos = pvs->draw->vs.position_output;
+   const unsigned pos = draw_current_shader_position_output(pvs->draw);
    unsigned clipped = 0;
    unsigned j;
 
@@ -147,6 +146,39 @@ static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
 
 
 
+/* As above plus edgeflags
+ */
+static boolean 
+post_vs_cliptest_viewport_gl_edgeflag(struct pt_post_vs *pvs,
+                                      struct vertex_header *vertices,
+                                      unsigned count,
+                                      unsigned stride )
+{
+   unsigned j;
+   boolean needpipe;
+
+   needpipe = post_vs_cliptest_viewport_gl( pvs, vertices, count, stride);
+
+   /* If present, copy edgeflag VS output into vertex header.
+    * Otherwise, leave header as is.
+    */
+   if (pvs->draw->vs.edgeflag_output) {
+      struct vertex_header *out = vertices;
+      int ef = pvs->draw->vs.edgeflag_output;
+
+      for (j = 0; j < count; j++) {
+         const float *edgeflag = out->data[ef];
+         out->edgeflag = !(edgeflag[0] != 1.0f);
+         needpipe |= !out->edgeflag;
+         out = (struct vertex_header *)( (char *)out + stride );
+      }
+   }
+   return needpipe;
+}
+
+
+
+
 /* If bypass_clipping is set, skip cliptest and rhw divide.
  */
 static boolean post_vs_viewport( struct pt_post_vs *pvs,
@@ -157,7 +189,7 @@ static boolean post_vs_viewport( struct pt_post_vs *pvs,
    struct vertex_header *out = vertices;
    const float *scale = pvs->draw->viewport.scale;
    const float *trans = pvs->draw->viewport.translate;
-   const unsigned pos = pvs->draw->vs.position_output;
+   const unsigned pos = draw_current_shader_position_output(pvs->draw);
    unsigned j;
 
    if (0) debug_printf("%s\n", __FUNCTION__);
@@ -201,17 +233,29 @@ boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
 void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
 			      boolean bypass_clipping,
 			      boolean bypass_viewport,
-			      boolean opengl )
+			      boolean opengl,
+			      boolean need_edgeflags )
 {
-   if (bypass_clipping) {
-      if (bypass_viewport)
-	 pvs->run = post_vs_none;
-      else
-	 pvs->run = post_vs_viewport;
+   if (!need_edgeflags) {
+      if (bypass_clipping) {
+         if (bypass_viewport)
+            pvs->run = post_vs_none;
+         else
+            pvs->run = post_vs_viewport;
+      }
+      else {
+         /* if (opengl) */
+         pvs->run = post_vs_cliptest_viewport_gl;
+      }
    }
    else {
-      /* if (opengl) */
-      pvs->run = post_vs_cliptest_viewport_gl;
+      /* If we need to copy edgeflags to the vertex header, it should
+       * mean we're running the primitive pipeline.  Hence the bypass
+       * flags should be false.
+       */
+      assert(!bypass_clipping);
+      assert(!bypass_viewport);
+      pvs->run = post_vs_cliptest_viewport_gl_edgeflag;
    }
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c
index b61fa291436..3236d38e6ab 100644
--- a/src/gallium/auxiliary/draw/draw_pt_util.c
+++ b/src/gallium/auxiliary/draw/draw_pt_util.c
@@ -33,6 +33,7 @@
 #include "draw/draw_context.h"
 #include "draw/draw_private.h"
 #include "draw/draw_pt.h"
+#include "util/u_debug.h"
 
 void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
 {
@@ -50,16 +51,32 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       *first = 2;
       *incr = 1;
       break;
+   case PIPE_PRIM_LINES_ADJACENCY:
+      *first = 4;
+      *incr = 2;
+      break;
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      *first = 4;
+      *incr = 1;
+      break;
    case PIPE_PRIM_TRIANGLES:
       *first = 3;
       *incr = 3;
       break;
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      *first = 6;
+      *incr = 3;
+      break;
    case PIPE_PRIM_TRIANGLE_STRIP:
    case PIPE_PRIM_TRIANGLE_FAN:
    case PIPE_PRIM_POLYGON:
       *first = 3;
       *incr = 1;
       break;
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      *first = 6;
+      *incr = 1;
+      break;
    case PIPE_PRIM_QUADS:
       *first = 4;
       *incr = 4;
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
index 010c7a18a7c..f0aec5febab 100644
--- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
+++ b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
@@ -36,6 +36,10 @@ static void FUNC(struct draw_pt_front_end *frontend,
    case PIPE_PRIM_TRIANGLE_STRIP:
    case PIPE_PRIM_QUADS:
    case PIPE_PRIM_QUAD_STRIP:
+   case PIPE_PRIM_LINES_ADJACENCY:
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
       for (j = 0; j < count;) {
          unsigned remaining = count - j;
          unsigned nr = trim( MIN2(varray->driver_fetch_max, remaining), first, incr );
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index 554f4ac3c18..8c3c7befbc7 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -39,7 +39,9 @@
 #define DRAW_VERTEX_H
 
 
+#include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
+#include "util/u_debug.h"
 
 
 /**
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 790e89ed820..90858380221 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -43,29 +43,33 @@
 #include "translate/translate.h"
 #include "translate/translate_cache.h"
 
+#include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_exec.h"
 
 
 
-
-void draw_vs_set_constants( struct draw_context *draw,
-                            const float (*constants)[4],
-                            unsigned size )
+void
+draw_vs_set_constants(struct draw_context *draw,
+                      unsigned slot,
+                      const void *constants,
+                      unsigned size)
 {
    if (((uintptr_t)constants) & 0xf) {
-      if (size > draw->vs.const_storage_size) {
-         if (draw->vs.aligned_constant_storage)
-            align_free((void *)draw->vs.aligned_constant_storage);
-         draw->vs.aligned_constant_storage = align_malloc( size, 16 );
+      if (size > draw->vs.const_storage_size[slot]) {
+         if (draw->vs.aligned_constant_storage[slot]) {
+            align_free((void *)draw->vs.aligned_constant_storage[slot]);
+         }
+         draw->vs.aligned_constant_storage[slot] = align_malloc(size, 16);
       }
-      memcpy( (void*)draw->vs.aligned_constant_storage,
-              constants, 
-              size );
-      constants = draw->vs.aligned_constant_storage;
+      assert(constants);
+      memcpy((void *)draw->vs.aligned_constant_storage[slot],
+             constants,
+             size);
+      constants = draw->vs.aligned_constant_storage[slot];
    }
-      
-   draw->vs.aligned_constants = constants;
-   draw_vs_aos_machine_constants( draw->vs.aos_machine, constants );
+
+   draw->vs.aligned_constants[slot] = constants;
+   draw_vs_aos_machine_constants(draw->vs.aos_machine, slot, constants);
 }
 
 
@@ -83,6 +87,10 @@ draw_create_vertex_shader(struct draw_context *draw,
 {
    struct draw_vertex_shader *vs;
 
+   if (draw->dump_vs) {
+      tgsi_dump(shader->tokens, 0);
+   }
+
    vs = draw_create_vs_llvm( draw, shader );
    if (!vs) {
       vs = draw_create_vs_sse( draw, shader );
@@ -101,6 +109,9 @@ draw_create_vertex_shader(struct draw_context *draw,
          if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
              vs->info.output_semantic_index[i] == 0)
             vs->position_output = i;
+         else if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_EDGEFLAG &&
+             vs->info.output_semantic_index[i] == 0)
+            vs->edgeflag_output = i;
       }
    }
 
@@ -120,6 +131,7 @@ draw_bind_vertex_shader(struct draw_context *draw,
       draw->vs.vertex_shader = dvs;
       draw->vs.num_vs_outputs = dvs->info.num_outputs;
       draw->vs.position_output = dvs->position_output;
+      draw->vs.edgeflag_output = dvs->edgeflag_output;
       dvs->prepare( dvs, draw );
    }
    else {
@@ -148,6 +160,8 @@ draw_delete_vertex_shader(struct draw_context *draw,
 boolean 
 draw_vs_init( struct draw_context *draw )
 {
+   draw->dump_vs = debug_get_bool_option("GALLIUM_DUMP_VS", FALSE);
+
    draw->vs.machine = tgsi_exec_machine_create();
    if (!draw->vs.machine)
       return FALSE;
@@ -172,6 +186,8 @@ draw_vs_init( struct draw_context *draw )
 void
 draw_vs_destroy( struct draw_context *draw )
 {
+   uint i;
+
    if (draw->vs.fetch_cache)
       translate_cache_destroy(draw->vs.fetch_cache);
 
@@ -181,8 +197,11 @@ draw_vs_destroy( struct draw_context *draw )
    if (draw->vs.aos_machine)
       draw_vs_aos_machine_destroy(draw->vs.aos_machine);
 
-   if (draw->vs.aligned_constant_storage)
-      align_free((void*)draw->vs.aligned_constant_storage);
+   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      if (draw->vs.aligned_constant_storage[i]) {
+         align_free((void *)draw->vs.aligned_constant_storage[i]);
+      }
+   }
 
    tgsi_exec_machine_destroy(draw->vs.machine);
 }
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 89ae158751a..d095c9bad1d 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -43,6 +43,7 @@ struct draw_varient_input
    enum pipe_format format;
    unsigned buffer;
    unsigned offset; 
+   unsigned instance_divisor;
 };
 
 struct draw_varient_output
@@ -107,6 +108,7 @@ struct draw_vertex_shader {
 
    struct tgsi_shader_info info;
    unsigned position_output;
+   unsigned edgeflag_output;
 
    /* Extracted from shader:
     */
@@ -130,7 +132,7 @@ struct draw_vertex_shader {
    void (*run_linear)( struct draw_vertex_shader *shader,
 		       const float (*input)[4],
 		       float (*output)[4],
-		       const float (*constants)[4],
+                      const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
 		       unsigned count,
 		       unsigned input_stride,
 		       unsigned output_stride );
@@ -210,8 +212,10 @@ static INLINE int draw_vs_varient_key_compare( const struct draw_vs_varient_key
 struct aos_machine *draw_vs_aos_machine( void );
 void draw_vs_aos_machine_destroy( struct aos_machine *machine );
 
-void draw_vs_aos_machine_constants( struct aos_machine *machine,
-                                    const float (*constants)[4] );
+void
+draw_vs_aos_machine_constants(struct aos_machine *machine,
+                              unsigned slot,
+                              const void *constants);
 
 void draw_vs_aos_machine_viewport( struct aos_machine *machine,
                                    const struct pipe_viewport_state *viewport );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 1aaae4ab7a4..e7121f36541 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -2114,11 +2114,14 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
+   unsigned i;
 
    if (0) debug_printf("%s %d\n", __FUNCTION__, count);
 
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
-   machine->constants = vaos->draw->vs.aligned_constants;
+   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      machine->constants[i] = vaos->draw->vs.aligned_constants[i];
+   }
    machine->immediates = vaos->base.vs->immediates;
    machine->buffer = vaos->buffer;
 
@@ -2135,12 +2138,15 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
+   unsigned i;
 
    if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, 
                        vaos->base.key.const_vbuffers);
 
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
-   machine->constants = vaos->draw->vs.aligned_constants;
+   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      machine->constants[i] = vaos->draw->vs.aligned_constants[i];
+   }
    machine->immediates = vaos->base.vs->immediates;
    machine->buffer = vaos->buffer;
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 2cf72ddf7b1..1911242f825 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -122,7 +122,7 @@ struct aos_machine {
    ushort fpucntl;              /* one of FPU_* above */
 
    const float (*immediates)[4];     /* points to shader data */
-   const float (*constants)[4];      /* points to draw data */
+   const void *constants[PIPE_MAX_CONSTANT_BUFFERS]; /* points to draw data */
 
    const struct aos_buffer *buffer; /* points to ? */
 };
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index a6eb37d1280..ece1ddde0cb 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -191,7 +191,7 @@ static boolean load_input( struct aos_compilation *cp,
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
       emit_load_R32G32B32A32(cp, dataXMM, src);
       break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
       emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
       break;
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
index 3240e3745dd..0eda414ee6a 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
@@ -219,10 +219,12 @@ static void PIPE_CDECL populate_lut( struct aos_machine *machine,
 }
 
 
-void draw_vs_aos_machine_constants( struct aos_machine *machine,
-                                    const float (*constants)[4] )
+void
+draw_vs_aos_machine_constants(struct aos_machine *machine,
+                              unsigned slot,
+                              const void *constants)
 {
-   machine->constants = constants;
+   machine->constants[slot] = constants;
 
    {
       unsigned i;
@@ -307,8 +309,10 @@ void draw_vs_aos_machine_viewport( struct aos_machine *machine,
 {
 }
 
-void draw_vs_aos_machine_constants( struct aos_machine *machine,
-                                    const float (*constants)[4] )
+void
+draw_vs_aos_machine_constants(struct aos_machine *machine,
+                              unsigned slot,
+                              const void *constants)
 {
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 41cc8026131..7deca2b69d9 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -85,7 +85,7 @@ static void
 vs_exec_run_linear( struct draw_vertex_shader *shader,
 		    const float (*input)[4],
 		    float (*output)[4],
-		    const float (*constants)[4],
+                   const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
 		    unsigned count,
 		    unsigned input_stride,
 		    unsigned output_stride )
@@ -95,7 +95,9 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
    unsigned int i, j;
    unsigned slot;
 
-   machine->Consts = constants;
+   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      machine->Consts[i] = constants[i];
+   }
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index b3535c0e48e..5f7a645f5d8 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -42,11 +42,8 @@
 
 #ifdef MESA_LLVM
 
-#include "gallivm/gallivm.h"
-
 struct draw_llvm_vertex_shader {
    struct draw_vertex_shader base;
-   struct gallivm_prog *llvm_prog;
    struct tgsi_exec_machine *machine;
 };
 
@@ -58,23 +55,17 @@ vs_llvm_prepare( struct draw_vertex_shader *base,
 }
 
 
-
-
 static void
 vs_llvm_run_linear( struct draw_vertex_shader *base,
 		   const float (*input)[4],
 		   float (*output)[4],
-		   const float (*constants)[4],
+                   const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
 		   unsigned count,
 		   unsigned input_stride,
 		   unsigned output_stride )
 {
    struct draw_llvm_vertex_shader *shader =
       (struct draw_llvm_vertex_shader *)base;
-
-   gallivm_cpu_vs_exec(shader->llvm_prog, shader->machine,
-                       input, base->info.num_inputs, output, base->info.num_outputs,
-                       constants, count, input_stride, output_stride);
 }
 
 
@@ -121,27 +112,6 @@ draw_create_vs_llvm(struct draw_context *draw,
    vs->base.delete = vs_llvm_delete;
    vs->machine = draw->vs.machine;
 
-   {
-      struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS);
-      gallivm_ir_set_layout(ir, GALLIVM_SOA);
-      gallivm_ir_set_components(ir, 4);
-      gallivm_ir_fill_from_tgsi(ir, vs->base.state.tokens);
-      vs->llvm_prog = gallivm_ir_compile(ir);
-      gallivm_ir_delete(ir);
-   }
-
-   draw->vs.engine = gallivm_global_cpu_engine();
-
-   /* XXX: Why are there two versions of this?  Shouldn't creating the
-    *      engine be a separate operation to compiling a shader?
-    */
-   if (!draw->vs.engine) {
-      draw->vs.engine = gallivm_cpu_engine_create(vs->llvm_prog);
-   }
-   else {
-      gallivm_cpu_jit_compile(draw->vs.engine, vs->llvm_prog);
-   }
-
    return &vs->base;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index ad184bd696d..d869eecec5e 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -85,7 +85,7 @@ static void
 vs_ppc_run_linear( struct draw_vertex_shader *base,
 		   const float (*input)[4],
 		   float (*output)[4],
-		   const float (*constants)[4],
+                  const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
 		   unsigned count,
 		   unsigned input_stride,
 		   unsigned output_stride )
@@ -98,9 +98,9 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
    /* loop over verts */
    for (i = 0; i < count; i += MAX_VERTICES) {
       const uint max_vertices = MIN2(MAX_VERTICES, count - i);
-      float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB;
-      float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB;
-      float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB;
+      PIPE_ALIGN_VAR(16) float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4];
+      PIPE_ALIGN_VAR(16) float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4];
+      PIPE_ALIGN_VAR(16) float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4];
       uint attr;
 
       /* convert (up to) four input verts to SoA format */
@@ -125,7 +125,7 @@ vs_ppc_run_linear( struct draw_vertex_shader *base,
        */
       shader->func(inputs_soa, outputs_soa, temps_soa,
 		   (float (*)[4]) shader->base.immediates,
-		   (float (*)[4]) constants,
+                   (const float (*)[4])constants[0],
                    ppc_builtin_constants);
 
       /* convert (up to) four output verts from SoA back to AoS format */
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 702051387ac..54e6423388f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -83,7 +83,7 @@ static void
 vs_sse_run_linear( struct draw_vertex_shader *base,
 		   const float (*input)[4],
 		   float (*output)[4],
-		   const float (*constants)[4],
+                  const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
 		   unsigned count,
 		   unsigned input_stride,
 		   unsigned output_stride )
@@ -112,7 +112,7 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
       /* run compiled shader
        */
       shader->func(machine,
-		   constants,
+                   (const float (*)[4])constants[0],
 		   shader->base.immediates,
                    input,
                    base->info.num_inputs,
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index 7ee567d4789..5ed706cb4ff 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -38,7 +38,6 @@
 #include "draw/draw_vertex.h"
 #include "draw/draw_vs.h"
 #include "translate/translate.h"
-#include "translate/translate_cache.h"
 
 /* A first pass at incorporating vertex fetch/emit functionality into 
  */
@@ -142,16 +141,18 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_varient *varient,
    vsvg->fetch->run_elts( vsvg->fetch, 
                           elts,
                           count,
+                          vsvg->draw->instance_id,
                           temp_buffer );
 
    vsvg->base.vs->run_linear( vsvg->base.vs, 
                               temp_buffer,
                               temp_buffer,
-                              (const float (*)[4])vsvg->base.vs->draw->pt.user.constants,
+                             vsvg->base.vs->draw->pt.user.vs_constants,
                               count,
                               temp_vertex_stride, 
                               temp_vertex_stride);
 
+   /* FIXME: geometry shading? */
 
    if (vsvg->base.key.clip) {
       /* not really handling clipping, just do the rhw so we can
@@ -180,6 +181,7 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_varient *varient,
 
    vsvg->emit->run( vsvg->emit,
                     0, count,
+                    vsvg->draw->instance_id,
                     output_buffer );
 
    FREE(temp_buffer);
@@ -202,12 +204,13 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_varient *varient,
    vsvg->fetch->run( vsvg->fetch, 
                      start,
                      count,
+                     vsvg->draw->instance_id,
                      temp_buffer );
 
    vsvg->base.vs->run_linear( vsvg->base.vs, 
                               temp_buffer,
                               temp_buffer,
-                              (const float (*)[4])vsvg->base.vs->draw->pt.user.constants,
+                             vsvg->base.vs->draw->pt.user.vs_constants,
                               count,
                               temp_vertex_stride, 
                               temp_vertex_stride);
@@ -238,6 +241,7 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_varient *varient,
    
    vsvg->emit->run( vsvg->emit,
                     0, count,
+                    vsvg->draw->instance_id,
                     output_buffer );
 
    FREE(temp_buffer);
@@ -280,9 +284,11 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
    fetch.nr_elements = key->nr_inputs;
    fetch.output_stride = vsvg->temp_vertex_stride;
    for (i = 0; i < key->nr_inputs; i++) {
+      fetch.element[i].type = TRANSLATE_ELEMENT_NORMAL;
       fetch.element[i].input_format = key->element[i].in.format;
       fetch.element[i].input_buffer = key->element[i].in.buffer;
       fetch.element[i].input_offset = key->element[i].in.offset;
+      fetch.element[i].instance_divisor = 0;
       fetch.element[i].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       fetch.element[i].output_offset = i * 4 * sizeof(float);
       assert(fetch.element[i].output_offset < fetch.output_stride);
@@ -294,17 +300,21 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
    for (i = 0; i < key->nr_outputs; i++) {
       if (key->element[i].out.format != EMIT_1F_PSIZE)
       {      
+         emit.element[i].type = TRANSLATE_ELEMENT_NORMAL;
          emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
          emit.element[i].input_buffer = 0;
          emit.element[i].input_offset = key->element[i].out.vs_output * 4 * sizeof(float);
+         emit.element[i].instance_divisor = 0;
          emit.element[i].output_format = draw_translate_vinfo_format(key->element[i].out.format);
          emit.element[i].output_offset = key->element[i].out.offset;
          assert(emit.element[i].input_offset <= fetch.output_stride);
       }
       else {
+         emit.element[i].type = TRANSLATE_ELEMENT_NORMAL;
          emit.element[i].input_format = PIPE_FORMAT_R32_FLOAT;
          emit.element[i].input_buffer = 1;
          emit.element[i].input_offset = 0;
+         emit.element[i].instance_divisor = 0;
          emit.element[i].output_format = PIPE_FORMAT_R32_FLOAT;
          emit.element[i].output_offset = key->element[i].out.offset;
       }
diff --git a/src/gallium/auxiliary/gallivm/Makefile b/src/gallium/auxiliary/gallivm/Makefile
deleted file mode 100644
index 5a96d94ec37..00000000000
--- a/src/gallium/auxiliary/gallivm/Makefile
+++ /dev/null
@@ -1,92 +0,0 @@
-# -*-makefile-*-
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = gallivm
-
-
-GALLIVM_SOURCES = \
-        gallivm.cpp  \
-        gallivm_cpu.cpp \
-        instructions.cpp  \
-        loweringpass.cpp \
-        tgsitollvm.cpp \
-        storage.cpp \
-        storagesoa.cpp \
-        instructionssoa.cpp
-
-INC_SOURCES = gallivm_builtins.cpp gallivmsoabuiltins.cpp
-
-CPP_SOURCES = \
-	$(GALLIVM_SOURCES)
-
-C_SOURCES =
-ASM_SOURCES =
-
-OBJECTS = $(C_SOURCES:.c=.o) \
-          $(CPP_SOURCES:.cpp=.o) \
-	  $(ASM_SOURCES:.S=.o)
-
-### Include directories
-INCLUDES = \
-	-I. \
-	-I$(TOP)/src/gallium/drivers \
-	-I$(TOP)/src/gallium/auxiliary \
-	-I$(TOP)/src/gallium/include \
-	-I$(TOP)/src/mesa \
-	-I$(TOP)/include
-
-
-##### RULES #####
-
-.c.o:
-	$(CC) -c $(INCLUDES) $(LLVM_CFLAGS) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
-
-.cpp.o:
-	$(CXX) -c $(INCLUDES) $(LLVM_CXXFLAGS) $(CXXFLAGS) $(DRIVER_DEFINES) $< -o $@
-
-.S.o:
-	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES)  $< -o $@
-
-##### TARGETS #####
-
-default:: depend symlinks $(LIBNAME)
-
-
-$(LIBNAME): $(OBJECTS) Makefile
-	$(TOP)/bin/mklib -o $@ -static $(OBJECTS)
-
-
-depend: $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) $(INC_SOURCES)
-	rm -f depend
-	touch depend
-	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) $(CPP_SOURCES) \
-		$(ASM_SOURCES) $(INC_SOURCES) 2> /dev/null
-
-
-gallivm_builtins.cpp: llvm_builtins.c
-	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts > temp1.bin
-	(echo "static const unsigned char llvm_builtins_data[] = {"; od -txC temp1.bin | sed -e "s/^[0-9]*//" -e s"/ \([0-9a-f][0-9a-f]\)/0x\1,/g" -e"\$$d" | sed -e"\$$s/,$$/,0x00};/") >$@
-	rm temp1.bin
-
-gallivmsoabuiltins.cpp: soabuiltins.c
-	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts > temp2.bin
-	(echo "static const unsigned char soabuiltins_data[] = {"; od -txC temp2.bin | sed -e "s/^[0-9]*//" -e s"/ \([0-9a-f][0-9a-f]\)/0x\1,/g" -e"\$$d" | sed -e"\$$s/,$$/,0x00};/") >$@
-	rm temp2.bin
-
-# Emacs tags
-tags:
-	etags `find . -name \*.[ch]` `find ../include`
-
-
-# Remove .o and backup files
-clean:
-	-rm -f *.o */*.o *~ *.so *~ server/*.o
-	-rm -f depend depend.bak
-	-rm -f gallivm_builtins.cpp
-	-rm -f gallivmsoabuiltins.cpp
-
-symlinks:
-
-
-include depend
diff --git a/src/gallium/auxiliary/gallivm/SConscript b/src/gallium/auxiliary/gallivm/SConscript
deleted file mode 100644
index c0aa51b90a9..00000000000
--- a/src/gallium/auxiliary/gallivm/SConscript
+++ /dev/null
@@ -1,16 +0,0 @@
-Import('*')
-
-gallivm = env.ConvenienceLibrary(
-	target = 'gallivm',
-	source = [
-        'gallivm.cpp',
-        'gallivm_cpu.cpp',
-        'instructions.cpp',
-        'loweringpass.cpp',
-        'tgsitollvm.cpp',
-        'storage.cpp',
-        'storagesoa.cpp',
-        'instructionssoa.cpp',
-	])
-
-auxiliaries.insert(0, gallivm)
diff --git a/src/gallium/auxiliary/gallivm/gallivm.cpp b/src/gallium/auxiliary/gallivm/gallivm.cpp
deleted file mode 100644
index f4af5cc8ad5..00000000000
--- a/src/gallium/auxiliary/gallivm/gallivm.cpp
+++ /dev/null
@@ -1,332 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Zack Rusin [email protected]
-  */
-#ifdef MESA_LLVM
-
-#include "gallivm.h"
-#include "gallivm_p.h"
-
-#include "instructions.h"
-#include "loweringpass.h"
-#include "storage.h"
-#include "tgsitollvm.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_shader_tokens.h"
-
-#include "tgsi/tgsi_exec.h"
-#include "tgsi/tgsi_dump.h"
-
-#include <llvm/Module.h>
-#include <llvm/CallingConv.h>
-#include <llvm/Constants.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/ModuleProvider.h>
-#include <llvm/Pass.h>
-#include <llvm/PassManager.h>
-#include <llvm/Attributes.h>
-#include <llvm/Support/PatternMatch.h>
-#include <llvm/ExecutionEngine/JIT.h>
-#include <llvm/ExecutionEngine/Interpreter.h>
-#include <llvm/ExecutionEngine/GenericValue.h>
-#include <llvm/Support/MemoryBuffer.h>
-#include <llvm/LinkAllPasses.h>
-#include <llvm/Analysis/Verifier.h>
-#include <llvm/Analysis/LoopPass.h>
-#include <llvm/Target/TargetData.h>
-#include <llvm/Bitcode/ReaderWriter.h>
-#include <llvm/Transforms/Utils/Cloning.h>
-
-#include <sstream>
-#include <fstream>
-#include <iostream>
-
-static int GLOBAL_ID = 0;
-
-using namespace llvm;
-
-static inline
-void AddStandardCompilePasses(PassManager &PM)
-{
-   PM.add(new LoweringPass());
-   PM.add(createVerifierPass());                  // Verify that input is correct
-
-   PM.add(createLowerSetJmpPass());          // Lower llvm.setjmp/.longjmp
-
-   //PM.add(createStripSymbolsPass(true));
-
-   PM.add(createRaiseAllocationsPass());     // call %malloc -> malloc inst
-   PM.add(createCFGSimplificationPass());    // Clean up disgusting code
-   PM.add(createPromoteMemoryToRegisterPass());// Kill useless allocas
-   PM.add(createGlobalOptimizerPass());      // Optimize out global vars
-   PM.add(createGlobalDCEPass());            // Remove unused fns and globs
-   PM.add(createIPConstantPropagationPass());// IP Constant Propagation
-   PM.add(createDeadArgEliminationPass());   // Dead argument elimination
-   PM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
-   PM.add(createCFGSimplificationPass());    // Clean up after IPCP & DAE
-
-   PM.add(createPruneEHPass());              // Remove dead EH info
-
-   PM.add(createFunctionInliningPass());   // Inline small functions
-   PM.add(createArgumentPromotionPass());    // Scalarize uninlined fn args
-
-   PM.add(createTailDuplicationPass());      // Simplify cfg by copying code
-   PM.add(createInstructionCombiningPass()); // Cleanup for scalarrepl.
-   PM.add(createCFGSimplificationPass());    // Merge & remove BBs
-   PM.add(createScalarReplAggregatesPass()); // Break up aggregate allocas
-   PM.add(createInstructionCombiningPass()); // Combine silly seq's
-   PM.add(createCondPropagationPass());      // Propagate conditionals
-
-   PM.add(createTailCallEliminationPass());  // Eliminate tail calls
-   PM.add(createCFGSimplificationPass());    // Merge & remove BBs
-   PM.add(createReassociatePass());          // Reassociate expressions
-   PM.add(createLoopRotatePass());
-   PM.add(createLICMPass());                 // Hoist loop invariants
-   PM.add(createLoopUnswitchPass());         // Unswitch loops.
-   PM.add(createLoopIndexSplitPass());       // Index split loops.
-   PM.add(createInstructionCombiningPass()); // Clean up after LICM/reassoc
-   PM.add(createIndVarSimplifyPass());       // Canonicalize indvars
-   PM.add(createLoopUnrollPass());           // Unroll small loops
-   PM.add(createInstructionCombiningPass()); // Clean up after the unroller
-   PM.add(createGVNPass());                  // Remove redundancies
-   PM.add(createSCCPPass());                 // Constant prop with SCCP
-
-   // Run instcombine after redundancy elimination to exploit opportunities
-   // opened up by them.
-   PM.add(createInstructionCombiningPass());
-   PM.add(createCondPropagationPass());      // Propagate conditionals
-
-   PM.add(createDeadStoreEliminationPass()); // Delete dead stores
-   PM.add(createAggressiveDCEPass());        // SSA based 'Aggressive DCE'
-   PM.add(createCFGSimplificationPass());    // Merge & remove BBs
-   PM.add(createSimplifyLibCallsPass());     // Library Call Optimizations
-   PM.add(createDeadTypeEliminationPass());  // Eliminate dead types
-   PM.add(createConstantMergePass());        // Merge dup global constants
-}
-
-void gallivm_prog_delete(struct gallivm_prog *prog)
-{
-   delete prog->module;
-   prog->module = 0;
-   prog->function = 0;
-   free(prog);
-}
-
-static inline void
-constant_interpolation(float (*inputs)[16][4],
-                       const struct tgsi_interp_coef *coefs,
-                       unsigned attrib,
-                       unsigned chan)
-{
-   unsigned i;
-
-   for (i = 0; i < QUAD_SIZE; ++i) {
-      inputs[i][attrib][chan] = coefs[attrib].a0[chan];
-   }
-}
-
-static inline void
-linear_interpolation(float (*inputs)[16][4],
-                     const struct tgsi_interp_coef *coefs,
-                     unsigned attrib,
-                     unsigned chan)
-{
-   unsigned i;
-
-   for( i = 0; i < QUAD_SIZE; i++ ) {
-      const float x = inputs[i][0][0];
-      const float y = inputs[i][0][1];
-
-      inputs[i][attrib][chan] =
-         coefs[attrib].a0[chan] +
-         coefs[attrib].dadx[chan] * x +
-         coefs[attrib].dady[chan] * y;
-   }
-}
-
-static inline void
-perspective_interpolation(float (*inputs)[16][4],
-                          const struct tgsi_interp_coef *coefs,
-                          unsigned attrib,
-                          unsigned chan )
-{
-   unsigned i;
-
-   for( i = 0; i < QUAD_SIZE; i++ ) {
-      const float x = inputs[i][0][0];
-      const float y = inputs[i][0][1];
-      /* WPOS.w here is really 1/w */
-      const float w = 1.0f / inputs[i][0][3];
-      assert(inputs[i][0][3] != 0.0);
-
-      inputs[i][attrib][chan] =
-         (coefs[attrib].a0[chan] +
-          coefs[attrib].dadx[chan] * x +
-          coefs[attrib].dady[chan] * y) * w;
-   }
-}
-
-void gallivm_ir_dump(struct gallivm_ir *ir, const char *file_prefix)
-{
-   if (!ir || !ir->module)
-      return;
-
-   if (file_prefix) {
-      std::ostringstream stream;
-      stream << file_prefix;
-      stream << ir->id;
-      stream << ".ll";
-      std::string name = stream.str();
-      std::ofstream out(name.c_str());
-      if (!out) {
-         std::cerr<<"Can't open file : "<<stream.str()<<std::endl;;
-         return;
-      }
-      out << (*ir->module);
-      out.close();
-   } else {
-      const llvm::Module::FunctionListType &funcs = ir->module->getFunctionList();
-      llvm::Module::FunctionListType::const_iterator itr;
-      std::cout<<"; ---------- Start shader "<<ir->id<<std::endl;
-      for (itr = funcs.begin(); itr != funcs.end(); ++itr) {
-         const llvm::Function &func = (*itr);
-         std::string name = func.getName();
-         const llvm::Function *found = 0;
-         if (name.find("vs_shader") != std::string::npos ||
-             name.find("fs_shader") != std::string::npos ||
-             name.find("function") != std::string::npos)
-            found = &func;
-         if (found) {
-            std::cout<<*found<<std::endl;
-         }
-      }
-      std::cout<<"; ---------- End shader "<<ir->id<<std::endl;
-   }
-}
-
-
-void gallivm_prog_inputs_interpolate(struct gallivm_prog *prog,
-                                     float (*inputs)[16][4],
-                                     const struct tgsi_interp_coef *coef)
-{
-   for (int i = 0; i < prog->num_interp; ++i) {
-      const gallivm_interpolate &interp = prog->interpolators[i];
-      switch (interp.type) {
-      case TGSI_INTERPOLATE_CONSTANT:
-         constant_interpolation(inputs, coef, interp.attrib, interp.chan);
-         break;
-
-      case TGSI_INTERPOLATE_LINEAR:
-         linear_interpolation(inputs, coef, interp.attrib, interp.chan);
-         break;
-
-      case TGSI_INTERPOLATE_PERSPECTIVE:
-         perspective_interpolation(inputs, coef, interp.attrib, interp.chan);
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-}
-
-
-struct gallivm_ir * gallivm_ir_new(enum gallivm_shader_type type)
-{
-   struct gallivm_ir *ir =
-      (struct gallivm_ir *)calloc(1, sizeof(struct gallivm_ir));
-   ++GLOBAL_ID;
-   ir->id   = GLOBAL_ID;
-   ir->type = type;
-
-   return ir;
-}
-
-void gallivm_ir_set_layout(struct gallivm_ir *ir,
-                           enum gallivm_vector_layout layout)
-{
-   ir->layout = layout;
-}
-
-void gallivm_ir_set_components(struct gallivm_ir *ir, int num)
-{
-   ir->num_components = num;
-}
-
-void gallivm_ir_fill_from_tgsi(struct gallivm_ir *ir,
-                               const struct tgsi_token *tokens)
-{
-   std::cout << "Creating llvm from: " <<std::endl;
-   tgsi_dump(tokens, 0);
-
-   llvm::Module *mod = tgsi_to_llvmir(ir, tokens);
-   ir->module = mod;
-   gallivm_ir_dump(ir, 0);
-}
-
-void gallivm_ir_delete(struct gallivm_ir *ir)
-{
-   delete ir->module;
-   free(ir);
-}
-
-struct gallivm_prog * gallivm_ir_compile(struct gallivm_ir *ir)
-{
-   struct gallivm_prog *prog =
-      (struct gallivm_prog *)calloc(1, sizeof(struct gallivm_prog));
-
-   std::cout << "Before optimizations:"<<std::endl;
-   ir->module->dump();
-   std::cout<<"-------------------------------"<<std::endl;
-
-   PassManager veri;
-   veri.add(createVerifierPass());
-   veri.run(*ir->module);
-   llvm::Module *mod = llvm::CloneModule(ir->module);
-   prog->num_consts = ir->num_consts;
-   memcpy(prog->interpolators, ir->interpolators, sizeof(prog->interpolators));
-   prog->num_interp = ir->num_interp;
-
-   /* Run optimization passes over it */
-   PassManager passes;
-   passes.add(new TargetData(mod));
-   AddStandardCompilePasses(passes);
-   passes.run(*mod);
-   prog->module = mod;
-
-   std::cout << "After optimizations:"<<std::endl;
-   mod->dump();
-
-   return prog;
-}
-
-#endif /* MESA_LLVM */
diff --git a/src/gallium/auxiliary/gallivm/gallivm.h b/src/gallium/auxiliary/gallivm/gallivm.h
deleted file mode 100644
index 36a64a77471..00000000000
--- a/src/gallium/auxiliary/gallivm/gallivm.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Zack Rusin [email protected]
-  */
-
-#ifndef GALLIVM_H
-#define GALLIVM_H
-
-/*
-  LLVM representation consists of two stages - layout independent
-  intermediate representation gallivm_ir and driver specific
-  gallivm_prog. TGSI is first being translated into gallivm_ir
-  after that driver can set number of options on gallivm_ir and
-  have it compiled into gallivm_prog. gallivm_prog can be either
-  executed (assuming there's LLVM JIT backend for the current
-  target) or machine code generation can be done (assuming there's
-  a LLVM code generator for thecurrent target)
- */
-#if defined __cplusplus
-extern "C" {
-#endif
-
-#include "pipe/p_state.h"
-
-#ifdef MESA_LLVM
-
-struct tgsi_token;
-
-struct gallivm_ir;
-struct gallivm_prog;
-struct gallivm_cpu_engine;
-struct tgsi_interp_coef;
-struct tgsi_sampler;
-struct tgsi_exec_vector;
-
-enum gallivm_shader_type {
-   GALLIVM_VS,
-   GALLIVM_FS
-};
-
-enum gallivm_vector_layout {
-   GALLIVM_AOS,
-   GALLIVM_SOA
-};
-
-struct gallivm_ir *gallivm_ir_new(enum gallivm_shader_type type);
-void               gallivm_ir_set_layout(struct gallivm_ir *ir,
-                                         enum gallivm_vector_layout layout);
-void               gallivm_ir_set_components(struct gallivm_ir *ir, int num);
-void               gallivm_ir_fill_from_tgsi(struct gallivm_ir *ir,
-                                             const struct tgsi_token *tokens);
-void               gallivm_ir_delete(struct gallivm_ir *ir);
-
-
-struct gallivm_prog *gallivm_ir_compile(struct gallivm_ir *ir);
-
-void gallivm_prog_inputs_interpolate(struct gallivm_prog *prog,
-                                     float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
-                                     const struct tgsi_interp_coef *coefs);
-void gallivm_prog_dump(struct gallivm_prog *prog, const char *file_prefix);
-
-
-struct gallivm_cpu_engine *gallivm_cpu_engine_create(struct gallivm_prog *prog);
-struct gallivm_cpu_engine *gallivm_global_cpu_engine();
-int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
-                        struct tgsi_exec_machine *machine,
-                        const float (*input)[4],
-                        unsigned num_inputs,
-                        float (*output)[4],
-                        unsigned num_outputs,
-                        const float (*constants)[4],
-                        unsigned count,
-                        unsigned input_stride,
-                        unsigned output_stride);
-int gallivm_cpu_fs_exec(struct gallivm_prog *prog,
-                        float x, float y,
-                        float (*dests)[PIPE_MAX_SHADER_INPUTS][4],
-                        float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
-                        float (*consts)[4],
-                        struct tgsi_sampler *samplers);
-void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *ee, struct gallivm_prog *prog);
-void gallivm_cpu_engine_delete(struct gallivm_cpu_engine *ee);
-
-
-#endif /* MESA_LLVM */
-
-#if defined __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
deleted file mode 100644
index 634bac01507..00000000000
--- a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-static const unsigned char llvm_builtins_data[] = {
-0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
-0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39,
-0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02,
-0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24,
-0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64,
-0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x03,0x00,0x00,0x00,0x0b,0x84,0xff,0xff,
-0xff,0xff,0x1f,0xc0,0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,
-0x32,0x22,0x48,0x09,0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,
-0xc6,0x05,0x42,0x52,0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,
-0x29,0x80,0x21,0x00,0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,
-0x80,0x50,0x2b,0x03,0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,
-0x14,0x01,0x80,0x11,0x80,0x22,0x88,0x00,0x13,0x30,0x7c,0xc0,0x03,0x3b,0xf8,0x05,
-0x3b,0xa0,0x83,0x36,0xa8,0x07,0x77,0x58,0x07,0x77,0x78,0x87,0x7b,0x70,0x87,0x36,
-0x60,0x87,0x74,0x70,0x87,0x7a,0xc0,0x87,0x36,0x38,0x07,0x77,0xa8,0x87,0x0d,0xf7,
-0x50,0x0e,0x6d,0x00,0x0f,0x7a,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
-0x07,0x74,0xd0,0x06,0xe9,0x10,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,
-0x78,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,0xe9,0x10,0x07,0x76,0xa0,0x07,0x71,
-0x60,0x07,0x7a,0x10,0x07,0x76,0xd0,0x06,0xe9,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,
-0x07,0x7a,0x30,0x07,0x72,0xd0,0x06,0xe9,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,
-0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x7a,
-0x30,0x07,0x72,0xd0,0x06,0xe6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
-0x07,0x74,0xd0,0x06,0xf6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,0x07,
-0x74,0xd0,0x06,0xf6,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,
-0x10,0x07,0x72,0x80,0x07,0x6d,0x10,0x0e,0x70,0xa0,0x07,0x70,0xa0,0x07,0x76,0x40,
-0x07,0x6d,0x60,0x0e,0x78,0x00,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,
-0x72,0x80,0x07,0x3a,0x0f,0x84,0x48,0x20,0x23,0x24,0x40,0x00,0x62,0x67,0x88,0x9f,
-0x19,0x92,0x24,0x00,0x10,0x04,0x00,0x00,0x00,0x43,0x92,0x04,0x08,0x00,0x00,0x00,
-0x00,0x60,0x48,0xa2,0x00,0x40,0x10,0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x02,
-0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x41,0x00,0x00,0x00,0x30,0x24,0x61,0x80,0x00,
-0x00,0x00,0x00,0x00,0x86,0x24,0x07,0x10,0x00,0x00,0x00,0x00,0xc0,0x90,0x44,0x01,
-0x80,0x20,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
-0x05,0x00,0x82,0x00,0x00,0x00,0x60,0x48,0x52,0x00,0x40,0x10,0x00,0x00,0x00,0x64,
-0x81,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
-0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x8a,0x8a,0x59,0x8b,0x43,0x50,0xd2,0x09,0x02,
-0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,0x64,0x56,0x28,0x66,
-0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,0xa8,0x64,0x06,0x95,
-0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,0x04,0x00,0x00,0x00,
-0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
-0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,0x10,0x84,0xc3,0x60,
-0x04,0x00,0x00,0x00,0xc3,0x0d,0xce,0x43,0x4c,0x37,0x3c,0x8e,0x34,0xdc,0x30,0x41,
-0xc2,0x74,0x03,0x34,0x51,0xc3,0x0d,0x4d,0x44,0x4c,0x37,0x44,0x8d,0x35,0x56,0x01,
-0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0xd6,0x10,0x00,0xe6,
-0x10,0x04,0x76,0x81,0x00,0x3e,0x30,0x0c,0x91,0x4f,0x1b,0x05,0x21,0x30,0x8f,0x6d,
-0x13,0x48,0xe0,0x03,0xc3,0x10,0xf9,0xb4,0x55,0x20,0x81,0x0f,0x0c,0x43,0xe4,0xd7,
-0x66,0x41,0x08,0xcc,0xa3,0x1f,0x40,0x41,0x34,0x53,0x84,0x99,0xc4,0x20,0x30,0x8f,
-0x61,0x10,0x02,0xb0,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x27,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,0x55,0xd6,0x1c,0x84,
-0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,0x41,0xfc,0x03,0x00,
-0x63,0x08,0x0d,0x34,0xdc,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
-0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,0x6d,0x73,0x0c,0x19,
-0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x10,0x00,0x00,0x00,0x27,0x50,0x20,0x05,
-0xd1,0x0c,0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,
-0x14,0x13,0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,
-0xba,0x35,0x0c,0x13,0xf3,0xd8,0x05,0x31,0x31,0x8f,0x6e,0x1c,0x84,0x00,0x2c,0xcb,
-0x01,0x14,0x44,0x33,0x45,0x98,0x61,0x0c,0x02,0xf3,0x00,0x00,0x00,0x00,0x00,0x00,
-0x61,0x20,0x00,0x00,0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,
-0x04,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,
-0x11,0x00,0x00,0x00,0x63,0x08,0x4d,0x64,0x16,0xc1,0xe1,0x86,0xab,0x22,0x66,0x19,
-0x02,0x01,0x1b,0x43,0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,
-0x0a,0x20,0x0b,0x34,0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x70,
-0x83,0x57,0x11,0xb3,0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,
-0x48,0xb3,0x04,0xc6,0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,
-0x63,0x08,0xcd,0x64,0x64,0x40,0x70,0xb8,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,
-0x60,0x0c,0xc1,0x99,0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,
-0x33,0x38,0xd0,0x00,0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,
-0xe0,0x70,0x03,0x1b,0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,
-0x30,0x63,0x08,0x0f,0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,
-0xa0,0x06,0x70,0x00,0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
-0x76,0x52,0x4c,0xcc,0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,
-0xc6,0x50,0x8a,0x89,0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,
-0x79,0x68,0x73,0x20,0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,
-0x9e,0xdb,0x32,0x88,0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,
-0xa7,0xb7,0x95,0x62,0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,
-0x34,0x35,0x56,0x62,0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,
-0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,
-0x24,0x01,0x63,0xec,0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,
-0xfc,0xc4,0xd0,0x90,0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,
-0xc1,0x71,0x7b,0x29,0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,
-0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,
-0x4e,0x33,0x58,0x47,0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,
-0x33,0xe1,0xbc,0xa5,0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,
-0x3a,0x40,0xc6,0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,
-0x6f,0x20,0x11,0x9c,0x66,0xf0,0x05,0xa7,0x99,0xec,0x82,0x10,0x9c,0xa6,0x32,0x93,
-0x42,0x60,0x1e,0x7b,0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,
-0xa7,0x6d,0xa4,0x98,0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,
-0x9c,0x66,0xc0,0x7b,0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,
-0x8b,0xe0,0x34,0x83,0x2f,0x38,0xcd,0x64,0xd3,0x07,0x50,0x10,0xcd,0x14,0x61,0xe6,
-0x61,0x08,0x4e,0x53,0xd5,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
-0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,
-0x1e,0xe1,0x19,0xc6,0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,
-0x15,0xc1,0x31,0x84,0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,
-0x71,0x6c,0x23,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,
-0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,
-0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,
-0x30,0x06,0x64,0xe0,0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,
-0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,
-0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,
-0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,
-0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,
-0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,
-0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,
-0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,
-0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,
-0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,
-0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,
-0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,
-0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,
-0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x26,0x41,0x08,0xc0,
-0xb2,0x18,0x45,0x21,0x00,0xcb,0xb2,0x5b,0x04,0x31,0x31,0x8f,0x6d,0x13,0xc4,0xc4,
-0x3c,0xb9,0x35,0x0c,0x15,0xb0,0x58,0x05,0x31,0x31,0x4f,0x7f,0x00,0x05,0xd1,0x4c,
-0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
-0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,
-0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,
-0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x76,0x51,0x4c,0xcc,
-0x53,0xdb,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,
-0x31,0x8f,0x6e,0x16,0xc4,0xc4,0x3c,0xbd,0x51,0x10,0x02,0xb0,0x2c,0xd6,0x30,0x54,
-0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x2c,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,
-0xdc,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xdc,0x50,0x49,0xc4,
-0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xdc,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,
-0x63,0x08,0x8d,0x33,0xdc,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,
-0x17,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,
-0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x76,
-0x41,0x4c,0xcc,0xb3,0x1f,0x81,0x11,0x11,0x13,0x15,0x35,0x37,0x90,0x2c,0x4e,0xf4,
-0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44,
-0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0,
-0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03,
-0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,
-0x23,0x19,0xc3,0xa0,0x20,0x8b,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,
-0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,
-0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,
-0x84,0x34,0x85,0x25,0x0c,0x92,0x20,0x59,0xc1,0x20,0x30,0x8f,0x2d,0x10,0x95,0x84,
-0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
deleted file mode 100644
index 1bd00a0c2a6..00000000000
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Zack Rusin [email protected]
-  */
-#ifdef MESA_LLVM
-
-#include "gallivm.h"
-#include "gallivm_p.h"
-
-#include "instructions.h"
-#include "loweringpass.h"
-#include "storage.h"
-#include "tgsitollvm.h"
-
-#include "pipe/p_context.h"
-#include "pipe/p_shader_tokens.h"
-
-#include "tgsi/tgsi_exec.h"
-#include "tgsi/tgsi_dump.h"
-
-#include "util/u_memory.h"
-#include "util/u_math.h"
-
-#include <llvm/Module.h>
-#include <llvm/CallingConv.h>
-#include <llvm/Constants.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/ModuleProvider.h>
-#include <llvm/Pass.h>
-#include <llvm/PassManager.h>
-#include <llvm/Attributes.h>
-#include <llvm/Support/PatternMatch.h>
-#include <llvm/ExecutionEngine/JIT.h>
-#include <llvm/ExecutionEngine/Interpreter.h>
-#include <llvm/ExecutionEngine/GenericValue.h>
-#include <llvm/Support/MemoryBuffer.h>
-#include <llvm/LinkAllPasses.h>
-#include <llvm/Analysis/Verifier.h>
-#include <llvm/Analysis/LoopPass.h>
-#include <llvm/Target/TargetData.h>
-#include <llvm/Bitcode/ReaderWriter.h>
-#include <llvm/Transforms/Utils/Cloning.h>
-
-#include <sstream>
-#include <fstream>
-#include <iostream>
-
-struct gallivm_cpu_engine {
-   llvm::ExecutionEngine *engine;
-};
-
-static struct gallivm_cpu_engine *CPU = 0;
-
-typedef int (*fragment_shader_runner)(float x, float y,
-                                      float (*dests)[16][4],
-                                      float (*inputs)[16][4],
-                                      int num_attribs,
-                                      float (*consts)[4], int num_consts,
-                                      struct tgsi_sampler *samplers);
-
-int gallivm_cpu_fs_exec(struct gallivm_prog *prog,
-                        float fx, float fy,
-                        float (*dests)[16][4],
-                        float (*inputs)[16][4],
-                        float (*consts)[4],
-                        struct tgsi_sampler *samplers)
-{
-   fragment_shader_runner runner = reinterpret_cast<fragment_shader_runner>(prog->function);
-   assert(runner);
-
-   return runner(fx, fy, dests, inputs, prog->num_interp,
-                 consts, prog->num_consts,
-                 samplers);
-}
-
-static inline llvm::Function *func_for_shader(struct gallivm_prog *prog)
-{
-   llvm::Module *mod = prog->module;
-   llvm::Function *func = 0;
-
-   switch (prog->type) {
-   case GALLIVM_VS:
-      func = mod->getFunction("vs_shader");
-      break;
-   case GALLIVM_FS:
-      func = mod->getFunction("fs_shader");
-      break;
-   default:
-      assert(!"Unknown shader type!");
-      break;
-   }
-   return func;
-}
-
-/*!
-  This function creates a CPU based execution engine for the given gallivm_prog.
-  gallivm_cpu_engine should be used as a singleton throughout the library. Before
-  executing gallivm_prog_exec one needs to call gallivm_cpu_jit_compile.
-  The gallivm_prog instance which is being passed to the constructor is being
-  automatically JIT compiled so one shouldn't call gallivm_cpu_jit_compile
-  with it again.
- */
-struct gallivm_cpu_engine * gallivm_cpu_engine_create(struct gallivm_prog *prog)
-{
-   struct gallivm_cpu_engine *cpu = (struct gallivm_cpu_engine *)
-                                    calloc(1, sizeof(struct gallivm_cpu_engine));
-   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
-   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
-   llvm::ExecutionEngine *ee = llvm::ExecutionEngine::create(mp, false);
-   ee->DisableLazyCompilation();
-   cpu->engine = ee;
-
-   llvm::Function *func = func_for_shader(prog);
-
-   prog->function = ee->getPointerToFunction(func);
-   CPU = cpu;
-   return cpu;
-}
-
-
-/*!
-  This function JIT compiles the given gallivm_prog with the given cpu based execution engine.
-  The reference to the generated machine code entry point will be stored
-  in the gallivm_prog program. After executing this function one can call gallivm_prog_exec
-  in order to execute the gallivm_prog on the CPU.
- */
-void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog *prog)
-{
-   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
-   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
-   llvm::ExecutionEngine *ee = cpu->engine;
-   assert(ee);
-   /*FIXME : why was this disabled ? we need it for pow/sqrt/... */
-   ee->DisableLazyCompilation(false);
-   ee->addModuleProvider(mp);
-
-   llvm::Function *func = func_for_shader(prog);
-   prog->function = ee->getPointerToFunction(func);
-}
-
-void gallivm_cpu_engine_delete(struct gallivm_cpu_engine *cpu)
-{
-   free(cpu);
-}
-
-struct gallivm_cpu_engine * gallivm_global_cpu_engine()
-{
-   return CPU;
-}
-
-
-typedef void (*vertex_shader_runner)(void *ainputs,
-                                     void *dests,
-                                     float (*aconsts)[4]);
-
-#define MAX_TGSI_VERTICES 4
-/*!
-  This function is used to execute the gallivm_prog in software. Before calling
-  this function the gallivm_prog has to be JIT compiled with the gallivm_cpu_jit_compile
-  function.
- */
-int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
-                        struct tgsi_exec_machine *machine,
-                        const float (*input)[4],
-                        unsigned num_inputs,
-                        float (*output)[4],
-                        unsigned num_outputs,
-                        const float (*constants)[4],
-                        unsigned count,
-                        unsigned input_stride,
-                        unsigned output_stride )
-{
-   unsigned int i, j;
-   unsigned slot;
-   vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
-   assert(runner);
-
-   for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
-      unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
-
-      /* Swizzle inputs.
-       */
-      for (j = 0; j < max_vertices; j++) {
-	 for (slot = 0; slot < num_inputs; slot++) {
-	    machine->Inputs[slot].xyzw[0].f[j] = input[slot][0];
-	    machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
-	    machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
-	    machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
-	 }
-
-	 input = (const float (*)[4])((const char *)input + input_stride);
-      }
-
-      /* run shader */
-      runner(machine->Inputs,
-             machine->Outputs,
-             (float (*)[4]) constants);
-
-      /* Unswizzle all output results
-       */
-      for (j = 0; j < max_vertices; j++) {
-         for (slot = 0; slot < num_outputs; slot++) {
-            output[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-            output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-         }
-         output = (float (*)[4])((char *)output + output_stride);
-      }
-   }
-
-   return 0;
-}
-
-#endif
diff --git a/src/gallium/auxiliary/gallivm/gallivm_p.h b/src/gallium/auxiliary/gallivm/gallivm_p.h
deleted file mode 100644
index d2c5852bdf7..00000000000
--- a/src/gallium/auxiliary/gallivm/gallivm_p.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef GALLIVM_P_H
-#define GALLIVM_P_H
-
-#ifdef MESA_LLVM
-
-#include "gallivm.h"
-#include "pipe/p_shader_tokens.h"
-#include "pipe/p_compiler.h"
-
-namespace llvm {
-   class Module;
-}
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-enum gallivm_shader_type;
-enum gallivm_vector_layout;
-
-struct gallivm_interpolate {
-   int attrib;
-   int chan;
-   int type;
-};
-
-struct gallivm_ir {
-   llvm::Module *module;
-   int id;
-   enum gallivm_shader_type type;
-   enum gallivm_vector_layout layout;
-   int num_components;
-   int   num_consts;
-
-   /* FIXME: this might not be enough for some shaders */
-   struct gallivm_interpolate interpolators[32*4];
-   int   num_interp;
-};
-
-struct gallivm_prog {
-   llvm::Module *module;
-   void *function;
-
-   int   id;
-   enum gallivm_shader_type type;
-
-   int   num_consts;
-
-   /* FIXME: this might not be enough for some shaders */
-   struct gallivm_interpolate interpolators[32*4];
-   int   num_interp;
-};
-
-static INLINE void gallivm_swizzle_components(int swizzle,
-                                              int *xc, int *yc,
-                                              int *zc, int *wc)
-{
-   int x = swizzle / 1000; swizzle -= x * 1000;
-   int y = swizzle / 100;  swizzle -= y * 100;
-   int z = swizzle / 10;   swizzle -= z * 10;
-   int w = swizzle;
-
-   if (xc) *xc = x;
-   if (yc) *yc = y;
-   if (zc) *zc = z;
-   if (wc) *wc = w;
-}
-
-static INLINE boolean gallivm_is_swizzle(int swizzle)
-{
-   const int NO_SWIZZLE = TGSI_SWIZZLE_X * 1000 + TGSI_SWIZZLE_Y * 100 +
-                          TGSI_SWIZZLE_Z * 10 + TGSI_SWIZZLE_W;
-   return swizzle != NO_SWIZZLE;
-}
-
-static INLINE int gallivm_x_swizzle(int swizzle)
-{
-   int x;
-   gallivm_swizzle_components(swizzle, &x, 0, 0, 0);
-   return x;
-}
-
-static INLINE int gallivm_y_swizzle(int swizzle)
-{
-   int y;
-   gallivm_swizzle_components(swizzle, 0, &y, 0, 0);
-   return y;
-}
-
-static INLINE int gallivm_z_swizzle(int swizzle)
-{
-   int z;
-   gallivm_swizzle_components(swizzle, 0, 0, &z, 0);
-   return z;
-}
-
-static INLINE int gallivm_w_swizzle(int swizzle)
-{
-   int w;
-   gallivm_swizzle_components(swizzle, 0, 0, 0, &w);
-   return w;
-}
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* MESA_LLVM */
-
-#endif
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
deleted file mode 100644
index ee8162efce5..00000000000
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ /dev/null
@@ -1,1193 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Zack Rusin [email protected]
-  */
-#ifdef MESA_LLVM
-
-#include "instructions.h"
-
-#include "storage.h"
-
-#include "util/u_memory.h"
-
-#include <llvm/CallingConv.h>
-#include <llvm/Constants.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Function.h>
-#include <llvm/InstrTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/Attributes.h>
-#include <llvm/Support/MemoryBuffer.h>
-#include <llvm/Bitcode/ReaderWriter.h>
-
-#include <sstream>
-#include <fstream>
-#include <iostream>
-
-using namespace llvm;
-
-#include "gallivm_builtins.cpp"
-
-#if 0
-llvm::Value *arrayFromChannels(std::vector<llvm::Value*> &vals)
-{
-   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
-   ArrayType *vectorArray = ArrayType::get(vectorType, 4);
-}
-#endif
-
-static inline std::string createFuncName(int label)
-{
-   std::ostringstream stream;
-   stream << "function";
-   stream << label;
-   return stream.str();
-}
-
-Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicBlock *block,
-                           Storage *storage)
-   :  m_mod(mod), m_func(func), m_builder(block), m_idx(0),
-      m_storage(storage)
-{
-   m_floatVecType = VectorType::get(Type::FloatTy, 4);
-
-   m_llvmFSqrt = 0;
-   m_llvmFAbs  = 0;
-   m_llvmPow   = 0;
-   m_llvmFloor = 0;
-   m_llvmFlog  = 0;
-   m_llvmFexp  = 0;
-   m_llvmLit  = 0;
-   m_fmtPtr = 0;
-
-   MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
-      (const char*)&llvm_builtins_data[0],
-      (const char*)&llvm_builtins_data[Elements(llvm_builtins_data)-1]);
-   m_mod = ParseBitcodeFile(buffer);
-}
-
-llvm::BasicBlock * Instructions::currentBlock() const
-{
-   return m_builder.GetInsertBlock();
-}
-
-llvm::Value * Instructions::abs(llvm::Value *in)
-{
-   std::vector<llvm::Value*> vec = extractVector(in);
-   Value *xabs  = callFAbs(vec[0]);
-   Value *yabs  = callFAbs(vec[1]);
-   Value *zabs  = callFAbs(vec[2]);
-   Value *wabs  = callFAbs(vec[3]);
-   return vectorFromVals(xabs, yabs, zabs, wabs);
-}
-
-llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
-{
-   return m_builder.CreateAdd(in1, in2, name("add"));
-}
-
-llvm::Value * Instructions::arl(llvm::Value *in)
-{
-   return floor(in);
-}
-
-void Instructions::beginLoop()
-{
-   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
-   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
-
-   m_builder.CreateBr(begin);
-   Loop loop;
-   loop.begin = begin;
-   loop.end   = end;
-   m_builder.SetInsertPoint(begin);
-   m_loopStack.push(loop);
-}
-
-void Instructions::bgnSub(unsigned label)
-{
-   llvm::Function *func = findFunction(label);
-
-   Function::arg_iterator args = func->arg_begin();
-   Value *ptr_INPUT = args++;
-   ptr_INPUT->setName("INPUT");
-   m_storage->pushArguments(ptr_INPUT);
-
-   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
-
-   m_func = func;
-   m_builder.SetInsertPoint(entry);
-}
-
-void Instructions::brk()
-{
-   assert(!m_loopStack.empty());
-   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
-   m_builder.CreateBr(m_loopStack.top().end);
-   m_builder.SetInsertPoint(unr);
-}
-
-void Instructions::cal(int label, llvm::Value *input)
-{
-   std::vector<Value*> params;
-   params.push_back(input);
-   llvm::Function *func = findFunction(label);
-
-   m_builder.CreateCall(func, params.begin(), params.end());
-}
-
-llvm::Value * Instructions::ceil(llvm::Value *in)
-{
-   std::vector<llvm::Value*> vec = extractVector(in);
-   return vectorFromVals(callCeil(vec[0]), callCeil(vec[1]),
-                         callCeil(vec[2]), callCeil(vec[3]));
-}
-
-llvm::Value * Instructions::clamp(llvm::Value *in1)
-{
-   llvm::Value *zero = constVector(0.0f, 0.0f, 0.0f, 0.0f);
-   llvm::Value *one = constVector(1.0f, 1.0f, 1.0f, 1.0f);
-   return min( max(zero, in1), one);
-}
-
-llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
-   llvm::Function *func = m_mod->getFunction("cmp");
-   assert(func);
-
-   std::vector<Value*> params;
-   params.push_back(in1);
-   params.push_back(in2);
-   params.push_back(in3);
-   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-   std::vector<llvm::Value*> vec3 = extractVector(in3);
-   Constant *half = ConstantFP::get(APFloat(0.5f));
-
-   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], half, name("xcmp"));
-   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
-                                        name("selx"));
-
-   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], half, name("ycmp"));
-   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
-                                        name("sely"));
-
-   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], half, name("zcmp"));
-   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
-                                        name("selz"));
-
-   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], half, name("wcmp"));
-   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
-                                        name("selw"));
-
-   return vectorFromVals(selx, sely, selz, selw);
-}
-
-llvm::Value * Instructions::cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-   std::vector<llvm::Value*> vec3 = extractVector(in3);
-   Constant *zero = Constant::getNullValue(Type::FloatTy);
-
-   Value *xcmp  = m_builder.CreateFCmpOGE(vec1[0], zero, name("xcmp"));
-   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
-                                        name("selx"));
-
-   Value *ycmp  = m_builder.CreateFCmpOGE(vec1[1], zero, name("ycmp"));
-   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
-                                        name("sely"));
-
-   Value *zcmp  = m_builder.CreateFCmpOGE(vec1[2], zero, name("zcmp"));
-   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
-                                        name("selz"));
-
-   Value *wcmp  = m_builder.CreateFCmpOGE(vec1[3], zero, name("wcmp"));
-   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
-                                        name("selw"));
-
-   return vectorFromVals(selx, sely, selz, selw);
-}
-
-llvm::Value * Instructions::cos(llvm::Value *in)
-{
-#if 0
-   llvm::Function *func = m_mod->getFunction("vcos");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
-   call->setTailCall(false);
-   return call;
-#else
-   std::vector<llvm::Value*> elems = extractVector(in);
-   Function *func = m_mod->getFunction("cosf");
-   assert(func);
-   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
-   cos->setCallingConv(CallingConv::C);
-   cos->setTailCall(true);
-   return vectorFromVals(cos, cos, cos, cos);
-#endif
-}
-
-llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
-{
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *y1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(1),
-                                              name("y1"));
-   Value *z1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(2),
-                                              name("z1"));
-
-   Value *x2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(0),
-                                              name("x2"));
-   Value *y2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(1),
-                                              name("y2"));
-   Value *z2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(2),
-                                              name("z2"));
-   Value *y1z2 = mul(y1, z2);
-   Value *z1y2 = mul(z1, y2);
-
-   Value *z1x2 = mul(z1, x2);
-   Value *x1z2 = mul(x1, z2);
-
-   Value *x1y2 = mul(x1, y2);
-   Value *y1x2 = mul(y1, x2);
-
-   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
-}
-
-llvm::Value * Instructions::ddx(llvm::Value *in)
-{
-   // FIXME
-   assert(0);
-}
-
-llvm::Value * Instructions::ddy(llvm::Value *in)
-{
-   // FIXME
-   assert(0);
-}
-
-llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2)
-{
-   return m_builder.CreateFDiv(in1, in2, name("div"));
-}
-
-llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
-   Value *mulRes = mul(in1, in2);
-   Value *x = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(0),
-                                                          name("extractx"));
-   Value *y = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(1),
-                                                          name("extracty"));
-   Value *z = m_builder.CreateExtractElement(in3,
-                                                          m_storage->constantInt(2),
-                                                          name("extractz"));
-   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
-   Value *dot2add = m_builder.CreateAdd(xy, z, name("dot2add"));
-   return vectorFromVals(dot2add, dot2add, dot2add, dot2add);
-}
-
-llvm::Value * Instructions::dp2(llvm::Value *in1, llvm::Value *in2)
-{
-   Value *mulRes = mul(in1, in2);
-   Value *x = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(0),
-                                                          name("extractx"));
-   Value *y = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(1),
-                                                          name("extracty"));
-   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
-   return vectorFromVals(xy, xy, xy, xy);
-}
-
-llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
-{
-   Value *mulRes = mul(in1, in2);
-   Value *x = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(0),
-                                                          name("extractx"));
-   Value *y = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(1),
-                                                          name("extracty"));
-   Value *z = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(2),
-                                                          name("extractz"));
-   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
-   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
-   return vectorFromVals(dot3, dot3, dot3, dot3);
-}
-
-llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2)
-{
-   Value *mulRes = mul(in1, in2);
-   std::vector<llvm::Value*> vec = extractVector(mulRes);
-   Value *xy = m_builder.CreateAdd(vec[0], vec[1], name("xy"));
-   Value *xyz = m_builder.CreateAdd(xy, vec[2], name("xyz"));
-   Value *dot4 = m_builder.CreateAdd(xyz, vec[3], name("dot4"));
-   return vectorFromVals(dot4, dot4, dot4, dot4);
-}
-
-llvm::Value * Instructions::dph(llvm::Value *in1, llvm::Value *in2)
-{
-   Value *mulRes = mul(in1, in2);
-   std::vector<llvm::Value*> vec1 = extractVector(mulRes);
-   Value *xy = m_builder.CreateAdd(vec1[0], vec1[1], name("xy"));
-   Value *xyz = m_builder.CreateAdd(xy, vec1[2], name("xyz"));
-   Value *dph = m_builder.CreateAdd(xyz, vec1[3], name("dph"));
-   return vectorFromVals(dph, dph, dph, dph);
-}
-
-llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
-{
-   Value *y1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(1),
-                                              name("y1"));
-   Value *z = m_builder.CreateExtractElement(in1,
-                                             m_storage->constantInt(2),
-                                             name("z"));
-   Value *y2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(1),
-                                              name("y2"));
-   Value *w = m_builder.CreateExtractElement(in2,
-                                             m_storage->constantInt(3),
-                                             name("w"));
-   Value *ry = m_builder.CreateMul(y1, y2, name("tyuy"));
-   return vectorFromVals(ConstantFP::get(APFloat(1.f)),
-                         ry, z, w);
-}
-
-void Instructions::elseop()
-{
-   assert(!m_ifStack.empty());
-   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
-   m_builder.CreateBr(ifend);
-   m_builder.SetInsertPoint(m_ifStack.top());
-   currentBlock()->setName(name("ifelse"));
-   m_ifStack.pop();
-   m_ifStack.push(ifend);
-}
-
-void Instructions::endif()
-{
-   assert(!m_ifStack.empty());
-   m_builder.CreateBr(m_ifStack.top());
-   m_builder.SetInsertPoint(m_ifStack.top());
-   m_ifStack.pop();
-}
-
-void Instructions::endLoop()
-{
-   assert(!m_loopStack.empty());
-   Loop loop = m_loopStack.top();
-   m_builder.CreateBr(loop.begin);
-   loop.end->moveAfter(currentBlock());
-   m_builder.SetInsertPoint(loop.end);
-   m_loopStack.pop();
-}
-
-void Instructions::end()
-{
-   m_builder.CreateRetVoid();
-}
-
-void Instructions::endSub()
-{
-   m_func = 0;
-   m_builder.SetInsertPoint(0);
-}
-
-llvm::Value * Instructions::exp(llvm::Value *in)
-{
-   std::vector<llvm::Value*> vec = extractVector(in);
-   return vectorFromVals(callFExp(vec[0]), callFExp(vec[1]),
-                             callFExp(vec[2]), callFExp(vec[3]));
-}
-
-llvm::Value * Instructions::ex2(llvm::Value *in)
-{
-   llvm::Value *val = callPow(ConstantFP::get(APFloat(2.f)),
-                              m_builder.CreateExtractElement(
-                                 in, m_storage->constantInt(0),
-                                 name("x1")));
-   return vectorFromVals(val, val, val, val);
-}
-
-llvm::Value * Instructions::floor(llvm::Value *in)
-{
-   std::vector<llvm::Value*> vec = extractVector(in);
-   return vectorFromVals(callFloor(vec[0]), callFloor(vec[1]),
-                         callFloor(vec[2]), callFloor(vec[3]));
-}
-
-llvm::Value * Instructions::frc(llvm::Value *in)
-{
-   llvm::Value *flr = floor(in);
-   return sub(in, flr);
-}
-
-void Instructions::ifop(llvm::Value *in)
-{
-   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
-   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
-
-   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
-   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
-   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
-
-   Constant *float0 = Constant::getNullValue(Type::FloatTy);
-
-   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
-   m_builder.CreateCondBr(xcmp, ifthen, ifend);
-   //m_builder.SetInsertPoint(yblock);
-
-   m_builder.SetInsertPoint(ifthen);
-   m_ifStack.push(ifend);
-}
-
-llvm::Value * Instructions::kil(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("kil");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
-{
-   llvm::Value *m = mul(in1, in2);
-   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
-   llvm::Value *s = sub(vec1, in1);
-   return add(m, mul(s, in3));
-}
-
-llvm::Value * Instructions::lg2(llvm::Value *in)
-{
-   std::vector<llvm::Value*> vec = extractVector(in);
-   llvm::Value *const_vec = constVector(1.442695f, 1.442695f,
-                                        1.442695f, 1.442695f);
-   return mul(vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
-                             callFLog(vec[2]), callFLog(vec[3])), const_vec);
-}
-
-llvm::Value * Instructions::lit(llvm::Value *in)
-{
-   if (!m_llvmLit) {
-      m_llvmLit = m_mod->getFunction("lit");
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::log(llvm::Value *in)
-{
-   std::vector<llvm::Value*> vec = extractVector(in);
-   return vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
-                             callFLog(vec[2]), callFLog(vec[3]));
-}
-
-llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
-{
-   Value *mulRes = mul(in1, in2);
-   return add(mulRes, in3);
-}
-
-llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
-{
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-
-   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
-                                          name("xcmp"));
-   Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
-                                        name("selx"));
-
-   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
-                                          name("ycmp"));
-   Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
-                                        name("sely"));
-
-   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
-                                          name("zcmp"));
-   Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
-                                        name("selz"));
-
-   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
-                                          name("wcmp"));
-   Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
-                                        name("selw"));
-
-   return vectorFromVals(selx, sely, selz, selw);
-}
-
-llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
-{
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-
-   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
-   Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
-                                        name("selx"));
-
-   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
-   Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
-                                        name("sely"));
-
-   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
-   Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
-                                        name("selz"));
-
-   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
-   Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
-                                        name("selw"));
-
-   return vectorFromVals(selx, sely, selz, selw);
-}
-
-llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
-{
-   return m_builder.CreateMul(in1, in2, name("mul"));
-}
-
-llvm::Value * Instructions::neg(llvm::Value *in)
-{
-   Value *neg = m_builder.CreateNeg(in, name("neg"));
-   return neg;
-}
-
-llvm::Value * Instructions::nrm(llvm::Value *in)
-{
-   llvm::Value *v = rsq(in);
-   return mul(v, in);
-}
-
-llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
-{
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *x2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(0),
-                                              name("x2"));
-   llvm::Value *val = callPow(x1, x2);
-   return vectorFromVals(val, val, val, val);
-}
-
-llvm::Value * Instructions::rcp(llvm::Value *in1)
-{
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                     x1, name("rcp"));
-   return vectorFromVals(res, res, res, res);
-}
-
-llvm::Value * Instructions::rsq(llvm::Value *in1)
-{
-   Value *x = m_builder.CreateExtractElement(in1,
-                                             m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *abs  = callFAbs(x);
-   Value *sqrt = callFSqrt(abs);
-
-   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                       sqrt,
-                                       name("rsqrt"));
-   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
-}
-
-llvm::Value * Instructions::scs(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("scs");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::seq(llvm::Value *in1, llvm::Value *in2)
-{
-   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
-   Constant *const0f = Constant::getNullValue(Type::FloatTy);
-
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-
-   Value *xcmp = m_builder.CreateFCmpOEQ(vec1[0], vec2[0], name("xcmp"));
-   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
-
-   Value *ycmp = m_builder.CreateFCmpOEQ(vec1[1], vec2[1], name("ycmp"));
-   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
-
-   Value *zcmp = m_builder.CreateFCmpOEQ(vec1[2], vec2[2], name("zcmp"));
-   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
-
-   Value *wcmp = m_builder.CreateFCmpOEQ(vec1[3], vec2[3], name("wcmp"));
-   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
-
-   return vectorFromVals(x, y, z, w);
-}
-
-llvm::Value * Instructions::sfl(llvm::Value *in1, llvm::Value *in2)
-{
-   Constant *const0f = Constant::getNullValue(Type::FloatTy);
-
-   return vectorFromVals(const0f, const0f, const0f, const0f);
-}
-
-llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
-{
-   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
-   Constant *const0f = Constant::getNullValue(Type::FloatTy);
-
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-
-   Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], vec2[0], name("xcmp"));
-   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
-
-   Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], vec2[1], name("ycmp"));
-   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
-
-   Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], vec2[2], name("zcmp"));
-   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
-
-   Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], vec2[3], name("wcmp"));
-   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
-
-   return vectorFromVals(x, y, z, w);
-}
-
-llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
-{
-   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
-   Constant *const0f = Constant::getNullValue(Type::FloatTy);
-
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-   Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp"));
-   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
-
-   Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp"));
-   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
-
-   Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp"));
-   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
-
-   Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp"));
-   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
-
-   return vectorFromVals(x, y, z, w);
-}
-
-llvm::Value * Instructions::sin(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("vsin");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::sle(llvm::Value *in1, llvm::Value *in2)
-{
-   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
-   Constant *const0f = Constant::getNullValue(Type::FloatTy);
-
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-
-   Value *xcmp = m_builder.CreateFCmpOLE(vec1[0], vec2[0], name("xcmp"));
-   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
-
-   Value *ycmp = m_builder.CreateFCmpOLE(vec1[1], vec2[1], name("ycmp"));
-   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
-
-   Value *zcmp = m_builder.CreateFCmpOLE(vec1[2], vec2[2], name("zcmp"));
-   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
-
-   Value *wcmp = m_builder.CreateFCmpOLE(vec1[3], vec2[3], name("wcmp"));
-   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
-
-   return vectorFromVals(x, y, z, w);
-}
-
-llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
-{
-   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
-   Constant *const0f = Constant::getNullValue(Type::FloatTy);
-
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-
-   Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
-   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
-
-   Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
-   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
-
-   Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
-   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
-
-   Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
-   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
-
-   return vectorFromVals(x, y, z, w);
-}
-
-llvm::Value * Instructions::sne(llvm::Value *in1, llvm::Value *in2)
-{
-   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
-   Constant *const0f = Constant::getNullValue(Type::FloatTy);
-
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-
-   Value *xcmp = m_builder.CreateFCmpONE(vec1[0], vec2[0], name("xcmp"));
-   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
-
-   Value *ycmp = m_builder.CreateFCmpONE(vec1[1], vec2[1], name("ycmp"));
-   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
-
-   Value *zcmp = m_builder.CreateFCmpONE(vec1[2], vec2[2], name("zcmp"));
-   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
-
-   Value *wcmp = m_builder.CreateFCmpONE(vec1[3], vec2[3], name("wcmp"));
-   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
-
-   return vectorFromVals(x, y, z, w);
-}
-
-llvm::Value * Instructions::str(llvm::Value *in1, llvm::Value *in2)
-{
-   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
-
-   return vectorFromVals(const1f, const1f, const1f, const1f);
-}
-
-llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
-{
-   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
-   return res;
-}
-
-llvm::Value * Instructions::trunc(llvm::Value *in)
-{
-   std::vector<llvm::Value*> vec = extractVector(in);
-   Value *icastx = m_builder.CreateFPToSI(vec[0], IntegerType::get(32),
-                                          name("ftoix"));
-   Value *icasty = m_builder.CreateFPToSI(vec[1], IntegerType::get(32),
-                                          name("ftoiy"));
-   Value *icastz = m_builder.CreateFPToSI(vec[2], IntegerType::get(32),
-                                          name("ftoiz"));
-   Value *icastw = m_builder.CreateFPToSI(vec[3], IntegerType::get(32),
-                                          name("ftoiw"));
-   Value *fx = m_builder.CreateSIToFP(icastx, Type::FloatTy,
-                                      name("fx"));
-   Value *fy = m_builder.CreateSIToFP(icasty, Type::FloatTy,
-                                      name("fy"));
-   Value *fz = m_builder.CreateSIToFP(icastz, Type::FloatTy,
-                                      name("fz"));
-   Value *fw = m_builder.CreateSIToFP(icastw, Type::FloatTy,
-                                      name("fw"));
-   return vectorFromVals(fx, fy, fz, fw);
-}
-
-llvm::Value * Instructions::x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
-   std::vector<llvm::Value*> vec1 = extractVector(in1);
-   std::vector<llvm::Value*> vec2 = extractVector(in2);
-   std::vector<llvm::Value*> vec3 = extractVector(in3);
-
-   Value *x2x3 = m_builder.CreateMul( vec2[0], vec3[0], name("x2x3"));
-   Value *y2y3 = m_builder.CreateMul( vec2[1], vec3[1], name("y2y3"));
-   Value *x1px2x3 = m_builder.CreateAdd (vec1[0], x2x3, name("x1 + x2x3"));
-   Value *x1px2x3py2y3 = m_builder.CreateAdd (x1px2x3, y2y3, name("x1 + x2x3 + y2y3"));
-
-   Value *x2z3 = m_builder.CreateMul( vec2[0], vec3[2], name("x2z3"));
-   Value *y2w3 = m_builder.CreateMul( vec2[1], vec3[3], name("y2w3"));
-   Value *y1px2z3 = m_builder.CreateAdd (vec1[1], x2z3, name("y1 + x2z3"));
-   Value *y1px2z3py2w3 = m_builder.CreateAdd (y1px2z3, y2w3, name("y1 + x2z3 + y2w3"));
-
-   return vectorFromVals(x1px2x3py2y3, y1px2z3py2w3, x1px2x3py2y3, y1px2z3py2w3);
-}
-
-void Instructions::printVector(llvm::Value *val)
-{
-   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
-
-   if (!m_fmtPtr) {
-      Constant *format = ConstantArray::get(frmt, true);
-      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
-      GlobalVariable* globalFormat = new GlobalVariable(
-         /*Type=*/arrayTy,
-         /*isConstant=*/true,
-         /*Linkage=*/GlobalValue::InternalLinkage,
-         /*Initializer=*/0, // has initializer, specified below
-         /*Name=*/name(".str"),
-         m_mod);
-      globalFormat->setInitializer(format);
-
-      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
-      std::vector<Constant*> const_ptr_21_indices;
-      const_ptr_21_indices.push_back(const_int0);
-      const_ptr_21_indices.push_back(const_int0);
-      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
-                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
-   }
-
-   Function *func_printf = m_mod->getFunction("printf");
-   if (!func_printf)
-      func_printf = declarePrintf();
-   assert(func_printf);
-   std::vector<llvm::Value*> vec = extractVector(val);
-   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
-   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
-   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
-   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
-   std::vector<Value*> params;
-   params.push_back(m_fmtPtr);
-   params.push_back(dx);
-   params.push_back(dy);
-   params.push_back(dz);
-   params.push_back(dw);
-   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
-                                         name("printf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(true);
-}
-
-const char * Instructions::name(const char *prefix)
-{
-   ++m_idx;
-   snprintf(m_name, 32, "%s%d", prefix, m_idx);
-   return m_name;
-}
-
-llvm::Value * Instructions::callCeil(llvm::Value *val)
-{
-   if (!m_llvmCeil) {
-      // predeclare the intrinsic
-      std::vector<const Type*> ceilArgs;
-      ceilArgs.push_back(Type::FloatTy);
-      AttrListPtr ceilPal;
-      FunctionType* ceilType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/ceilArgs,
-         /*isVarArg=*/false);
-      m_llvmCeil = Function::Create(
-         /*Type=*/ceilType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"ceilf", m_mod);
-      m_llvmCeil->setCallingConv(CallingConv::C);
-      m_llvmCeil->setAttributes(ceilPal);
-   }
-   CallInst *call =  m_builder.CreateCall(m_llvmCeil, val,
-                                          name("ceilf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value *Instructions::callFAbs(llvm::Value *val)
-{
-   if (!m_llvmFAbs) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fabsArgs;
-      fabsArgs.push_back(Type::FloatTy);
-      AttrListPtr fabsPal;
-      FunctionType* fabsType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fabsArgs,
-         /*isVarArg=*/false);
-      m_llvmFAbs = Function::Create(
-         /*Type=*/fabsType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"fabs", m_mod);
-      m_llvmFAbs->setCallingConv(CallingConv::C);
-      m_llvmFAbs->setAttributes(fabsPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
-                                         name("fabs"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::callFExp(llvm::Value *val)
-{
-   if (!m_llvmFexp) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fexpArgs;
-      fexpArgs.push_back(Type::FloatTy);
-      AttrListPtr fexpPal;
-      FunctionType* fexpType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fexpArgs,
-         /*isVarArg=*/false);
-      m_llvmFexp = Function::Create(
-         /*Type=*/fexpType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"expf", m_mod);
-      m_llvmFexp->setCallingConv(CallingConv::C);
-      m_llvmFexp->setAttributes(fexpPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFexp, val,
-                                         name("expf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::callFLog(llvm::Value *val)
-{
-   if (!m_llvmFlog) {
-      // predeclare the intrinsic
-      std::vector<const Type*> flogArgs;
-      flogArgs.push_back(Type::FloatTy);
-      AttrListPtr flogPal;
-      FunctionType* flogType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/flogArgs,
-         /*isVarArg=*/false);
-      m_llvmFlog = Function::Create(
-         /*Type=*/flogType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"logf", m_mod);
-      m_llvmFlog->setCallingConv(CallingConv::C);
-      m_llvmFlog->setAttributes(flogPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
-                                         name("logf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::callFloor(llvm::Value *val)
-{
-   if (!m_llvmFloor) {
-      // predeclare the intrinsic
-      std::vector<const Type*> floorArgs;
-      floorArgs.push_back(Type::FloatTy);
-      AttrListPtr floorPal;
-      FunctionType* floorType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/floorArgs,
-         /*isVarArg=*/false);
-      m_llvmFloor = Function::Create(
-         /*Type=*/floorType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"floorf", m_mod);
-      m_llvmFloor->setCallingConv(CallingConv::C);
-      m_llvmFloor->setAttributes(floorPal);
-   }
-   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
-                                          name("floorf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value *Instructions::callFSqrt(llvm::Value *val)
-{
-   if (!m_llvmFSqrt) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fsqrtArgs;
-      fsqrtArgs.push_back(Type::FloatTy);
-      AttrListPtr fsqrtPal;
-      FunctionType* fsqrtType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fsqrtArgs,
-         /*isVarArg=*/false);
-      m_llvmFSqrt = Function::Create(
-         /*Type=*/fsqrtType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.sqrt.f32", m_mod);
-      m_llvmFSqrt->setCallingConv(CallingConv::C);
-      m_llvmFSqrt->setAttributes(fsqrtPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
-                                         name("sqrt"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
-{
-   if (!m_llvmPow) {
-      // predeclare the intrinsic
-      std::vector<const Type*> powArgs;
-      powArgs.push_back(Type::FloatTy);
-      powArgs.push_back(Type::FloatTy);
-      AttrListPtr powPal;
-      FunctionType* powType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/powArgs,
-         /*isVarArg=*/false);
-      m_llvmPow = Function::Create(
-         /*Type=*/powType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.pow.f32", m_mod);
-      m_llvmPow->setCallingConv(CallingConv::C);
-      m_llvmPow->setAttributes(powPal);
-   }
-   std::vector<Value*> params;
-   params.push_back(val1);
-   params.push_back(val2);
-   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
-                                         name("pow"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
-                                           llvm::Value *z, llvm::Value *w)
-{
-   Constant *const_vec = Constant::getNullValue(m_floatVecType);
-   Value *res = m_builder.CreateInsertElement(const_vec, x,
-                                              m_storage->constantInt(0),
-                                              name("vecx"));
-   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
-                               name("vecxy"));
-   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
-                               name("vecxyz"));
-   if (w)
-      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
-                                          name("vecxyzw"));
-   return res;
-}
-
-llvm::Value * Instructions::constVector(float x, float y, float z, float w)
-{
-   std::vector<Constant*> vec(4);
-   vec[0] = ConstantFP::get(APFloat(x));
-   vec[1] = ConstantFP::get(APFloat(y));
-   vec[2] = ConstantFP::get(APFloat(z));
-   vec[3] = ConstantFP::get(APFloat(w));
-   return ConstantVector::get(m_floatVecType, vec);
-}
-
-llvm::Function * Instructions::declarePrintf()
-{
-   std::vector<const Type*> args;
-   AttrListPtr params;
-   FunctionType* funcTy = FunctionType::get(
-      /*Result=*/IntegerType::get(32),
-      /*Params=*/args,
-      /*isVarArg=*/true);
-   Function* func_printf = Function::Create(
-      /*Type=*/funcTy,
-      /*Linkage=*/GlobalValue::ExternalLinkage,
-      /*Name=*/"printf", m_mod);
-   func_printf->setCallingConv(CallingConv::C);
-   func_printf->setAttributes(params);
-   return func_printf;
-}
-
-llvm::Function * Instructions::declareFunc(int label)
-{
-   PointerType *vecPtr = PointerType::getUnqual(m_floatVecType);
-   std::vector<const Type*> args;
-   args.push_back(vecPtr);
-   args.push_back(vecPtr);
-   args.push_back(vecPtr);
-   args.push_back(vecPtr);
-   AttrListPtr params;
-   FunctionType *funcType = FunctionType::get(
-      /*Result=*/Type::VoidTy,
-      /*Params=*/args,
-      /*isVarArg=*/false);
-   std::string name = createFuncName(label);
-   Function *func = Function::Create(
-      /*Type=*/funcType,
-      /*Linkage=*/GlobalValue::ExternalLinkage,
-      /*Name=*/name.c_str(), m_mod);
-   func->setCallingConv(CallingConv::C);
-   func->setAttributes(params);
-   return func;
-}
-
-llvm::Function * Instructions::findFunction(int label)
-{
-   llvm::Function *func = m_functions[label];
-   if (!func) {
-      func = declareFunc(label);
-      m_functions[label] = func;
-   }
-   return func;
-}
-
-std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
-{
-   std::vector<llvm::Value*> elems(4);
-   elems[0] = m_builder.CreateExtractElement(vec, m_storage->constantInt(0),
-                                             name("x"));
-   elems[1] = m_builder.CreateExtractElement(vec, m_storage->constantInt(1),
-                                             name("y"));
-   elems[2] = m_builder.CreateExtractElement(vec, m_storage->constantInt(2),
-                                             name("z"));
-   elems[3] = m_builder.CreateExtractElement(vec, m_storage->constantInt(3),
-                                             name("w"));
-   return elems;
-}
-
-
-#endif //MESA_LLVM
-
-
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
deleted file mode 100644
index e18571251ee..00000000000
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Zack Rusin [email protected]
-  */
-
-#ifndef INSTRUCTIONS_H
-#define INSTRUCTIONS_H
-
-#include <llvm/BasicBlock.h>
-#include <llvm/Module.h>
-#include <llvm/Value.h>
-#include <llvm/Support/IRBuilder.h>
-
-#include <map>
-#include <stack>
-
-namespace llvm {
-   class VectorType;
-   class Function;
-}
-
-class Storage;
-
-class Instructions
-{
-public:
-   Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicBlock *block,
-                Storage *storage);
-
-   llvm::BasicBlock *currentBlock() const;
-
-   llvm::Value *abs(llvm::Value *in1);
-   llvm::Value *add(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *arl(llvm::Value *in1);
-   void         beginLoop();
-   void         bgnSub(unsigned);
-   void         brk();
-   void         cal(int label, llvm::Value *input);
-   llvm::Value *ceil(llvm::Value *in);
-   llvm::Value *clamp(llvm::Value *in);
-   llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
-   llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
-   llvm::Value *cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
-   llvm::Value *cos(llvm::Value *in);
-   llvm::Value *cross(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *ddx(llvm::Value *in);
-   llvm::Value *ddy(llvm::Value *in);
-   llvm::Value *div(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3);
-   llvm::Value *dp2(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *dst(llvm::Value *in1, llvm::Value *in2);
-   void         elseop();
-   void         endif();
-   void         endLoop();
-   void         end();
-   void         endSub();
-   llvm::Value *exp(llvm::Value *in);
-   llvm::Value *ex2(llvm::Value *in);
-   llvm::Value *floor(llvm::Value *in);
-   llvm::Value *frc(llvm::Value *in);
-   void         ifop(llvm::Value *in);
-   llvm::Value *kil(llvm::Value *in);
-   llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2,
-                     llvm::Value *in3);
-   llvm::Value *lg2(llvm::Value *in);
-   llvm::Value *lit(llvm::Value *in);
-   llvm::Value *log(llvm::Value *in);
-   llvm::Value *madd(llvm::Value *in1, llvm::Value *in2,
-                     llvm::Value *in3);
-   llvm::Value *max(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *neg(llvm::Value *in);
-   llvm::Value *nrm(llvm::Value *in);
-   llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *rcp(llvm::Value *in);
-   llvm::Value *rsq(llvm::Value *in);
-   llvm::Value *scs(llvm::Value *in);
-   llvm::Value *seq(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *sfl(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *sge(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *sin(llvm::Value *in);
-   llvm::Value *sle(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *slt(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *sne(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *str(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *sub(llvm::Value *in1, llvm::Value *in2);
-   llvm::Value *trunc(llvm::Value *in);
-   llvm::Value *x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
-
-   void printVector(llvm::Value *val);
-private:
-   const char *name(const char *prefix);
-
-   llvm::Value *callCeil(llvm::Value *val);
-   llvm::Value *callFAbs(llvm::Value *val);
-   llvm::Value *callFExp(llvm::Value *val);
-   llvm::Value *callFLog(llvm::Value *val);
-   llvm::Value *callFloor(llvm::Value *val);
-   llvm::Value *callFSqrt(llvm::Value *val);
-   llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2);
-
-   llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
-                               llvm::Value *z, llvm::Value *w=0);
-
-   llvm::Value *constVector(float x, float y, float z, float w);
-
-   llvm::Function *declarePrintf();
-   llvm::Function *declareFunc(int label);
-
-   llvm::Function *findFunction(int label);
-
-   std::vector<llvm::Value*> extractVector(llvm::Value *vec);
-private:
-   llvm::Module             *m_mod;
-   llvm::Function           *m_func;
-   char                      m_name[32];
-   llvm::IRBuilder<>         m_builder;
-   int                       m_idx;
-
-   llvm::VectorType *m_floatVecType;
-
-   llvm::Function   *m_llvmCeil;
-   llvm::Function   *m_llvmFSqrt;
-   llvm::Function   *m_llvmFAbs;
-   llvm::Function   *m_llvmPow;
-   llvm::Function   *m_llvmFloor;
-   llvm::Function   *m_llvmFlog;
-   llvm::Function   *m_llvmFexp;
-   llvm::Function   *m_llvmLit;
-
-   llvm::Constant   *m_fmtPtr;
-
-   std::stack<llvm::BasicBlock*> m_ifStack;
-   struct Loop {
-      llvm::BasicBlock *begin;
-      llvm::BasicBlock *end;
-   };
-   std::stack<Loop> m_loopStack;
-   std::map<int, llvm::Function*> m_functions;
-   Storage *m_storage;
-};
-
-#endif
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
deleted file mode 100644
index 721b7d2d833..00000000000
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ /dev/null
@@ -1,525 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include <cstdio>
-#include "instructionssoa.h"
-
-#include "storagesoa.h"
-
-#include "pipe/p_shader_tokens.h"
-#include "util/u_memory.h"
-
-#include <llvm/CallingConv.h>
-#include <llvm/Constants.h>
-#include <llvm/Module.h>
-#include <llvm/Function.h>
-#include <llvm/Instructions.h>
-#include <llvm/Transforms/Utils/Cloning.h>
-#include <llvm/Attributes.h>
-#include <llvm/Support/MemoryBuffer.h>
-#include <llvm/Bitcode/ReaderWriter.h>
-
-
-#include <iostream>
-
-
-/* disable some warnings. this file is autogenerated */
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#endif
-using namespace llvm;
-#include "gallivmsoabuiltins.cpp"
-#if defined(__GNUC__)
-#pragma GCC diagnostic warning "-Wunused-variable"
-#endif
-
-InstructionsSoa::InstructionsSoa(llvm::Module *mod, llvm::Function *func,
-                                 llvm::BasicBlock *block, StorageSoa *storage)
-   : m_builder(block),
-     m_storage(storage),
-     m_idx(0)
-{
-   createFunctionMap();
-   createBuiltins();
-}
-
-const char * InstructionsSoa::name(const char *prefix) const
-{
-   ++m_idx;
-   snprintf(m_name, 32, "%s%d", prefix, m_idx);
-   return m_name;
-}
-
-llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
-                                              llvm::Value *z, llvm::Value *w)
-{
-   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
-   Constant *constVector = Constant::getNullValue(vectorType);
-   Value *res = m_builder.CreateInsertElement(constVector, x,
-                                              m_storage->constantInt(0),
-                                              name("vecx"));
-   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
-                               name("vecxy"));
-   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
-                               name("vecxyz"));
-   if (w)
-      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
-                                          name("vecxyzw"));
-   return res;
-}
-
-void InstructionsSoa::end()
-{
-   m_builder.CreateRetVoid();
-}
-
-std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
-{
-   std::vector<llvm::Value*> res(4);
-   res[0] = m_builder.CreateExtractElement(vector,
-                                           m_storage->constantInt(0),
-                                           name("extract1X"));
-   res[1] = m_builder.CreateExtractElement(vector,
-                                           m_storage->constantInt(1),
-                                           name("extract2X"));
-   res[2] = m_builder.CreateExtractElement(vector,
-                                           m_storage->constantInt(2),
-                                           name("extract3X"));
-   res[3] = m_builder.CreateExtractElement(vector,
-                                           m_storage->constantInt(3),
-                                           name("extract4X"));
-
-   return res;
-}
-
-llvm::IRBuilder<>* InstructionsSoa::getIRBuilder()
-{
-   return &m_builder;
-}
-
-void InstructionsSoa::createFunctionMap()
-{
-   m_functionsMap[TGSI_OPCODE_ABS]   = "abs";
-   m_functionsMap[TGSI_OPCODE_DP3]   = "dp3";
-   m_functionsMap[TGSI_OPCODE_DP4]   = "dp4";
-   m_functionsMap[TGSI_OPCODE_MIN]   = "min";
-   m_functionsMap[TGSI_OPCODE_MAX]   = "max";
-   m_functionsMap[TGSI_OPCODE_POW]   = "pow";
-   m_functionsMap[TGSI_OPCODE_LIT]   = "lit";
-   m_functionsMap[TGSI_OPCODE_RSQ]   = "rsq";
-   m_functionsMap[TGSI_OPCODE_SLT]   = "slt";
-}
-
-void InstructionsSoa::createDependencies()
-{
-   {
-      std::vector<std::string> powDeps(2);
-      powDeps[0] = "powf";
-      powDeps[1] = "powvec";
-      m_builtinDependencies["pow"] = powDeps;
-   }
-   {
-      std::vector<std::string> absDeps(2);
-      absDeps[0] = "fabsf";
-      absDeps[1] = "absvec";
-      m_builtinDependencies["abs"] = absDeps;
-   }
-   {
-      std::vector<std::string> maxDeps(1);
-      maxDeps[0] = "maxvec";
-      m_builtinDependencies["max"] = maxDeps;
-   }
-   {
-      std::vector<std::string> minDeps(1);
-      minDeps[0] = "minvec";
-      m_builtinDependencies["min"] = minDeps;
-   }
-   {
-      std::vector<std::string> litDeps(4);
-      litDeps[0] = "minvec";
-      litDeps[1] = "maxvec";
-      litDeps[2] = "powf";
-      litDeps[3] = "powvec";
-      m_builtinDependencies["lit"] = litDeps;
-   }
-   {
-      std::vector<std::string> rsqDeps(4);
-      rsqDeps[0] = "sqrtf";
-      rsqDeps[1] = "sqrtvec";
-      rsqDeps[2] = "fabsf";
-      rsqDeps[3] = "absvec";
-      m_builtinDependencies["rsq"] = rsqDeps;
-   }
-}
-
-llvm::Function * InstructionsSoa::function(int op)
-{
-    if (m_functions.find(op) != m_functions.end())
-       return m_functions[op];
-
-    std::string name = m_functionsMap[op];
-
-    std::cout <<"For op = "<<op<<", func is '"<<name<<"'"<<std::endl;
-
-    std::vector<std::string> deps = m_builtinDependencies[name];
-    for (unsigned int i = 0; i < deps.size(); ++i) {
-       llvm::Function *func = m_builtins->getFunction(deps[i]);
-       std::cout <<"\tinjecting dep = '"<<func->getName()<<"'"<<std::endl;
-       injectFunction(func);
-    }
-
-    llvm::Function *originalFunc = m_builtins->getFunction(name);
-    injectFunction(originalFunc, op);
-    return m_functions[op];
-}
-
-llvm::Module * InstructionsSoa::currentModule() const
-{
-   BasicBlock *block = m_builder.GetInsertBlock();
-   if (!block || !block->getParent())
-      return 0;
-
-   return block->getParent()->getParent();
-}
-
-void InstructionsSoa::createBuiltins()
-{
-   std::string ErrMsg;
-   MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
-      (const char*)&soabuiltins_data[0],
-      (const char*)&soabuiltins_data[Elements(soabuiltins_data) - 1]);
-   m_builtins = ParseBitcodeFile(buffer, &ErrMsg);
-   std::cout<<"Builtins created at "<<m_builtins<<" ("<<ErrMsg<<")"<<std::endl;
-   assert(m_builtins);
-   createDependencies();
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> in1)
-{
-   llvm::Function *func = function(TGSI_OPCODE_ABS);
-   return callBuiltin(func, in1);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
-   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
-   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
-   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
-{
-   std::vector<llvm::Value*> res(4);
-
-   //Extract x's
-   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
-                                                    m_storage->constantInt(0),
-                                                    name("extractX"));
-   //cast it to an unsigned int
-   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
-
-   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
-   //only x is valid. the others shouldn't be necessary
-   /*
-   res[1] = Constant::getNullValue(m_floatVecType);
-   res[2] = Constant::getNullValue(m_floatVecType);
-   res[3] = Constant::getNullValue(m_floatVecType);
-   */
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_DP3);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_LIT);
-   return callBuiltin(func, in);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
-                                                const std::vector<llvm::Value*> in2,
-                                                const std::vector<llvm::Value*> in3)
-{
-   std::vector<llvm::Value*> res = mul(in1, in2);
-   return add(res, in3);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MAX);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MIN);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
-   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
-   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
-   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_POW);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_RSQ);
-   return callBuiltin(func, in);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_SLT);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
-   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
-   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
-   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
-
-   return res;
-}
-
-void checkFunction(Function *func)
-{
-   for (Function::const_iterator BI = func->begin(), BE = func->end();
-        BI != BE; ++BI) {
-      const BasicBlock &BB = *BI;
-      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
-           II != IE; ++II) {
-         const Instruction &I = *II;
-         std::cout<< "Instr = "<<I;
-         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
-            const Value *Op = I.getOperand(op);
-            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
-            //I->setOperand(op, V);
-  }
-      }
-   }
-}
-
-llvm::Value * InstructionsSoa::allocaTemp()
-{
-   VectorType *vector   = VectorType::get(Type::FloatTy, 4);
-   ArrayType  *vecArray = ArrayType::get(vector, 4);
-   AllocaInst *alloca = new AllocaInst(vecArray, name("tmpRes"),
-                                       m_builder.GetInsertBlock());
-
-   std::vector<Value*> indices;
-   indices.push_back(m_storage->constantInt(0));
-   indices.push_back(m_storage->constantInt(0));
-   GetElementPtrInst *getElem = GetElementPtrInst::Create(alloca,
-                                                          indices.begin(),
-                                                          indices.end(),
-                                                          name("allocaPtr"),
-                                                          m_builder.GetInsertBlock());
-   return getElem;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::allocaToResult(llvm::Value *allocaPtr)
-{
-   GetElementPtrInst *xElemPtr =  GetElementPtrInst::Create(allocaPtr,
-                                                            m_storage->constantInt(0),
-                                                            name("xPtr"),
-                                                            m_builder.GetInsertBlock());
-   GetElementPtrInst *yElemPtr =  GetElementPtrInst::Create(allocaPtr,
-                                                            m_storage->constantInt(1),
-                                                            name("yPtr"),
-                                                            m_builder.GetInsertBlock());
-   GetElementPtrInst *zElemPtr =  GetElementPtrInst::Create(allocaPtr,
-                                                            m_storage->constantInt(2),
-                                                            name("zPtr"),
-                                                            m_builder.GetInsertBlock());
-   GetElementPtrInst *wElemPtr =  GetElementPtrInst::Create(allocaPtr,
-                                                            m_storage->constantInt(3),
-                                                            name("wPtr"),
-                                                            m_builder.GetInsertBlock());
-
-   std::vector<llvm::Value*> res(4);
-   res[0] = new LoadInst(xElemPtr, name("xRes"), false, m_builder.GetInsertBlock());
-   res[1] = new LoadInst(yElemPtr, name("yRes"), false, m_builder.GetInsertBlock());
-   res[2] = new LoadInst(zElemPtr, name("zRes"), false, m_builder.GetInsertBlock());
-   res[3] = new LoadInst(wElemPtr, name("wRes"), false, m_builder.GetInsertBlock());
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::dp4(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_DP4);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std::vector<llvm::Value*> in1)
-{
-   std::vector<Value*> params;
-
-   llvm::Value *allocaPtr = allocaTemp();
-   params.push_back(allocaPtr);
-   params.push_back(in1[0]);
-   params.push_back(in1[1]);
-   params.push_back(in1[2]);
-   params.push_back(in1[3]);
-   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end());
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-
-   return allocaToResult(allocaPtr);
-}
-
-std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std::vector<llvm::Value*> in1,
-                                                 const std::vector<llvm::Value*> in2)
-{
-   std::vector<Value*> params;
-
-   llvm::Value *allocaPtr = allocaTemp();
-   params.push_back(allocaPtr);
-   params.push_back(in1[0]);
-   params.push_back(in1[1]);
-   params.push_back(in1[2]);
-   params.push_back(in1[3]);
-   params.push_back(in2[0]);
-   params.push_back(in2[1]);
-   params.push_back(in2[2]);
-   params.push_back(in2[3]);
-   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end());
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-
-   return allocaToResult(allocaPtr);
-}
-
-std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std::vector<llvm::Value*> in1,
-                                                 const std::vector<llvm::Value*> in2,
-                                                 const std::vector<llvm::Value*> in3)
-{
-   std::vector<Value*> params;
-
-   llvm::Value *allocaPtr = allocaTemp();
-   params.push_back(allocaPtr);
-   params.push_back(in1[0]);
-   params.push_back(in1[1]);
-   params.push_back(in1[2]);
-   params.push_back(in1[3]);
-   params.push_back(in2[0]);
-   params.push_back(in2[1]);
-   params.push_back(in2[2]);
-   params.push_back(in2[3]);
-   params.push_back(in3[0]);
-   params.push_back(in3[1]);
-   params.push_back(in3[2]);
-   params.push_back(in3[3]);
-   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end());
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-
-   return allocaToResult(allocaPtr);
-}
-
-void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
-{
-   assert(originalFunc);
-   std::cout << "injecting function originalFunc " <<originalFunc->getName() <<std::endl;
-   if (op != TGSI_OPCODE_LAST) {
-      /* in this case it's possible the function has been already
-       * injected as part of the dependency chain, which gets
-       * injected below */
-      llvm::Function *func = currentModule()->getFunction(originalFunc->getName());
-      if (func) {
-         m_functions[op] = func;
-         return;
-      }
-   }
-   llvm::Function *func = 0;
-   if (originalFunc->isDeclaration()) {
-      func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage,
-                              originalFunc->getName(), currentModule());
-      func->setCallingConv(CallingConv::C);
-      const AttrListPtr pal;
-      func->setAttributes(pal);
-      currentModule()->dump();
-   } else {
-      DenseMap<const Value*, Value *> val;
-      val[m_builtins->getFunction("fabsf")] = currentModule()->getFunction("fabsf");
-      val[m_builtins->getFunction("powf")] = currentModule()->getFunction("powf");
-      val[m_builtins->getFunction("sqrtf")] = currentModule()->getFunction("sqrtf");
-      func = CloneFunction(originalFunc, val);
-#if 0
-      std::cout <<" replacing "<<m_builtins->getFunction("powf")
-                <<", with " <<currentModule()->getFunction("powf")<<std::endl;
-      std::cout<<"1111-------------------------------"<<std::endl;
-      checkFunction(originalFunc);
-      std::cout<<"2222-------------------------------"<<std::endl;
-      checkFunction(func);
-      std::cout <<"XXXX = " <<val[m_builtins->getFunction("powf")]<<std::endl;
-#endif
-      currentModule()->getFunctionList().push_back(func);
-   }
-   if (op != TGSI_OPCODE_LAST) {
-      m_functions[op] = func;
-   }
-}
-
-
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
deleted file mode 100644
index d6831e0a6b9..00000000000
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef INSTRUCTIONSSOA_H
-#define INSTRUCTIONSSOA_H
-
-#include <pipe/p_shader_tokens.h>
-#include <llvm/Support/IRBuilder.h>
-
-#include <map>
-#include <vector>
-
-namespace llvm {
-   class Module;
-   class Function;
-   class BasicBlock;
-   class Value;
-}
-class StorageSoa;
-
-class InstructionsSoa
-{
-public:
-   InstructionsSoa(llvm::Module *mod, llvm::Function *func,
-                   llvm::BasicBlock *block, StorageSoa *storage);
-
-   std::vector<llvm::Value*> abs(const std::vector<llvm::Value*> in1);
-   std::vector<llvm::Value*> arl(const std::vector<llvm::Value*> in);
-   std::vector<llvm::Value*> add(const std::vector<llvm::Value*> in1,
-                                 const std::vector<llvm::Value*> in2);
-   std::vector<llvm::Value*> dp3(const std::vector<llvm::Value*> in1,
-                                 const std::vector<llvm::Value*> in2);
-   std::vector<llvm::Value*> dp4(const std::vector<llvm::Value*> in1,
-                                 const std::vector<llvm::Value*> in2);
-   std::vector<llvm::Value*> lit(const std::vector<llvm::Value*> in);
-   std::vector<llvm::Value*> madd(const std::vector<llvm::Value*> in1,
-                                  const std::vector<llvm::Value*> in2,
-                                  const std::vector<llvm::Value*> in3);
-   std::vector<llvm::Value*> max(const std::vector<llvm::Value*> in1,
-                                 const std::vector<llvm::Value*> in2);
-   std::vector<llvm::Value*> min(const std::vector<llvm::Value*> in1,
-                                 const std::vector<llvm::Value*> in2);
-   std::vector<llvm::Value*> mul(const std::vector<llvm::Value*> in1,
-                                 const std::vector<llvm::Value*> in2);
-   std::vector<llvm::Value*> pow(const std::vector<llvm::Value*> in1,
-                                 const std::vector<llvm::Value*> in2);
-   std::vector<llvm::Value*> rsq(const std::vector<llvm::Value*> in1);
-   std::vector<llvm::Value*> slt(const std::vector<llvm::Value*> in1,
-                                 const std::vector<llvm::Value*> in2);
-   std::vector<llvm::Value*> sub(const std::vector<llvm::Value*> in1,
-                                 const std::vector<llvm::Value*> in2);
-   void         end();
-
-   std::vector<llvm::Value*> extractVector(llvm::Value *vector);
-   llvm::IRBuilder<>*  getIRBuilder();
-private:
-   const char * name(const char *prefix) const;
-   llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
-                               llvm::Value *z, llvm::Value *w);
-   void createFunctionMap();
-   void createBuiltins();
-   void createDependencies();
-   llvm::Function *function(int);
-   llvm::Module *currentModule() const;
-   llvm::Value *allocaTemp();
-   std::vector<llvm::Value*> allocaToResult(llvm::Value *allocaPtr);
-   std::vector<llvm::Value*> callBuiltin(llvm::Function *func,
-                                         const std::vector<llvm::Value*> in1);
-   std::vector<llvm::Value*> callBuiltin(llvm::Function *func,
-                                         const std::vector<llvm::Value*> in1,
-                                         const std::vector<llvm::Value*> in2);
-   std::vector<llvm::Value*> callBuiltin(llvm::Function *func,
-                                         const std::vector<llvm::Value*> in1,
-                                         const std::vector<llvm::Value*> in2,
-                                         const std::vector<llvm::Value*> in3);
-   void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST);
-private:
-   llvm::IRBuilder<>  m_builder;
-   StorageSoa *m_storage;
-
-   std::map<int, std::string> m_functionsMap;
-   std::map<int, llvm::Function*> m_functions;
-   llvm::Module *m_builtins;
-   std::map<std::string, std::vector<std::string> > m_builtinDependencies;
-
-private:
-   mutable int  m_idx;
-   mutable char m_name[32];
-};
-
-
-#endif
diff --git a/src/gallium/auxiliary/gallivm/llvm_builtins.c b/src/gallium/auxiliary/gallivm/llvm_builtins.c
deleted file mode 100644
index d5a003a48b2..00000000000
--- a/src/gallium/auxiliary/gallivm/llvm_builtins.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Zack Rusin [email protected]
-  */
-typedef __attribute__(( ext_vector_type(4) )) float float4;
-
-extern float powf(float a, float b);
-
-inline float approx(float a, float b)
-{
-    if (b < -128.0f) b = -128.0f;
-    if (b > 128.0f)   b = 128.0f;
-    if (a < 0) a = 0;
-    return powf(a, b);
-}
-
-inline float4 lit(float4 tmp)
-{
-    float4 result;
-    result.x = 1.0;
-    result.w = 1.0;
-    if (tmp.x > 0) {
-        result.y = tmp.x;
-        result.z = approx(tmp.y, tmp.w);
-    } else {
-        result.y = 0;
-        result.z = 0;
-    }
-    return result;
-}
-
-inline float4 cmp(float4 tmp0, float4 tmp1, float4 tmp2)
-{
-   float4 result;
-
-   result.x = (tmp0.x < 0.0) ? tmp1.x : tmp2.x;
-   result.y = (tmp0.y < 0.0) ? tmp1.y : tmp2.y;
-   result.z = (tmp0.z < 0.0) ? tmp1.z : tmp2.z;
-   result.w = (tmp0.w < 0.0) ? tmp1.w : tmp2.w;
-
-   return result;
-}
-
-extern float cosf(float  val);
-extern float sinf(float  val);
-
-inline float4 vcos(float4 val)
-{
-   float4 result;
-   printf("VEC IN   is %f %f %f %f\n", val.x, val.y, val.z, val.w);
-   result.x = cosf(val.x);
-   result.y = cosf(val.x);
-   result.z = cosf(val.x);
-   result.w = cosf(val.x);
-   printf("VEC OUT  is %f %f %f %f\n", result.x, result.y, result.z, result.w);
-   return result;
-}
-
-inline float4 scs(float4 val)
-{
-   float4 result;
-   float tmp = val.x;
-   result.x = cosf(tmp);
-   result.y = sinf(tmp);
-   return result;
-}
-
-
-inline float4 vsin(float4 val)
-{
-   float4 result;
-   float tmp = val.x;
-   float res = sinf(tmp);
-   result.x = res;
-   result.y = res;
-   result.z = res;
-   result.w = res;
-   return result;
-}
-
-inline int kil(float4 val)
-{
-   if (val.x < 0 || val.y < 0 || val.z < 0 || val.w < 0)
-      return 1;
-   else
-      return 0;
-}
diff --git a/src/gallium/auxiliary/gallivm/loweringpass.cpp b/src/gallium/auxiliary/gallivm/loweringpass.cpp
deleted file mode 100644
index 556dbec3661..00000000000
--- a/src/gallium/auxiliary/gallivm/loweringpass.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "loweringpass.h"
-
-using namespace llvm;
-
-char LoweringPass::ID = 0;
-RegisterPass<LoweringPass> X("lowering", "Lowering Pass");
-
-LoweringPass::LoweringPass()
-   :  ModulePass((intptr_t)&ID)
-{
-}
-
-bool LoweringPass::runOnModule(Module &m)
-{
-   llvm::cerr << "Hello: " << m.getModuleIdentifier() << "\n";
-   return false;
-}
diff --git a/src/gallium/auxiliary/gallivm/loweringpass.h b/src/gallium/auxiliary/gallivm/loweringpass.h
deleted file mode 100644
index f62dcf6ba73..00000000000
--- a/src/gallium/auxiliary/gallivm/loweringpass.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef LOWERINGPASS_H
-#define LOWERINGPASS_H
-
-#include "llvm/Pass.h"
-#include "llvm/Module.h"
-
-struct LoweringPass : public llvm::ModulePass
-{
-   static char ID;
-   LoweringPass();
-
-   virtual bool runOnModule(llvm::Module &m);
-};
-
-#endif
diff --git a/src/gallium/auxiliary/util/u_debug_dump.h b/src/gallium/auxiliary/gallivm/lp_bld_alpha.c
index 19b130ad183..7245730350c 100644
--- a/src/gallium/auxiliary/util/u_debug_dump.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_alpha.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2009 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,56 +22,42 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 /**
- * @file
- * Dump data in human/machine readable format.
- * 
+ * Alpha testing to LLVM IR translation.
+ *
  * @author Jose Fonseca <[email protected]>
  */
 
-#ifndef U_DEBUG_DUMP_H_
-#define U_DEBUG_DUMP_H_
-
-
-#include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_alpha.h"
 
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-
-const char *
-debug_dump_blend_factor(unsigned value, boolean shortened);
-
-const char *
-debug_dump_blend_func(unsigned value, boolean shortened);
 
-const char *
-debug_dump_func(unsigned value, boolean shortened);
+void
+lp_build_alpha_test(LLVMBuilderRef builder,
+                    const struct pipe_alpha_state *state,
+                    struct lp_type type,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef alpha,
+                    LLVMValueRef ref)
+{
+   struct lp_build_context bld;
 
-const char *
-debug_dump_tex_target(unsigned value, boolean shortened);
+   lp_build_context_init(&bld, builder, type);
 
-const char *
-debug_dump_tex_wrap(unsigned value, boolean shortened);
+   if(state->enabled) {
+      LLVMValueRef test = lp_build_cmp(&bld, state->func, alpha, ref);
 
-const char *
-debug_dump_tex_mipfilter(unsigned value, boolean shortened);
+      lp_build_name(test, "alpha_mask");
 
-const char *
-debug_dump_tex_filter(unsigned value, boolean shortened);
-
-
-/* FIXME: Move the other debug_dump_xxx functions out of u_debug.h into here. */
-
-
-#ifdef	__cplusplus
+      lp_build_mask_update(mask, test);
+   }
 }
-#endif
-
-#endif /* U_DEBUG_H_ */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_alpha.h b/src/gallium/auxiliary/gallivm/lp_bld_alpha.h
new file mode 100644
index 00000000000..634575670db
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_alpha.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Alpha testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#ifndef LP_BLD_ALPHA_H
+#define LP_BLD_ALPHA_H
+
+
+#include <llvm-c/Core.h>  
+
+struct pipe_alpha_state;
+struct lp_type;
+struct lp_build_mask_context;
+
+
+void
+lp_build_alpha_test(LLVMBuilderRef builder,
+                    const struct pipe_alpha_state *state,
+                    struct lp_type type,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef alpha,
+                    LLVMValueRef ref);
+
+
+#endif /* !LP_BLD_ALPHA_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
new file mode 100644
index 00000000000..32f9e5201c5
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -0,0 +1,1420 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper
+ *
+ * LLVM IR doesn't support all basic arithmetic operations we care about (most
+ * notably min/max and saturated operations), and it is often necessary to
+ * resort machine-specific intrinsics directly. The functions here hide all
+ * these implementation details from the other modules.
+ *
+ * We also do simple expressions simplification here. Reasons are:
+ * - it is very easy given we have all necessary information readily available
+ * - LLVM optimization passes fail to simplify several vector expressions
+ * - We often know value constraints which the optimization passes have no way
+ *   of knowing, such as when source arguments are known to be in [0, 1] range.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_memory.h"
+#include "util/u_debug.h"
+#include "util/u_math.h"
+#include "util/u_string.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_arit.h"
+
+
+/**
+ * Generate min(a, b)
+ * No checks for special case values of a or b = 1 or 0 are done.
+ */
+static LLVMValueRef
+lp_build_min_simple(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   const char *intrinsic = NULL;
+   LLVMValueRef cond;
+
+   /* TODO: optimize the constant case */
+
+   if(type.width * type.length == 128) {
+      if(type.floating) {
+         if(type.width == 32 && util_cpu_caps.has_sse)
+            intrinsic = "llvm.x86.sse.min.ps";
+         if(type.width == 64 && util_cpu_caps.has_sse2)
+            intrinsic = "llvm.x86.sse2.min.pd";
+      }
+      else {
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
+            intrinsic = "llvm.x86.sse2.pminu.b";
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+            intrinsic = "llvm.x86.sse41.pminsb";
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+            intrinsic = "llvm.x86.sse41.pminuw";
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
+            intrinsic = "llvm.x86.sse2.pmins.w";
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+            intrinsic = "llvm.x86.sse41.pminud";
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+            intrinsic = "llvm.x86.sse41.pminsd";
+      }
+   }
+
+   if(intrinsic)
+      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
+
+   cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
+   return lp_build_select(bld, cond, a, b);
+}
+
+
+/**
+ * Generate max(a, b)
+ * No checks for special case values of a or b = 1 or 0 are done.
+ */
+static LLVMValueRef
+lp_build_max_simple(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   const char *intrinsic = NULL;
+   LLVMValueRef cond;
+
+   /* TODO: optimize the constant case */
+
+   if(type.width * type.length == 128) {
+      if(type.floating) {
+         if(type.width == 32 && util_cpu_caps.has_sse)
+            intrinsic = "llvm.x86.sse.max.ps";
+         if(type.width == 64 && util_cpu_caps.has_sse2)
+            intrinsic = "llvm.x86.sse2.max.pd";
+      }
+      else {
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
+            intrinsic = "llvm.x86.sse2.pmaxu.b";
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
+            intrinsic = "llvm.x86.sse41.pmaxsb";
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
+            intrinsic = "llvm.x86.sse41.pmaxuw";
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
+            intrinsic = "llvm.x86.sse2.pmaxs.w";
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
+            intrinsic = "llvm.x86.sse41.pmaxud";
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
+            intrinsic = "llvm.x86.sse41.pmaxsd";
+      }
+   }
+
+   if(intrinsic)
+      return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
+
+   cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
+   return lp_build_select(bld, cond, a, b);
+}
+
+
+/**
+ * Generate 1 - a, or ~a depending on bld->type.
+ */
+LLVMValueRef
+lp_build_comp(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   if(a == bld->one)
+      return bld->zero;
+   if(a == bld->zero)
+      return bld->one;
+
+   if(type.norm && !type.floating && !type.fixed && !type.sign) {
+      if(LLVMIsConstant(a))
+         return LLVMConstNot(a);
+      else
+         return LLVMBuildNot(bld->builder, a, "");
+   }
+
+   if(LLVMIsConstant(a))
+      return LLVMConstSub(bld->one, a);
+   else
+      return LLVMBuildSub(bld->builder, bld->one, a, "");
+}
+
+
+/**
+ * Generate a + b
+ */
+LLVMValueRef
+lp_build_add(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   if(a == bld->zero)
+      return b;
+   if(b == bld->zero)
+      return a;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(bld->type.norm) {
+      const char *intrinsic = NULL;
+
+      if(a == bld->one || b == bld->one)
+        return bld->one;
+
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
+         !type.floating && !type.fixed) {
+         if(type.width == 8)
+            intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
+         if(type.width == 16)
+            intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
+      }
+   
+      if(intrinsic)
+         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
+   }
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b))
+      res = LLVMConstAdd(a, b);
+   else
+      res = LLVMBuildAdd(bld->builder, a, b, "");
+
+   /* clamp to ceiling of 1.0 */
+   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
+      res = lp_build_min_simple(bld, res, bld->one);
+
+   /* XXX clamp to floor of -1 or 0??? */
+
+   return res;
+}
+
+
+/**
+ * Generate a - b
+ */
+LLVMValueRef
+lp_build_sub(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   if(b == bld->zero)
+      return a;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+   if(a == b)
+      return bld->zero;
+
+   if(bld->type.norm) {
+      const char *intrinsic = NULL;
+
+      if(b == bld->one)
+        return bld->zero;
+
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
+         !type.floating && !type.fixed) {
+         if(type.width == 8)
+            intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
+         if(type.width == 16)
+            intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
+      }
+   
+      if(intrinsic)
+         return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
+   }
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b))
+      res = LLVMConstSub(a, b);
+   else
+      res = LLVMBuildSub(bld->builder, a, b, "");
+
+   if(bld->type.norm && (bld->type.floating || bld->type.fixed))
+      res = lp_build_max_simple(bld, res, bld->zero);
+
+   return res;
+}
+
+
+/**
+ * Normalized 8bit multiplication.
+ *
+ * - alpha plus one
+ *
+ *     makes the following approximation to the division (Sree)
+ *    
+ *       a*b/255 ~= (a*(b + 1)) >> 256
+ *    
+ *     which is the fastest method that satisfies the following OpenGL criteria
+ *    
+ *       0*0 = 0 and 255*255 = 255
+ *
+ * - geometric series
+ *
+ *     takes the geometric series approximation to the division
+ *
+ *       t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
+ *
+ *     in this case just the first two terms to fit in 16bit arithmetic
+ *
+ *       t/255 ~= (t + (t >> 8)) >> 8
+ *
+ *     note that just by itself it doesn't satisfies the OpenGL criteria, as
+ *     255*255 = 254, so the special case b = 255 must be accounted or roundoff
+ *     must be used
+ *
+ * - geometric series plus rounding
+ *
+ *     when using a geometric series division instead of truncating the result
+ *     use roundoff in the approximation (Jim Blinn)
+ *
+ *       t/255 ~= (t + (t >> 8) + 0x80) >> 8
+ *
+ *     achieving the exact results
+ *
+ * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995, 
+ *     ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
+ * @sa Michael Herf, The "double blend trick", May 2000, 
+ *     http://www.stereopsis.com/doubleblend.html
+ */
+static LLVMValueRef
+lp_build_mul_u8n(LLVMBuilderRef builder,
+                 struct lp_type i16_type,
+                 LLVMValueRef a, LLVMValueRef b)
+{
+   LLVMValueRef c8;
+   LLVMValueRef ab;
+
+   c8 = lp_build_int_const_scalar(i16_type, 8);
+   
+#if 0
+   
+   /* a*b/255 ~= (a*(b + 1)) >> 256 */
+   b = LLVMBuildAdd(builder, b, lp_build_int_const_scalar(i16_type, 1), "");
+   ab = LLVMBuildMul(builder, a, b, "");
+
+#else
+   
+   /* ab/255 ~= (ab + (ab >> 8) + 0x80) >> 8 */
+   ab = LLVMBuildMul(builder, a, b, "");
+   ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c8, ""), "");
+   ab = LLVMBuildAdd(builder, ab, lp_build_int_const_scalar(i16_type, 0x80), "");
+
+#endif
+   
+   ab = LLVMBuildLShr(builder, ab, c8, "");
+
+   return ab;
+}
+
+
+/**
+ * Generate a * b
+ */
+LLVMValueRef
+lp_build_mul(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef shift;
+   LLVMValueRef res;
+
+   if(a == bld->zero)
+      return bld->zero;
+   if(a == bld->one)
+      return b;
+   if(b == bld->zero)
+      return bld->zero;
+   if(b == bld->one)
+      return a;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(!type.floating && !type.fixed && type.norm) {
+      if(type.width == 8) {
+         struct lp_type i16_type = lp_wider_type(type);
+         LLVMValueRef al, ah, bl, bh, abl, abh, ab;
+
+         lp_build_unpack2(bld->builder, type, i16_type, a, &al, &ah);
+         lp_build_unpack2(bld->builder, type, i16_type, b, &bl, &bh);
+
+         /* PMULLW, PSRLW, PADDW */
+         abl = lp_build_mul_u8n(bld->builder, i16_type, al, bl);
+         abh = lp_build_mul_u8n(bld->builder, i16_type, ah, bh);
+
+         ab = lp_build_pack2(bld->builder, i16_type, type, abl, abh);
+         
+         return ab;
+      }
+
+      /* FIXME */
+      assert(0);
+   }
+
+   if(type.fixed)
+      shift = lp_build_int_const_scalar(type, type.width/2);
+   else
+      shift = NULL;
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
+      res =  LLVMConstMul(a, b);
+      if(shift) {
+         if(type.sign)
+            res = LLVMConstAShr(res, shift);
+         else
+            res = LLVMConstLShr(res, shift);
+      }
+   }
+   else {
+      res = LLVMBuildMul(bld->builder, a, b, "");
+      if(shift) {
+         if(type.sign)
+            res = LLVMBuildAShr(bld->builder, res, shift, "");
+         else
+            res = LLVMBuildLShr(bld->builder, res, shift, "");
+      }
+   }
+
+   return res;
+}
+
+
+/**
+ * Small vector x scale multiplication optimization.
+ */
+LLVMValueRef
+lp_build_mul_imm(struct lp_build_context *bld,
+                 LLVMValueRef a,
+                 int b)
+{
+   LLVMValueRef factor;
+
+   if(b == 0)
+      return bld->zero;
+
+   if(b == 1)
+      return a;
+
+   if(b == -1)
+      return LLVMBuildNeg(bld->builder, a, "");
+
+   if(b == 2 && bld->type.floating)
+      return lp_build_add(bld, a, a);
+
+   if(util_is_pot(b)) {
+      unsigned shift = ffs(b) - 1;
+
+      if(bld->type.floating) {
+#if 0
+         /*
+          * Power of two multiplication by directly manipulating the mantissa.
+          *
+          * XXX: This might not be always faster, it will introduce a small error
+          * for multiplication by zero, and it will produce wrong results
+          * for Inf and NaN.
+          */
+         unsigned mantissa = lp_mantissa(bld->type);
+         factor = lp_build_int_const_scalar(bld->type, (unsigned long long)shift << mantissa);
+         a = LLVMBuildBitCast(bld->builder, a, lp_build_int_vec_type(bld->type), "");
+         a = LLVMBuildAdd(bld->builder, a, factor, "");
+         a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(bld->type), "");
+         return a;
+#endif
+      }
+      else {
+         factor = lp_build_const_scalar(bld->type, shift);
+         return LLVMBuildShl(bld->builder, a, factor, "");
+      }
+   }
+
+   factor = lp_build_const_scalar(bld->type, (double)b);
+   return lp_build_mul(bld, a, factor);
+}
+
+
+/**
+ * Generate a / b
+ */
+LLVMValueRef
+lp_build_div(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   const struct lp_type type = bld->type;
+
+   if(a == bld->zero)
+      return bld->zero;
+   if(a == bld->one)
+      return lp_build_rcp(bld, b);
+   if(b == bld->zero)
+      return bld->undef;
+   if(b == bld->one)
+      return a;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(LLVMIsConstant(a) && LLVMIsConstant(b))
+      return LLVMConstFDiv(a, b);
+
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
+      return lp_build_mul(bld, a, lp_build_rcp(bld, b));
+
+   return LLVMBuildFDiv(bld->builder, a, b, "");
+}
+
+
+/**
+ * Linear interpolation.
+ *
+ * This also works for integer values with a few caveats.
+ *
+ * @sa http://www.stereopsis.com/doubleblend.html
+ */
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1)
+{
+   LLVMValueRef delta;
+   LLVMValueRef res;
+
+   delta = lp_build_sub(bld, v1, v0);
+
+   res = lp_build_mul(bld, x, delta);
+
+   res = lp_build_add(bld, v0, res);
+
+   if(bld->type.fixed)
+      /* XXX: This step is necessary for lerping 8bit colors stored on 16bits,
+       * but it will be wrong for other uses. Basically we need a more
+       * powerful lp_type, capable of further distinguishing the values
+       * interpretation from the value storage. */
+      res = LLVMBuildAnd(bld->builder, res, lp_build_int_const_scalar(bld->type, (1 << bld->type.width/2) - 1), "");
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11)
+{
+   LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
+   LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
+   return lp_build_lerp(bld, y, v0, v1);
+}
+
+
+/**
+ * Generate min(a, b)
+ * Do checks for special cases.
+ */
+LLVMValueRef
+lp_build_min(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(a == b)
+      return a;
+
+   if(bld->type.norm) {
+      if(a == bld->zero || b == bld->zero)
+         return bld->zero;
+      if(a == bld->one)
+         return b;
+      if(b == bld->one)
+         return a;
+   }
+
+   return lp_build_min_simple(bld, a, b);
+}
+
+
+/**
+ * Generate max(a, b)
+ * Do checks for special cases.
+ */
+LLVMValueRef
+lp_build_max(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   if(a == b)
+      return a;
+
+   if(bld->type.norm) {
+      if(a == bld->one || b == bld->one)
+         return bld->one;
+      if(a == bld->zero)
+         return b;
+      if(b == bld->zero)
+         return a;
+   }
+
+   return lp_build_max_simple(bld, a, b);
+}
+
+
+/**
+ * Generate clamp(a, min, max)
+ * Do checks for special cases.
+ */
+LLVMValueRef
+lp_build_clamp(struct lp_build_context *bld,
+               LLVMValueRef a,
+               LLVMValueRef min,
+               LLVMValueRef max)
+{
+   a = lp_build_min(bld, a, max);
+   a = lp_build_max(bld, a, min);
+   return a;
+}
+
+
+/**
+ * Generate abs(a)
+ */
+LLVMValueRef
+lp_build_abs(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+
+   if(!type.sign)
+      return a;
+
+   if(type.floating) {
+      /* Mask out the sign bit */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      unsigned long long absMask = ~(1ULL << (type.width - 1));
+      LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long) absMask));
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      a = LLVMBuildAnd(bld->builder, a, mask, "");
+      a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
+      return a;
+   }
+
+   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
+      switch(type.width) {
+      case 8:
+         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
+      case 16:
+         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
+      case 32:
+         return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
+      }
+   }
+
+   return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
+}
+
+
+LLVMValueRef
+lp_build_negate(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   return LLVMBuildNeg(bld->builder, a, "");
+}
+
+
+LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMValueRef cond;
+   LLVMValueRef res;
+
+   /* Handle non-zero case */
+   if(!type.sign) {
+      /* if not zero then sign must be positive */
+      res = bld->one;
+   }
+   else if(type.floating) {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef one;
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      one = LLVMConstBitCast(bld->one, int_vec_type);
+      res = LLVMBuildOr(bld->builder, sign, one, "");
+      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+   }
+   else
+   {
+      LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
+      cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
+      res = lp_build_select(bld, cond, bld->one, minus_one);
+   }
+
+   /* Handle zero */
+   cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
+   res = lp_build_select(bld, cond, bld->zero, bld->one);
+
+   return res;
+}
+
+
+/**
+ * Set the sign of float vector 'a' according to 'sign'.
+ * If sign==0, return abs(a).
+ * If sign==1, return -abs(a);
+ * Other values for sign produce undefined results.
+ */
+LLVMValueRef
+lp_build_set_sign(struct lp_build_context *bld,
+                  LLVMValueRef a, LLVMValueRef sign)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMValueRef shift = lp_build_int_const_scalar(type, type.width - 1);
+   LLVMValueRef mask = lp_build_int_const_scalar(type,
+                             ~((unsigned long long) 1 << (type.width - 1)));
+   LLVMValueRef val, res;
+
+   assert(type.floating);
+
+   /* val = reinterpret_cast<int>(a) */
+   val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+   /* val = val & mask */
+   val = LLVMBuildAnd(bld->builder, val, mask, "");
+   /* sign = sign << shift */
+   sign = LLVMBuildShl(bld->builder, sign, shift, "");
+   /* res = val | sign */
+   res = LLVMBuildOr(bld->builder, val, sign, "");
+   /* res = reinterpret_cast<float>(res) */
+   res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+
+   return res;
+}
+
+
+/**
+ * Convert vector of int to vector of float.
+ */
+LLVMValueRef
+lp_build_int_to_float(struct lp_build_context *bld,
+                      LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+   /*assert(lp_check_value(type, a));*/
+
+   {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/
+      LLVMValueRef res;
+      res = LLVMBuildSIToFP(bld->builder, a, vec_type, "");
+      return res;
+   }
+}
+
+
+
+enum lp_build_round_sse41_mode
+{
+   LP_BUILD_ROUND_SSE41_NEAREST = 0,
+   LP_BUILD_ROUND_SSE41_FLOOR = 1,
+   LP_BUILD_ROUND_SSE41_CEIL = 2,
+   LP_BUILD_ROUND_SSE41_TRUNCATE = 3
+};
+
+
+static INLINE LLVMValueRef
+lp_build_round_sse41(struct lp_build_context *bld,
+                     LLVMValueRef a,
+                     enum lp_build_round_sse41_mode mode)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   const char *intrinsic;
+
+   assert(type.floating);
+   assert(type.width*type.length == 128);
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse4_1);
+
+   switch(type.width) {
+   case 32:
+      intrinsic = "llvm.x86.sse41.round.ps";
+      break;
+   case 64:
+      intrinsic = "llvm.x86.sse41.round.pd";
+      break;
+   default:
+      assert(0);
+      return bld->undef;
+   }
+
+   return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
+                                    LLVMConstInt(LLVMInt32Type(), mode, 0));
+}
+
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef res;
+      res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
+
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_iround(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
+
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_ifloor(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
+
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_iceil(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
+
+
+/**
+ * Return fractional part of 'a' computed as a - floor(f)
+ * Typically used in texture coord arithmetic.
+ */
+LLVMValueRef
+lp_build_fract(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   assert(bld->type.floating);
+   return lp_build_sub(bld, a, lp_build_floor(bld, a));
+}
+
+
+/**
+ * Convert to integer, through whichever rounding method that's fastest,
+ * typically truncating toward zero.
+ */
+LLVMValueRef
+lp_build_itrunc(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+}
+
+
+LLVMValueRef
+lp_build_iround(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   }
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef half;
+
+      /* get sign bit */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+
+      /* sign * 0.5 */
+      half = lp_build_const_scalar(type, 0.5);
+      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
+      half = LLVMBuildOr(bld->builder, sign, half, "");
+      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
+
+      res = LLVMBuildAdd(bld->builder, a, half, "");
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
+}
+
+
+/**
+ * Convert float[] to int[] with floor().
+ */
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   }
+   else {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      unsigned mantissa = lp_mantissa(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef offset;
+
+      /* sign = a < 0 ? ~0 : 0 */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
+      lp_build_name(sign, "floor.sign");
+
+      /* offset = -0.99999(9)f */
+      offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = LLVMConstBitCast(offset, int_vec_type);
+
+      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
+      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
+      lp_build_name(offset, "floor.offset");
+
+      res = LLVMBuildAdd(bld->builder, a, offset, "");
+      lp_build_name(res, "floor.res");
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+   lp_build_name(res, "floor");
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_iceil(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   }
+   else {
+      assert(0);
+      res = bld->undef;
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_sqrt(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   char intrinsic[32];
+
+   /* TODO: optimize the constant case */
+   /* TODO: optimize the constant case */
+
+   assert(type.floating);
+   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
+
+   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
+}
+
+
+LLVMValueRef
+lp_build_rcp(struct lp_build_context *bld,
+             LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   if(a == bld->zero)
+      return bld->undef;
+   if(a == bld->one)
+      return bld->one;
+   if(a == bld->undef)
+      return bld->undef;
+
+   assert(type.floating);
+
+   if(LLVMIsConstant(a))
+      return LLVMConstFDiv(bld->one, a);
+
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
+      /* FIXME: improve precision */
+      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
+
+   return LLVMBuildFDiv(bld->builder, bld->one, a, "");
+}
+
+
+/**
+ * Generate 1/sqrt(a)
+ */
+LLVMValueRef
+lp_build_rsqrt(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
+      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
+
+   return lp_build_rcp(bld, lp_build_sqrt(bld, a));
+}
+
+
+/**
+ * Generate cos(a)
+ */
+LLVMValueRef
+lp_build_cos(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   char intrinsic[32];
+
+   /* TODO: optimize the constant case */
+
+   assert(type.floating);
+   util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
+
+   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
+}
+
+
+/**
+ * Generate sin(a)
+ */
+LLVMValueRef
+lp_build_sin(struct lp_build_context *bld,
+              LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   char intrinsic[32];
+
+   /* TODO: optimize the constant case */
+
+   assert(type.floating);
+   util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
+
+   return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
+}
+
+
+/**
+ * Generate pow(x, y)
+ */
+LLVMValueRef
+lp_build_pow(struct lp_build_context *bld,
+             LLVMValueRef x,
+             LLVMValueRef y)
+{
+   /* TODO: optimize the constant case */
+   if(LLVMIsConstant(x) && LLVMIsConstant(y))
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
+
+   return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
+}
+
+
+/**
+ * Generate exp(x)
+ */
+LLVMValueRef
+lp_build_exp(struct lp_build_context *bld,
+             LLVMValueRef x)
+{
+   /* log2(e) = 1/log(2) */
+   LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
+
+   return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
+}
+
+
+/**
+ * Generate log(x)
+ */
+LLVMValueRef
+lp_build_log(struct lp_build_context *bld,
+             LLVMValueRef x)
+{
+   /* log(2) */
+   LLVMValueRef log2 = lp_build_const_scalar(bld->type, 0.69314718055994529);
+
+   return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
+}
+
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+
+/**
+ * Generate polynomial.
+ * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
+ */
+static LLVMValueRef
+lp_build_polynomial(struct lp_build_context *bld,
+                    LLVMValueRef x,
+                    const double *coeffs,
+                    unsigned num_coeffs)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res = NULL;
+   unsigned i;
+
+   /* TODO: optimize the constant case */
+   if(LLVMIsConstant(x))
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
+
+   for (i = num_coeffs; i--; ) {
+      LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
+      if(res)
+         res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
+      else
+         res = coeff;
+   }
+
+   if(res)
+      return res;
+   else
+      return bld->undef;
+}
+
+
+/**
+ * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
+ */
+const double lp_build_exp2_polynomial[] = {
+#if EXP_POLY_DEGREE == 5
+   9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
+#elif EXP_POLY_DEGREE == 4
+   1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
+#elif EXP_POLY_DEGREE == 3
+   9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
+#elif EXP_POLY_DEGREE == 2
+   1.0017247, 6.5763628e-1, 3.3718944e-1
+#else
+#error
+#endif
+};
+
+
+void
+lp_build_exp2_approx(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef *p_exp2_int_part,
+                     LLVMValueRef *p_frac_part,
+                     LLVMValueRef *p_exp2)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef ipart = NULL;
+   LLVMValueRef fpart = NULL;
+   LLVMValueRef expipart = NULL;
+   LLVMValueRef expfpart = NULL;
+   LLVMValueRef res = NULL;
+
+   if(p_exp2_int_part || p_frac_part || p_exp2) {
+      /* TODO: optimize the constant case */
+      if(LLVMIsConstant(x))
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
+
+      assert(type.floating && type.width == 32);
+
+      x = lp_build_min(bld, x, lp_build_const_scalar(type,  129.0));
+      x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
+
+      /* ipart = int(x - 0.5) */
+      ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
+      ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
+
+      /* fpart = x - ipart */
+      fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
+      fpart = LLVMBuildSub(bld->builder, x, fpart, "");
+   }
+
+   if(p_exp2_int_part || p_exp2) {
+      /* expipart = (float) (1 << ipart) */
+      expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
+      expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
+      expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
+   }
+
+   if(p_exp2) {
+      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
+                                     Elements(lp_build_exp2_polynomial));
+
+      res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
+   }
+
+   if(p_exp2_int_part)
+      *p_exp2_int_part = expipart;
+
+   if(p_frac_part)
+      *p_frac_part = fpart;
+
+   if(p_exp2)
+      *p_exp2 = res;
+}
+
+
+LLVMValueRef
+lp_build_exp2(struct lp_build_context *bld,
+              LLVMValueRef x)
+{
+   LLVMValueRef res;
+   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
+   return res;
+}
+
+
+/**
+ * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
+ * These coefficients can be generate with
+ * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+ */
+const double lp_build_log2_polynomial[] = {
+#if LOG_POLY_DEGREE == 6
+   3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
+#elif LOG_POLY_DEGREE == 5
+   2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
+#elif LOG_POLY_DEGREE == 4
+   2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
+#elif LOG_POLY_DEGREE == 3
+   2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
+#else
+#error
+#endif
+};
+
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+void
+lp_build_log2_approx(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef *p_exp,
+                     LLVMValueRef *p_floor_log2,
+                     LLVMValueRef *p_log2)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
+   LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
+   LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
+
+   LLVMValueRef i = NULL;
+   LLVMValueRef exp = NULL;
+   LLVMValueRef mant = NULL;
+   LLVMValueRef logexp = NULL;
+   LLVMValueRef logmant = NULL;
+   LLVMValueRef res = NULL;
+
+   if(p_exp || p_floor_log2 || p_log2) {
+      /* TODO: optimize the constant case */
+      if(LLVMIsConstant(x))
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
+
+      assert(type.floating && type.width == 32);
+
+      i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
+
+      /* exp = (float) exponent(x) */
+      exp = LLVMBuildAnd(bld->builder, i, expmask, "");
+   }
+
+   if(p_floor_log2 || p_log2) {
+      logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
+      logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
+      logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
+   }
+
+   if(p_log2) {
+      /* mant = (float) mantissa(x) */
+      mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
+      mant = LLVMBuildOr(bld->builder, mant, one, "");
+      mant = LLVMBuildBitCast(bld->builder, mant, vec_type, "");
+
+      logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
+                                    Elements(lp_build_log2_polynomial));
+
+      /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
+
+      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
+   }
+
+   if(p_exp)
+      *p_exp = exp;
+
+   if(p_floor_log2)
+      *p_floor_log2 = logexp;
+
+   if(p_log2)
+      *p_log2 = res;
+}
+
+
+LLVMValueRef
+lp_build_log2(struct lp_build_context *bld,
+              LLVMValueRef x)
+{
+   LLVMValueRef res;
+   lp_build_log2_approx(bld, x, NULL, NULL, &res);
+   return res;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
new file mode 100644
index 00000000000..55385e3a66a
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
@@ -0,0 +1,225 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper arithmetic functions.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_ARIT_H
+#define LP_BLD_ARIT_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type;
+struct lp_build_context;
+
+
+/**
+ * Complement, i.e., 1 - a.
+ */
+LLVMValueRef
+lp_build_comp(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_add(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_sub(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_mul(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_mul_imm(struct lp_build_context *bld,
+                 LLVMValueRef a,
+                 int b);
+
+LLVMValueRef
+lp_build_div(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_lerp(struct lp_build_context *bld,
+              LLVMValueRef x,
+              LLVMValueRef v0,
+              LLVMValueRef v1);
+
+/**
+ * Bilinear interpolation.
+ *
+ * Values indices are in v_{yx}.
+ */
+LLVMValueRef
+lp_build_lerp_2d(struct lp_build_context *bld,
+                 LLVMValueRef x,
+                 LLVMValueRef y,
+                 LLVMValueRef v00,
+                 LLVMValueRef v01,
+                 LLVMValueRef v10,
+                 LLVMValueRef v11);
+
+LLVMValueRef
+lp_build_min(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_max(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_clamp(struct lp_build_context *bld,
+               LLVMValueRef a,
+               LLVMValueRef min,
+               LLVMValueRef max);
+
+LLVMValueRef
+lp_build_abs(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_negate(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
+lp_build_sgn(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_set_sign(struct lp_build_context *bld,
+                  LLVMValueRef a, LLVMValueRef sign);
+
+LLVMValueRef
+lp_build_int_to_float(struct lp_build_context *bld,
+                      LLVMValueRef a);
+
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_floor(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ceil(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_trunc(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_fract(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a);
+LLVMValueRef
+lp_build_iceil(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_iround(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
+lp_build_itrunc(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
+lp_build_sqrt(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_rcp(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_rsqrt(struct lp_build_context *bld,
+               LLVMValueRef a);
+
+LLVMValueRef
+lp_build_cos(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_sin(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_pow(struct lp_build_context *bld,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+LLVMValueRef
+lp_build_exp(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_log(struct lp_build_context *bld,
+             LLVMValueRef a);
+
+LLVMValueRef
+lp_build_exp2(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+LLVMValueRef
+lp_build_log2(struct lp_build_context *bld,
+              LLVMValueRef a);
+
+void
+lp_build_exp2_approx(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef *p_exp2_int_part,
+                     LLVMValueRef *p_frac_part,
+                     LLVMValueRef *p_exp2);
+
+void
+lp_build_log2_approx(struct lp_build_context *bld,
+                     LLVMValueRef x,
+                     LLVMValueRef *p_exp,
+                     LLVMValueRef *p_floor_log2,
+                     LLVMValueRef *p_log2);
+
+#endif /* !LP_BLD_ARIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend.h b/src/gallium/auxiliary/gallivm/lp_bld_blend.h
new file mode 100644
index 00000000000..da272e549f3
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_blend.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_BLD_BLEND_H
+#define LP_BLD_BLEND_H
+
+
+/**
+ * @file
+ * LLVM IR building helpers interfaces.
+ *
+ * We use LLVM-C bindings for now. They are not documented, but follow the C++
+ * interfaces very closely, and appear to be complete enough for code
+ * genration. See
+ * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html
+ * for a standalone example.
+ */
+
+#include <llvm-c/Core.h>  
+ 
+#include "pipe/p_format.h"
+
+
+struct pipe_blend_state;
+struct lp_type;
+struct lp_build_context;
+
+
+/**
+ * Whether the blending function is commutative or not.
+ */
+boolean
+lp_build_blend_func_commutative(unsigned func);
+
+
+/**
+ * Whether the blending functions are the reverse of each other.
+ */
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func);
+
+
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
+                    unsigned func,
+                    LLVMValueRef term1,
+                    LLVMValueRef term2);
+
+
+LLVMValueRef
+lp_build_blend_aos(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   LLVMValueRef src,
+                   LLVMValueRef dst,
+                   LLVMValueRef const_,
+                   unsigned alpha_swizzle);
+
+
+void
+lp_build_blend_soa(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   LLVMValueRef src[4],
+                   LLVMValueRef dst[4],
+                   LLVMValueRef const_[4],
+                   LLVMValueRef res[4]);
+
+
+/**
+ * Apply a logic op.
+ *
+ * src/dst parameters are packed values. It should work regardless the inputs
+ * are scalars, or a vector.
+ */
+LLVMValueRef
+lp_build_logicop(LLVMBuilderRef builder,
+                 unsigned logicop_func,
+                 LLVMValueRef src,
+                 LLVMValueRef dst);
+
+
+#endif /* !LP_BLD_BLEND_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_blend_aos.c
new file mode 100644
index 00000000000..0215bb72ac6
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_blend_aos.c
@@ -0,0 +1,360 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- AoS layout.
+ *
+ * AoS blending is in general much slower than SoA, but there are some cases
+ * where it might be faster. In particular, if a pixel is rendered only once
+ * then the overhead of tiling and untiling will dominate over the speedup that
+ * SoA gives. So we might want to detect such cases and fallback to AoS in the
+ * future, but for now this function is here for historical/benchmarking
+ * purposes.
+ *
+ * Run lp_blend_test after any change to this file.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_blend.h"
+#include "lp_bld_debug.h"
+
+
+/**
+ * We may the same values several times, so we keep them here to avoid
+ * recomputing them. Also reusing the values allows us to do simplifications
+ * that LLVM optimization passes wouldn't normally be able to do.
+ */
+struct lp_build_blend_aos_context
+{
+   struct lp_build_context base;
+   
+   LLVMValueRef src;
+   LLVMValueRef dst;
+   LLVMValueRef const_;
+
+   LLVMValueRef inv_src;
+   LLVMValueRef inv_dst;
+   LLVMValueRef inv_const;
+   LLVMValueRef saturate;
+
+   LLVMValueRef rgb_src_factor;
+   LLVMValueRef alpha_src_factor;
+   LLVMValueRef rgb_dst_factor;
+   LLVMValueRef alpha_dst_factor;
+};
+
+
+static LLVMValueRef
+lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
+                                 unsigned factor,
+                                 boolean alpha)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_ONE:
+      return bld->base.one;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return bld->src;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return bld->dst;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if(alpha)
+         return bld->base.one;
+      else {
+         if(!bld->inv_dst)
+            bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+         if(!bld->saturate)
+            bld->saturate = lp_build_min(&bld->base, bld->src, bld->inv_dst);
+         return bld->saturate;
+      }
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return bld->const_;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      if(!bld->inv_src)
+         bld->inv_src = lp_build_comp(&bld->base, bld->src);
+      return bld->inv_src;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if(!bld->inv_dst)
+         bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+      return bld->inv_dst;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      if(!bld->inv_const)
+         bld->inv_const = lp_build_comp(&bld->base, bld->const_);
+      return bld->inv_const;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   default:
+      assert(0);
+      return bld->base.zero;
+   }
+}
+
+
+enum lp_build_blend_swizzle {
+   LP_BUILD_BLEND_SWIZZLE_RGBA = 0,
+   LP_BUILD_BLEND_SWIZZLE_AAAA = 1
+};
+
+
+/**
+ * How should we shuffle the base factor.
+ */
+static enum lp_build_blend_swizzle
+lp_build_blend_factor_swizzle(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+   case PIPE_BLENDFACTOR_ZERO:
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      return LP_BUILD_BLEND_SWIZZLE_RGBA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      return LP_BUILD_BLEND_SWIZZLE_AAAA;
+   default:
+      assert(0);
+      return LP_BUILD_BLEND_SWIZZLE_RGBA;
+   }
+}
+
+
+static LLVMValueRef
+lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
+                       LLVMValueRef rgb, 
+                       LLVMValueRef alpha, 
+                       enum lp_build_blend_swizzle rgb_swizzle,
+                       unsigned alpha_swizzle)
+{
+   if(rgb == alpha) {
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA)
+         return rgb;
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA)
+         return lp_build_broadcast_aos(&bld->base, rgb, alpha_swizzle);
+   }
+   else {
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_RGBA) {
+         boolean cond[4] = {0, 0, 0, 0};
+         cond[alpha_swizzle] = 1;
+         return lp_build_select_aos(&bld->base, alpha, rgb, cond);
+      }
+      if(rgb_swizzle == LP_BUILD_BLEND_SWIZZLE_AAAA) {
+         unsigned char swizzle[4];
+         swizzle[0] = alpha_swizzle;
+         swizzle[1] = alpha_swizzle;
+         swizzle[2] = alpha_swizzle;
+         swizzle[3] = alpha_swizzle;
+         swizzle[alpha_swizzle] += 4;
+         return lp_build_swizzle2_aos(&bld->base, rgb, alpha, swizzle);
+      }
+   }
+   assert(0);
+   return bld->base.undef;
+}
+
+
+/**
+ * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
+ */
+static LLVMValueRef
+lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
+                      LLVMValueRef factor1,
+                      unsigned rgb_factor,
+                      unsigned alpha_factor,
+                      unsigned alpha_swizzle)
+{
+   LLVMValueRef rgb_factor_;
+   LLVMValueRef alpha_factor_;
+   LLVMValueRef factor2;
+   enum lp_build_blend_swizzle rgb_swizzle;
+
+   rgb_factor_   = lp_build_blend_factor_unswizzled(bld, rgb_factor,   FALSE);
+   alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
+
+   rgb_swizzle = lp_build_blend_factor_swizzle(rgb_factor);
+
+   factor2 = lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle);
+
+   return lp_build_mul(&bld->base, factor1, factor2);
+}
+
+
+boolean
+lp_build_blend_func_commutative(unsigned func)
+{
+   switch (func) {
+   case PIPE_BLEND_ADD:
+   case PIPE_BLEND_MIN:
+   case PIPE_BLEND_MAX:
+      return TRUE;
+   case PIPE_BLEND_SUBTRACT:
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return FALSE;
+   default:
+      assert(0);
+      return TRUE;
+   }
+}
+
+
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
+{
+   if(rgb_func == alpha_func)
+      return FALSE;
+   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
+      return TRUE;
+   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
+      return TRUE;
+   return FALSE;
+}
+
+
+/**
+ * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
+ */
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
+                    unsigned func,
+                    LLVMValueRef term1, 
+                    LLVMValueRef term2)
+{
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      return lp_build_add(bld, term1, term2);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      return lp_build_sub(bld, term1, term2);
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return lp_build_sub(bld, term2, term1);
+   case PIPE_BLEND_MIN:
+      return lp_build_min(bld, term1, term2);
+   case PIPE_BLEND_MAX:
+      return lp_build_max(bld, term1, term2);
+   default:
+      assert(0);
+      return bld->zero;
+   }
+}
+
+
+LLVMValueRef
+lp_build_blend_aos(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   LLVMValueRef src,
+                   LLVMValueRef dst,
+                   LLVMValueRef const_,
+                   unsigned alpha_swizzle)
+{
+   struct lp_build_blend_aos_context bld;
+   LLVMValueRef src_term;
+   LLVMValueRef dst_term;
+
+   /* FIXME */
+   assert(blend->independent_blend_enable == 0);
+   assert(blend->rt[0].colormask == 0xf);
+
+   if(!blend->rt[0].blend_enable)
+      return src;
+
+   /* It makes no sense to blend unless values are normalized */
+   assert(type.norm);
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   bld.src = src;
+   bld.dst = dst;
+   bld.const_ = const_;
+
+   /* TODO: There are still a few optimization opportunities here. For certain
+    * combinations it is possible to reorder the operations and therefore saving
+    * some instructions. */
+
+   src_term = lp_build_blend_factor(&bld, src, blend->rt[0].rgb_src_factor,
+                                    blend->rt[0].alpha_src_factor, alpha_swizzle);
+   dst_term = lp_build_blend_factor(&bld, dst, blend->rt[0].rgb_dst_factor,
+                                    blend->rt[0].alpha_dst_factor, alpha_swizzle);
+
+   lp_build_name(src_term, "src_term");
+   lp_build_name(dst_term, "dst_term");
+
+   if(blend->rt[0].rgb_func == blend->rt[0].alpha_func) {
+      return lp_build_blend_func(&bld.base, blend->rt[0].rgb_func, src_term, dst_term);
+   }
+   else {
+      /* Seperate RGB / A functions */
+
+      LLVMValueRef rgb;
+      LLVMValueRef alpha;
+
+      rgb   = lp_build_blend_func(&bld.base, blend->rt[0].rgb_func,   src_term, dst_term);
+      alpha = lp_build_blend_func(&bld.base, blend->rt[0].alpha_func, src_term, dst_term);
+
+      return lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend_logicop.c b/src/gallium/auxiliary/gallivm/lp_bld_blend_logicop.c
new file mode 100644
index 00000000000..1eac0a5c891
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_blend_logicop.c
@@ -0,0 +1,109 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- logic ops.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+
+#include "lp_bld_blend.h"
+
+
+LLVMValueRef
+lp_build_logicop(LLVMBuilderRef builder,
+                 unsigned logicop_func,
+                 LLVMValueRef src,
+                 LLVMValueRef dst)
+{
+   LLVMTypeRef type;
+   LLVMValueRef res;
+
+   type = LLVMTypeOf(src);
+
+   switch (logicop_func) {
+   case PIPE_LOGICOP_CLEAR:
+      res = LLVMConstNull(type);
+      break;
+   case PIPE_LOGICOP_NOR:
+      res = LLVMBuildNot(builder, LLVMBuildOr(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      res = LLVMBuildAnd(builder, LLVMBuildNot(builder, src, ""), dst, "");
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      res = LLVMBuildNot(builder, src, "");
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      res = LLVMBuildAnd(builder, src, LLVMBuildNot(builder, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_INVERT:
+      res = LLVMBuildNot(builder, dst, "");
+      break;
+   case PIPE_LOGICOP_XOR:
+      res = LLVMBuildXor(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_NAND:
+      res = LLVMBuildNot(builder, LLVMBuildAnd(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_AND:
+      res = LLVMBuildAnd(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      res = LLVMBuildNot(builder, LLVMBuildXor(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_NOOP:
+      res = dst;
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      res = LLVMBuildOr(builder, LLVMBuildNot(builder, src, ""), dst, "");
+      break;
+   case PIPE_LOGICOP_COPY:
+      res = src;
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      res = LLVMBuildOr(builder, src, LLVMBuildNot(builder, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_OR:
+      res = LLVMBuildOr(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_SET:
+      res = LLVMConstAllOnes(type);
+      break;
+   default:
+      assert(0);
+      res = src;
+   }
+
+   return res;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_blend_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_blend_soa.c
new file mode 100644
index 00000000000..6d5a45db7a3
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_blend_soa.c
@@ -0,0 +1,298 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- SoA layout.
+ *
+ * Blending in SoA is much faster than AoS, especially when separate rgb/alpha
+ * factors/functions are used, since no channel masking/shuffling is necessary
+ * and we can achieve the full throughput of the SIMD operations. Furthermore
+ * the fragment shader output is also in SoA, so it fits nicely with the rest of
+ * the fragment pipeline.
+ *
+ * The drawback is that to be displayed the color buffer needs to be in AoS
+ * layout, so we need to tile/untile the color buffer before/after rendering.
+ * A color buffer like
+ *
+ *  R11 G11 B11 A11 R12 G12 B12 A12  R13 G13 B13 A13 R14 G14 B14 A14  ...
+ *  R21 G21 B21 A21 R22 G22 B22 A22  R23 G23 B23 A23 R24 G24 B24 A24  ...
+ *
+ *  R31 G31 B31 A31 R32 G32 B32 A32  R33 G33 B33 A33 R34 G34 B34 A34  ...
+ *  R41 G41 B41 A41 R42 G42 B42 A42  R43 G43 B43 A43 R44 G44 B44 A44  ...
+ *
+ *  ... ... ... ... ... ... ... ...  ... ... ... ... ... ... ... ...  ...
+ *
+ * will actually be stored in memory as
+ *
+ *  R11 R12 R21 R22 R13 R14 R23 R24 ... G11 G12 G21 G22 G13 G14 G23 G24 ... B11 B12 B21 B22 B13 B14 B23 B24 ... A11 A12 A21 A22 A13 A14 A23 A24 ...
+ *  R31 R32 R41 R42 R33 R34 R43 R44 ... G31 G32 G41 G42 G33 G34 G43 G44 ... B31 B32 B41 B42 B33 B34 B43 B44 ... A31 A32 A41 A42 A33 A34 A43 A44 ...
+ *  ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
+ *
+ * NOTE: Run lp_blend_test after any change to this file.
+ *
+ * You can also run lp_blend_test to obtain AoS vs SoA benchmarks. Invoking it
+ * as:
+ *
+ *  lp_blend_test -o blend.tsv
+ *
+ * will generate a tab-seperated-file with the test results and performance
+ * measurements.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_blend.h"
+
+
+/**
+ * We may the same values several times, so we keep them here to avoid
+ * recomputing them. Also reusing the values allows us to do simplifications
+ * that LLVM optimization passes wouldn't normally be able to do.
+ */
+struct lp_build_blend_soa_context
+{
+   struct lp_build_context base;
+
+   LLVMValueRef src[4];
+   LLVMValueRef dst[4];
+   LLVMValueRef con[4];
+
+   LLVMValueRef inv_src[4];
+   LLVMValueRef inv_dst[4];
+   LLVMValueRef inv_con[4];
+
+   LLVMValueRef src_alpha_saturate;
+
+   /**
+    * We store all factors in a table in order to eliminate redundant
+    * multiplications later.
+    */
+   LLVMValueRef factor[2][2][4];
+
+   /**
+    * Table with all terms.
+    */
+   LLVMValueRef term[2][4];
+};
+
+
+static LLVMValueRef
+lp_build_blend_soa_factor(struct lp_build_blend_soa_context *bld,
+                          unsigned factor, unsigned i)
+{
+   /*
+    * Compute src/first term RGB
+    */
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      return bld->base.one;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return bld->src[i];
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return bld->src[3];
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return bld->dst[i];
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return bld->dst[3];
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if(i == 3)
+         return bld->base.one;
+      else {
+         if(!bld->inv_dst[3])
+            bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
+         if(!bld->src_alpha_saturate)
+            bld->src_alpha_saturate = lp_build_min(&bld->base, bld->src[3], bld->inv_dst[3]);
+         return bld->src_alpha_saturate;
+      }
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return bld->con[i];
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return bld->con[3];
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_ZERO:
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      if(!bld->inv_src[i])
+         bld->inv_src[i] = lp_build_comp(&bld->base, bld->src[i]);
+      return bld->inv_src[i];
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      if(!bld->inv_src[3])
+         bld->inv_src[3] = lp_build_comp(&bld->base, bld->src[3]);
+      return bld->inv_src[3];
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      if(!bld->inv_dst[i])
+         bld->inv_dst[i] = lp_build_comp(&bld->base, bld->dst[i]);
+      return bld->inv_dst[i];
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if(!bld->inv_dst[3])
+         bld->inv_dst[3] = lp_build_comp(&bld->base, bld->dst[3]);
+      return bld->inv_dst[3];
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      if(!bld->inv_con[i])
+         bld->inv_con[i] = lp_build_comp(&bld->base, bld->con[i]);
+      return bld->inv_con[i];
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      if(!bld->inv_con[3])
+         bld->inv_con[3] = lp_build_comp(&bld->base, bld->con[3]);
+      return bld->inv_con[3];
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* TODO */
+      assert(0);
+      return bld->base.zero;
+   default:
+      assert(0);
+      return bld->base.zero;
+   }
+}
+
+
+/**
+ * Generate blend code in SOA mode.
+ * \param src  src/fragment color
+ * \param dst  dst/framebuffer color
+ * \param con  constant blend color
+ * \param res  the result/output
+ */
+void
+lp_build_blend_soa(LLVMBuilderRef builder,
+                   const struct pipe_blend_state *blend,
+                   struct lp_type type,
+                   LLVMValueRef src[4],
+                   LLVMValueRef dst[4],
+                   LLVMValueRef con[4],
+                   LLVMValueRef res[4])
+{
+   struct lp_build_blend_soa_context bld;
+   unsigned i, j, k;
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   for (i = 0; i < 4; ++i) {
+      bld.src[i] = src[i];
+      bld.dst[i] = dst[i];
+      bld.con[i] = con[i];
+   }
+
+   for (i = 0; i < 4; ++i) {
+      if (blend->rt[0].colormask & (1 << i)) {
+         if (blend->logicop_enable) {
+            if(!type.floating) {
+               res[i] = lp_build_logicop(builder, blend->logicop_func, src[i], dst[i]);
+            }
+            else
+               res[i] = dst[i];
+         }
+         else if (blend->rt[0].blend_enable) {
+            unsigned src_factor = i < 3 ? blend->rt[0].rgb_src_factor : blend->rt[0].alpha_src_factor;
+            unsigned dst_factor = i < 3 ? blend->rt[0].rgb_dst_factor : blend->rt[0].alpha_dst_factor;
+            unsigned func = i < 3 ? blend->rt[0].rgb_func : blend->rt[0].alpha_func;
+            boolean func_commutative = lp_build_blend_func_commutative(func);
+
+            /* It makes no sense to blend unless values are normalized */
+            assert(type.norm);
+
+            /*
+             * Compute src/dst factors.
+             */
+
+            bld.factor[0][0][i] = src[i];
+            bld.factor[0][1][i] = lp_build_blend_soa_factor(&bld, src_factor, i);
+            bld.factor[1][0][i] = dst[i];
+            bld.factor[1][1][i] = lp_build_blend_soa_factor(&bld, dst_factor, i);
+
+            /*
+             * Compute src/dst terms
+             */
+
+            for(k = 0; k < 2; ++k) {
+               /* See if this multiplication has been previously computed */
+               for(j = 0; j < i; ++j) {
+                  if((bld.factor[k][0][j] == bld.factor[k][0][i] &&
+                      bld.factor[k][1][j] == bld.factor[k][1][i]) ||
+                     (bld.factor[k][0][j] == bld.factor[k][1][i] &&
+                      bld.factor[k][1][j] == bld.factor[k][0][i]))
+                     break;
+               }
+
+               if(j < i)
+                  bld.term[k][i] = bld.term[k][j];
+               else
+                  bld.term[k][i] = lp_build_mul(&bld.base, bld.factor[k][0][i], bld.factor[k][1][i]);
+            }
+
+            /*
+             * Combine terms
+             */
+
+            /* See if this function has been previously applied */
+            for(j = 0; j < i; ++j) {
+               unsigned prev_func = j < 3 ? blend->rt[0].rgb_func : blend->rt[0].alpha_func;
+               unsigned func_reverse = lp_build_blend_func_reverse(func, prev_func);
+
+               if((!func_reverse &&
+                   bld.term[0][j] == bld.term[0][i] &&
+                   bld.term[1][j] == bld.term[1][i]) ||
+                  ((func_commutative || func_reverse) &&
+                   bld.term[0][j] == bld.term[1][i] &&
+                   bld.term[1][j] == bld.term[0][i]))
+                  break;
+            }
+
+            if(j < i)
+               res[i] = res[j];
+            else
+               res[i] = lp_build_blend_func(&bld.base, func, bld.term[0][i], bld.term[1][i]);
+         }
+         else {
+            res[i] = src[i];
+         }
+      }
+      else {
+         res[i] = dst[i];
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c
new file mode 100644
index 00000000000..c8eaa8c3940
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c
@@ -0,0 +1,369 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper functions for constant building.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#include <float.h>
+
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+
+
+unsigned
+lp_mantissa(struct lp_type type)
+{
+   assert(type.floating);
+
+   if(type.floating) {
+      switch(type.width) {
+      case 32:
+         return 23;
+      case 64:
+         return 53;
+      default:
+         assert(0);
+         return 0;
+      }
+   }
+   else {
+      if(type.sign)
+         return type.width - 1;
+      else
+         return type.width;
+   }
+}
+
+
+/**
+ * Shift of the unity.
+ *
+ * Same as lp_const_scale(), but in terms of shifts.
+ */
+unsigned
+lp_const_shift(struct lp_type type)
+{
+   if(type.floating)
+      return 0;
+   else if(type.fixed)
+      return type.width/2;
+   else if(type.norm)
+      return type.sign ? type.width - 1 : type.width;
+   else
+      return 0;
+}
+
+
+unsigned
+lp_const_offset(struct lp_type type)
+{
+   if(type.floating || type.fixed)
+      return 0;
+   else if(type.norm)
+      return 1;
+   else
+      return 0;
+}
+
+
+/**
+ * Scaling factor between the LLVM native value and its interpretation.
+ *
+ * This is 1.0 for all floating types and unnormalized integers, and something
+ * else for the fixed points types and normalized integers.
+ */
+double
+lp_const_scale(struct lp_type type)
+{
+   unsigned long long llscale;
+   double dscale;
+
+   llscale = (unsigned long long)1 << lp_const_shift(type);
+   llscale -= lp_const_offset(type);
+   dscale = (double)llscale;
+   assert((unsigned long long)dscale == llscale);
+
+   return dscale;
+}
+
+
+/**
+ * Minimum value representable by the type.
+ */
+double
+lp_const_min(struct lp_type type)
+{
+   unsigned bits;
+
+   if(!type.sign)
+      return 0.0;
+
+   if(type.norm)
+      return -1.0;
+
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         return -FLT_MAX;
+      case 64:
+         return -DBL_MAX;
+      default:
+         assert(0);
+         return 0.0;
+      }
+   }
+
+   if(type.fixed)
+      /* FIXME: consider the fractional bits? */
+      bits = type.width / 2 - 1;
+   else
+      bits = type.width - 1;
+
+   return (double)-((long long)1 << bits);
+}
+
+
+/**
+ * Maximum value representable by the type.
+ */
+double
+lp_const_max(struct lp_type type)
+{
+   unsigned bits;
+
+   if(type.norm)
+      return 1.0;
+
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         return FLT_MAX;
+      case 64:
+         return DBL_MAX;
+      default:
+         assert(0);
+         return 0.0;
+      }
+   }
+
+   if(type.fixed)
+      bits = type.width / 2;
+   else
+      bits = type.width;
+
+   if(type.sign)
+      bits -= 1;
+
+   return (double)(((unsigned long long)1 << bits) - 1);
+}
+
+
+double
+lp_const_eps(struct lp_type type)
+{
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         return FLT_EPSILON;
+      case 64:
+         return DBL_EPSILON;
+      default:
+         assert(0);
+         return 0.0;
+      }
+   }
+   else {
+      double scale = lp_const_scale(type);
+      return 1.0/scale;
+   }
+}
+
+
+LLVMValueRef
+lp_build_undef(struct lp_type type)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   return LLVMGetUndef(vec_type);
+}
+               
+
+LLVMValueRef
+lp_build_zero(struct lp_type type)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   return LLVMConstNull(vec_type);
+}
+               
+
+LLVMValueRef
+lp_build_one(struct lp_type type)
+{
+   LLVMTypeRef elem_type;
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   elem_type = lp_build_elem_type(type);
+
+   if(type.floating)
+      elems[0] = LLVMConstReal(elem_type, 1.0);
+   else if(type.fixed)
+      elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0);
+   else if(!type.norm)
+      elems[0] = LLVMConstInt(elem_type, 1, 0);
+   else if(type.sign)
+      elems[0] = LLVMConstInt(elem_type, (1LL << (type.width - 1)) - 1, 0);
+   else {
+      /* special case' -- 1.0 for normalized types is more easily attained if
+       * we start with a vector consisting of all bits set */
+      LLVMTypeRef vec_type = LLVMVectorType(elem_type, type.length);
+      LLVMValueRef vec = LLVMConstAllOnes(vec_type);
+
+#if 0
+      if(type.sign)
+         /* TODO: Unfortunately this caused "Tried to create a shift operation
+          * on a non-integer type!" */
+         vec = LLVMConstLShr(vec, lp_build_int_const_scalar(type, 1));
+#endif
+
+      return vec;
+   }
+
+   for(i = 1; i < type.length; ++i)
+      elems[i] = elems[0];
+
+   return LLVMConstVector(elems, type.length);
+}
+               
+
+LLVMValueRef
+lp_build_const_scalar(struct lp_type type,
+                      double val)
+{
+   LLVMTypeRef elem_type = lp_build_elem_type(type);
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   if(type.floating) {
+      elems[0] = LLVMConstReal(elem_type, val);
+   }
+   else {
+      double dscale = lp_const_scale(type);
+
+      elems[0] = LLVMConstInt(elem_type, val*dscale + 0.5, 0);
+   }
+
+   for(i = 1; i < type.length; ++i)
+      elems[i] = elems[0];
+
+   return LLVMConstVector(elems, type.length);
+}
+
+
+LLVMValueRef
+lp_build_int_const_scalar(struct lp_type type,
+                          long long val)
+{
+   LLVMTypeRef elem_type = lp_build_int_elem_type(type);
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   for(i = 0; i < type.length; ++i)
+      elems[i] = LLVMConstInt(elem_type, val, type.sign ? 1 : 0);
+
+   return LLVMConstVector(elems, type.length);
+}
+
+
+LLVMValueRef
+lp_build_const_aos(struct lp_type type, 
+                   double r, double g, double b, double a, 
+                   const unsigned char *swizzle)
+{
+   const unsigned char default_swizzle[4] = {0, 1, 2, 3};
+   LLVMTypeRef elem_type;
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(type.length % 4 == 0);
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   elem_type = lp_build_elem_type(type);
+
+   if(swizzle == NULL)
+      swizzle = default_swizzle;
+
+   if(type.floating) {
+      elems[swizzle[0]] = LLVMConstReal(elem_type, r);
+      elems[swizzle[1]] = LLVMConstReal(elem_type, g);
+      elems[swizzle[2]] = LLVMConstReal(elem_type, b);
+      elems[swizzle[3]] = LLVMConstReal(elem_type, a);
+   }
+   else {
+      double dscale = lp_const_scale(type);
+
+      elems[swizzle[0]] = LLVMConstInt(elem_type, r*dscale + 0.5, 0);
+      elems[swizzle[1]] = LLVMConstInt(elem_type, g*dscale + 0.5, 0);
+      elems[swizzle[2]] = LLVMConstInt(elem_type, b*dscale + 0.5, 0);
+      elems[swizzle[3]] = LLVMConstInt(elem_type, a*dscale + 0.5, 0);
+   }
+
+   for(i = 4; i < type.length; ++i)
+      elems[i] = elems[i % 4];
+
+   return LLVMConstVector(elems, type.length);
+}
+
+
+LLVMValueRef
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4])
+{
+   LLVMTypeRef elem_type = LLVMIntType(type.width);
+   LLVMValueRef masks[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(type.length <= LP_MAX_VECTOR_LENGTH);
+
+   for(j = 0; j < type.length; j += 4)
+      for(i = 0; i < 4; ++i)
+         masks[j + i] = LLVMConstInt(elem_type, cond[i] ? ~0 : 0, 0);
+
+   return LLVMConstVector(masks, type.length);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h
new file mode 100644
index 00000000000..cb8e1c7b006
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h
@@ -0,0 +1,108 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for constant building.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_CONST_H
+#define LP_BLD_CONST_H
+
+
+#include <llvm-c/Core.h>  
+
+#include <pipe/p_compiler.h>
+
+
+struct lp_type;
+
+
+unsigned
+lp_mantissa(struct lp_type type);
+
+
+unsigned
+lp_const_shift(struct lp_type type);
+
+
+unsigned
+lp_const_offset(struct lp_type type);
+
+
+double
+lp_const_scale(struct lp_type type);
+
+double
+lp_const_min(struct lp_type type);
+
+
+double
+lp_const_max(struct lp_type type);
+
+
+double
+lp_const_eps(struct lp_type type);
+
+
+LLVMValueRef
+lp_build_undef(struct lp_type type);
+
+
+LLVMValueRef
+lp_build_zero(struct lp_type type);
+
+
+LLVMValueRef
+lp_build_one(struct lp_type type);
+
+
+LLVMValueRef
+lp_build_const_scalar(struct lp_type type,
+                      double val);
+
+
+LLVMValueRef
+lp_build_int_const_scalar(struct lp_type type,
+                          long long val);
+
+
+LLVMValueRef
+lp_build_const_aos(struct lp_type type, 
+                   double r, double g, double b, double a, 
+                   const unsigned char *swizzle);
+
+
+LLVMValueRef
+lp_build_const_mask_aos(struct lp_type type,
+                        const boolean cond[4]);
+
+
+#endif /* !LP_BLD_CONST_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
new file mode 100644
index 00000000000..f77cf787213
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -0,0 +1,469 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper functions for type conversions.
+ *
+ * We want to use the fastest type for a given computation whenever feasible.
+ * The other side of this is that we need to be able convert between several
+ * types accurately and efficiently.
+ *
+ * Conversion between types of different bit width is quite complex since a 
+ *
+ * To remember there are a few invariants in type conversions:
+ *
+ * - register width must remain constant:
+ *
+ *     src_type.width * src_type.length == dst_type.width * dst_type.length
+ *
+ * - total number of elements must remain constant:
+ *
+ *     src_type.length * num_srcs == dst_type.length * num_dsts
+ *
+ * It is not always possible to do the conversion both accurately and
+ * efficiently, usually due to lack of adequate machine instructions. In these
+ * cases it is important not to cut shortcuts here and sacrifice accuracy, as
+ * there this functions can be used anywhere. In the future we might have a
+ * precision parameter which can gauge the accuracy vs efficiency compromise,
+ * but for now if the data conversion between two stages happens to be the
+ * bottleneck, then most likely should just avoid converting at all and run
+ * both stages with the same type.
+ *
+ * Make sure to run lp_test_conv unit test after any change to this file.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_debug.h"
+#include "util/u_math.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_conv.h"
+
+
+/**
+ * Special case for converting clamped IEEE-754 floats to unsigned norms.
+ *
+ * The mathematical voodoo below may seem excessive but it is actually
+ * paramount we do it this way for several reasons. First, there is no single
+ * precision FP to unsigned integer conversion Intel SSE instruction. Second,
+ * secondly, even if there was, since the FP's mantissa takes only a fraction
+ * of register bits the typically scale and cast approach would require double
+ * precision for accurate results, and therefore half the throughput
+ *
+ * Although the result values can be scaled to an arbitrary bit width specified
+ * by dst_width, the actual result type will have the same width.
+ */
+LLVMValueRef
+lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
+                                        struct lp_type src_type,
+                                        unsigned dst_width,
+                                        LLVMValueRef src)
+{
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type);
+   LLVMValueRef res;
+   unsigned mantissa;
+   unsigned n;
+   unsigned long long ubound;
+   unsigned long long mask;
+   double scale;
+   double bias;
+
+   assert(src_type.floating);
+
+   mantissa = lp_mantissa(src_type);
+
+   /* We cannot carry more bits than the mantissa */
+   n = MIN2(mantissa, dst_width);
+
+   /* This magic coefficients will make the desired result to appear in the
+    * lowest significant bits of the mantissa.
+    */
+   ubound = ((unsigned long long)1 << n);
+   mask = ubound - 1;
+   scale = (double)mask/ubound;
+   bias = (double)((unsigned long long)1 << (mantissa - n));
+
+   res = LLVMBuildMul(builder, src, lp_build_const_scalar(src_type, scale), "");
+   res = LLVMBuildAdd(builder, res, lp_build_const_scalar(src_type, bias), "");
+   res = LLVMBuildBitCast(builder, res, int_vec_type, "");
+
+   if(dst_width > n) {
+      int shift = dst_width - n;
+      res = LLVMBuildShl(builder, res, lp_build_int_const_scalar(src_type, shift), "");
+
+      /* TODO: Fill in the empty lower bits for additional precision? */
+      /* YES: this fixes progs/trivial/tri-z-eq.c.
+       * Otherwise vertex Z=1.0 values get converted to something like
+       * 0xfffffb00 and the test for equality with 0xffffffff fails.
+       */
+#if 0
+      {
+         LLVMValueRef msb;
+         msb = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, dst_width - 1), "");
+         msb = LLVMBuildShl(builder, msb, lp_build_int_const_scalar(src_type, shift), "");
+         msb = LLVMBuildSub(builder, msb, lp_build_int_const_scalar(src_type, 1), "");
+         res = LLVMBuildOr(builder, res, msb, "");
+      }
+#elif 0
+      while(shift > 0) {
+         res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_int_const_scalar(src_type, n), ""), "");
+         shift -= n;
+         n *= 2;
+      }
+#endif
+   }
+   else
+      res = LLVMBuildAnd(builder, res, lp_build_int_const_scalar(src_type, mask), "");
+
+   return res;
+}
+
+
+/**
+ * Inverse of lp_build_clamped_float_to_unsigned_norm above.
+ */
+LLVMValueRef
+lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
+                                unsigned src_width,
+                                struct lp_type dst_type,
+                                LLVMValueRef src)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(dst_type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(dst_type);
+   LLVMValueRef bias_;
+   LLVMValueRef res;
+   unsigned mantissa;
+   unsigned n;
+   unsigned long long ubound;
+   unsigned long long mask;
+   double scale;
+   double bias;
+
+   mantissa = lp_mantissa(dst_type);
+
+   n = MIN2(mantissa, src_width);
+
+   ubound = ((unsigned long long)1 << n);
+   mask = ubound - 1;
+   scale = (double)ubound/mask;
+   bias = (double)((unsigned long long)1 << (mantissa - n));
+
+   res = src;
+
+   if(src_width > mantissa) {
+      int shift = src_width - mantissa;
+      res = LLVMBuildLShr(builder, res, lp_build_int_const_scalar(dst_type, shift), "");
+   }
+
+   bias_ = lp_build_const_scalar(dst_type, bias);
+
+   res = LLVMBuildOr(builder,
+                     res,
+                     LLVMBuildBitCast(builder, bias_, int_vec_type, ""), "");
+
+   res = LLVMBuildBitCast(builder, res, vec_type, "");
+
+   res = LLVMBuildSub(builder, res, bias_, "");
+   res = LLVMBuildMul(builder, res, lp_build_const_scalar(dst_type, scale), "");
+
+   return res;
+}
+
+
+/**
+ * Generic type conversion.
+ *
+ * TODO: Take a precision argument, or even better, add a new precision member
+ * to the lp_type union.
+ */
+void
+lp_build_conv(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              const LLVMValueRef *src, unsigned num_srcs,
+              LLVMValueRef *dst, unsigned num_dsts)
+{
+   struct lp_type tmp_type;
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+   unsigned num_tmps;
+   unsigned i;
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   assert(src_type.length <= LP_MAX_VECTOR_LENGTH);
+   assert(dst_type.length <= LP_MAX_VECTOR_LENGTH);
+
+   tmp_type = src_type;
+   for(i = 0; i < num_srcs; ++i)
+      tmp[i] = src[i];
+   num_tmps = num_srcs;
+
+   /*
+    * Clamp if necessary
+    */
+
+   if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) {
+      struct lp_build_context bld;
+      double src_min = lp_const_min(src_type);
+      double dst_min = lp_const_min(dst_type);
+      double src_max = lp_const_max(src_type);
+      double dst_max = lp_const_max(dst_type);
+      LLVMValueRef thres;
+
+      lp_build_context_init(&bld, builder, tmp_type);
+
+      if(src_min < dst_min) {
+         if(dst_min == 0.0)
+            thres = bld.zero;
+         else
+            thres = lp_build_const_scalar(src_type, dst_min);
+         for(i = 0; i < num_tmps; ++i)
+            tmp[i] = lp_build_max(&bld, tmp[i], thres);
+      }
+
+      if(src_max > dst_max) {
+         if(dst_max == 1.0)
+            thres = bld.one;
+         else
+            thres = lp_build_const_scalar(src_type, dst_max);
+         for(i = 0; i < num_tmps; ++i)
+            tmp[i] = lp_build_min(&bld, tmp[i], thres);
+      }
+   }
+
+   /*
+    * Scale to the narrowest range
+    */
+
+   if(dst_type.floating) {
+      /* Nothing to do */
+   }
+   else if(tmp_type.floating) {
+      if(!dst_type.fixed && !dst_type.sign && dst_type.norm) {
+         for(i = 0; i < num_tmps; ++i) {
+            tmp[i] = lp_build_clamped_float_to_unsigned_norm(builder,
+                                                             tmp_type,
+                                                             dst_type.width,
+                                                             tmp[i]);
+         }
+         tmp_type.floating = FALSE;
+      }
+      else {
+         double dst_scale = lp_const_scale(dst_type);
+         LLVMTypeRef tmp_vec_type;
+
+         if (dst_scale != 1.0) {
+            LLVMValueRef scale = lp_build_const_scalar(tmp_type, dst_scale);
+            for(i = 0; i < num_tmps; ++i)
+               tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
+         }
+
+         /* Use an equally sized integer for intermediate computations */
+         tmp_type.floating = FALSE;
+         tmp_vec_type = lp_build_vec_type(tmp_type);
+         for(i = 0; i < num_tmps; ++i) {
+#if 0
+            if(dst_type.sign)
+               tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
+            else
+               tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, "");
+#else
+           /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */
+            tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, "");
+#endif
+         }
+      }
+   }
+   else {
+      unsigned src_shift = lp_const_shift(src_type);
+      unsigned dst_shift = lp_const_shift(dst_type);
+
+      /* FIXME: compensate different offsets too */
+      if(src_shift > dst_shift) {
+         LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, src_shift - dst_shift);
+         for(i = 0; i < num_tmps; ++i)
+            if(src_type.sign)
+               tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, "");
+            else
+               tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, "");
+      }
+   }
+
+   /*
+    * Truncate or expand bit width
+    */
+
+   assert(!tmp_type.floating || tmp_type.width == dst_type.width);
+
+   if(tmp_type.width > dst_type.width) {
+      assert(num_dsts == 1);
+      tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps);
+      tmp_type.width = dst_type.width;
+      tmp_type.length = dst_type.length;
+      num_tmps = 1;
+   }
+
+   if(tmp_type.width < dst_type.width) {
+      assert(num_tmps == 1);
+      lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts);
+      tmp_type.width = dst_type.width;
+      tmp_type.length = dst_type.length;
+      num_tmps = num_dsts;
+   }
+
+   assert(tmp_type.width == dst_type.width);
+   assert(tmp_type.length == dst_type.length);
+   assert(num_tmps == num_dsts);
+
+   /*
+    * Scale to the widest range
+    */
+
+   if(src_type.floating) {
+      /* Nothing to do */
+   }
+   else if(!src_type.floating && dst_type.floating) {
+      if(!src_type.fixed && !src_type.sign && src_type.norm) {
+         for(i = 0; i < num_tmps; ++i) {
+            tmp[i] = lp_build_unsigned_norm_to_float(builder,
+                                                     src_type.width,
+                                                     dst_type,
+                                                     tmp[i]);
+         }
+         tmp_type.floating = TRUE;
+      }
+      else {
+         double src_scale = lp_const_scale(src_type);
+         LLVMTypeRef tmp_vec_type;
+
+         /* Use an equally sized integer for intermediate computations */
+         tmp_type.floating = TRUE;
+         tmp_type.sign = TRUE;
+         tmp_vec_type = lp_build_vec_type(tmp_type);
+         for(i = 0; i < num_tmps; ++i) {
+#if 0
+            if(dst_type.sign)
+               tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
+            else
+               tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, "");
+#else
+            /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */
+            tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, "");
+#endif
+          }
+
+          if (src_scale != 1.0) {
+             LLVMValueRef scale = lp_build_const_scalar(tmp_type, 1.0/src_scale);
+             for(i = 0; i < num_tmps; ++i)
+                tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
+          }
+      }
+    }
+    else {
+       unsigned src_shift = lp_const_shift(src_type);
+       unsigned dst_shift = lp_const_shift(dst_type);
+
+       /* FIXME: compensate different offsets too */
+       if(src_shift < dst_shift) {
+          LLVMValueRef shift = lp_build_int_const_scalar(tmp_type, dst_shift - src_shift);
+          for(i = 0; i < num_tmps; ++i)
+             tmp[i] = LLVMBuildShl(builder, tmp[i], shift, "");
+       }
+    }
+
+   for(i = 0; i < num_dsts; ++i)
+      dst[i] = tmp[i];
+}
+
+
+/**
+ * Bit mask conversion.
+ *
+ * This will convert the integer masks that match the given types.
+ *
+ * The mask values should 0 or -1, i.e., all bits either set to zero or one.
+ * Any other value will likely cause in unpredictable results.
+ *
+ * This is basically a very trimmed down version of lp_build_conv.
+ */
+void
+lp_build_conv_mask(LLVMBuilderRef builder,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
+                   const LLVMValueRef *src, unsigned num_srcs,
+                   LLVMValueRef *dst, unsigned num_dsts)
+{
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   /*
+    * Drop
+    *
+    * We assume all values are 0 or -1
+    */
+
+   src_type.floating = FALSE;
+   src_type.fixed = FALSE;
+   src_type.sign = TRUE;
+   src_type.norm = FALSE;
+
+   dst_type.floating = FALSE;
+   dst_type.fixed = FALSE;
+   dst_type.sign = TRUE;
+   dst_type.norm = FALSE;
+
+   /*
+    * Truncate or expand bit width
+    */
+
+   if(src_type.width > dst_type.width) {
+      assert(num_dsts == 1);
+      dst[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs);
+   }
+   else if(src_type.width < dst_type.width) {
+      assert(num_srcs == 1);
+      lp_build_unpack(builder, src_type, dst_type, src[0], dst, num_dsts);
+   }
+   else {
+      assert(num_srcs == num_dsts);
+      memcpy(dst, src, num_dsts * sizeof *dst);
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.h b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
new file mode 100644
index 00000000000..948e68fae4f
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.h
@@ -0,0 +1,73 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for type conversions.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_CONV_H
+#define LP_BLD_CONV_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type;
+
+
+LLVMValueRef
+lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
+                                        struct lp_type src_type,
+                                        unsigned dst_width,
+                                        LLVMValueRef src);
+
+LLVMValueRef
+lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
+                                unsigned src_width,
+                                struct lp_type dst_type,
+                                LLVMValueRef src);
+
+
+void
+lp_build_conv(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              const LLVMValueRef *srcs, unsigned num_srcs,
+              LLVMValueRef *dsts, unsigned num_dsts);
+
+void
+lp_build_conv_mask(LLVMBuilderRef builder,
+                   struct lp_type src_type,
+                   struct lp_type dst_type,
+                   const LLVMValueRef *src, unsigned num_srcs,
+                   LLVMValueRef *dst, unsigned num_dsts);
+
+#endif /* !LP_BLD_CONV_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
new file mode 100644
index 00000000000..39dfc51e503
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c
@@ -0,0 +1,132 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifdef HAVE_UDIS86
+#include <udis86.h>
+#endif
+
+#include "util/u_math.h"
+#include "util/u_debug.h"
+#include "lp_bld_debug.h"
+
+
+/**
+ * Check alignment.
+ *
+ * It is important that this check is not implemented as a macro or inlined
+ * function, as the compiler assumptions in respect to alignment of global
+ * and stack variables would often make the check a no op, defeating the
+ * whole purpose of the exercise.
+ */
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment)
+{
+   assert(util_is_pot(alignment));
+   return ((uintptr_t)ptr & (alignment - 1)) == 0;
+}
+
+
+void
+lp_disassemble(const void* func)
+{
+#ifdef HAVE_UDIS86
+   ud_t ud_obj;
+   uint64_t max_jmp_pc;
+
+   ud_init(&ud_obj);
+
+   ud_set_input_buffer(&ud_obj, (void*)func, 0xffff);
+
+   max_jmp_pc = (uint64_t) (uintptr_t) func;
+   ud_set_pc(&ud_obj, max_jmp_pc);
+
+#ifdef PIPE_ARCH_X86
+   ud_set_mode(&ud_obj, 32);
+#endif
+#ifdef PIPE_ARCH_X86_64
+   ud_set_mode(&ud_obj, 64);
+#endif
+
+   ud_set_syntax(&ud_obj, UD_SYN_ATT);
+
+   while (ud_disassemble(&ud_obj)) {
+
+#ifdef PIPE_ARCH_X86
+      debug_printf("0x%08lx:\t", (unsigned long)ud_insn_off(&ud_obj));
+#endif
+#ifdef PIPE_ARCH_X86_64
+      debug_printf("0x%016llx:\t", (unsigned long long)ud_insn_off(&ud_obj));
+#endif
+
+#if 0
+      debug_printf("%-16s ", ud_insn_hex(&ud_obj));
+#endif
+
+      debug_printf("%s\n", ud_insn_asm(&ud_obj));
+
+      if(ud_obj.mnemonic != UD_Icall) {
+         unsigned i;
+         for(i = 0; i < 3; ++i) {
+            const struct ud_operand *op = &ud_obj.operand[i];
+            if (op->type == UD_OP_JIMM){
+               uint64_t pc = ud_obj.pc;
+
+               switch (op->size) {
+               case 8:
+                  pc += op->lval.sbyte;
+                  break;
+               case 16:
+                  pc += op->lval.sword;
+                  break;
+               case 32:
+                  pc += op->lval.sdword;
+                  break;
+               default:
+                  break;
+               }
+               if(pc > max_jmp_pc)
+                  max_jmp_pc = pc;
+            }
+         }
+      }
+
+      if ((ud_insn_off(&ud_obj) >= max_jmp_pc && ud_obj.mnemonic == UD_Iret) ||
+           ud_obj.mnemonic == UD_Iinvalid)
+         break;
+   }
+
+#if 0
+   /* Print GDB command, useful to verify udis86 output */
+   debug_printf("disassemble %p %p\n", func, (void*)(uintptr_t)ud_obj.pc);
+#endif
+
+   debug_printf("\n");
+#else
+   (void)func;
+#endif
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
index d91cd35b3b7..583e6132b4b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump_c.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ *
+ * Copyright 2009 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,40 +10,55 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
-#ifndef TGSI_DUMP_C_H
-#define TGSI_DUMP_C_H
 
-#include "pipe/p_shader_tokens.h"
+#ifndef LP_BLD_DEBUG_H
+#define LP_BLD_DEBUG_H
 
-#if defined __cplusplus
-extern "C" {
+
+#include <llvm-c/Core.h>
+
+#include "pipe/p_compiler.h"
+#include "util/u_string.h"
+
+
+static INLINE void
+lp_build_name(LLVMValueRef val, const char *format, ...)
+{
+#ifdef DEBUG
+   char name[32];
+   va_list ap;
+   va_start(ap, format);
+   util_vsnprintf(name, sizeof name, format, ap);
+   va_end(ap);
+   LLVMSetValueName(val, name);
+#else
+   (void)val;
+   (void)format;
 #endif
+}
+
+
+boolean
+lp_check_alignment(const void *ptr, unsigned alignment);
 
-#define TGSI_DUMP_C_IGNORED 1
-#define TGSI_DUMP_C_DEFAULT 2
 
 void
-tgsi_dump_c(
-   const struct tgsi_token *tokens,
-   uint flags );
+lp_disassemble(const void* func);
 
-#if defined __cplusplus
-}
-#endif
 
-#endif /* TGSI_DUMP_C_H */
+#endif /* !LP_BLD_DEBUG_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_depth.c b/src/gallium/auxiliary/gallivm/lp_bld_depth.c
new file mode 100644
index 00000000000..f08f8eb6d8b
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_depth.c
@@ -0,0 +1,213 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Depth/stencil testing to LLVM IR translation.
+ *
+ * To be done accurately/efficiently the depth/stencil test must be done with
+ * the same type/format of the depth/stencil buffer, which implies massaging
+ * the incoming depths to fit into place. Using a more straightforward
+ * type/format for depth/stencil values internally and only convert when
+ * flushing would avoid this, but it would most likely result in depth fighting
+ * artifacts.
+ *
+ * We are free to use a different pixel layout though. Since our basic
+ * processing unit is a quad (2x2 pixel block) we store the depth/stencil
+ * values tiled, a quad at time. That is, a depth buffer containing 
+ *
+ *  Z11 Z12 Z13 Z14 ...
+ *  Z21 Z22 Z23 Z24 ...
+ *  Z31 Z32 Z33 Z34 ...
+ *  Z41 Z42 Z43 Z44 ...
+ *  ... ... ... ... ...
+ *
+ * will actually be stored in memory as
+ *
+ *  Z11 Z12 Z21 Z22 Z13 Z14 Z23 Z24 ...
+ *  Z31 Z32 Z41 Z42 Z33 Z34 Z43 Z44 ...
+ *  ... ... ... ... ... ... ... ... ...
+ *
+ * FIXME: Code generate stencil test
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_depth.h"
+
+
+/**
+ * Return a type appropriate for depth/stencil testing.
+ */
+struct lp_type
+lp_depth_type(const struct util_format_description *format_desc,
+              unsigned length)
+{
+   struct lp_type type;
+   unsigned swizzle;
+
+   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+
+   swizzle = format_desc->swizzle[0];
+   assert(swizzle < 4);
+
+   memset(&type, 0, sizeof type);
+   type.width = format_desc->block.bits;
+
+   if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
+      type.floating = TRUE;
+      assert(swizzle == 0);
+      assert(format_desc->channel[swizzle].size == format_desc->block.bits);
+   }
+   else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+      assert(format_desc->block.bits <= 32);
+      if(format_desc->channel[swizzle].normalized)
+         type.norm = TRUE;
+   }
+   else
+      assert(0);
+
+   assert(type.width <= length);
+   type.length = length / type.width;
+
+   return type;
+}
+
+
+/**
+ * Depth test.
+ */
+void
+lp_build_depth_test(LLVMBuilderRef builder,
+                    const struct pipe_depth_state *state,
+                    struct lp_type type,
+                    const struct util_format_description *format_desc,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef src,
+                    LLVMValueRef dst_ptr)
+{
+   struct lp_build_context bld;
+   unsigned z_swizzle;
+   LLVMValueRef dst;
+   LLVMValueRef z_bitmask = NULL;
+   LLVMValueRef test;
+
+   if(!state->enabled)
+      return;
+
+   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+
+   z_swizzle = format_desc->swizzle[0];
+   if(z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+      return;
+
+   /* Sanity checking */
+   assert(z_swizzle < 4);
+   assert(format_desc->block.bits == type.width);
+   if(type.floating) {
+      assert(z_swizzle == 0);
+      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT);
+      assert(format_desc->channel[z_swizzle].size == format_desc->block.bits);
+   }
+   else {
+      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
+      assert(format_desc->channel[z_swizzle].normalized);
+      assert(!type.fixed);
+      assert(!type.sign);
+      assert(type.norm);
+   }
+
+   /* Setup build context */
+   lp_build_context_init(&bld, builder, type);
+
+   dst = LLVMBuildLoad(builder, dst_ptr, "");
+
+   lp_build_name(dst, "zsbuf");
+
+   /* Align the source depth bits with the destination's, and mask out any
+    * stencil or padding bits from both */
+   if(format_desc->channel[z_swizzle].size == format_desc->block.bits) {
+      assert(z_swizzle == 0);
+      /* nothing to do */
+   }
+   else {
+      unsigned padding_left;
+      unsigned padding_right;
+      unsigned chan;
+
+      assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
+      assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
+      assert(format_desc->channel[z_swizzle].size <= format_desc->block.bits);
+      assert(format_desc->channel[z_swizzle].normalized);
+
+      padding_right = 0;
+      for(chan = 0; chan < z_swizzle; ++chan)
+         padding_right += format_desc->channel[chan].size;
+      padding_left = format_desc->block.bits -
+                     (padding_right + format_desc->channel[z_swizzle].size);
+
+      if(padding_left || padding_right) {
+         const unsigned long long mask_left = ((unsigned long long)1 << (format_desc->block.bits - padding_left)) - 1;
+         const unsigned long long mask_right = ((unsigned long long)1 << (padding_right)) - 1;
+         z_bitmask = lp_build_int_const_scalar(type, mask_left ^ mask_right);
+      }
+
+      if(padding_left)
+         src = LLVMBuildLShr(builder, src, lp_build_int_const_scalar(type, padding_left), "");
+      if(padding_right)
+         src = LLVMBuildAnd(builder, src, z_bitmask, "");
+      if(padding_left || padding_right)
+         dst = LLVMBuildAnd(builder, dst, z_bitmask, "");
+   }
+
+   lp_build_name(dst, "zsbuf.z");
+
+   test = lp_build_cmp(&bld, state->func, src, dst);
+   lp_build_mask_update(mask, test);
+
+   if(state->writemask) {
+      if(z_bitmask)
+         z_bitmask = LLVMBuildAnd(builder, mask->value, z_bitmask, "");
+      else
+         z_bitmask = mask->value;
+
+      dst = lp_build_select(&bld, z_bitmask, src, dst);
+      LLVMBuildStore(builder, dst, dst_ptr);
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_depth.h b/src/gallium/auxiliary/gallivm/lp_bld_depth.h
new file mode 100644
index 00000000000..79d6981bb51
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_depth.h
@@ -0,0 +1,63 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Depth/stencil testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#ifndef LP_BLD_DEPTH_H
+#define LP_BLD_DEPTH_H
+
+
+#include <llvm-c/Core.h>  
+
+ 
+struct pipe_depth_state;
+struct util_format_description;
+struct lp_type;
+struct lp_build_mask_context;
+
+
+struct lp_type
+lp_depth_type(const struct util_format_description *format_desc,
+              unsigned length);
+
+
+void
+lp_build_depth_test(LLVMBuilderRef builder,
+                    const struct pipe_depth_state *state,
+                    struct lp_type type,
+                    const struct util_format_description *format_desc,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef src,
+                    LLVMValueRef dst_ptr);
+
+
+#endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
new file mode 100644
index 00000000000..bc831389085
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c
@@ -0,0 +1,757 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * LLVM control flow build helpers.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_flow.h"
+
+
+#define LP_BUILD_FLOW_MAX_VARIABLES 32
+#define LP_BUILD_FLOW_MAX_DEPTH 32
+
+/**
+ * Enumeration of all possible flow constructs.
+ */
+enum lp_build_flow_construct_kind {
+   LP_BUILD_FLOW_SCOPE,
+   LP_BUILD_FLOW_SKIP,
+   LP_BUILD_FLOW_IF
+};
+
+
+/**
+ * Variable declaration scope.
+ */
+struct lp_build_flow_scope
+{
+   /** Number of variables declared in this scope */
+   unsigned num_variables;
+};
+
+
+/**
+ * Early exit. Useful to skip to the end of a function or block when
+ * the execution mask becomes zero or when there is an error condition.
+ */
+struct lp_build_flow_skip
+{
+   /** Block to skip to */
+   LLVMBasicBlockRef block;
+
+   /** Number of variables declared at the beginning */
+   unsigned num_variables;
+
+   LLVMValueRef *phi;  /**< array [num_variables] */
+};
+
+
+/**
+ * if/else/endif.
+ */
+struct lp_build_flow_if
+{
+   unsigned num_variables;
+
+   LLVMValueRef *phi;  /**< array [num_variables] */
+
+   LLVMValueRef condition;
+   LLVMBasicBlockRef entry_block, true_block, false_block, merge_block;
+};
+
+
+/**
+ * Union of all possible flow constructs' data
+ */
+union lp_build_flow_construct_data
+{
+   struct lp_build_flow_scope scope;
+   struct lp_build_flow_skip skip;
+   struct lp_build_flow_if ifthen;
+};
+
+
+/**
+ * Element of the flow construct stack.
+ */
+struct lp_build_flow_construct
+{
+   enum lp_build_flow_construct_kind kind;
+   union lp_build_flow_construct_data data;
+};
+
+
+/**
+ * All necessary data to generate LLVM control flow constructs.
+ *
+ * Besides keeping track of the control flow construct themselves we also
+ * need to keep track of variables in order to generate SSA Phi values.
+ */
+struct lp_build_flow_context
+{
+   LLVMBuilderRef builder;
+
+   /**
+    * Control flow stack.
+    */
+   struct lp_build_flow_construct constructs[LP_BUILD_FLOW_MAX_DEPTH];
+   unsigned num_constructs;
+
+   /**
+    * Variable stack
+    */
+   LLVMValueRef *variables[LP_BUILD_FLOW_MAX_VARIABLES];
+   unsigned num_variables;
+};
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder)
+{
+   struct lp_build_flow_context *flow;
+
+   flow = CALLOC_STRUCT(lp_build_flow_context);
+   if(!flow)
+      return NULL;
+
+   flow->builder = builder;
+
+   return flow;
+}
+
+
+void
+lp_build_flow_destroy(struct lp_build_flow_context *flow)
+{
+   assert(flow->num_constructs == 0);
+   assert(flow->num_variables == 0);
+   FREE(flow);
+}
+
+
+/**
+ * Begin/push a new flow control construct, such as a loop, skip block
+ * or variable scope.
+ */
+static union lp_build_flow_construct_data *
+lp_build_flow_push(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs < LP_BUILD_FLOW_MAX_DEPTH);
+   if(flow->num_constructs >= LP_BUILD_FLOW_MAX_DEPTH)
+      return NULL;
+
+   flow->constructs[flow->num_constructs].kind = kind;
+   return &flow->constructs[flow->num_constructs++].data;
+}
+
+
+/**
+ * Return the current/top flow control construct on the stack.
+ * \param kind  the expected type of the top-most construct
+ */
+static union lp_build_flow_construct_data *
+lp_build_flow_peek(struct lp_build_flow_context *flow,
+                   enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[flow->num_constructs - 1].data;
+}
+
+
+/**
+ * End/pop the current/top flow control construct on the stack.
+ * \param kind  the expected type of the top-most construct
+ */
+static union lp_build_flow_construct_data *
+lp_build_flow_pop(struct lp_build_flow_context *flow,
+                  enum lp_build_flow_construct_kind kind)
+{
+   assert(flow->num_constructs);
+   if(!flow->num_constructs)
+      return NULL;
+
+   assert(flow->constructs[flow->num_constructs - 1].kind == kind);
+   if(flow->constructs[flow->num_constructs - 1].kind != kind)
+      return NULL;
+
+   return &flow->constructs[--flow->num_constructs].data;
+}
+
+
+/**
+ * Begin a variable scope.
+ *
+ *
+ */
+void
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_push(flow, LP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
+
+   scope->num_variables = 0;
+}
+
+
+/**
+ * Declare a variable.
+ *
+ * A variable is a named entity which can have different LLVMValueRef's at
+ * different points of the program. This is relevant for control flow because
+ * when there are multiple branches to a same location we need to replace
+ * the variable's value with a Phi function as explained in
+ * http://en.wikipedia.org/wiki/Static_single_assignment_form .
+ *
+ * We keep track of variables by keeping around a pointer to where they're
+ * current.
+ *
+ * There are a few cautions to observe:
+ *
+ * - Variable's value must not be NULL. If there is no initial value then
+ *   LLVMGetUndef() should be used.
+ *
+ * - Variable's value must be kept up-to-date. If the variable is going to be
+ *   modified by a function then a pointer should be passed so that its value
+ *   is accurate. Failure to do this will cause some of the variables'
+ *   transient values to be lost, leading to wrong results.
+ *
+ * - A program should be written from top to bottom, by always appending
+ *   instructions to the bottom with a single LLVMBuilderRef. Inserting and/or
+ *   modifying existing statements will most likely lead to wrong results.
+ *
+ */
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
+
+   assert(*variable);
+   if(!*variable)
+      return;
+
+   assert(flow->num_variables < LP_BUILD_FLOW_MAX_VARIABLES);
+   if(flow->num_variables >= LP_BUILD_FLOW_MAX_VARIABLES)
+      return;
+
+   flow->variables[flow->num_variables++] = variable;
+   ++scope->num_variables;
+}
+
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_scope *scope;
+
+   scope = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SCOPE)->scope;
+   if(!scope)
+      return;
+
+   assert(flow->num_variables >= scope->num_variables);
+   if(flow->num_variables < scope->num_variables) {
+      flow->num_variables = 0;
+      return;
+   }
+
+   flow->num_variables -= scope->num_variables;
+}
+
+
+/**
+ * Note: this function has no dependencies on the flow code and could
+ * be used elsewhere.
+ */
+static LLVMBasicBlockRef
+lp_build_insert_new_block(LLVMBuilderRef builder, const char *name)
+{
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef next_block;
+   LLVMBasicBlockRef new_block;
+
+   /* get current basic block */
+   current_block = LLVMGetInsertBlock(builder);
+
+   /* check if there's another block after this one */
+   next_block = LLVMGetNextBasicBlock(current_block);
+   if (next_block) {
+      /* insert the new block before the next block */
+      new_block = LLVMInsertBasicBlock(next_block, name);
+   }
+   else {
+      /* append new block after current block */
+      LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+      new_block = LLVMAppendBasicBlock(function, name);
+   }
+
+   return new_block;
+}
+
+
+static LLVMBasicBlockRef
+lp_build_flow_insert_block(struct lp_build_flow_context *flow)
+{
+   return lp_build_insert_new_block(flow->builder, "");
+}
+
+
+/**
+ * Begin a "skip" block.  Inside this block we can test a condition and
+ * skip to the end of the block if the condition is false.
+ */
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBuilderRef builder;
+   unsigned i;
+
+   skip = &lp_build_flow_push(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   /* create new basic block */
+   skip->block = lp_build_flow_insert_block(flow);
+
+   skip->num_variables = flow->num_variables;
+   if(!skip->num_variables) {
+      skip->phi = NULL;
+      return;
+   }
+
+   /* Allocate a Phi node for each variable in this skip scope */
+   skip->phi = MALLOC(skip->num_variables * sizeof *skip->phi);
+   if(!skip->phi) {
+      skip->num_variables = 0;
+      return;
+   }
+
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, skip->block);
+
+   /* create a Phi node for each variable */
+   for(i = 0; i < skip->num_variables; ++i)
+      skip->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
+
+   LLVMDisposeBuilder(builder);
+}
+
+
+/**
+ * Insert code to test a condition and branch to the end of the current
+ * skip block if the condition is true.
+ */
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   LLVMBasicBlockRef new_block;
+   unsigned i;
+
+   skip = &lp_build_flow_peek(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
+   new_block = lp_build_flow_insert_block(flow);
+
+   /* for each variable, update the Phi node with a (variable, block) pair */
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+   }
+
+   /* if cond is true, goto skip->block, else goto new_block */
+   LLVMBuildCondBr(flow->builder, cond, skip->block, new_block);
+
+   LLVMPositionBuilderAtEnd(flow->builder, new_block);
+}
+
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow)
+{
+   struct lp_build_flow_skip *skip;
+   LLVMBasicBlockRef current_block;
+   unsigned i;
+
+   skip = &lp_build_flow_pop(flow, LP_BUILD_FLOW_SKIP)->skip;
+   if(!skip)
+      return;
+
+   current_block = LLVMGetInsertBlock(flow->builder);
+
+   /* add (variable, block) tuples to the phi nodes */
+   for(i = 0; i < skip->num_variables; ++i) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(skip->phi[i], flow->variables[i], &current_block, 1);
+      *flow->variables[i] = skip->phi[i];
+   }
+
+   /* goto block */
+   LLVMBuildBr(flow->builder, skip->block);
+   LLVMPositionBuilderAtEnd(flow->builder, skip->block);
+
+   FREE(skip->phi);
+}
+
+
+/**
+ * Check if the mask predicate is zero.  If so, jump to the end of the block.
+ */
+static void
+lp_build_mask_check(struct lp_build_mask_context *mask)
+{
+   LLVMBuilderRef builder = mask->flow->builder;
+   LLVMValueRef cond;
+
+   /* cond = (mask == 0) */
+   cond = LLVMBuildICmp(builder,
+                        LLVMIntEQ,
+                        LLVMBuildBitCast(builder, mask->value, mask->reg_type, ""),
+                        LLVMConstNull(mask->reg_type),
+                        "");
+
+   /* if cond, goto end of block */
+   lp_build_flow_skip_cond_break(mask->flow, cond);
+}
+
+
+/**
+ * Begin a section of code which is predicated on a mask.
+ * \param mask  the mask context, initialized here
+ * \param flow  the flow context
+ * \param type  the type of the mask
+ * \param value  storage for the mask
+ */
+void
+lp_build_mask_begin(struct lp_build_mask_context *mask,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
+                    LLVMValueRef value)
+{
+   memset(mask, 0, sizeof *mask);
+
+   mask->flow = flow;
+   mask->reg_type = LLVMIntType(type.width * type.length);
+   mask->value = value;
+
+   lp_build_flow_scope_begin(flow);
+   lp_build_flow_scope_declare(flow, &mask->value);
+   lp_build_flow_skip_begin(flow);
+
+   lp_build_mask_check(mask);
+}
+
+
+/**
+ * Update boolean mask with given value (bitwise AND).
+ * Typically used to update the quad's pixel alive/killed mask
+ * after depth testing, alpha testing, TGSI_OPCODE_KIL, etc.
+ */
+void
+lp_build_mask_update(struct lp_build_mask_context *mask,
+                     LLVMValueRef value)
+{
+   mask->value = LLVMBuildAnd( mask->flow->builder, mask->value, value, "");
+
+   lp_build_mask_check(mask);
+}
+
+
+/**
+ * End section of code which is predicated on a mask.
+ */
+LLVMValueRef
+lp_build_mask_end(struct lp_build_mask_context *mask)
+{
+   lp_build_flow_skip_end(mask->flow);
+   lp_build_flow_scope_end(mask->flow);
+   return mask->value;
+}
+
+
+
+void
+lp_build_loop_begin(LLVMBuilderRef builder,
+                    LLVMValueRef start,
+                    struct lp_build_loop_state *state)
+{
+   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+   LLVMValueRef function = LLVMGetBasicBlockParent(block);
+
+   state->block = LLVMAppendBasicBlock(function, "loop");
+
+   LLVMBuildBr(builder, state->block);
+
+   LLVMPositionBuilderAtEnd(builder, state->block);
+
+   state->counter = LLVMBuildPhi(builder, LLVMTypeOf(start), "");
+
+   LLVMAddIncoming(state->counter, &start, &block, 1);
+
+}
+
+
+void
+lp_build_loop_end(LLVMBuilderRef builder,
+                  LLVMValueRef end,
+                  LLVMValueRef step,
+                  struct lp_build_loop_state *state)
+{
+   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+   LLVMValueRef function = LLVMGetBasicBlockParent(block);
+   LLVMValueRef next;
+   LLVMValueRef cond;
+   LLVMBasicBlockRef after_block;
+
+   if (!step)
+      step = LLVMConstInt(LLVMTypeOf(end), 1, 0);
+
+   next = LLVMBuildAdd(builder, state->counter, step, "");
+
+   cond = LLVMBuildICmp(builder, LLVMIntNE, next, end, "");
+
+   after_block = LLVMAppendBasicBlock(function, "");
+
+   LLVMBuildCondBr(builder, cond, after_block, state->block);
+
+   LLVMAddIncoming(state->counter, &next, &block, 1);
+
+   LLVMPositionBuilderAtEnd(builder, after_block);
+}
+
+
+
+/*
+  Example of if/then/else building:
+
+     int x;
+     if (cond) {
+        x = 1 + 2;
+     }
+     else {
+        x = 2 + 3;
+     }
+
+  Is built with:
+
+     LLVMValueRef x = LLVMGetUndef();  // or something else
+
+     flow = lp_build_flow_create(builder);
+
+        lp_build_flow_scope_begin(flow);
+
+           // x needs a phi node
+           lp_build_flow_scope_declare(flow, &x);
+
+           lp_build_if(ctx, flow, builder, cond);
+              x = LLVMAdd(1, 2);
+           lp_build_else(ctx);
+              x = LLVMAdd(2, 3);
+           lp_build_endif(ctx);
+
+        lp_build_flow_scope_end(flow);
+
+     lp_build_flow_destroy(flow);
+ */
+
+
+
+/**
+ * Begin an if/else/endif construct.
+ */
+void
+lp_build_if(struct lp_build_if_state *ctx,
+            struct lp_build_flow_context *flow,
+            LLVMBuilderRef builder,
+            LLVMValueRef condition)
+{
+   LLVMBasicBlockRef block = LLVMGetInsertBlock(builder);
+   struct lp_build_flow_if *ifthen;
+   unsigned i;
+
+   memset(ctx, 0, sizeof(*ctx));
+   ctx->builder = builder;
+   ctx->flow = flow;
+
+   /* push/create new scope */
+   ifthen = &lp_build_flow_push(flow, LP_BUILD_FLOW_IF)->ifthen;
+   assert(ifthen);
+
+   ifthen->num_variables = flow->num_variables;
+   ifthen->condition = condition;
+   ifthen->entry_block = block;
+
+   /* create a Phi node for each variable in this flow scope */
+   ifthen->phi = MALLOC(ifthen->num_variables * sizeof(*ifthen->phi));
+   if (!ifthen->phi) {
+      ifthen->num_variables = 0;
+      return;
+   }
+
+   /* create endif/merge basic block for the phi functions */
+   ifthen->merge_block = lp_build_insert_new_block(builder, "endif-block");
+   LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
+
+   /* create a phi node for each variable */
+   for (i = 0; i < flow->num_variables; i++) {
+      ifthen->phi[i] = LLVMBuildPhi(builder, LLVMTypeOf(*flow->variables[i]), "");
+
+      /* add add the initial value of the var from the entry block */
+      LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->entry_block, 1);
+   }
+
+   /* create/insert true_block before merge_block */
+   ifthen->true_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-true-block");
+
+   /* successive code goes into the true block */
+   LLVMPositionBuilderAtEnd(builder, ifthen->true_block);
+}
+
+
+/**
+ * Begin else-part of a conditional
+ */
+void
+lp_build_else(struct lp_build_if_state *ctx)
+{
+   struct lp_build_flow_context *flow = ctx->flow;
+   struct lp_build_flow_if *ifthen;
+   unsigned i;
+
+   ifthen = &lp_build_flow_peek(flow, LP_BUILD_FLOW_IF)->ifthen;
+   assert(ifthen);
+
+   /* for each variable, update the Phi node with a (variable, block) pair */
+   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
+   for (i = 0; i < flow->num_variables; i++) {
+      assert(*flow->variables[i]);
+      LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
+   }
+
+   /* create/insert false_block before the merge block */
+   ifthen->false_block = LLVMInsertBasicBlock(ifthen->merge_block, "if-false-block");
+
+   /* successive code goes into the else block */
+   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->false_block);
+}
+
+
+/**
+ * End a conditional.
+ */
+void
+lp_build_endif(struct lp_build_if_state *ctx)
+{
+   struct lp_build_flow_context *flow = ctx->flow;
+   struct lp_build_flow_if *ifthen;
+   unsigned i;
+
+   ifthen = &lp_build_flow_pop(flow, LP_BUILD_FLOW_IF)->ifthen;
+   assert(ifthen);
+
+   if (ifthen->false_block) {
+      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
+      /* for each variable, update the Phi node with a (variable, block) pair */
+      for (i = 0; i < flow->num_variables; i++) {
+         assert(*flow->variables[i]);
+         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->false_block, 1);
+
+         /* replace the variable ref with the phi function */
+         *flow->variables[i] = ifthen->phi[i];
+      }
+   }
+   else {
+      /* no else clause */
+      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
+      for (i = 0; i < flow->num_variables; i++) {
+         assert(*flow->variables[i]);
+         LLVMAddIncoming(ifthen->phi[i], flow->variables[i], &ifthen->true_block, 1);
+
+         /* replace the variable ref with the phi function */
+         *flow->variables[i] = ifthen->phi[i];
+      }
+   }
+
+   FREE(ifthen->phi);
+
+   /***
+    *** Now patch in the various branch instructions.
+    ***/
+
+   /* Insert the conditional branch instruction at the end of entry_block */
+   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->entry_block);
+   if (ifthen->false_block) {
+      /* we have an else clause */
+      LLVMBuildCondBr(ctx->builder, ifthen->condition,
+                      ifthen->true_block, ifthen->false_block);
+   }
+   else {
+      /* no else clause */
+      LLVMBuildCondBr(ctx->builder, ifthen->condition,
+                      ifthen->true_block, ifthen->merge_block);
+   }
+
+   /* Append an unconditional Br(anch) instruction on the true_block */
+   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
+   LLVMBuildBr(ctx->builder, ifthen->merge_block);
+   if (ifthen->false_block) {
+      /* Append an unconditional Br(anch) instruction on the false_block */
+      LLVMPositionBuilderAtEnd(ctx->builder, ifthen->false_block);
+      LLVMBuildBr(ctx->builder, ifthen->merge_block);
+   }
+
+
+   /* Resume building code at end of the ifthen->merge_block */
+   LLVMPositionBuilderAtEnd(ctx->builder, ifthen->merge_block);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
new file mode 100644
index 00000000000..4c225a0d4f9
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -0,0 +1,151 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * LLVM control flow build helpers.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#ifndef LP_BLD_FLOW_H
+#define LP_BLD_FLOW_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type;
+
+
+struct lp_build_flow_context;
+
+
+struct lp_build_flow_context *
+lp_build_flow_create(LLVMBuilderRef builder);
+
+void
+lp_build_flow_destroy(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_scope_declare(struct lp_build_flow_context *flow,
+                            LLVMValueRef *variable);
+
+void
+lp_build_flow_scope_end(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_begin(struct lp_build_flow_context *flow);
+
+void
+lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow,
+                              LLVMValueRef cond);
+
+void
+lp_build_flow_skip_end(struct lp_build_flow_context *flow);
+
+
+struct lp_build_mask_context
+{
+   struct lp_build_flow_context *flow;
+
+   LLVMTypeRef reg_type;
+
+   LLVMValueRef value;
+};
+
+
+void
+lp_build_mask_begin(struct lp_build_mask_context *mask,
+                    struct lp_build_flow_context *flow,
+                    struct lp_type type,
+                    LLVMValueRef value);
+
+/**
+ * Bitwise AND the mask with the given value, if a previous mask was set.
+ */
+void
+lp_build_mask_update(struct lp_build_mask_context *mask,
+                     LLVMValueRef value);
+
+LLVMValueRef
+lp_build_mask_end(struct lp_build_mask_context *mask);
+
+
+/**
+ * LLVM's IR doesn't represent for-loops directly. Furthermore it
+ * it requires creating code blocks, branches, phi variables, so it
+ * requires a fair amount of code.
+ *
+ * @sa http://www.llvm.org/docs/tutorial/LangImpl5.html#for
+ */
+struct lp_build_loop_state
+{
+  LLVMBasicBlockRef block;
+  LLVMValueRef counter;
+};
+
+
+void
+lp_build_loop_begin(LLVMBuilderRef builder,
+                    LLVMValueRef start,
+                    struct lp_build_loop_state *state);
+
+
+void
+lp_build_loop_end(LLVMBuilderRef builder,
+                  LLVMValueRef end,
+                  LLVMValueRef step,
+                  struct lp_build_loop_state *state);
+
+
+
+
+struct lp_build_if_state
+{
+   LLVMBuilderRef builder;
+   struct lp_build_flow_context *flow;
+};
+
+
+void
+lp_build_if(struct lp_build_if_state *ctx,
+            struct lp_build_flow_context *flow,
+            LLVMBuilderRef builder,
+            LLVMValueRef condition);
+
+void
+lp_build_else(struct lp_build_if_state *ctx);
+
+void
+lp_build_endif(struct lp_build_if_state *ctx);
+              
+
+
+#endif /* !LP_BLD_FLOW_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
new file mode 100644
index 00000000000..970bee379f5
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -0,0 +1,83 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_BLD_FORMAT_H
+#define LP_BLD_FORMAT_H
+
+
+/**
+ * @file
+ * Pixel format helpers.
+ */
+
+#include <llvm-c/Core.h>  
+
+#include "pipe/p_format.h"
+
+struct util_format_description;
+struct lp_type;
+
+
+boolean
+lp_format_is_rgba8(const struct util_format_description *desc);
+
+
+void
+lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
+                            struct lp_type type,
+                            const LLVMValueRef *unswizzled,
+                            LLVMValueRef *swizzled);
+
+
+LLVMValueRef
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         const struct util_format_description *desc,
+                         LLVMValueRef packed);
+
+
+LLVMValueRef
+lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
+                          const struct util_format_description *desc,
+                          struct lp_type type,
+                          LLVMValueRef packed);
+
+
+LLVMValueRef
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       const struct util_format_description *desc,
+                       LLVMValueRef rgba);
+
+
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba);
+
+
+#endif /* !LP_BLD_FORMAT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
new file mode 100644
index 00000000000..a07f7418f2c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -0,0 +1,383 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * AoS pixel format manipulation.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_cpu_detect.h"
+#include "util/u_format.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_format.h"
+
+
+/**
+ * Unpack a single pixel into its RGBA components.
+ *
+ * @param packed integer.
+ *
+ * @return RGBA in a 4 floats vector.
+ *
+ * XXX: This is mostly for reference and testing -- operating a single pixel at
+ * a time is rarely if ever needed.
+ */
+LLVMValueRef
+lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
+                         const struct util_format_description *desc,
+                         LLVMValueRef packed)
+{
+   LLVMTypeRef type;
+   LLVMValueRef shifted, casted, scaled, masked;
+   LLVMValueRef shifts[4];
+   LLVMValueRef masks[4];
+   LLVMValueRef scales[4];
+   LLVMValueRef swizzles[4];
+   LLVMValueRef aux[4];
+   bool normalized;
+   int empty_channel;
+   unsigned shift;
+   unsigned i;
+
+   /* FIXME: Support more formats */
+   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
+   assert(desc->block.width == 1);
+   assert(desc->block.height == 1);
+   assert(desc->block.bits <= 32);
+
+   type = LLVMIntType(desc->block.bits);
+
+   /* Do the intermediate integer computations with 32bit integers since it
+    * matches floating point size */
+   if (desc->block.bits < 32)
+      packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), "");
+
+   /* Broadcast the packed value to all four channels */
+   packed = LLVMBuildInsertElement(builder,
+                                   LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)),
+                                   packed,
+                                   LLVMConstNull(LLVMInt32Type()),
+                                   "");
+   packed = LLVMBuildShuffleVector(builder,
+                                   packed,
+                                   LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)),
+                                   LLVMConstNull(LLVMVectorType(LLVMInt32Type(), 4)),
+                                   "");
+
+   /* Initialize vector constants */
+   normalized = FALSE;
+   empty_channel = -1;
+   shift = 0;
+   for (i = 0; i < 4; ++i) {
+      unsigned bits = desc->channel[i].size;
+
+      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
+         shifts[i] = LLVMGetUndef(LLVMInt32Type());
+         masks[i] = LLVMConstNull(LLVMInt32Type());
+         scales[i] =  LLVMConstNull(LLVMFloatType());
+         empty_channel = i;
+      }
+      else {
+         unsigned mask = (1 << bits) - 1;
+
+         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
+         assert(bits < 32);
+
+         shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0);
+         masks[i] = LLVMConstInt(LLVMInt32Type(), mask, 0);
+
+         if (desc->channel[i].normalized) {
+            scales[i] = LLVMConstReal(LLVMFloatType(), 1.0/mask);
+            normalized = TRUE;
+         }
+         else
+            scales[i] =  LLVMConstReal(LLVMFloatType(), 1.0);
+      }
+
+      shift += bits;
+   }
+
+   shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
+   masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
+   /* UIToFP can't be expressed in SSE2 */
+   casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
+
+   if (normalized)
+      scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), "");
+   else
+      scaled = casted;
+
+   for (i = 0; i < 4; ++i)
+      aux[i] = LLVMGetUndef(LLVMFloatType());
+
+   for (i = 0; i < 4; ++i) {
+      enum util_format_swizzle swizzle = desc->swizzle[i];
+
+      switch (swizzle) {
+      case UTIL_FORMAT_SWIZZLE_X:
+      case UTIL_FORMAT_SWIZZLE_Y:
+      case UTIL_FORMAT_SWIZZLE_Z:
+      case UTIL_FORMAT_SWIZZLE_W:
+         swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0);
+         break;
+      case UTIL_FORMAT_SWIZZLE_0:
+         assert(empty_channel >= 0);
+         swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0);
+         break;
+      case UTIL_FORMAT_SWIZZLE_1:
+         swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0);
+         aux[0] = LLVMConstReal(LLVMFloatType(), 1.0);
+         break;
+      case UTIL_FORMAT_SWIZZLE_NONE:
+         swizzles[i] = LLVMGetUndef(LLVMFloatType());
+         assert(0);
+         break;
+      }
+   }
+
+   return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4), LLVMConstVector(swizzles, 4), "");
+}
+
+
+/**
+ * Take a vector with packed pixels and unpack into a rgba8 vector.
+ *
+ * Formats with bit depth smaller than 32bits are accepted, but they must be
+ * padded to 32bits.
+ */
+LLVMValueRef
+lp_build_unpack_rgba8_aos(LLVMBuilderRef builder,
+                          const struct util_format_description *desc,
+                          struct lp_type type,
+                          LLVMValueRef packed)
+{
+   struct lp_build_context bld;
+   bool rgba8;
+   LLVMValueRef res;
+   unsigned i;
+
+   lp_build_context_init(&bld, builder, type);
+
+   /* FIXME: Support more formats */
+   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
+   assert(desc->block.width == 1);
+   assert(desc->block.height == 1);
+   assert(desc->block.bits <= 32);
+
+   assert(!type.floating);
+   assert(!type.fixed);
+   assert(type.norm);
+   assert(type.width == 8);
+   assert(type.length % 4 == 0);
+
+   rgba8 = TRUE;
+   for(i = 0; i < 4; ++i) {
+      assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
+             desc->channel[i].type == UTIL_FORMAT_TYPE_VOID);
+      if(desc->channel[0].size != 8)
+         rgba8 = FALSE;
+   }
+
+   if(rgba8) {
+      /*
+       * The pixel is already in a rgba8 format variant. All it is necessary
+       * is to swizzle the channels.
+       */
+
+      unsigned char swizzles[4];
+      boolean zeros[4]; /* bitwise AND mask */
+      boolean ones[4]; /* bitwise OR mask */
+      boolean swizzles_needed = FALSE;
+      boolean zeros_needed = FALSE;
+      boolean ones_needed = FALSE;
+
+      for(i = 0; i < 4; ++i) {
+         enum util_format_swizzle swizzle = desc->swizzle[i];
+
+         /* Initialize with the no-op case */
+         swizzles[i] = util_cpu_caps.little_endian ? 3 - i : i;
+         zeros[i] = TRUE;
+         ones[i] = FALSE;
+
+         switch (swizzle) {
+         case UTIL_FORMAT_SWIZZLE_X:
+         case UTIL_FORMAT_SWIZZLE_Y:
+         case UTIL_FORMAT_SWIZZLE_Z:
+         case UTIL_FORMAT_SWIZZLE_W:
+            if(swizzle != swizzles[i]) {
+               swizzles[i] = swizzle;
+               swizzles_needed = TRUE;
+            }
+            break;
+         case UTIL_FORMAT_SWIZZLE_0:
+            zeros[i] = FALSE;
+            zeros_needed = TRUE;
+            break;
+         case UTIL_FORMAT_SWIZZLE_1:
+            ones[i] = TRUE;
+            ones_needed = TRUE;
+            break;
+         case UTIL_FORMAT_SWIZZLE_NONE:
+            assert(0);
+            break;
+         }
+      }
+
+      res = packed;
+
+      if(swizzles_needed)
+         res = lp_build_swizzle1_aos(&bld, res, swizzles);
+
+      if(zeros_needed) {
+         /* Mask out zero channels */
+         LLVMValueRef mask = lp_build_const_mask_aos(type, zeros);
+         res = LLVMBuildAnd(builder, res, mask, "");
+      }
+
+      if(ones_needed) {
+         /* Or one channels */
+         LLVMValueRef mask = lp_build_const_mask_aos(type, ones);
+         res = LLVMBuildOr(builder, res, mask, "");
+      }
+   }
+   else {
+      /* FIXME */
+      assert(0);
+      res = lp_build_undef(type);
+   }
+
+   return res;
+}
+
+
+/**
+ * Pack a single pixel.
+ *
+ * @param rgba 4 float vector with the unpacked components.
+ *
+ * XXX: This is mostly for reference and testing -- operating a single pixel at
+ * a time is rarely if ever needed.
+ */
+LLVMValueRef
+lp_build_pack_rgba_aos(LLVMBuilderRef builder,
+                       const struct util_format_description *desc,
+                       LLVMValueRef rgba)
+{
+   LLVMTypeRef type;
+   LLVMValueRef packed = NULL;
+   LLVMValueRef swizzles[4];
+   LLVMValueRef shifted, casted, scaled, unswizzled;
+   LLVMValueRef shifts[4];
+   LLVMValueRef scales[4];
+   bool normalized;
+   unsigned shift;
+   unsigned i, j;
+
+   assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
+   assert(desc->block.width == 1);
+   assert(desc->block.height == 1);
+
+   type = LLVMIntType(desc->block.bits);
+
+   /* Unswizzle the color components into the source vector. */
+   for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j) {
+         if (desc->swizzle[j] == i)
+            break;
+      }
+      if (j < 4)
+         swizzles[i] = LLVMConstInt(LLVMInt32Type(), j, 0);
+      else
+         swizzles[i] = LLVMGetUndef(LLVMInt32Type());
+   }
+
+   unswizzled = LLVMBuildShuffleVector(builder, rgba,
+                                       LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4)),
+                                       LLVMConstVector(swizzles, 4), "");
+
+   normalized = FALSE;
+   shift = 0;
+   for (i = 0; i < 4; ++i) {
+      unsigned bits = desc->channel[i].size;
+
+      if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID) {
+         shifts[i] = LLVMGetUndef(LLVMInt32Type());
+         scales[i] =  LLVMGetUndef(LLVMFloatType());
+      }
+      else {
+         unsigned mask = (1 << bits) - 1;
+
+         assert(desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED);
+         assert(bits < 32);
+
+         shifts[i] = LLVMConstInt(LLVMInt32Type(), shift, 0);
+
+         if (desc->channel[i].normalized) {
+            scales[i] = LLVMConstReal(LLVMFloatType(), mask);
+            normalized = TRUE;
+         }
+         else
+            scales[i] =  LLVMConstReal(LLVMFloatType(), 1.0);
+      }
+
+      shift += bits;
+   }
+
+   if (normalized)
+      scaled = LLVMBuildMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
+   else
+      scaled = unswizzled;
+
+   casted = LLVMBuildFPToSI(builder, scaled, LLVMVectorType(LLVMInt32Type(), 4), "");
+
+   shifted = LLVMBuildShl(builder, casted, LLVMConstVector(shifts, 4), "");
+   
+   /* Bitwise or all components */
+   for (i = 0; i < 4; ++i) {
+      if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         LLVMValueRef component = LLVMBuildExtractElement(builder, shifted, LLVMConstInt(LLVMInt32Type(), i, 0), "");
+         if (packed)
+            packed = LLVMBuildOr(builder, packed, component, "");
+         else
+            packed = component;
+      }
+   }
+
+   if (!packed)
+      packed = LLVMGetUndef(LLVMInt32Type());
+
+   if (desc->block.bits < 32)
+      packed = LLVMBuildTrunc(builder, packed, type, "");
+
+   return packed;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_query.c b/src/gallium/auxiliary/gallivm/lp_bld_format_query.c
new file mode 100644
index 00000000000..f3832d07ff9
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_query.c
@@ -0,0 +1,72 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Utility functions to make assertions about formats.
+ *
+ * This module centralizes most of logic used when determining what algorithm
+ * is most suitable (i.e., most efficient yet correct) for a given format.
+ *
+ * It might be possible to move some of these functions to u_format module,
+ * but since tiny differences in the format my render it more/less
+ * appropriate to a given algorithm it is impossible to make any long term
+ * guarantee about the semantics of these functions.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_format.h"
+
+
+/**
+ * Whether this format is a 4 rgba8 variant
+ */
+boolean
+lp_format_is_rgba8(const struct util_format_description *desc)
+{
+   unsigned chan;
+
+   if(desc->block.width != 1 ||
+      desc->block.height != 1 ||
+      desc->block.bits != 32)
+      return FALSE;
+
+   for(chan = 0; chan < 4; ++chan) {
+      if(desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED &&
+         desc->channel[chan].type != UTIL_FORMAT_TYPE_SIGNED &&
+         desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID)
+         return FALSE;
+      if(desc->channel[chan].size != 8)
+         return FALSE;
+   }
+
+   return TRUE;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
new file mode 100644
index 00000000000..abb27e4c328
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -0,0 +1,147 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_format.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_format.h"
+
+
+static LLVMValueRef
+lp_build_format_swizzle_chan_soa(struct lp_type type,
+                                 const LLVMValueRef *unswizzled,
+                                 enum util_format_swizzle swizzle)
+{
+   switch (swizzle) {
+   case UTIL_FORMAT_SWIZZLE_X:
+   case UTIL_FORMAT_SWIZZLE_Y:
+   case UTIL_FORMAT_SWIZZLE_Z:
+   case UTIL_FORMAT_SWIZZLE_W:
+      return unswizzled[swizzle];
+   case UTIL_FORMAT_SWIZZLE_0:
+      return lp_build_zero(type);
+   case UTIL_FORMAT_SWIZZLE_1:
+      return lp_build_one(type);
+   case UTIL_FORMAT_SWIZZLE_NONE:
+      return lp_build_undef(type);
+   default:
+      assert(0);
+      return lp_build_undef(type);
+   }
+}
+
+
+void
+lp_build_format_swizzle_soa(const struct util_format_description *format_desc,
+                            struct lp_type type,
+                            const LLVMValueRef *unswizzled,
+                            LLVMValueRef *swizzled)
+{
+   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      enum util_format_swizzle swizzle = format_desc->swizzle[0];
+      LLVMValueRef depth = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
+      swizzled[2] = swizzled[1] = swizzled[0] = depth;
+      swizzled[3] = lp_build_one(type);
+   }
+   else {
+      unsigned chan;
+      for (chan = 0; chan < 4; ++chan) {
+         enum util_format_swizzle swizzle = format_desc->swizzle[chan];
+         swizzled[chan] = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle);
+      }
+   }
+}
+
+
+void
+lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
+                         const struct util_format_description *format_desc,
+                         struct lp_type type,
+                         LLVMValueRef packed,
+                         LLVMValueRef *rgba)
+{
+   LLVMValueRef inputs[4];
+   unsigned start;
+   unsigned chan;
+
+   /* FIXME: Support more formats */
+   assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+   assert(format_desc->block.bits <= 32);
+
+   /* Decode the input vector components */
+   start = 0;
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned width = format_desc->channel[chan].size;
+      unsigned stop = start + width;
+      LLVMValueRef input;
+
+      input = packed;
+
+      switch(format_desc->channel[chan].type) {
+      case UTIL_FORMAT_TYPE_VOID:
+         input = NULL;
+         break;
+
+      case UTIL_FORMAT_TYPE_UNSIGNED:
+         if(type.floating) {
+            if(start)
+               input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(type, start), "");
+            if(stop < format_desc->block.bits) {
+               unsigned mask = ((unsigned long long)1 << width) - 1;
+               input = LLVMBuildAnd(builder, input, lp_build_int_const_scalar(type, mask), "");
+            }
+
+            if(format_desc->channel[chan].normalized)
+               input = lp_build_unsigned_norm_to_float(builder, width, type, input);
+            else
+               input = LLVMBuildFPToSI(builder, input, lp_build_vec_type(type), "");
+         }
+         else {
+            /* FIXME */
+            assert(0);
+            input = lp_build_undef(type);
+         }
+         break;
+
+      default:
+         /* fall through */
+         input = lp_build_undef(type);
+         break;
+      }
+
+      inputs[chan] = input;
+
+      start = stop;
+   }
+
+   lp_build_format_swizzle_soa(format_desc, type, inputs, rgba);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.cpp b/src/gallium/auxiliary/gallivm/lp_bld_init.cpp
new file mode 100644
index 00000000000..067397a520b
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.cpp
@@ -0,0 +1,69 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <llvm/Config/config.h>
+#include <llvm/Target/TargetSelect.h>
+#include <llvm/Target/TargetOptions.h>
+
+#include "pipe/p_config.h"
+
+#include "lp_bld_init.h"
+
+
+extern "C" void LLVMLinkInJIT();
+
+
+extern "C" void
+lp_build_init(void)
+{
+#if defined(PIPE_OS_WINDOWS) && defined(PIPE_ARCH_X86)
+   /*
+    * This is mis-detected on some hardware / software combinations.
+    */
+   llvm::StackAlignment = 4;
+   llvm::RealignStack = true;
+#endif
+
+   /* Same as LLVMInitializeNativeTarget(); */
+   llvm::InitializeNativeTarget();
+
+   LLVMLinkInJIT();
+}
+
+
+/* 
+ * Hack to allow the linking of release LLVM static libraries on a debug build.
+ *
+ * See also:
+ * - http://social.msdn.microsoft.com/Forums/en-US/vclanguage/thread/7234ea2b-0042-42ed-b4e2-5d8644dfb57d
+ */
+#if defined(_MSC_VER) && defined(_DEBUG)
+#include <crtdefs.h>
+extern "C" _CRTIMP void __cdecl
+_invalid_parameter_noinfo(void) {}
+#endif
diff --git a/src/gallium/auxiliary/util/u_format.c b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 98ea13b60b5..07f50d1c433 100644
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 Vmware, Inc.
+ * Copyright 2009 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -26,21 +26,22 @@
  **************************************************************************/
 
 
-#include "u_format.h"
+#ifndef LP_BLD_INIT_H
+#define LP_BLD_INIT_H
 
 
-const struct util_format_description *
-util_format_description(enum pipe_format format)
-{
-   const struct util_format_description *desc = util_format_description_table;
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-   while(TRUE) {
-      if(desc->format == format)
-         return desc;
 
-      if(desc->format == PIPE_FORMAT_NONE)
-         return NULL;
+void
+lp_build_init(void);
 
-      ++desc;
-   };
+
+#ifdef __cplusplus
 }
+#endif
+
+
+#endif /* !LP_BLD_INIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_interp.c b/src/gallium/auxiliary/gallivm/lp_bld_interp.c
new file mode 100644
index 00000000000..2fc894017d8
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_interp.c
@@ -0,0 +1,408 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Position and shader input interpolation.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "tgsi/tgsi_parse.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_interp.h"
+
+
+/*
+ * The shader JIT function operates on blocks of quads.
+ * Each block has 2x2 quads and each quad has 2x2 pixels.
+ *
+ * We iterate over the quads in order 0, 1, 2, 3:
+ *
+ * #################
+ * #   |   #   |   #
+ * #---0---#---1---#
+ * #   |   #   |   #
+ * #################
+ * #   |   #   |   #
+ * #---2---#---3---#
+ * #   |   #   |   #
+ * #################
+ *
+ * Within each quad, we have four pixels which are represented in SOA
+ * order:
+ *
+ * #########
+ * # 0 | 1 #
+ * #---+---#
+ * # 2 | 3 #
+ * #########
+ *
+ * So the green channel (for example) of the four pixels is stored in
+ * a single vector register: {g0, g1, g2, g3}.
+ */
+
+
+static void
+attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
+{
+   if(attrib == 0)
+      lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
+   else
+      lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
+}
+
+
+/**
+ * Initialize the bld->a0, dadx, dady fields.  This involves fetching
+ * those values from the arrays which are passed into the JIT function.
+ */
+static void
+coeffs_init(struct lp_build_interp_soa_context *bld,
+            LLVMValueRef a0_ptr,
+            LLVMValueRef dadx_ptr,
+            LLVMValueRef dady_ptr)
+{
+   LLVMBuilderRef builder = bld->base.builder;
+   unsigned attrib;
+   unsigned chan;
+
+   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      unsigned mask = bld->mask[attrib];
+      unsigned mode = bld->mode[attrib];
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         if(mask & (1 << chan)) {
+            LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), attrib*NUM_CHANNELS + chan, 0);
+            LLVMValueRef a0 = NULL;
+            LLVMValueRef dadx = NULL;
+            LLVMValueRef dady = NULL;
+
+            switch( mode ) {
+            case TGSI_INTERPOLATE_PERSPECTIVE:
+               /* fall-through */
+
+            case TGSI_INTERPOLATE_LINEAR:
+               dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
+               dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
+               dadx = lp_build_broadcast_scalar(&bld->base, dadx);
+               dady = lp_build_broadcast_scalar(&bld->base, dady);
+               attrib_name(dadx, attrib, chan, ".dadx");
+               attrib_name(dady, attrib, chan, ".dady");
+               /* fall-through */
+
+            case TGSI_INTERPOLATE_CONSTANT:
+               a0 = LLVMBuildLoad(builder, LLVMBuildGEP(builder, a0_ptr, &index, 1, ""), "");
+               a0 = lp_build_broadcast_scalar(&bld->base, a0);
+               attrib_name(a0, attrib, chan, ".a0");
+               break;
+
+            default:
+               assert(0);
+               break;
+            }
+
+            bld->a0  [attrib][chan] = a0;
+            bld->dadx[attrib][chan] = dadx;
+            bld->dady[attrib][chan] = dady;
+         }
+      }
+   }
+}
+
+
+/**
+ * Emit LLVM code to compute the fragment shader input attribute values.
+ * For example, for a color input, we'll compute red, green, blue and alpha
+ * values for the four pixels in a quad.
+ * Recall that we're operating on 4-element vectors so each arithmetic
+ * operation is operating on the four pixels in a quad.
+ */
+static void
+attribs_init(struct lp_build_interp_soa_context *bld)
+{
+   LLVMValueRef x = bld->pos[0];
+   LLVMValueRef y = bld->pos[1];
+   LLVMValueRef oow = NULL;
+   unsigned attrib;
+   unsigned chan;
+
+   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      unsigned mask = bld->mask[attrib];
+      unsigned mode = bld->mode[attrib];
+      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         if(mask & (1 << chan)) {
+            LLVMValueRef a0   = bld->a0  [attrib][chan];
+            LLVMValueRef dadx = bld->dadx[attrib][chan];
+            LLVMValueRef dady = bld->dady[attrib][chan];
+            LLVMValueRef res;
+
+            res = a0;
+
+            if (mode != TGSI_INTERPOLATE_CONSTANT) {
+               /* res = res + x * dadx */
+               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, x, dadx));
+               /* res = res + y * dady */
+               res = lp_build_add(&bld->base, res, lp_build_mul(&bld->base, y, dady));
+            }
+
+            /* Keep the value of the attribue before perspective divide for faster updates */
+            bld->attribs_pre[attrib][chan] = res;
+
+            if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
+               LLVMValueRef w = bld->pos[3];
+               assert(attrib != 0);
+               if(!oow)
+                  oow = lp_build_rcp(&bld->base, w);
+               res = lp_build_mul(&bld->base, res, oow);
+            }
+
+            attrib_name(res, attrib, chan, "");
+
+            bld->attribs[attrib][chan] = res;
+         }
+      }
+   }
+}
+
+
+/**
+ * Increment the shader input attribute values.
+ * This is called when we move from one quad to the next.
+ */
+static void
+attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
+{
+   LLVMValueRef oow = NULL;
+   unsigned attrib;
+   unsigned chan;
+
+   assert(quad_index < 4);
+
+   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      unsigned mask = bld->mask[attrib];
+      unsigned mode = bld->mode[attrib];
+
+      if (mode != TGSI_INTERPOLATE_CONSTANT) {
+         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+            if(mask & (1 << chan)) {
+               LLVMValueRef dadx = bld->dadx[attrib][chan];
+               LLVMValueRef dady = bld->dady[attrib][chan];
+               LLVMValueRef res;
+
+               res = bld->attribs_pre[attrib][chan];
+
+               if (quad_index == 1 || quad_index == 3) {
+                  /* top-right or bottom-right quad */
+                  /* build res = res + dadx + dadx */
+                  res = lp_build_add(&bld->base, res, dadx);
+                  res = lp_build_add(&bld->base, res, dadx);
+               }
+
+               if (quad_index == 2 || quad_index == 3) {
+                  /* bottom-left or bottom-right quad */
+                  /* build res = res + dady + dady */
+                  res = lp_build_add(&bld->base, res, dady);
+                  res = lp_build_add(&bld->base, res, dady);
+               }
+
+               //XXX bld->attribs_pre[attrib][chan] = res;
+
+               if (mode == TGSI_INTERPOLATE_PERSPECTIVE) {
+                  LLVMValueRef w = bld->pos[3];
+                  assert(attrib != 0);
+                  if(!oow)
+                     oow = lp_build_rcp(&bld->base, w);
+                  res = lp_build_mul(&bld->base, res, oow);
+               }
+
+               attrib_name(res, attrib, chan, "");
+
+               bld->attribs[attrib][chan] = res;
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * Generate the position vectors.
+ *
+ * Parameter x0, y0 are the integer values with the quad upper left coordinates.
+ */
+static void
+pos_init(struct lp_build_interp_soa_context *bld,
+         LLVMValueRef x0,
+         LLVMValueRef y0)
+{
+   lp_build_name(x0, "pos.x");
+   lp_build_name(y0, "pos.y");
+
+   bld->attribs[0][0] = x0;
+   bld->attribs[0][1] = y0;
+}
+
+
+/**
+ * Update quad position values when moving to the next quad.
+ */
+static void
+pos_update(struct lp_build_interp_soa_context *bld, int quad_index)
+{
+   LLVMValueRef x = bld->attribs[0][0];
+   LLVMValueRef y = bld->attribs[0][1];
+   const int xstep = 2, ystep = 2;
+
+   if (quad_index == 1 || quad_index == 3) {
+      /* top-right or bottom-right quad in block */
+      /* build x += xstep */
+      x = lp_build_add(&bld->base, x,
+                       lp_build_const_scalar(bld->base.type, xstep));
+   }
+
+   if (quad_index == 2) {
+      /* bottom-left quad in block */
+      /* build y += ystep */
+      y = lp_build_add(&bld->base, y,
+                       lp_build_const_scalar(bld->base.type, ystep));
+      /* build x -= xstep */
+      x = lp_build_sub(&bld->base, x,
+                       lp_build_const_scalar(bld->base.type, xstep));
+   }
+
+   lp_build_name(x, "pos.x");
+   lp_build_name(y, "pos.y");
+
+   bld->attribs[0][0] = x;
+   bld->attribs[0][1] = y;
+}
+
+
+/**
+ * Initialize fragment shader input attribute info.
+ */
+void
+lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
+                         const struct tgsi_token *tokens,
+                         boolean flatshade,
+                         LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef a0_ptr,
+                         LLVMValueRef dadx_ptr,
+                         LLVMValueRef dady_ptr,
+                         LLVMValueRef x0,
+                         LLVMValueRef y0)
+{
+   struct tgsi_parse_context parse;
+   struct tgsi_full_declaration *decl;
+
+   memset(bld, 0, sizeof *bld);
+
+   lp_build_context_init(&bld->base, builder, type);
+
+   /* For convenience */
+   bld->pos = bld->attribs[0];
+   bld->inputs = (const LLVMValueRef (*)[NUM_CHANNELS]) bld->attribs[1];
+
+   /* Position */
+   bld->num_attribs = 1;
+   bld->mask[0] = TGSI_WRITEMASK_ZW;
+   bld->mode[0] = TGSI_INTERPOLATE_LINEAR;
+
+   /* Inputs */
+   tgsi_parse_init( &parse, tokens );
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         decl = &parse.FullToken.FullDeclaration;
+         if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+            unsigned first, last, mask;
+            unsigned attrib;
+
+            first = decl->Range.First;
+            last = decl->Range.Last;
+            mask = decl->Declaration.UsageMask;
+
+            for( attrib = first; attrib <= last; ++attrib ) {
+               bld->mask[1 + attrib] = mask;
+
+               /* XXX: have mesa set INTERP_CONSTANT in the fragment
+                * shader.
+                */
+               if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR &&
+                   flatshade)
+                  bld->mode[1 + attrib] = TGSI_INTERPOLATE_CONSTANT;
+               else
+                  bld->mode[1 + attrib] = decl->Declaration.Interpolate;
+            }
+
+            bld->num_attribs = MAX2(bld->num_attribs, 1 + last + 1);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+   tgsi_parse_free( &parse );
+
+   coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+
+   pos_init(bld, x0, y0);
+
+   attribs_init(bld);
+}
+
+
+/**
+ * Advance the position and inputs to the given quad within the block.
+ */
+void
+lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
+                           int quad_index)
+{
+   assert(quad_index < 4);
+
+   pos_update(bld, quad_index);
+
+   attribs_update(bld, quad_index);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_interp.h b/src/gallium/auxiliary/gallivm/lp_bld_interp.h
new file mode 100644
index 00000000000..ca958cdf343
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_interp.h
@@ -0,0 +1,96 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Position and shader input interpolation.
+ *
+ * Special attention is given to the interpolation of side by side quads.
+ * Multiplications are made only for the first quad. Interpolation of
+ * inputs for posterior quads are done exclusively with additions, and
+ * perspective divide if necessary.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#ifndef LP_BLD_INTERP_H
+#define LP_BLD_INTERP_H
+
+
+#include <llvm-c/Core.h>
+
+#include "tgsi/tgsi_exec.h"
+
+#include "lp_bld_type.h"
+
+
+struct tgsi_token;
+
+
+struct lp_build_interp_soa_context
+{
+   struct lp_build_context base;
+
+   unsigned num_attribs;
+   unsigned mask[1 + PIPE_MAX_SHADER_INPUTS];
+   unsigned mode[1 + PIPE_MAX_SHADER_INPUTS];
+
+   LLVMValueRef a0  [1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   LLVMValueRef dadx[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+   LLVMValueRef dady[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+
+   /* Attribute values before perspective divide */
+   LLVMValueRef attribs_pre[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+
+   LLVMValueRef attribs[1 + PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS];
+
+   /*
+    * Convenience pointers. Callers may access this one.
+    */
+   const LLVMValueRef *pos;
+   const LLVMValueRef (*inputs)[NUM_CHANNELS];
+};
+
+
+void
+lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
+                         const struct tgsi_token *tokens,
+                         boolean flatshade,
+                         LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef a0_ptr,
+                         LLVMValueRef dadx_ptr,
+                         LLVMValueRef dady_ptr,
+                         LLVMValueRef x0,
+                         LLVMValueRef y0);
+
+void
+lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
+                           int quad_index);
+
+
+#endif /* LP_BLD_INTERP_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.c b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
new file mode 100644
index 00000000000..9895749d568
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.c
@@ -0,0 +1,192 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helpers for emiting intrinsic calls.
+ *
+ * LLVM vanilla IR doesn't represent all basic arithmetic operations we care
+ * about, and it is often necessary to resort target-specific intrinsics for
+ * performance, convenience.
+ *
+ * Ideally we would like to stay away from target specific intrinsics and
+ * move all the instruction selection logic into upstream LLVM where it belongs.
+ *
+ * These functions are also used for calling C functions provided by us from
+ * generated LLVM code.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_debug.h"
+
+#include "lp_bld_intr.h"
+
+
+LLVMValueRef
+lp_declare_intrinsic(LLVMModuleRef module,
+                     const char *name,
+                     LLVMTypeRef ret_type,
+                     LLVMTypeRef *arg_types,
+                     unsigned num_args)
+{
+   LLVMTypeRef function_type;
+   LLVMValueRef function;
+
+   assert(!LLVMGetNamedFunction(module, name));
+
+   function_type = LLVMFunctionType(ret_type, arg_types, num_args, 0);
+   function = LLVMAddFunction(module, name, function_type);
+
+   LLVMSetFunctionCallConv(function, LLVMCCallConv);
+   LLVMSetLinkage(function, LLVMExternalLinkage);
+
+   assert(LLVMIsDeclaration(function));
+
+   if(name[0] == 'l' &&
+      name[1] == 'l' &&
+      name[2] == 'v' &&
+      name[3] == 'm' &&
+      name[4] == '.')
+      assert(LLVMGetIntrinsicID(function));
+
+   return function;
+}
+
+
+LLVMValueRef
+lp_build_intrinsic(LLVMBuilderRef builder,
+                   const char *name,
+                   LLVMTypeRef ret_type,
+                   LLVMValueRef *args,
+                   unsigned num_args)
+{
+   LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder)));
+   LLVMValueRef function;
+
+   function = LLVMGetNamedFunction(module, name);
+   if(!function) {
+      LLVMTypeRef arg_types[LP_MAX_FUNC_ARGS];
+      unsigned i;
+
+      assert(num_args <= LP_MAX_FUNC_ARGS);
+
+      for(i = 0; i < num_args; ++i) {
+         assert(args[i]);
+         arg_types[i] = LLVMTypeOf(args[i]);
+      }
+
+      function = lp_declare_intrinsic(module, name, ret_type, arg_types, num_args);
+   }
+
+   return LLVMBuildCall(builder, function, args, num_args, "");
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_unary(LLVMBuilderRef builder,
+                         const char *name,
+                         LLVMTypeRef ret_type,
+                         LLVMValueRef a)
+{
+   return lp_build_intrinsic(builder, name, ret_type, &a, 1);
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_binary(LLVMBuilderRef builder,
+                          const char *name,
+                          LLVMTypeRef ret_type,
+                          LLVMValueRef a,
+                          LLVMValueRef b)
+{
+   LLVMValueRef args[2];
+
+   args[0] = a;
+   args[1] = b;
+
+   return lp_build_intrinsic(builder, name, ret_type, args, 2);
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_map(LLVMBuilderRef builder,
+                       const char *name,
+                       LLVMTypeRef ret_type,
+                       LLVMValueRef *args,
+                       unsigned num_args)
+{
+   LLVMTypeRef ret_elem_type = LLVMGetElementType(ret_type);
+   unsigned n = LLVMGetVectorSize(ret_type);
+   unsigned i, j;
+   LLVMValueRef res;
+
+   assert(num_args <= LP_MAX_FUNC_ARGS);
+
+   res = LLVMGetUndef(ret_type);
+   for(i = 0; i < n; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef arg_elems[LP_MAX_FUNC_ARGS];
+      LLVMValueRef res_elem;
+      for(j = 0; j < num_args; ++j)
+         arg_elems[j] = LLVMBuildExtractElement(builder, args[j], index, "");
+      res_elem = lp_build_intrinsic(builder, name, ret_elem_type, arg_elems, num_args);
+      res = LLVMBuildInsertElement(builder, res, res_elem, index, "");
+   }
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_map_unary(LLVMBuilderRef builder,
+                             const char *name,
+                             LLVMTypeRef ret_type,
+                             LLVMValueRef a)
+{
+   return lp_build_intrinsic_map(builder, name, ret_type, &a, 1);
+}
+
+
+LLVMValueRef
+lp_build_intrinsic_map_binary(LLVMBuilderRef builder,
+                              const char *name,
+                              LLVMTypeRef ret_type,
+                              LLVMValueRef a,
+                              LLVMValueRef b)
+{
+   LLVMValueRef args[2];
+
+   args[0] = a;
+   args[1] = b;
+
+   return lp_build_intrinsic_map(builder, name, ret_type, args, 2);
+}
+
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_intr.h b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
new file mode 100644
index 00000000000..f813f27074b
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_intr.h
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for calling intrinsics.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_INTR_H
+#define LP_BLD_INTR_H
+
+
+#include <llvm-c/Core.h>  
+
+
+/**
+ * Max number of arguments in an intrinsic.
+ */
+#define LP_MAX_FUNC_ARGS 32
+
+
+LLVMValueRef
+lp_declare_intrinsic(LLVMModuleRef module,
+                     const char *name,
+                     LLVMTypeRef ret_type,
+                     LLVMTypeRef *arg_types,
+                     unsigned num_args);
+
+LLVMValueRef
+lp_build_intrinsic(LLVMBuilderRef builder,
+                   const char *name,
+                   LLVMTypeRef ret_type,
+                   LLVMValueRef *args,
+                   unsigned num_args);
+
+
+LLVMValueRef
+lp_build_intrinsic_unary(LLVMBuilderRef builder,
+                         const char *name,
+                         LLVMTypeRef ret_type,
+                         LLVMValueRef a);
+
+
+LLVMValueRef
+lp_build_intrinsic_binary(LLVMBuilderRef builder,
+                          const char *name,
+                          LLVMTypeRef ret_type,
+                          LLVMValueRef a,
+                          LLVMValueRef b);
+
+
+LLVMValueRef
+lp_build_intrinsic_map(LLVMBuilderRef builder,
+                       const char *name,
+                       LLVMTypeRef ret_type,
+                       LLVMValueRef *args,
+                       unsigned num_args);
+
+
+LLVMValueRef
+lp_build_intrinsic_map_unary(LLVMBuilderRef builder,
+                             const char *name,
+                             LLVMTypeRef ret_type,
+                             LLVMValueRef a);
+
+
+LLVMValueRef
+lp_build_intrinsic_map_binary(LLVMBuilderRef builder,
+                              const char *name,
+                              LLVMTypeRef ret_type,
+                              LLVMValueRef a,
+                              LLVMValueRef b);
+
+
+#endif /* !LP_BLD_INTR_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
new file mode 100644
index 00000000000..2726747eaea
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -0,0 +1,438 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for logical operations.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_cpu_detect.h"
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_logic.h"
+
+
+/**
+ * Build code to compare two values 'a' and 'b' of 'type' using the given func.
+ * \param func  one of PIPE_FUNC_x
+ * The result values will be 0 for false or ~0 for true.
+ */
+LLVMValueRef
+lp_build_compare(LLVMBuilderRef builder,
+                 const struct lp_type type,
+                 unsigned func,
+                 LLVMValueRef a,
+                 LLVMValueRef b)
+{
+   LLVMTypeRef vec_type = lp_build_vec_type(type);
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef zeros = LLVMConstNull(int_vec_type);
+   LLVMValueRef ones = LLVMConstAllOnes(int_vec_type);
+   LLVMValueRef cond;
+   LLVMValueRef res;
+   unsigned i;
+
+   assert(func >= PIPE_FUNC_NEVER);
+   assert(func <= PIPE_FUNC_ALWAYS);
+
+   if(func == PIPE_FUNC_NEVER)
+      return zeros;
+   if(func == PIPE_FUNC_ALWAYS)
+      return ones;
+
+   /* TODO: optimize the constant case */
+
+   /* XXX: It is not clear if we should use the ordered or unordered operators */
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   if(type.width * type.length == 128) {
+      if(type.floating && util_cpu_caps.has_sse) {
+         /* float[4] comparison */
+         LLVMValueRef args[3];
+         unsigned cc;
+         boolean swap;
+
+         swap = FALSE;
+         switch(func) {
+         case PIPE_FUNC_EQUAL:
+            cc = 0;
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            cc = 4;
+            break;
+         case PIPE_FUNC_LESS:
+            cc = 1;
+            break;
+         case PIPE_FUNC_LEQUAL:
+            cc = 2;
+            break;
+         case PIPE_FUNC_GREATER:
+            cc = 1;
+            swap = TRUE;
+            break;
+         case PIPE_FUNC_GEQUAL:
+            cc = 2;
+            swap = TRUE;
+            break;
+         default:
+            assert(0);
+            return lp_build_undef(type);
+         }
+
+         if(swap) {
+            args[0] = b;
+            args[1] = a;
+         }
+         else {
+            args[0] = a;
+            args[1] = b;
+         }
+
+         args[2] = LLVMConstInt(LLVMInt8Type(), cc, 0);
+         res = lp_build_intrinsic(builder,
+                                  "llvm.x86.sse.cmp.ps",
+                                  vec_type,
+                                  args, 3);
+         res = LLVMBuildBitCast(builder, res, int_vec_type, "");
+         return res;
+      }
+      else if(util_cpu_caps.has_sse2) {
+         /* int[4] comparison */
+         static const struct {
+            unsigned swap:1;
+            unsigned eq:1;
+            unsigned gt:1;
+            unsigned not:1;
+         } table[] = {
+            {0, 0, 0, 1}, /* PIPE_FUNC_NEVER */
+            {1, 0, 1, 0}, /* PIPE_FUNC_LESS */
+            {0, 1, 0, 0}, /* PIPE_FUNC_EQUAL */
+            {0, 0, 1, 1}, /* PIPE_FUNC_LEQUAL */
+            {0, 0, 1, 0}, /* PIPE_FUNC_GREATER */
+            {0, 1, 0, 1}, /* PIPE_FUNC_NOTEQUAL */
+            {1, 0, 1, 1}, /* PIPE_FUNC_GEQUAL */
+            {0, 0, 0, 0}  /* PIPE_FUNC_ALWAYS */
+         };
+         const char *pcmpeq;
+         const char *pcmpgt;
+         LLVMValueRef args[2];
+         LLVMValueRef res;
+
+         switch (type.width) {
+         case 8:
+            pcmpeq = "llvm.x86.sse2.pcmpeq.b";
+            pcmpgt = "llvm.x86.sse2.pcmpgt.b";
+            break;
+         case 16:
+            pcmpeq = "llvm.x86.sse2.pcmpeq.w";
+            pcmpgt = "llvm.x86.sse2.pcmpgt.w";
+            break;
+         case 32:
+            pcmpeq = "llvm.x86.sse2.pcmpeq.d";
+            pcmpgt = "llvm.x86.sse2.pcmpgt.d";
+            break;
+         default:
+            assert(0);
+            return lp_build_undef(type);
+         }
+
+         /* There are no signed byte and unsigned word/dword comparison
+          * instructions. So flip the sign bit so that the results match.
+          */
+         if(table[func].gt &&
+            ((type.width == 8 && type.sign) ||
+             (type.width != 8 && !type.sign))) {
+            LLVMValueRef msb = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+            a = LLVMBuildXor(builder, a, msb, "");
+            b = LLVMBuildXor(builder, b, msb, "");
+         }
+
+         if(table[func].swap) {
+            args[0] = b;
+            args[1] = a;
+         }
+         else {
+            args[0] = a;
+            args[1] = b;
+         }
+
+         if(table[func].eq)
+            res = lp_build_intrinsic(builder, pcmpeq, vec_type, args, 2);
+         else if (table[func].gt)
+            res = lp_build_intrinsic(builder, pcmpgt, vec_type, args, 2);
+         else
+            res = LLVMConstNull(vec_type);
+
+         if(table[func].not)
+            res = LLVMBuildNot(builder, res, "");
+
+         return res;
+      }
+   }
+#endif
+
+   if(type.floating) {
+      LLVMRealPredicate op;
+      switch(func) {
+      case PIPE_FUNC_NEVER:
+         op = LLVMRealPredicateFalse;
+         break;
+      case PIPE_FUNC_ALWAYS:
+         op = LLVMRealPredicateTrue;
+         break;
+      case PIPE_FUNC_EQUAL:
+         op = LLVMRealUEQ;
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         op = LLVMRealUNE;
+         break;
+      case PIPE_FUNC_LESS:
+         op = LLVMRealULT;
+         break;
+      case PIPE_FUNC_LEQUAL:
+         op = LLVMRealULE;
+         break;
+      case PIPE_FUNC_GREATER:
+         op = LLVMRealUGT;
+         break;
+      case PIPE_FUNC_GEQUAL:
+         op = LLVMRealUGE;
+         break;
+      default:
+         assert(0);
+         return lp_build_undef(type);
+      }
+
+#if 0
+      /* XXX: Although valid IR, no LLVM target currently support this */
+      cond = LLVMBuildFCmp(builder, op, a, b, "");
+      res = LLVMBuildSelect(builder, cond, ones, zeros, "");
+#else
+      debug_printf("%s: warning: using slow element-wise vector comparison\n",
+                   __FUNCTION__);
+      res = LLVMGetUndef(int_vec_type);
+      for(i = 0; i < type.length; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         cond = LLVMBuildFCmp(builder, op,
+                              LLVMBuildExtractElement(builder, a, index, ""),
+                              LLVMBuildExtractElement(builder, b, index, ""),
+                              "");
+         cond = LLVMBuildSelect(builder, cond,
+                                LLVMConstExtractElement(ones, index),
+                                LLVMConstExtractElement(zeros, index),
+                                "");
+         res = LLVMBuildInsertElement(builder, res, cond, index, "");
+      }
+#endif
+   }
+   else {
+      LLVMIntPredicate op;
+      switch(func) {
+      case PIPE_FUNC_EQUAL:
+         op = LLVMIntEQ;
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         op = LLVMIntNE;
+         break;
+      case PIPE_FUNC_LESS:
+         op = type.sign ? LLVMIntSLT : LLVMIntULT;
+         break;
+      case PIPE_FUNC_LEQUAL:
+         op = type.sign ? LLVMIntSLE : LLVMIntULE;
+         break;
+      case PIPE_FUNC_GREATER:
+         op = type.sign ? LLVMIntSGT : LLVMIntUGT;
+         break;
+      case PIPE_FUNC_GEQUAL:
+         op = type.sign ? LLVMIntSGE : LLVMIntUGE;
+         break;
+      default:
+         assert(0);
+         return lp_build_undef(type);
+      }
+
+#if 0
+      /* XXX: Although valid IR, no LLVM target currently support this */
+      cond = LLVMBuildICmp(builder, op, a, b, "");
+      res = LLVMBuildSelect(builder, cond, ones, zeros, "");
+#else
+      debug_printf("%s: warning: using slow element-wise int vector comparison\n",
+                   __FUNCTION__);
+      res = LLVMGetUndef(int_vec_type);
+      for(i = 0; i < type.length; ++i) {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+         cond = LLVMBuildICmp(builder, op,
+                              LLVMBuildExtractElement(builder, a, index, ""),
+                              LLVMBuildExtractElement(builder, b, index, ""),
+                              "");
+         cond = LLVMBuildSelect(builder, cond,
+                                LLVMConstExtractElement(ones, index),
+                                LLVMConstExtractElement(zeros, index),
+                                "");
+         res = LLVMBuildInsertElement(builder, res, cond, index, "");
+      }
+#endif
+   }
+
+   return res;
+}
+
+
+
+/**
+ * Build code to compare two values 'a' and 'b' using the given func.
+ * \param func  one of PIPE_FUNC_x
+ * The result values will be 0 for false or ~0 for true.
+ */
+LLVMValueRef
+lp_build_cmp(struct lp_build_context *bld,
+             unsigned func,
+             LLVMValueRef a,
+             LLVMValueRef b)
+{
+   return lp_build_compare(bld->builder, bld->type, func, a, b);
+}
+
+
+/**
+ * Return mask ? a : b;
+ */
+LLVMValueRef
+lp_build_select(struct lp_build_context *bld,
+                LLVMValueRef mask,
+                LLVMValueRef a,
+                LLVMValueRef b)
+{
+   struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   if(a == b)
+      return a;
+
+   if(type.floating) {
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      b = LLVMBuildBitCast(bld->builder, b, int_vec_type, "");
+   }
+
+   a = LLVMBuildAnd(bld->builder, a, mask, "");
+
+   /* This often gets translated to PANDN, but sometimes the NOT is
+    * pre-computed and stored in another constant. The best strategy depends
+    * on available registers, so it is not a big deal -- hopefully LLVM does
+    * the right decision attending the rest of the program.
+    */
+   b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), "");
+
+   res = LLVMBuildOr(bld->builder, a, b, "");
+
+   if(type.floating) {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
+   }
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_select_aos(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef b,
+                    const boolean cond[4])
+{
+   const struct lp_type type = bld->type;
+   const unsigned n = type.length;
+   unsigned i, j;
+
+   if(a == b)
+      return a;
+   if(cond[0] && cond[1] && cond[2] && cond[3])
+      return a;
+   if(!cond[0] && !cond[1] && !cond[2] && !cond[3])
+      return b;
+   if(a == bld->undef || b == bld->undef)
+      return bld->undef;
+
+   /*
+    * There are three major ways of accomplishing this:
+    * - with a shuffle,
+    * - with a select,
+    * - or with a bit mask.
+    *
+    * Select isn't supported for vector types yet.
+    * The flip between these is empirical and might need to be.
+    */
+   if (n <= 4) {
+      /*
+       * Shuffle.
+       */
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            shuffles[j + i] = LLVMConstInt(elem_type, (cond[i] ? 0 : n) + j + i, 0);
+
+      return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
+   }
+   else {
+#if 0
+      /* XXX: Unfortunately select of vectors do not work */
+      /* Use a select */
+      LLVMTypeRef elem_type = LLVMInt1Type();
+      LLVMValueRef cond[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            cond[j + i] = LLVMConstInt(elem_type, cond[i] ? 1 : 0, 0);
+
+      return LLVMBuildSelect(bld->builder, LLVMConstVector(cond, n), a, b, "");
+#else
+      LLVMValueRef mask = lp_build_const_mask_aos(type, cond);
+      return lp_build_select(bld, mask, a, b);
+#endif
+   }
+}
+
+LLVMValueRef
+lp_build_alloca(struct lp_build_context *bld)
+{
+   const struct lp_type type = bld->type;
+
+   if (type.length > 1) { /*vector*/
+      return LLVMBuildAlloca(bld->builder, lp_build_vec_type(type), "");
+   } else { /*scalar*/
+      return LLVMBuildAlloca(bld->builder, lp_build_elem_type(type), "");
+   }
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
new file mode 100644
index 00000000000..a399ebf39ef
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h
@@ -0,0 +1,82 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for logical operations.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_LOGIC_H
+#define LP_BLD_LOGIC_H
+
+
+#include <llvm-c/Core.h>  
+
+#include "pipe/p_defines.h" /* For PIPE_FUNC_xxx */
+
+
+struct lp_type;
+struct lp_build_context;
+
+
+LLVMValueRef
+lp_build_compare(LLVMBuilderRef builder,
+                 const struct lp_type type,
+                 unsigned func,
+                 LLVMValueRef a,
+                 LLVMValueRef b);
+
+
+/**
+ * @param func is one of PIPE_FUNC_xxx
+ */
+LLVMValueRef
+lp_build_cmp(struct lp_build_context *bld,
+             unsigned func,
+             LLVMValueRef a,
+             LLVMValueRef b);
+
+
+LLVMValueRef
+lp_build_select(struct lp_build_context *bld,
+                LLVMValueRef mask,
+                LLVMValueRef a,
+                LLVMValueRef b);
+
+LLVMValueRef
+lp_build_select_aos(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef b,
+                    const boolean cond[4]);
+
+LLVMValueRef
+lp_build_alloca(struct lp_build_context *bld);
+
+#endif /* !LP_BLD_LOGIC_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
new file mode 100644
index 00000000000..bc360ad77ad
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -0,0 +1,418 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper functions for packing/unpacking.
+ *
+ * Pack/unpacking is necessary for conversion between types of different
+ * bit width.
+ *
+ * They are also commonly used when an computation needs higher
+ * precision for the intermediate values. For example, if one needs the
+ * function:
+ *
+ *   c = compute(a, b);
+ *
+ * to use more precision for intermediate results then one should implement it
+ * as:
+ *
+ *   LLVMValueRef
+ *   compute(LLVMBuilderRef builder struct lp_type type, LLVMValueRef a, LLVMValueRef b)
+ *   {
+ *      struct lp_type wide_type = lp_wider_type(type);
+ *      LLVMValueRef al, ah, bl, bh, cl, ch, c;
+ *
+ *      lp_build_unpack2(builder, type, wide_type, a, &al, &ah);
+ *      lp_build_unpack2(builder, type, wide_type, b, &bl, &bh);
+ *
+ *      cl = compute_half(al, bl);
+ *      ch = compute_half(ah, bh);
+ *
+ *      c = lp_build_pack2(bld->builder, wide_type, type, cl, ch);
+ *
+ *      return c;
+ *   }
+ *
+ * where compute_half() would do the computation for half the elements with
+ * twice the precision.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_debug.h"
+#include "util/u_math.h"
+#include "util/u_cpu_detect.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_intr.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_pack.h"
+
+
+/**
+ * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
+ */
+static LLVMValueRef
+lp_build_const_unpack_shuffle(unsigned n, unsigned lo_hi)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i, j;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+   assert(lo_hi < 2);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
+      elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
+      elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
+   }
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Build shuffle vectors that match PACKxx instructions.
+ */
+static LLVMValueRef
+lp_build_const_pack_shuffle(unsigned n)
+{
+   LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   assert(n <= LP_MAX_VECTOR_LENGTH);
+
+   /* TODO: cache results in a static table */
+
+   for(i = 0; i < n; ++i)
+      elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0);
+
+   return LLVMConstVector(elems, n);
+}
+
+
+/**
+ * Interleave vector elements.
+ *
+ * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions.
+ */
+LLVMValueRef
+lp_build_interleave2(LLVMBuilderRef builder,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi)
+{
+   LLVMValueRef shuffle;
+
+   shuffle = lp_build_const_unpack_shuffle(type.length, lo_hi);
+
+   return LLVMBuildShuffleVector(builder, a, b, shuffle, "");
+}
+
+
+/**
+ * Double the bit width.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ */
+void
+lp_build_unpack2(LLVMBuilderRef builder,
+                 struct lp_type src_type,
+                 struct lp_type dst_type,
+                 LLVMValueRef src,
+                 LLVMValueRef *dst_lo,
+                 LLVMValueRef *dst_hi)
+{
+   LLVMValueRef msb;
+   LLVMTypeRef dst_vec_type;
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(dst_type.width == src_type.width * 2);
+   assert(dst_type.length * 2 == src_type.length);
+
+   if(dst_type.sign && src_type.sign) {
+      /* Replicate the sign bit in the most significant bits */
+      msb = LLVMBuildAShr(builder, src, lp_build_int_const_scalar(src_type, src_type.width - 1), "");
+   }
+   else
+      /* Most significant bits always zero */
+      msb = lp_build_zero(src_type);
+
+   /* Interleave bits */
+   if(util_cpu_caps.little_endian) {
+      *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0);
+      *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1);
+   }
+   else {
+      *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0);
+      *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1);
+   }
+
+   /* Cast the result into the new type (twice as wide) */
+
+   dst_vec_type = lp_build_vec_type(dst_type);
+
+   *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, "");
+   *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, "");
+}
+
+
+/**
+ * Expand the bit width.
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ */
+void
+lp_build_unpack(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef src,
+                LLVMValueRef *dst, unsigned num_dsts)
+{
+   unsigned num_tmps;
+   unsigned i;
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length == dst_type.length * num_dsts);
+
+   num_tmps = 1;
+   dst[0] = src;
+
+   while(src_type.width < dst_type.width) {
+      struct lp_type tmp_type = src_type;
+
+      tmp_type.width *= 2;
+      tmp_type.length /= 2;
+
+      for(i = num_tmps; i--; ) {
+         lp_build_unpack2(builder, src_type, tmp_type, dst[i], &dst[2*i + 0], &dst[2*i + 1]);
+      }
+
+      src_type = tmp_type;
+
+      num_tmps *= 2;
+   }
+
+   assert(num_tmps == num_dsts);
+}
+
+
+/**
+ * Non-interleaved pack.
+ *
+ * This will move values as
+ *
+ *   lo =   __ l0 __ l1 __ l2 __..  __ ln
+ *   hi =   __ h0 __ h1 __ h2 __..  __ hn
+ *   res =  l0 l1 l2 .. ln h0 h1 h2 .. hn
+ *
+ * This will only change the number of bits the values are represented, not the
+ * values themselves.
+ *
+ * It is assumed the values are already clamped into the destination type range.
+ * Values outside that range will produce undefined results. Use
+ * lp_build_packs2 instead.
+ */
+LLVMValueRef
+lp_build_pack2(LLVMBuilderRef builder,
+               struct lp_type src_type,
+               struct lp_type dst_type,
+               LLVMValueRef lo,
+               LLVMValueRef hi)
+{
+   LLVMTypeRef src_vec_type = lp_build_vec_type(src_type);
+   LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
+   LLVMValueRef shuffle;
+   LLVMValueRef res;
+
+   dst_vec_type = lp_build_vec_type(dst_type);
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(src_type.width == dst_type.width * 2);
+   assert(src_type.length * 2 == dst_type.length);
+
+   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
+      switch(src_type.width) {
+      case 32:
+         if(dst_type.sign) {
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
+         }
+         else {
+            if (util_cpu_caps.has_sse4_1) {
+               /* PACKUSDW is the only instrinsic with a consistent signature */
+               return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
+            }
+            else {
+               assert(0);
+               return LLVMGetUndef(dst_vec_type);
+            }
+         }
+         break;
+
+      case 16:
+         if(dst_type.sign)
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packsswb.128", src_vec_type, lo, hi);
+         else
+            res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packuswb.128", src_vec_type, lo, hi);
+         break;
+
+      default:
+         assert(0);
+         return LLVMGetUndef(dst_vec_type);
+         break;
+      }
+
+      res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+      return res;
+   }
+
+   lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
+   hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
+
+   shuffle = lp_build_const_pack_shuffle(dst_type.length);
+
+   res = LLVMBuildShuffleVector(builder, lo, hi, shuffle, "");
+
+   return res;
+}
+
+
+
+/**
+ * Non-interleaved pack and saturate.
+ *
+ * Same as lp_build_pack2 but will saturate values so that they fit into the
+ * destination type.
+ */
+LLVMValueRef
+lp_build_packs2(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef lo,
+                LLVMValueRef hi)
+{
+   boolean clamp;
+
+   assert(!src_type.floating);
+   assert(!dst_type.floating);
+   assert(src_type.sign == dst_type.sign);
+   assert(src_type.width == dst_type.width * 2);
+   assert(src_type.length * 2 == dst_type.length);
+
+   clamp = TRUE;
+
+   /* All X86 SSE non-interleaved pack instructions take signed inputs and
+    * saturate them, so no need to clamp for those cases. */
+   if(util_cpu_caps.has_sse2 &&
+      src_type.width * src_type.length == 128 &&
+      src_type.sign)
+      clamp = FALSE;
+
+   if(clamp) {
+      struct lp_build_context bld;
+      unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width;
+      LLVMValueRef dst_max = lp_build_int_const_scalar(src_type, ((unsigned long long)1 << dst_bits) - 1);
+      lp_build_context_init(&bld, builder, src_type);
+      lo = lp_build_min(&bld, lo, dst_max);
+      hi = lp_build_min(&bld, hi, dst_max);
+      /* FIXME: What about lower bound? */
+   }
+
+   return lp_build_pack2(builder, src_type, dst_type, lo, hi);
+}
+
+
+/**
+ * Truncate the bit width.
+ *
+ * TODO: Handle saturation consistently.
+ */
+LLVMValueRef
+lp_build_pack(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              boolean clamped,
+              const LLVMValueRef *src, unsigned num_srcs)
+{
+   LLVMValueRef (*pack2)(LLVMBuilderRef builder,
+                         struct lp_type src_type,
+                         struct lp_type dst_type,
+                         LLVMValueRef lo,
+                         LLVMValueRef hi);
+   LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+
+   /* Register width must remain constant */
+   assert(src_type.width * src_type.length == dst_type.width * dst_type.length);
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length);
+
+   if(clamped)
+      pack2 = &lp_build_pack2;
+   else
+      pack2 = &lp_build_packs2;
+
+   for(i = 0; i < num_srcs; ++i)
+      tmp[i] = src[i];
+
+   while(src_type.width > dst_type.width) {
+      struct lp_type tmp_type = src_type;
+
+      tmp_type.width /= 2;
+      tmp_type.length *= 2;
+
+      /* Take in consideration the sign changes only in the last step */
+      if(tmp_type.width == dst_type.width)
+         tmp_type.sign = dst_type.sign;
+
+      num_srcs /= 2;
+
+      for(i = 0; i < num_srcs; ++i)
+         tmp[i] = pack2(builder, src_type, tmp_type, tmp[2*i + 0], tmp[2*i + 1]);
+
+      src_type = tmp_type;
+   }
+
+   assert(num_srcs == 1);
+
+   return tmp[0];
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
new file mode 100644
index 00000000000..fb2a34984a4
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for packing/unpacking conversions.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_PACK_H
+#define LP_BLD_PACK_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type;
+
+
+LLVMValueRef
+lp_build_interleave2(LLVMBuilderRef builder,
+                     struct lp_type type,
+                     LLVMValueRef a,
+                     LLVMValueRef b,
+                     unsigned lo_hi);
+
+
+void
+lp_build_unpack2(LLVMBuilderRef builder,
+                 struct lp_type src_type,
+                 struct lp_type dst_type,
+                 LLVMValueRef src,
+                 LLVMValueRef *dst_lo,
+                 LLVMValueRef *dst_hi);
+
+
+void
+lp_build_unpack(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef src,
+                LLVMValueRef *dst, unsigned num_dsts);
+
+
+LLVMValueRef
+lp_build_packs2(LLVMBuilderRef builder,
+                struct lp_type src_type,
+                struct lp_type dst_type,
+                LLVMValueRef lo,
+                LLVMValueRef hi);
+
+
+LLVMValueRef
+lp_build_pack2(LLVMBuilderRef builder,
+               struct lp_type src_type,
+               struct lp_type dst_type,
+               LLVMValueRef lo,
+               LLVMValueRef hi);
+
+
+LLVMValueRef
+lp_build_pack(LLVMBuilderRef builder,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              boolean clamped,
+              const LLVMValueRef *src, unsigned num_srcs);
+
+
+#endif /* !LP_BLD_PACK_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
new file mode 100644
index 00000000000..55ac2e94363
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -0,0 +1,198 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling -- common code.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_type.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+
+
+/**
+ * Initialize lp_sampler_static_state object with the gallium sampler
+ * and texture state.
+ * The former is considered to be static and the later dynamic.
+ */
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler)
+{
+   memset(state, 0, sizeof *state);
+
+   if(!texture)
+      return;
+
+   if(!sampler)
+      return;
+
+   state->format            = texture->format;
+   state->target            = texture->target;
+   state->pot_width         = util_is_pot(texture->width0);
+   state->pot_height        = util_is_pot(texture->height0);
+   state->pot_depth         = util_is_pot(texture->depth0);
+
+   state->wrap_s            = sampler->wrap_s;
+   state->wrap_t            = sampler->wrap_t;
+   state->wrap_r            = sampler->wrap_r;
+   state->min_img_filter    = sampler->min_img_filter;
+   state->min_mip_filter    = sampler->min_mip_filter;
+   state->mag_img_filter    = sampler->mag_img_filter;
+   state->compare_mode      = sampler->compare_mode;
+   state->border_color[0]   = sampler->border_color[0];
+   state->border_color[1]   = sampler->border_color[1];
+   state->border_color[2]   = sampler->border_color[2];
+   state->border_color[3]   = sampler->border_color[3];
+   if(sampler->compare_mode != PIPE_TEX_COMPARE_NONE) {
+      state->compare_func      = sampler->compare_func;
+   }
+   state->normalized_coords = sampler->normalized_coords;
+}
+
+
+/**
+ * Gather elements from scatter positions in memory into a single vector.
+ *
+ * @param src_width src element width
+ * @param dst_width result element width (source will be expanded to fit)
+ * @param length length of the offsets,
+ * @param base_ptr base pointer, should be a i8 pointer type.
+ * @param offsets vector with offsets
+ */
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets)
+{
+   LLVMTypeRef src_type = LLVMIntType(src_width);
+   LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0);
+   LLVMTypeRef dst_elem_type = LLVMIntType(dst_width);
+   LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length);
+   LLVMValueRef res;
+   unsigned i;
+
+   res = LLVMGetUndef(dst_vec_type);
+   for(i = 0; i < length; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef elem_offset;
+      LLVMValueRef elem_ptr;
+      LLVMValueRef elem;
+
+      elem_offset = LLVMBuildExtractElement(builder, offsets, index, "");
+      elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, "");
+      elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, "");
+      elem = LLVMBuildLoad(builder, elem_ptr, "");
+
+      assert(src_width <= dst_width);
+      if(src_width > dst_width)
+         elem = LLVMBuildTrunc(builder, elem, dst_elem_type, "");
+      if(src_width < dst_width)
+         elem = LLVMBuildZExt(builder, elem, dst_elem_type, "");
+
+      res = LLVMBuildInsertElement(builder, res, elem, index, "");
+   }
+
+   return res;
+}
+
+
+/**
+ * Compute the offset of a pixel.
+ *
+ * x, y, y_stride are vectors
+ */
+LLVMValueRef
+lp_build_sample_offset(struct lp_build_context *bld,
+                       const struct util_format_description *format_desc,
+                       LLVMValueRef x,
+                       LLVMValueRef y,
+                       LLVMValueRef y_stride,
+                       LLVMValueRef data_ptr)
+{
+   LLVMValueRef x_stride;
+   LLVMValueRef offset;
+
+   x_stride = lp_build_const_scalar(bld->type, format_desc->block.bits/8);
+
+   if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+      LLVMValueRef x_lo, x_hi;
+      LLVMValueRef y_lo, y_hi;
+      LLVMValueRef x_stride_lo, x_stride_hi;
+      LLVMValueRef y_stride_lo, y_stride_hi;
+      LLVMValueRef x_offset_lo, x_offset_hi;
+      LLVMValueRef y_offset_lo, y_offset_hi;
+      LLVMValueRef offset_lo, offset_hi;
+
+      x_lo = LLVMBuildAnd(bld->builder, x, bld->one, "");
+      y_lo = LLVMBuildAnd(bld->builder, y, bld->one, "");
+
+      x_hi = LLVMBuildLShr(bld->builder, x, bld->one, "");
+      y_hi = LLVMBuildLShr(bld->builder, y, bld->one, "");
+
+      x_stride_lo = x_stride;
+      y_stride_lo = lp_build_const_scalar(bld->type, 2*format_desc->block.bits/8);
+
+      x_stride_hi = lp_build_const_scalar(bld->type, 4*format_desc->block.bits/8);
+      y_stride_hi = LLVMBuildShl(bld->builder, y_stride, bld->one, "");
+
+      x_offset_lo = lp_build_mul(bld, x_lo, x_stride_lo);
+      y_offset_lo = lp_build_mul(bld, y_lo, y_stride_lo);
+      offset_lo = lp_build_add(bld, x_offset_lo, y_offset_lo);
+
+      x_offset_hi = lp_build_mul(bld, x_hi, x_stride_hi);
+      y_offset_hi = lp_build_mul(bld, y_hi, y_stride_hi);
+      offset_hi = lp_build_add(bld, x_offset_hi, y_offset_hi);
+
+      offset = lp_build_add(bld, offset_hi, offset_lo);
+   }
+   else {
+      LLVMValueRef x_offset;
+      LLVMValueRef y_offset;
+
+      x_offset = lp_build_mul(bld, x, x_stride);
+      y_offset = lp_build_mul(bld, y, y_stride);
+
+      offset = lp_build_add(bld, x_offset, y_offset);
+   }
+
+   return offset;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
new file mode 100644
index 00000000000..a791d886127
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -0,0 +1,155 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#ifndef LP_BLD_SAMPLE_H
+#define LP_BLD_SAMPLE_H
+
+
+#include <llvm-c/Core.h>
+
+struct pipe_texture;
+struct pipe_sampler_state;
+struct util_format_description;
+struct lp_type;
+struct lp_build_context;
+
+
+/**
+ * Sampler static state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are embedded in the generated code.
+ */
+struct lp_sampler_static_state
+{
+   /* pipe_texture's state */
+   enum pipe_format format;
+   unsigned target:2;
+   unsigned pot_width:1;
+   unsigned pot_height:1;
+   unsigned pot_depth:1;
+
+   /* pipe_sampler_state's state */
+   unsigned wrap_s:3;
+   unsigned wrap_t:3;
+   unsigned wrap_r:3;
+   unsigned min_img_filter:2;
+   unsigned min_mip_filter:2;
+   unsigned mag_img_filter:2;
+   unsigned compare_mode:1;
+   unsigned compare_func:3;
+   unsigned normalized_coords:1;
+   float border_color[4];
+};
+
+
+/**
+ * Sampler dynamic state.
+ *
+ * These are the bits of state from pipe_texture and pipe_sampler_state that
+ * are computed in runtime.
+ *
+ * There are obtained through callbacks, as we don't want to tie the texture
+ * sampling code generation logic to any particular texture layout or pipe
+ * driver.
+ */
+struct lp_sampler_dynamic_state
+{
+
+   /** Obtain the base texture width. */
+   LLVMValueRef
+   (*width)( struct lp_sampler_dynamic_state *state,
+             LLVMBuilderRef builder,
+             unsigned unit);
+
+   /** Obtain the base texture height. */
+   LLVMValueRef
+   (*height)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*stride)( struct lp_sampler_dynamic_state *state,
+              LLVMBuilderRef builder,
+              unsigned unit);
+
+   LLVMValueRef
+   (*data_ptr)( struct lp_sampler_dynamic_state *state,
+                LLVMBuilderRef builder,
+                unsigned unit);
+
+};
+
+
+/**
+ * Derive the sampler static state.
+ */
+void
+lp_sampler_static_state(struct lp_sampler_static_state *state,
+                        const struct pipe_texture *texture,
+                        const struct pipe_sampler_state *sampler);
+
+
+LLVMValueRef
+lp_build_gather(LLVMBuilderRef builder,
+                unsigned length,
+                unsigned src_width,
+                unsigned dst_width,
+                LLVMValueRef base_ptr,
+                LLVMValueRef offsets);
+
+
+LLVMValueRef
+lp_build_sample_offset(struct lp_build_context *bld,
+                       const struct util_format_description *format_desc,
+                       LLVMValueRef x,
+                       LLVMValueRef y,
+                       LLVMValueRef y_stride,
+                       LLVMValueRef data_ptr);
+
+
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type fp_type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel);
+
+
+
+#endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
new file mode 100644
index 00000000000..fe41d5ee493
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -0,0 +1,1091 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture sampling -- SoA.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "util/u_dump.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
+#include "lp_bld_debug.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_conv.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_pack.h"
+#include "lp_bld_format.h"
+#include "lp_bld_sample.h"
+
+
+/**
+ * Keep all information for sampling code generation in a single place.
+ */
+struct lp_build_sample_context
+{
+   LLVMBuilderRef builder;
+
+   const struct lp_sampler_static_state *static_state;
+
+   struct lp_sampler_dynamic_state *dynamic_state;
+
+   const struct util_format_description *format_desc;
+
+   /** Incoming coordinates type and build context */
+   struct lp_type coord_type;
+   struct lp_build_context coord_bld;
+
+   /** Unsigned integer coordinates */
+   struct lp_type uint_coord_type;
+   struct lp_build_context uint_coord_bld;
+
+   /** Signed integer coordinates */
+   struct lp_type int_coord_type;
+   struct lp_build_context int_coord_bld;
+
+   /** Output texels type and build context */
+   struct lp_type texel_type;
+   struct lp_build_context texel_bld;
+};
+
+
+/**
+ * Does the given texture wrap mode allow sampling the texture border color?
+ * XXX maybe move this into gallium util code.
+ */
+static boolean
+wrap_mode_uses_border_color(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return FALSE;
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return TRUE;
+   default:
+      assert(0 && "unexpected wrap mode");
+      return FALSE;
+   }
+}
+
+
+
+/**
+ * Gen code to fetch a texel from a texture at int coords (x, y).
+ * The result, texel, will be:
+ *   texel[0] = red values
+ *   texel[1] = green values
+ *   texel[2] = blue values
+ *   texel[3] = alpha values
+ */
+static void
+lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
+                          LLVMValueRef width,
+                          LLVMValueRef height,
+                          LLVMValueRef x,
+                          LLVMValueRef y,
+                          LLVMValueRef y_stride,
+                          LLVMValueRef data_ptr,
+                          LLVMValueRef *texel)
+{
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef offset;
+   LLVMValueRef packed;
+   LLVMValueRef use_border = NULL;
+
+   /* use_border = x < 0 || x >= width || y < 0 || y >= height */
+   if (wrap_mode_uses_border_color(bld->static_state->wrap_s)) {
+      LLVMValueRef b1, b2;
+      b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
+      b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
+      use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
+   }
+
+   if (wrap_mode_uses_border_color(bld->static_state->wrap_t)) {
+      LLVMValueRef b1, b2;
+      b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
+      b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
+      if (use_border) {
+         use_border = LLVMBuildOr(bld->builder, use_border, b1, "ub_or_b1");
+         use_border = LLVMBuildOr(bld->builder, use_border, b2, "ub_or_b2");
+      }
+      else {
+         use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2");
+      }
+   }
+
+   /*
+    * Note: if we find an app which frequently samples the texture border
+    * we might want to implement a true conditional here to avoid sampling
+    * the texture whenever possible (since that's quite a bit of code).
+    * Ex:
+    *   if (use_border) {
+    *      texel = border_color;
+    *   }
+    *   else {
+    *      texel = sample_texture(coord);
+    *   }
+    * As it is now, we always sample the texture, then selectively replace
+    * the texel color results with the border color.
+    */
+
+   /* convert x,y coords to linear offset from start of texture, in bytes */
+   offset = lp_build_sample_offset(&bld->uint_coord_bld,
+                                   bld->format_desc,
+                                   x, y, y_stride,
+                                   data_ptr);
+
+   assert(bld->format_desc->block.width == 1);
+   assert(bld->format_desc->block.height == 1);
+   assert(bld->format_desc->block.bits <= bld->texel_type.width);
+
+   /* gather the texels from the texture */
+   packed = lp_build_gather(bld->builder,
+                            bld->texel_type.length,
+                            bld->format_desc->block.bits,
+                            bld->texel_type.width,
+                            data_ptr, offset);
+
+   /* convert texels to float rgba */
+   lp_build_unpack_rgba_soa(bld->builder,
+                            bld->format_desc,
+                            bld->texel_type,
+                            packed, texel);
+
+   if (use_border) {
+      /* select texel color or border color depending on use_border */
+      int chan;
+      for (chan = 0; chan < 4; chan++) {
+         LLVMValueRef border_chan =
+            lp_build_const_scalar(bld->texel_type,
+                                  bld->static_state->border_color[chan]);
+         texel[chan] = lp_build_select(&bld->texel_bld, use_border,
+                                       border_chan, texel[chan]);
+      }
+   }
+}
+
+
+static LLVMValueRef
+lp_build_sample_packed(struct lp_build_sample_context *bld,
+                       LLVMValueRef x,
+                       LLVMValueRef y,
+                       LLVMValueRef y_stride,
+                       LLVMValueRef data_ptr)
+{
+   LLVMValueRef offset;
+
+   offset = lp_build_sample_offset(&bld->uint_coord_bld,
+                                   bld->format_desc,
+                                   x, y, y_stride,
+                                   data_ptr);
+
+   assert(bld->format_desc->block.width == 1);
+   assert(bld->format_desc->block.height == 1);
+   assert(bld->format_desc->block.bits <= bld->texel_type.width);
+
+   return lp_build_gather(bld->builder,
+                          bld->texel_type.length,
+                          bld->format_desc->block.bits,
+                          bld->texel_type.width,
+                          data_ptr, offset);
+}
+
+
+/**
+ * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes.
+ */
+static LLVMValueRef
+lp_build_coord_mirror(struct lp_build_sample_context *bld,
+                      LLVMValueRef coord)
+{
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef fract, flr, isOdd;
+
+   /* fract = coord - floor(coord) */
+   fract = lp_build_sub(coord_bld, coord, lp_build_floor(coord_bld, coord));
+
+   /* flr = ifloor(coord); */
+   flr = lp_build_ifloor(coord_bld, coord);
+
+   /* isOdd = flr & 1 */
+   isOdd = LLVMBuildAnd(bld->builder, flr, int_coord_bld->one, "");
+
+   /* make coord positive or negative depending on isOdd */
+   coord = lp_build_set_sign(coord_bld, fract, isOdd);
+
+   /* convert isOdd to float */
+   isOdd = lp_build_int_to_float(coord_bld, isOdd);
+
+   /* add isOdd to coord */
+   coord = lp_build_add(coord_bld, coord, isOdd);
+
+   return coord;
+}
+
+
+/**
+ * We only support a few wrap modes in lp_build_sample_wrap_int() at this time.
+ * Return whether the given mode is supported by that function.
+ */
+static boolean
+is_simple_wrap_mode(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return TRUE;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+   default:
+      return FALSE;
+   }
+}
+
+
+/**
+ * Build LLVM code for texture wrap mode, for scaled integer texcoords.
+ * \param coord  the incoming texcoord (s,t,r or q) scaled to the texture size
+ * \param length  the texture size along one dimension
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ */
+static LLVMValueRef
+lp_build_sample_wrap_int(struct lp_build_sample_context *bld,
+                         LLVMValueRef coord,
+                         LLVMValueRef length,
+                         boolean is_pot,
+                         unsigned wrap_mode)
+{
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   LLVMValueRef length_minus_one;
+
+   length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      if(is_pot)
+         coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, "");
+      else
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord = LLVMBuildURem(bld->builder, coord, length, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero);
+      coord = lp_build_min(int_coord_bld, coord, length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      /* FIXME */
+      _debug_printf("llvmpipe: failed to translate texture wrap mode %s\n",
+                    util_dump_tex_wrap(wrap_mode, TRUE));
+      coord = lp_build_max(uint_coord_bld, coord, uint_coord_bld->zero);
+      coord = lp_build_min(uint_coord_bld, coord, length_minus_one);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   return coord;
+}
+
+
+/**
+ * Build LLVM code for texture wrap mode for linear filtering.
+ * \param x0_out  returns first integer texcoord
+ * \param x1_out  returns second integer texcoord
+ * \param weight_out  returns linear interpolation weight
+ */
+static void
+lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
+                            LLVMValueRef coord,
+                            LLVMValueRef length,
+                            boolean is_pot,
+                            unsigned wrap_mode,
+                            LLVMValueRef *x0_out,
+                            LLVMValueRef *x1_out,
+                            LLVMValueRef *weight_out)
+{
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   LLVMValueRef two = lp_build_const_scalar(coord_bld->type, 2.0);
+   LLVMValueRef half = lp_build_const_scalar(coord_bld->type, 0.5);
+   LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
+   LLVMValueRef length_minus_one;
+   LLVMValueRef length_f_minus_one;
+   LLVMValueRef coord0, coord1, weight;
+
+   /* XXX check for normalized vs. unnormalized coords */
+
+   length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      /* mul by size and subtract 0.5 */
+      coord = lp_build_mul(coord_bld, coord, length_f);
+      coord = lp_build_sub(coord_bld, coord, half);
+      /* convert to int */
+      coord0 = lp_build_ifloor(coord_bld, coord);
+      coord1 = lp_build_add(uint_coord_bld, coord0, uint_coord_bld->one);
+      /* compute lerp weight */
+      weight = lp_build_fract(coord_bld, coord);
+      /* repeat wrap */
+      if (is_pot) {
+         coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, "");
+         coord1 = LLVMBuildAnd(bld->builder, coord1, length_minus_one, "");
+      }
+      else {
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         coord0 = LLVMBuildURem(bld->builder, coord0, length, "");
+         coord1 = LLVMBuildURem(bld->builder, coord1, length, "");
+      }
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+      coord = lp_build_mul(coord_bld, coord, length_f);
+      weight = lp_build_fract(coord_bld, coord);
+      coord0 = lp_build_clamp(coord_bld, coord, coord_bld->zero,
+                              length_f_minus_one);
+      coord1 = lp_build_add(coord_bld, coord, coord_bld->one);
+      coord1 = lp_build_clamp(coord_bld, coord1, coord_bld->zero,
+                              length_f_minus_one);
+      coord0 = lp_build_ifloor(coord_bld, coord0);
+      coord1 = lp_build_ifloor(coord_bld, coord1);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      /* clamp to [0,1] */
+      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, coord_bld->one);
+      /* mul by tex size and subtract 0.5 */
+      coord = lp_build_mul(coord_bld, coord, length_f);
+      coord = lp_build_sub(coord_bld, coord, half);
+      /* compute lerp weight */
+      weight = lp_build_fract(coord_bld, coord);
+      /* coord0 = floor(coord); */
+      coord0 = lp_build_ifloor(coord_bld, coord);
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+      /* coord0 = max(coord0, 0) */
+      coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
+      /* coord1 = min(coord1, length-1) */
+      coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      {
+         LLVMValueRef min, max;
+         /* min = -1.0 / (2 * length) */
+         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
+         min = lp_build_negate(coord_bld, min);
+         /* max = 1.0 - min */
+         max = lp_build_sub(coord_bld, coord_bld->one, min);
+         /* coord = clamp(coord, min, max) */
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         /* scale coord to length (and sub 0.5?) */
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_sub(coord_bld, coord, half);
+         /* compute lerp weight */
+         weight = lp_build_fract(coord_bld, coord);
+         /* convert to int */
+         coord0 = lp_build_ifloor(coord_bld, coord);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+      }
+      break;
+
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      /* compute mirror function */
+      coord = lp_build_coord_mirror(bld, coord);
+
+      /* scale coord to length */
+      coord = lp_build_mul(coord_bld, coord, length_f);
+      coord = lp_build_sub(coord_bld, coord, half);
+
+      /* compute lerp weight */
+      weight = lp_build_fract(coord_bld, coord);
+
+      /* convert to int coords */
+      coord0 = lp_build_ifloor(coord_bld, coord);
+      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+
+      /* coord0 = max(coord0, 0) */
+      coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
+      /* coord1 = min(coord1, length-1) */
+      coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      {
+         LLVMValueRef min, max;
+         /* min = 1.0 / (2 * length) */
+         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
+         /* max = 1.0 - min */
+         max = lp_build_sub(coord_bld, coord_bld->one, min);
+
+         coord = lp_build_abs(coord_bld, coord);
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         if(0)coord = lp_build_sub(coord_bld, coord, half);
+         weight = lp_build_fract(coord_bld, coord);
+         coord0 = lp_build_ifloor(coord_bld, coord);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+      }
+      break;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      {
+         LLVMValueRef min, max;
+         /* min = 1.0 / (2 * length) */
+         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
+         /* max = 1.0 - min */
+         max = lp_build_sub(coord_bld, coord_bld->one, min);
+
+         coord = lp_build_abs(coord_bld, coord);
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_sub(coord_bld, coord, half);
+         weight = lp_build_fract(coord_bld, coord);
+         coord0 = lp_build_ifloor(coord_bld, coord);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+      }
+      break;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      {
+         LLVMValueRef min, max;
+         /* min = -1.0 / (2 * length) */
+         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
+         min = lp_build_negate(coord_bld, min);
+         /* max = 1.0 - min */
+         max = lp_build_sub(coord_bld, coord_bld->one, min);
+
+         coord = lp_build_abs(coord_bld, coord);
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_sub(coord_bld, coord, half);
+         weight = lp_build_fract(coord_bld, coord);
+         coord0 = lp_build_ifloor(coord_bld, coord);
+         coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+
+   *x0_out = coord0;
+   *x1_out = coord1;
+   *weight_out = weight;
+}
+
+
+/**
+ * Build LLVM code for texture wrap mode for nearest filtering.
+ * \param coord  the incoming texcoord (nominally in [0,1])
+ * \param length  the texture size along one dimension, as int
+ * \param is_pot  if TRUE, length is a power of two
+ * \param wrap_mode  one of PIPE_TEX_WRAP_x
+ */
+static LLVMValueRef
+lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
+                             LLVMValueRef coord,
+                             LLVMValueRef length,
+                             boolean is_pot,
+                             unsigned wrap_mode)
+{
+   struct lp_build_context *coord_bld = &bld->coord_bld;
+   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
+   struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld;
+   LLVMValueRef two = lp_build_const_scalar(coord_bld->type, 2.0);
+   LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length);
+   LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one);
+   LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one);
+   LLVMValueRef icoord;
+   
+   /* XXX check for normalized vs. unnormalized coords */
+
+   switch(wrap_mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      coord = lp_build_mul(coord_bld, coord, length_f);
+      icoord = lp_build_ifloor(coord_bld, coord);
+      if (is_pot)
+         icoord = LLVMBuildAnd(bld->builder, icoord, length_minus_one, "");
+      else
+         /* Signed remainder won't give the right results for negative
+          * dividends but unsigned remainder does.*/
+         icoord = LLVMBuildURem(bld->builder, icoord, length, "");
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP:
+      /* mul by size */
+      coord = lp_build_mul(coord_bld, coord, length_f);
+      /* floor */
+      icoord = lp_build_ifloor(coord_bld, coord);
+      /* clamp to [0, size-1].  Note: int coord builder type */
+      icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
+                              length_minus_one);
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      {
+         LLVMValueRef min, max;
+         /* min = 1.0 / (2 * length) */
+         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
+         /* max = length - min */
+         max = lp_build_sub(coord_bld, length_f, min);
+         /* scale coord to length */
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         /* coord = clamp(coord, min, max) */
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         icoord = lp_build_ifloor(coord_bld, coord);
+      }
+      break;
+
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */
+      {
+         LLVMValueRef min, max;
+         /* min = -1.0 / (2 * length) */
+         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
+         min = lp_build_negate(coord_bld, min);
+         /* max = length - min */
+         max = lp_build_sub(coord_bld, length_f, min);
+         /* scale coord to length */
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         /* coord = clamp(coord, min, max) */
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         icoord = lp_build_ifloor(coord_bld, coord);
+      }
+      break;
+
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      {
+         LLVMValueRef min, max;
+         /* min = 1.0 / (2 * length) */
+         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
+         /* max = length - min */
+         max = lp_build_sub(coord_bld, length_f, min);
+
+         /* compute mirror function */
+         coord = lp_build_coord_mirror(bld, coord);
+
+         /* scale coord to length */
+         coord = lp_build_mul(coord_bld, coord, length_f);
+
+         /* coord = clamp(coord, min, max) */
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         icoord = lp_build_ifloor(coord_bld, coord);
+      }
+      break;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      coord = lp_build_abs(coord_bld, coord);
+      coord = lp_build_mul(coord_bld, coord, length_f);
+      coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f_minus_one);
+      icoord = lp_build_ifloor(coord_bld, coord);
+      break;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      {
+         LLVMValueRef min, max;
+         /* min = 1.0 / (2 * length) */
+         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
+         /* max = length - min */
+         max = lp_build_sub(coord_bld, length_f, min);
+
+         coord = lp_build_abs(coord_bld, coord);
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         icoord = lp_build_ifloor(coord_bld, coord);
+      }
+      break;
+
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      {
+         LLVMValueRef min, max;
+         /* min = 1.0 / (2 * length) */
+         min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f));
+         min = lp_build_negate(coord_bld, min);
+         /* max = length - min */
+         max = lp_build_sub(coord_bld, length_f, min);
+
+         coord = lp_build_abs(coord_bld, coord);
+         coord = lp_build_mul(coord_bld, coord, length_f);
+         coord = lp_build_clamp(coord_bld, coord, min, max);
+         icoord = lp_build_ifloor(coord_bld, coord);
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+
+   return icoord;
+}
+
+
+/**
+ * Sample 2D texture with nearest filtering.
+ */
+static void
+lp_build_sample_2d_nearest_soa(struct lp_build_sample_context *bld,
+                               LLVMValueRef s,
+                               LLVMValueRef t,
+                               LLVMValueRef width,
+                               LLVMValueRef height,
+                               LLVMValueRef stride,
+                               LLVMValueRef data_ptr,
+                               LLVMValueRef *texel)
+{
+   LLVMValueRef x, y;
+
+   x = lp_build_sample_wrap_nearest(bld, s, width,
+                                    bld->static_state->pot_width,
+                                    bld->static_state->wrap_s);
+   y = lp_build_sample_wrap_nearest(bld, t, height,
+                                    bld->static_state->pot_height,
+                                    bld->static_state->wrap_t);
+
+   lp_build_name(x, "tex.x.wrapped");
+   lp_build_name(y, "tex.y.wrapped");
+
+   lp_build_sample_texel_soa(bld, width, height, x, y, stride, data_ptr, texel);
+}
+
+
+/**
+ * Sample 2D texture with bilinear filtering.
+ */
+static void
+lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef width,
+                              LLVMValueRef height,
+                              LLVMValueRef stride,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef *texel)
+{
+   LLVMValueRef s_fpart;
+   LLVMValueRef t_fpart;
+   LLVMValueRef x0, x1;
+   LLVMValueRef y0, y1;
+   LLVMValueRef neighbors[2][2][4];
+   unsigned chan;
+
+   lp_build_sample_wrap_linear(bld, s, width, bld->static_state->pot_width,
+                               bld->static_state->wrap_s, &x0, &x1, &s_fpart);
+   lp_build_sample_wrap_linear(bld, t, height, bld->static_state->pot_height,
+                               bld->static_state->wrap_t, &y0, &y1, &t_fpart);
+
+   lp_build_sample_texel_soa(bld, width, height, x0, y0, stride, data_ptr, neighbors[0][0]);
+   lp_build_sample_texel_soa(bld, width, height, x1, y0, stride, data_ptr, neighbors[0][1]);
+   lp_build_sample_texel_soa(bld, width, height, x0, y1, stride, data_ptr, neighbors[1][0]);
+   lp_build_sample_texel_soa(bld, width, height, x1, y1, stride, data_ptr, neighbors[1][1]);
+
+   /* TODO: Don't interpolate missing channels */
+   for(chan = 0; chan < 4; ++chan) {
+      texel[chan] = lp_build_lerp_2d(&bld->texel_bld,
+                                     s_fpart, t_fpart,
+                                     neighbors[0][0][chan],
+                                     neighbors[0][1][chan],
+                                     neighbors[1][0][chan],
+                                     neighbors[1][1][chan]);
+   }
+}
+
+
+static void
+lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder,
+                          struct lp_type dst_type,
+                          LLVMValueRef packed,
+                          LLVMValueRef *rgba)
+{
+   LLVMValueRef mask = lp_build_int_const_scalar(dst_type, 0xff);
+   unsigned chan;
+
+   /* Decode the input vector components */
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned start = chan*8;
+      unsigned stop = start + 8;
+      LLVMValueRef input;
+
+      input = packed;
+
+      if(start)
+         input = LLVMBuildLShr(builder, input, lp_build_int_const_scalar(dst_type, start), "");
+
+      if(stop < 32)
+         input = LLVMBuildAnd(builder, input, mask, "");
+
+      input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input);
+
+      rgba[chan] = input;
+   }
+}
+
+
+static void
+lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
+                              LLVMValueRef s,
+                              LLVMValueRef t,
+                              LLVMValueRef width,
+                              LLVMValueRef height,
+                              LLVMValueRef stride,
+                              LLVMValueRef data_ptr,
+                              LLVMValueRef *texel)
+{
+   LLVMBuilderRef builder = bld->builder;
+   struct lp_build_context i32, h16, u8n;
+   LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type;
+   LLVMValueRef i32_c8, i32_c128, i32_c255;
+   LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi;
+   LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi;
+   LLVMValueRef x0, x1;
+   LLVMValueRef y0, y1;
+   LLVMValueRef neighbors[2][2];
+   LLVMValueRef neighbors_lo[2][2];
+   LLVMValueRef neighbors_hi[2][2];
+   LLVMValueRef packed, packed_lo, packed_hi;
+   LLVMValueRef unswizzled[4];
+
+   lp_build_context_init(&i32, builder, lp_type_int(32));
+   lp_build_context_init(&h16, builder, lp_type_ufixed(16));
+   lp_build_context_init(&u8n, builder, lp_type_unorm(8));
+
+   i32_vec_type = lp_build_vec_type(i32.type);
+   h16_vec_type = lp_build_vec_type(h16.type);
+   u8n_vec_type = lp_build_vec_type(u8n.type);
+
+   if (bld->static_state->normalized_coords) {
+      LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type);
+      LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width, coord_vec_type, "");
+      LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height, coord_vec_type, "");
+      s = lp_build_mul(&bld->coord_bld, s, fp_width);
+      t = lp_build_mul(&bld->coord_bld, t, fp_height);
+   }
+
+   /* scale coords by 256 (8 fractional bits) */
+   s = lp_build_mul_imm(&bld->coord_bld, s, 256);
+   t = lp_build_mul_imm(&bld->coord_bld, t, 256);
+
+   /* convert float to int */
+   s = LLVMBuildFPToSI(builder, s, i32_vec_type, "");
+   t = LLVMBuildFPToSI(builder, t, i32_vec_type, "");
+
+   /* subtract 0.5 (add -128) */
+   i32_c128 = lp_build_int_const_scalar(i32.type, -128);
+   s = LLVMBuildAdd(builder, s, i32_c128, "");
+   t = LLVMBuildAdd(builder, t, i32_c128, "");
+
+   /* compute floor (shift right 8) */
+   i32_c8 = lp_build_int_const_scalar(i32.type, 8);
+   s_ipart = LLVMBuildAShr(builder, s, i32_c8, "");
+   t_ipart = LLVMBuildAShr(builder, t, i32_c8, "");
+
+   /* compute fractional part (AND with 0xff) */
+   i32_c255 = lp_build_int_const_scalar(i32.type, 255);
+   s_fpart = LLVMBuildAnd(builder, s, i32_c255, "");
+   t_fpart = LLVMBuildAnd(builder, t, i32_c255, "");
+
+   x0 = s_ipart;
+   y0 = t_ipart;
+
+   x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one);
+   y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one);
+
+   x0 = lp_build_sample_wrap_int(bld, x0, width,  bld->static_state->pot_width,
+                                 bld->static_state->wrap_s);
+   y0 = lp_build_sample_wrap_int(bld, y0, height, bld->static_state->pot_height,
+                                 bld->static_state->wrap_t);
+
+   x1 = lp_build_sample_wrap_int(bld, x1, width,  bld->static_state->pot_width,
+                                 bld->static_state->wrap_s);
+   y1 = lp_build_sample_wrap_int(bld, y1, height, bld->static_state->pot_height,
+                                 bld->static_state->wrap_t);
+
+   /*
+    * Transform 4 x i32 in
+    *
+    *   s_fpart = {s0, s1, s2, s3}
+    *
+    * into 8 x i16
+    *
+    *   s_fpart = {00, s0, 00, s1, 00, s2, 00, s3}
+    *
+    * into two 8 x i16
+    *
+    *   s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1}
+    *   s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3}
+    *
+    * and likewise for t_fpart. There is no risk of loosing precision here
+    * since the fractional parts only use the lower 8bits.
+    */
+
+   s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, "");
+   t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, "");
+
+   {
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH];
+      LLVMValueRef shuffle_lo;
+      LLVMValueRef shuffle_hi;
+      unsigned i, j;
+
+      for(j = 0; j < h16.type.length; j += 4) {
+         unsigned subindex = util_cpu_caps.little_endian ? 0 : 1;
+         LLVMValueRef index;
+
+         index = LLVMConstInt(elem_type, j/2 + subindex, 0);
+         for(i = 0; i < 4; ++i)
+            shuffles_lo[j + i] = index;
+
+         index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0);
+         for(i = 0; i < 4; ++i)
+            shuffles_hi[j + i] = index;
+      }
+
+      shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length);
+      shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length);
+
+      s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_lo, "");
+      t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_lo, "");
+      s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_hi, "");
+      t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, "");
+   }
+
+   /*
+    * Fetch the pixels as 4 x 32bit (rgba order might differ):
+    *
+    *   rgba0 rgba1 rgba2 rgba3
+    *
+    * bit cast them into 16 x u8
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * unpack them into two 8 x i16:
+    *
+    *   r0 g0 b0 a0 r1 g1 b1 a1
+    *   r2 g2 b2 a2 r3 g3 b3 a3
+    *
+    * The higher 8 bits of the resulting elements will be zero.
+    */
+
+   neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_ptr);
+   neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_ptr);
+   neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_ptr);
+   neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_ptr);
+
+   neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, "");
+   neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, "");
+   neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, "");
+   neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, "");
+
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]);
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]);
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]);
+   lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]);
+
+   /*
+    * Linear interpolate with 8.8 fixed point.
+    */
+
+   packed_lo = lp_build_lerp_2d(&h16,
+                                s_fpart_lo, t_fpart_lo,
+                                neighbors_lo[0][0],
+                                neighbors_lo[0][1],
+                                neighbors_lo[1][0],
+                                neighbors_lo[1][1]);
+
+   packed_hi = lp_build_lerp_2d(&h16,
+                                s_fpart_hi, t_fpart_hi,
+                                neighbors_hi[0][0],
+                                neighbors_hi[0][1],
+                                neighbors_hi[1][0],
+                                neighbors_hi[1][1]);
+
+   packed = lp_build_pack2(builder, h16.type, u8n.type, packed_lo, packed_hi);
+
+   /*
+    * Convert to SoA and swizzle.
+    */
+
+   packed = LLVMBuildBitCast(builder, packed, i32_vec_type, "");
+
+   lp_build_rgba8_to_f32_soa(bld->builder,
+                             bld->texel_type,
+                             packed, unswizzled);
+
+   lp_build_format_swizzle_soa(bld->format_desc,
+                               bld->texel_type, unswizzled,
+                               texel);
+}
+
+
+static void
+lp_build_sample_compare(struct lp_build_sample_context *bld,
+                        LLVMValueRef p,
+                        LLVMValueRef *texel)
+{
+   struct lp_build_context *texel_bld = &bld->texel_bld;
+   LLVMValueRef res;
+   unsigned chan;
+
+   if(bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE)
+      return;
+
+   /* TODO: Compare before swizzling, to avoid redundant computations */
+   res = NULL;
+   for(chan = 0; chan < 4; ++chan) {
+      LLVMValueRef cmp;
+      cmp = lp_build_cmp(texel_bld, bld->static_state->compare_func, p, texel[chan]);
+      cmp = lp_build_select(texel_bld, cmp, texel_bld->one, texel_bld->zero);
+
+      if(res)
+         res = lp_build_add(texel_bld, res, cmp);
+      else
+         res = cmp;
+   }
+
+   assert(res);
+   res = lp_build_mul(texel_bld, res, lp_build_const_scalar(texel_bld->type, 0.25));
+
+   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
+   for(chan = 0; chan < 3; ++chan)
+      texel[chan] = res;
+   texel[3] = texel_bld->one;
+}
+
+
+/**
+ * Build texture sampling code.
+ * 'texel' will return a vector of four LLVMValueRefs corresponding to
+ * R, G, B, A.
+ */
+void
+lp_build_sample_soa(LLVMBuilderRef builder,
+                    const struct lp_sampler_static_state *static_state,
+                    struct lp_sampler_dynamic_state *dynamic_state,
+                    struct lp_type type,
+                    unsigned unit,
+                    unsigned num_coords,
+                    const LLVMValueRef *coords,
+                    LLVMValueRef lodbias,
+                    LLVMValueRef *texel)
+{
+   struct lp_build_sample_context bld;
+   LLVMValueRef width;
+   LLVMValueRef height;
+   LLVMValueRef stride;
+   LLVMValueRef data_ptr;
+   LLVMValueRef s;
+   LLVMValueRef t;
+   LLVMValueRef p;
+
+   /* Setup our build context */
+   memset(&bld, 0, sizeof bld);
+   bld.builder = builder;
+   bld.static_state = static_state;
+   bld.dynamic_state = dynamic_state;
+   bld.format_desc = util_format_description(static_state->format);
+   bld.coord_type = type;
+   bld.uint_coord_type = lp_uint_type(type);
+   bld.int_coord_type = lp_int_type(type);
+   bld.texel_type = type;
+   lp_build_context_init(&bld.coord_bld, builder, bld.coord_type);
+   lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type);
+   lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type);
+   lp_build_context_init(&bld.texel_bld, builder, bld.texel_type);
+
+   /* Get the dynamic state */
+   width = dynamic_state->width(dynamic_state, builder, unit);
+   height = dynamic_state->height(dynamic_state, builder, unit);
+   stride = dynamic_state->stride(dynamic_state, builder, unit);
+   data_ptr = dynamic_state->data_ptr(dynamic_state, builder, unit);
+
+   s = coords[0];
+   t = coords[1];
+   p = coords[2];
+
+   width = lp_build_broadcast_scalar(&bld.uint_coord_bld, width);
+   height = lp_build_broadcast_scalar(&bld.uint_coord_bld, height);
+   stride = lp_build_broadcast_scalar(&bld.uint_coord_bld, stride);
+
+   if(static_state->target == PIPE_TEXTURE_1D)
+      t = bld.coord_bld.zero;
+
+   switch (static_state->min_img_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      lp_build_sample_2d_nearest_soa(&bld, s, t, width, height,
+                                     stride, data_ptr, texel);
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+      if(lp_format_is_rgba8(bld.format_desc) &&
+         is_simple_wrap_mode(static_state->wrap_s) &&
+         is_simple_wrap_mode(static_state->wrap_t))
+         lp_build_sample_2d_linear_aos(&bld, s, t, width, height,
+                                       stride, data_ptr, texel);
+      else
+         lp_build_sample_2d_linear_soa(&bld, s, t, width, height,
+                                       stride, data_ptr, texel);
+      break;
+   default:
+      assert(0);
+   }
+
+   /* FIXME: respect static_state->min_mip_filter */;
+   /* FIXME: respect static_state->mag_img_filter */;
+
+   lp_build_sample_compare(&bld, p, texel);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.c b/src/gallium/auxiliary/gallivm/lp_bld_struct.c
new file mode 100644
index 00000000000..3998ac374fe
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.c
@@ -0,0 +1,72 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Helper functions for manipulation structures.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+
+#include "lp_bld_debug.h"
+#include "lp_bld_struct.h"
+
+
+LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name)
+{
+   LLVMValueRef indices[2];
+   LLVMValueRef member_ptr;
+   indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0);
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   lp_build_name(member_ptr, "%s.%s_ptr", LLVMGetValueName(ptr), name);
+   return member_ptr;
+}
+
+
+LLVMValueRef
+lp_build_struct_get(LLVMBuilderRef builder,
+                    LLVMValueRef ptr,
+                    unsigned member,
+                    const char *name)
+{
+   LLVMValueRef member_ptr;
+   LLVMValueRef res;
+   member_ptr = lp_build_struct_get_ptr(builder, ptr, member, name);
+   res = LLVMBuildLoad(builder, member_ptr, "");
+   lp_build_name(res, "%s.%s", LLVMGetValueName(ptr), name);
+   return res;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.h b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
new file mode 100644
index 00000000000..740392f5611
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.h
@@ -0,0 +1,75 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for type conversions.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_STRUCT_H
+#define LP_BLD_STRUCT_H
+
+
+#include <llvm-c/Core.h>  
+#include <llvm-c/Target.h>
+
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+
+
+#define LP_CHECK_STRUCT_SIZE(_ctype, _ltarget, _ltype) \
+      assert(LLVMABISizeOfType(_ltarget, _ltype) == \
+             sizeof(_ctype))
+
+#define LP_CHECK_MEMBER_OFFSET(_ctype, _cmember, _ltarget, _ltype, _lindex) \
+      assert(LLVMOffsetOfElement(_ltarget, _ltype, _lindex) == \
+             offsetof(_ctype, _cmember))
+
+
+/**
+ * Get value pointer to a structure member.
+ */
+LLVMValueRef
+lp_build_struct_get_ptr(LLVMBuilderRef builder,
+                        LLVMValueRef ptr,
+                        unsigned member,
+                        const char *name);
+
+/**
+ * Get the value of a structure member.
+ */
+LLVMValueRef
+lp_build_struct_get(LLVMBuilderRef builder,
+                    LLVMValueRef ptr,
+                    unsigned member,
+                    const char *name);
+
+
+#endif /* !LP_BLD_STRUCT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
new file mode 100644
index 00000000000..64e81f7b1fe
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c
@@ -0,0 +1,239 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for swizzling/shuffling.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+
+
+LLVMValueRef
+lp_build_broadcast(LLVMBuilderRef builder,
+                   LLVMTypeRef vec_type,
+                   LLVMValueRef scalar)
+{
+   const unsigned n = LLVMGetVectorSize(vec_type);
+   LLVMValueRef res;
+   unsigned i;
+
+   res = LLVMGetUndef(vec_type);
+   for(i = 0; i < n; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildInsertElement(builder, res, scalar, index, "");
+   }
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_broadcast_scalar(struct lp_build_context *bld,
+                          LLVMValueRef scalar)
+{
+   const struct lp_type type = bld->type;
+   LLVMValueRef res;
+   unsigned i;
+
+   res = bld->undef;
+   for(i = 0; i < type.length; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildInsertElement(bld->builder, res, scalar, index, "");
+   }
+
+   return res;
+}
+
+
+LLVMValueRef
+lp_build_broadcast_aos(struct lp_build_context *bld,
+                       LLVMValueRef a,
+                       unsigned channel)
+{
+   const struct lp_type type = bld->type;
+   const unsigned n = type.length;
+   unsigned i, j;
+
+   if(a == bld->undef || a == bld->zero || a == bld->one)
+      return a;
+
+   /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing
+    * using shuffles here actually causes worst results. More investigation is
+    * needed. */
+   if (n <= 4) {
+      /*
+       * Shuffle.
+       */
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0);
+
+      return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
+   }
+   else {
+      /*
+       * Bit mask and recursive shifts
+       *
+       *   XYZW XYZW .... XYZW  <= input
+       *   0Y00 0Y00 .... 0Y00
+       *   YY00 YY00 .... YY00
+       *   YYYY YYYY .... YYYY  <= output
+       */
+      struct lp_type type4 = type;
+      const char shifts[4][2] = {
+         { 1,  2},
+         {-1,  2},
+         { 1, -2},
+         {-1, -2}
+      };
+      boolean cond[4];
+      unsigned i;
+
+      memset(cond, 0, sizeof cond);
+      cond[channel] = 1;
+
+      a = LLVMBuildAnd(bld->builder, a, lp_build_const_mask_aos(type, cond), "");
+
+      type4.width *= 4;
+      type4.length /= 4;
+
+      a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type4), "");
+
+      for(i = 0; i < 2; ++i) {
+         LLVMValueRef tmp = NULL;
+         int shift = shifts[channel][i];
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         shift = -shift;
+#endif
+
+         if(shift > 0)
+            tmp = LLVMBuildLShr(bld->builder, a, lp_build_int_const_scalar(type4, shift*type.width), "");
+         if(shift < 0)
+            tmp = LLVMBuildShl(bld->builder, a, lp_build_int_const_scalar(type4, -shift*type.width), "");
+
+         assert(tmp);
+         if(tmp)
+            a = LLVMBuildOr(bld->builder, a, tmp, "");
+      }
+
+      return LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type), "");
+   }
+}
+
+
+LLVMValueRef
+lp_build_swizzle1_aos(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      const unsigned char swizzle[4])
+{
+   const unsigned n = bld->type.length;
+   unsigned i, j;
+
+   if(a == bld->undef || a == bld->zero || a == bld->one)
+      return a;
+
+   if(swizzle[0] == swizzle[1] && swizzle[1] == swizzle[2] && swizzle[2] == swizzle[3])
+      return lp_build_broadcast_aos(bld, a, swizzle[0]);
+
+   {
+      /*
+       * Shuffle.
+       */
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            shuffles[j + i] = LLVMConstInt(elem_type, j + swizzle[i], 0);
+
+      return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), "");
+   }
+}
+
+
+LLVMValueRef
+lp_build_swizzle2_aos(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef b,
+                      const unsigned char swizzle[4])
+{
+   const unsigned n = bld->type.length;
+   unsigned i, j;
+
+   if(swizzle[0] < 4 && swizzle[1] < 4 && swizzle[2] < 4 && swizzle[3] < 4)
+      return lp_build_swizzle1_aos(bld, a, swizzle);
+
+   if(a == b) {
+      unsigned char swizzle1[4];
+      swizzle1[0] = swizzle[0] % 4;
+      swizzle1[1] = swizzle[1] % 4;
+      swizzle1[2] = swizzle[2] % 4;
+      swizzle1[3] = swizzle[3] % 4;
+      return lp_build_swizzle1_aos(bld, a, swizzle1);
+   }
+
+   if(swizzle[0] % 4 == 0 &&
+      swizzle[1] % 4 == 1 &&
+      swizzle[2] % 4 == 2 &&
+      swizzle[3] % 4 == 3) {
+      boolean cond[4];
+      cond[0] = swizzle[0] / 4;
+      cond[1] = swizzle[1] / 4;
+      cond[2] = swizzle[2] / 4;
+      cond[3] = swizzle[3] / 4;
+      return lp_build_select_aos(bld, a, b, cond);
+   }
+
+   {
+      /*
+       * Shuffle.
+       */
+      LLVMTypeRef elem_type = LLVMInt32Type();
+      LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+      for(j = 0; j < n; j += 4)
+         for(i = 0; i < 4; ++i)
+            shuffles[j + i] = LLVMConstInt(elem_type, j + (swizzle[i] % 4) + (swizzle[i] / 4 * n), 0);
+
+      return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), "");
+   }
+}
+
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
new file mode 100644
index 00000000000..b9472127a63
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h
@@ -0,0 +1,91 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Helper functions for swizzling/shuffling.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_SWIZZLE_H
+#define LP_BLD_SWIZZLE_H
+
+
+#include <llvm-c/Core.h>  
+
+
+struct lp_type;
+struct lp_build_context;
+
+
+LLVMValueRef
+lp_build_broadcast(LLVMBuilderRef builder,
+                   LLVMTypeRef vec_type,
+                   LLVMValueRef scalar);
+
+
+LLVMValueRef
+lp_build_broadcast_scalar(struct lp_build_context *bld,
+                          LLVMValueRef scalar);
+
+
+/**
+ * Broadcast one channel of a vector composed of arrays of XYZW structures into
+ * all four channel.
+ */
+LLVMValueRef
+lp_build_broadcast_aos(struct lp_build_context *bld,
+                       LLVMValueRef a,
+                       unsigned channel);
+
+
+/**
+ * Swizzle a vector consisting of an array of XYZW structs.
+ *
+ * @param swizzle is the in [0,4[ range.
+ */
+LLVMValueRef
+lp_build_swizzle1_aos(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      const unsigned char swizzle[4]);
+
+
+/**
+ * Swizzle two vector consisting of an array of XYZW structs.
+ *
+ * @param swizzle is the in [0,8[ range. Values in [4,8[ range refer to b.
+ */
+LLVMValueRef
+lp_build_swizzle2_aos(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef b,
+                      const unsigned char swizzle[4]);
+
+
+#endif /* !LP_BLD_SWIZZLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
new file mode 100644
index 00000000000..eddb7a83fa2
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -0,0 +1,84 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * TGSI to LLVM IR translation.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#ifndef LP_BLD_TGSI_H
+#define LP_BLD_TGSI_H
+
+#include <llvm-c/Core.h>
+
+
+struct tgsi_token;
+struct lp_type;
+struct lp_build_context;
+struct lp_build_mask_context;
+
+
+/**
+ * Sampler code generation interface.
+ *
+ * Although texture sampling is a requirement for TGSI translation, it is
+ * a very different problem with several different approaches to it. This
+ * structure establishes an interface for texture sampling code generation, so
+ * that we can easily use different texture sampling strategies.
+ */
+struct lp_build_sampler_soa
+{
+   void
+   (*destroy)( struct lp_build_sampler_soa *sampler );
+
+   void
+   (*emit_fetch_texel)( struct lp_build_sampler_soa *sampler,
+                        LLVMBuilderRef builder,
+                        struct lp_type type,
+                        unsigned unit,
+                        unsigned num_coords,
+                        const LLVMValueRef *coords,
+                        LLVMValueRef lodbias,
+                        LLVMValueRef *texel);
+};
+
+
+void
+lp_build_tgsi_soa(LLVMBuilderRef builder,
+                  const struct tgsi_token *tokens,
+                  struct lp_type type,
+                  struct lp_build_mask_context *mask,
+                  LLVMValueRef consts_ptr,
+                  const LLVMValueRef *pos,
+                  const LLVMValueRef (*inputs)[4],
+                  LLVMValueRef (*outputs)[4],
+                  struct lp_build_sampler_soa *sampler);
+
+
+#endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
new file mode 100644
index 00000000000..5f2c2a54ee9
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -0,0 +1,1595 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * TGSI to LLVM IR translation -- SoA.
+ *
+ * @author Jose Fonseca <[email protected]>
+ *
+ * Based on tgsi_sse2.c code written by Michal Krol, Keith Whitwell,
+ * Brian Paul, and others.
+ */
+
+#include "pipe/p_config.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_debug.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_info.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+#include "lp_bld_arit.h"
+#include "lp_bld_logic.h"
+#include "lp_bld_swizzle.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_tgsi.h"
+#include "lp_bld_debug.h"
+
+
+#define LP_MAX_TEMPS 256
+#define LP_MAX_IMMEDIATES 256
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST)->Dst[0].Register.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
+#define LP_TGSI_MAX_NESTING 16
+
+struct lp_exec_mask {
+   struct lp_build_context *bld;
+
+   boolean has_mask;
+
+   LLVMTypeRef int_vec_type;
+
+   LLVMValueRef cond_stack[LP_TGSI_MAX_NESTING];
+   int cond_stack_size;
+   LLVMValueRef cond_mask;
+
+   LLVMValueRef exec_mask;
+};
+
+struct lp_build_tgsi_soa_context
+{
+   struct lp_build_context base;
+
+   LLVMValueRef consts_ptr;
+   const LLVMValueRef *pos;
+   const LLVMValueRef (*inputs)[NUM_CHANNELS];
+   LLVMValueRef (*outputs)[NUM_CHANNELS];
+
+   struct lp_build_sampler_soa *sampler;
+
+   LLVMValueRef immediates[LP_MAX_IMMEDIATES][NUM_CHANNELS];
+   LLVMValueRef temps[LP_MAX_TEMPS][NUM_CHANNELS];
+
+   struct lp_build_mask_context *mask;
+   struct lp_exec_mask exec_mask;
+};
+
+static const unsigned char
+swizzle_left[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_LEFT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_LEFT
+};
+
+static const unsigned char
+swizzle_right[4] = {
+   QUAD_TOP_RIGHT,    QUAD_TOP_RIGHT,
+   QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT
+};
+
+static const unsigned char
+swizzle_top[4] = {
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT,
+   QUAD_TOP_LEFT,     QUAD_TOP_RIGHT
+};
+
+static const unsigned char
+swizzle_bottom[4] = {
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT,
+   QUAD_BOTTOM_LEFT,  QUAD_BOTTOM_RIGHT
+};
+
+static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld)
+{
+   mask->bld = bld;
+   mask->has_mask = FALSE;
+   mask->cond_stack_size = 0;
+
+   mask->int_vec_type = lp_build_int_vec_type(mask->bld->type);
+}
+
+static void lp_exec_mask_update(struct lp_exec_mask *mask)
+{
+   mask->exec_mask = mask->cond_mask;
+   if (mask->cond_stack_size > 0)
+      mask->has_mask = TRUE;
+}
+
+static void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
+                                   LLVMValueRef val)
+{
+   mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask;
+   mask->cond_mask = LLVMBuildBitCast(mask->bld->builder, val,
+                                      mask->int_vec_type, "");
+
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask)
+{
+   LLVMValueRef prev_mask = mask->cond_stack[mask->cond_stack_size - 1];
+   LLVMValueRef inv_mask = LLVMBuildNot(mask->bld->builder,
+                                        mask->cond_mask, "");
+
+   /* means that we didn't have any mask before and that
+    * we were fully enabled */
+   if (mask->cond_stack_size <= 1) {
+      prev_mask = LLVMConstAllOnes(mask->int_vec_type);
+   }
+
+   mask->cond_mask = LLVMBuildAnd(mask->bld->builder,
+                                  inv_mask,
+                                  prev_mask, "");
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask)
+{
+   mask->cond_mask = mask->cond_stack[--mask->cond_stack_size];
+   lp_exec_mask_update(mask);
+}
+
+static void lp_exec_mask_store(struct lp_exec_mask *mask,
+                               LLVMValueRef val,
+                               LLVMValueRef dst)
+{
+   if (mask->has_mask) {
+      LLVMValueRef real_val, dst_val;
+
+      dst_val = LLVMBuildLoad(mask->bld->builder, dst, "");
+      real_val = lp_build_select(mask->bld,
+                                 mask->exec_mask,
+                                 val, dst_val);
+
+      LLVMBuildStore(mask->bld->builder, real_val, dst);
+   } else
+      LLVMBuildStore(mask->bld->builder, val, dst);
+}
+
+
+static LLVMValueRef
+emit_ddx(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_left  = lp_build_swizzle1_aos(&bld->base, src, swizzle_left);
+   LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right);
+   return lp_build_sub(&bld->base, src_right, src_left);
+}
+
+
+static LLVMValueRef
+emit_ddy(struct lp_build_tgsi_soa_context *bld,
+         LLVMValueRef src)
+{
+   LLVMValueRef src_top    = lp_build_swizzle1_aos(&bld->base, src, swizzle_top);
+   LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom);
+   return lp_build_sub(&bld->base, src_top, src_bottom);
+}
+
+
+/**
+ * Register fetch.
+ */
+static LLVMValueRef
+emit_fetch(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   const unsigned chan_index )
+{
+   const struct tgsi_full_src_register *reg = &inst->Src[index];
+   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
+   LLVMValueRef res;
+
+   switch (swizzle) {
+   case TGSI_SWIZZLE_X:
+   case TGSI_SWIZZLE_Y:
+   case TGSI_SWIZZLE_Z:
+   case TGSI_SWIZZLE_W:
+
+      switch (reg->Register.File) {
+      case TGSI_FILE_CONSTANT: {
+         LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->Register.Index*4 + swizzle, 0);
+         LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, "");
+         LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, "");
+         res = lp_build_broadcast_scalar(&bld->base, scalar);
+         break;
+      }
+
+      case TGSI_FILE_IMMEDIATE:
+         res = bld->immediates[reg->Register.Index][swizzle];
+         assert(res);
+         break;
+
+      case TGSI_FILE_INPUT:
+         res = bld->inputs[reg->Register.Index][swizzle];
+         assert(res);
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         res = LLVMBuildLoad(bld->base.builder, bld->temps[reg->Register.Index][swizzle], "");
+         if(!res)
+            return bld->base.undef;
+         break;
+
+      default:
+         assert( 0 );
+         return bld->base.undef;
+      }
+      break;
+
+   default:
+      assert( 0 );
+      return bld->base.undef;
+   }
+
+   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      res = lp_build_abs( &bld->base, res );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      /* TODO: Use bitwese OR for floating point */
+      res = lp_build_abs( &bld->base, res );
+      res = LLVMBuildNeg( bld->base.builder, res, "" );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      res = LLVMBuildNeg( bld->base.builder, res, "" );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   return res;
+}
+
+
+/**
+ * Register fetch with derivatives.
+ */
+static void
+emit_fetch_deriv(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   const unsigned chan_index,
+   LLVMValueRef *res,
+   LLVMValueRef *ddx,
+   LLVMValueRef *ddy)
+{
+   LLVMValueRef src;
+
+   src = emit_fetch(bld, inst, index, chan_index);
+
+   if(res)
+      *res = src;
+
+   /* TODO: use interpolation coeffs for inputs */
+
+   if(ddx)
+      *ddx = emit_ddx(bld, src);
+
+   if(ddy)
+      *ddy = emit_ddy(bld, src);
+}
+
+
+/**
+ * Register store.
+ */
+static void
+emit_store(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   unsigned index,
+   unsigned chan_index,
+   LLVMValueRef value)
+{
+   const struct tgsi_full_dst_register *reg = &inst->Dst[index];
+
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      value = lp_build_max(&bld->base, value, bld->base.zero);
+      value = lp_build_min(&bld->base, value, bld->base.one);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      value = lp_build_max(&bld->base, value, lp_build_const_scalar(bld->base.type, -1.0));
+      value = lp_build_min(&bld->base, value, bld->base.one);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   switch( reg->Register.File ) {
+   case TGSI_FILE_OUTPUT:
+      lp_exec_mask_store(&bld->exec_mask, value,
+                         bld->outputs[reg->Register.Index][chan_index]);
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      lp_exec_mask_store(&bld->exec_mask, value,
+                         bld->temps[reg->Register.Index][chan_index]);
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      /* FIXME */
+      assert(0);
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * High-level instruction translators.
+ */
+
+
+static void
+emit_tex( struct lp_build_tgsi_soa_context *bld,
+          const struct tgsi_full_instruction *inst,
+          boolean apply_lodbias,
+          boolean projected,
+          LLVMValueRef *texel)
+{
+   const uint unit = inst->Src[1].Register.Index;
+   LLVMValueRef lodbias;
+   LLVMValueRef oow = NULL;
+   LLVMValueRef coords[3];
+   unsigned num_coords;
+   unsigned i;
+
+   switch (inst->Texture.Texture) {
+   case TGSI_TEXTURE_1D:
+      num_coords = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      num_coords = 2;
+      break;
+   case TGSI_TEXTURE_SHADOW1D:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+      num_coords = 3;
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   if(apply_lodbias)
+      lodbias = emit_fetch( bld, inst, 0, 3 );
+   else
+      lodbias = bld->base.zero;
+
+   if (projected) {
+      oow = emit_fetch( bld, inst, 0, 3 );
+      oow = lp_build_rcp(&bld->base, oow);
+   }
+
+   for (i = 0; i < num_coords; i++) {
+      coords[i] = emit_fetch( bld, inst, 0, i );
+      if (projected)
+         coords[i] = lp_build_mul(&bld->base, coords[i], oow);
+   }
+   for (i = num_coords; i < 3; i++) {
+      coords[i] = bld->base.undef;
+   }
+
+   bld->sampler->emit_fetch_texel(bld->sampler,
+                                  bld->base.builder,
+                                  bld->base.type,
+                                  unit, num_coords, coords, lodbias,
+                                  texel);
+}
+
+
+static void
+emit_kil(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst )
+{
+   const struct tgsi_full_src_register *reg = &inst->Src[0];
+   LLVMValueRef terms[NUM_CHANNELS];
+   LLVMValueRef mask;
+   unsigned chan_index;
+
+   memset(&terms, 0, sizeof terms);
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      unsigned swizzle;
+
+      /* Unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
+
+      /* Check if the component has not been already tested. */
+      assert(swizzle < NUM_CHANNELS);
+      if( !terms[swizzle] )
+         /* TODO: change the comparison operator instead of setting the sign */
+         terms[swizzle] =  emit_fetch(bld, inst, 0, chan_index );
+   }
+
+   mask = NULL;
+   FOR_EACH_CHANNEL( chan_index ) {
+      if(terms[chan_index]) {
+         LLVMValueRef chan_mask;
+
+         chan_mask = lp_build_cmp(&bld->base, PIPE_FUNC_GEQUAL, terms[chan_index], bld->base.zero);
+
+         if(mask)
+            mask = LLVMBuildAnd(bld->base.builder, mask, chan_mask, "");
+         else
+            mask = chan_mask;
+      }
+   }
+
+   if(mask)
+      lp_build_mask_update(bld->mask, mask);
+}
+
+
+/**
+ * Check if inst src/dest regs use indirect addressing into temporary
+ * register file.
+ */
+static boolean
+indirect_temp_reference(const struct tgsi_full_instruction *inst)
+{
+   uint i;
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *reg = &inst->Src[i];
+      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+          reg->Register.Indirect)
+         return TRUE;
+   }
+   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      const struct tgsi_full_dst_register *reg = &inst->Dst[i];
+      if (reg->Register.File == TGSI_FILE_TEMPORARY &&
+          reg->Register.Indirect)
+         return TRUE;
+   }
+   return FALSE;
+}
+
+static int
+emit_declaration(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_declaration *decl)
+{
+   unsigned first = decl->Range.First;
+   unsigned last = decl->Range.Last;
+   unsigned idx, i;
+
+   for (idx = first; idx <= last; ++idx) {
+      boolean ok;
+
+      switch (decl->Declaration.File) {
+      case TGSI_FILE_TEMPORARY:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            bld->temps[idx][i] = lp_build_alloca(&bld->base);
+         ok = TRUE;
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            bld->outputs[idx][i] = lp_build_alloca(&bld->base);
+         ok = TRUE;
+         break;
+
+      default:
+         /* don't need to declare other vars */
+         ok = TRUE;
+      }
+
+      if (!ok)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static int
+emit_instruction(
+   struct lp_build_tgsi_soa_context *bld,
+   const struct tgsi_full_instruction *inst,
+   const struct tgsi_opcode_info *info)
+{
+   unsigned chan_index;
+   LLVMValueRef src0, src1, src2;
+   LLVMValueRef tmp0, tmp1, tmp2;
+   LLVMValueRef tmp3 = NULL;
+   LLVMValueRef tmp4 = NULL;
+   LLVMValueRef tmp5 = NULL;
+   LLVMValueRef tmp6 = NULL;
+   LLVMValueRef tmp7 = NULL;
+   LLVMValueRef res;
+   LLVMValueRef dst0[NUM_CHANNELS];
+
+   /* we can't handle indirect addressing into temp register file yet */
+   if (indirect_temp_reference(inst))
+      return FALSE;
+
+   assert(info->num_dst <= 1);
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.undef;
+      }
+   }
+
+   switch (inst->Instruction.Opcode) {
+#if 0
+   case TGSI_OPCODE_ARL:
+      /* FIXME */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         emit_flr(bld, 0, 0);
+         emit_f2it( bld, 0 );
+         dst0[chan_index] = tmp0;
+      }
+      break;
+#endif
+
+   case TGSI_OPCODE_MOV:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = emit_fetch( bld, inst, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ) {
+         dst0[CHAN_X] = bld->base.one;
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
+         src0 = emit_fetch( bld, inst, 0, CHAN_X );
+         dst0[CHAN_Y] = lp_build_max( &bld->base, src0, bld->base.zero);
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
+         /* XMM[1] = SrcReg[0].yyyy */
+         tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
+         /* XMM[1] = max(XMM[1], 0) */
+         tmp1 = lp_build_max( &bld->base, tmp1, bld->base.zero);
+         /* XMM[2] = SrcReg[0].wwww */
+         tmp2 = emit_fetch( bld, inst, 0, CHAN_W );
+         tmp1 = lp_build_pow( &bld->base, tmp1, tmp2);
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+         tmp2 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, tmp0, bld->base.zero);
+         dst0[CHAN_Z] = lp_build_select(&bld->base, tmp2, tmp1, bld->base.zero);
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) ) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      src0 = emit_fetch( bld, inst, 0, CHAN_X );
+      res = lp_build_rcp(&bld->base, src0);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = res;
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      src0 = emit_fetch( bld, inst, 0, CHAN_X );
+      src0 = lp_build_abs(&bld->base, src0);
+      res = lp_build_rsqrt(&bld->base, src0);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = res;
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
+         LLVMValueRef *p_exp2_int_part = NULL;
+         LLVMValueRef *p_frac_part = NULL;
+         LLVMValueRef *p_exp2 = NULL;
+
+         src0 = emit_fetch( bld, inst, 0, CHAN_X );
+
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
+            p_exp2_int_part = &tmp0;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
+            p_frac_part = &tmp1;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
+            p_exp2 = &tmp2;
+
+         lp_build_exp2_approx(&bld->base, src0, p_exp2_int_part, p_frac_part, p_exp2);
+
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
+            dst0[CHAN_X] = tmp0;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
+            dst0[CHAN_Y] = tmp1;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
+            dst0[CHAN_Z] = tmp2;
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_LOG:
+      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z )) {
+         LLVMValueRef *p_floor_log2 = NULL;
+         LLVMValueRef *p_exp = NULL;
+         LLVMValueRef *p_log2 = NULL;
+
+         src0 = emit_fetch( bld, inst, 0, CHAN_X );
+         src0 = lp_build_abs( &bld->base, src0 );
+
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
+            p_floor_log2 = &tmp0;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ))
+            p_exp = &tmp1;
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
+            p_log2 = &tmp2;
+
+         lp_build_log2_approx(&bld->base, src0, p_exp, p_floor_log2, p_log2);
+
+         /* dst.x = floor(lg2(abs(src.x))) */
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ))
+            dst0[CHAN_X] = tmp0;
+         /* dst.y = abs(src)/ex2(floor(lg2(abs(src.x)))) */
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y )) {
+            dst0[CHAN_Y] = lp_build_div( &bld->base, src0, tmp1);
+         }
+         /* dst.z = lg2(abs(src.x)) */
+         if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ))
+            dst0[CHAN_Z] = tmp2;
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( inst, CHAN_W )) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_mul(&bld->base, src0, src1);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_add(&bld->base, src0, src1);
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_DP4:
+   /* TGSI_OPCODE_DOT4 */
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_W );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_W );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
+         dst0[CHAN_X] = bld->base.one;
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
+         tmp1 = emit_fetch( bld, inst, 1, CHAN_Y );
+         dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp0, tmp1);
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
+         dst0[CHAN_Z] = emit_fetch( bld, inst, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
+         dst0[CHAN_W] = emit_fetch( bld, inst, 1, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_min( &bld->base, src0, src1 );
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_max( &bld->base, src0, src1 );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GEQUAL, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         tmp1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
+         tmp0 = lp_build_add( &bld->base, tmp0, tmp2);
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         tmp1 = emit_fetch( bld, inst, 1, chan_index );
+         dst0[chan_index] = lp_build_sub( &bld->base, tmp0, tmp1);
+      }
+      break;
+
+   case TGSI_OPCODE_LRP:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_sub( &bld->base, src1, src2 );
+         tmp0 = lp_build_mul( &bld->base, src0, tmp0 );
+         dst0[chan_index] = lp_build_add( &bld->base, tmp0, src2 );
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp1 = lp_build_const_scalar(bld->base.type, 0.5);
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src2, tmp1);
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src0, src1 );
+      }
+      break;
+
+   case TGSI_OPCODE_DP2A:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
+      tmp1 = emit_fetch( bld, inst, 2, CHAN_X );  /* xmm1 = src[2].x */
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
+      }
+      break;
+
+   case TGSI_OPCODE_FRC:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         tmp0 = lp_build_floor(&bld->base, src0);
+         tmp0 = lp_build_sub(&bld->base, src0, tmp0);
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_max(&bld->base, tmp0, src1);
+         tmp0 = lp_build_min(&bld->base, tmp0, src2);
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_FLR:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_floor(&bld->base, tmp0);
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_round(&bld->base, tmp0);
+      }
+      break;
+
+   case TGSI_OPCODE_EX2: {
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp0 = lp_build_exp2( &bld->base, tmp0);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+   }
+
+   case TGSI_OPCODE_LG2:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp0 = lp_build_log2( &bld->base, tmp0);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_POW:
+      src0 = emit_fetch( bld, inst, 0, CHAN_X );
+      src1 = emit_fetch( bld, inst, 1, CHAN_X );
+      res = lp_build_pow( &bld->base, src0, src1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = res;
+      }
+      break;
+
+   case TGSI_OPCODE_XPD:
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ) {
+         tmp1 = emit_fetch( bld, inst, 1, CHAN_Z );
+         tmp3 = emit_fetch( bld, inst, 0, CHAN_Z );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_Y );
+         tmp4 = emit_fetch( bld, inst, 1, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
+         tmp2 = tmp0;
+         tmp2 = lp_build_mul( &bld->base, tmp2, tmp1);
+         tmp5 = tmp3;
+         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
+         tmp2 = lp_build_sub( &bld->base, tmp2, tmp5);
+         dst0[CHAN_X] = tmp2;
+      }
+      if( IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) ) {
+         tmp2 = emit_fetch( bld, inst, 1, CHAN_X );
+         tmp5 = emit_fetch( bld, inst, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
+         tmp3 = lp_build_mul( &bld->base, tmp3, tmp2);
+         tmp1 = lp_build_mul( &bld->base, tmp1, tmp5);
+         tmp3 = lp_build_sub( &bld->base, tmp3, tmp1);
+         dst0[CHAN_Y] = tmp3;
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
+         tmp5 = lp_build_mul( &bld->base, tmp5, tmp4);
+         tmp0 = lp_build_mul( &bld->base, tmp0, tmp2);
+         tmp5 = lp_build_sub( &bld->base, tmp5, tmp0);
+         dst0[CHAN_Z] = tmp5;
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_ABS:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_abs( &bld->base, tmp0 );
+      }
+      break;
+
+   case TGSI_OPCODE_RCC:
+      /* deprecated? */
+      assert(0);
+      return 0;
+
+   case TGSI_OPCODE_DPH:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Z );
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Z );
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_W );
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp0 = lp_build_cos( &bld->base, tmp0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, &dst0[chan_index], NULL);
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_fetch_deriv( bld, inst, 0, chan_index, NULL, NULL, &dst0[chan_index]);
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      /* predicated kill */
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* conditional kill */
+      emit_kil( bld, inst );
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_EQUAL, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.zero;
+      }
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_GREATER, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+      tmp0 = lp_build_sin( &bld->base, tmp0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LEQUAL, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_NOTEQUAL, src0, src1 );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, bld->base.one, bld->base.zero );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* XXX what about dst0 writemask? */
+      emit_tex( bld, inst, FALSE, FALSE, dst0 );
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      /* deprecated */
+      assert (0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_X2D:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARA:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+#if 0
+   case TGSI_OPCODE_ARR:
+      /* FIXME */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         emit_rnd( bld, 0, 0 );
+         emit_f2it( bld, 0 );
+         dst0[chan_index] = tmp0;
+      }
+      break;
+#endif
+
+   case TGSI_OPCODE_BRA:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RET:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_END:
+      break;
+
+   case TGSI_OPCODE_SSG:
+   /* TGSI_OPCODE_SGN */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_sgn( &bld->base, tmp0 );
+      }
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         src0 = emit_fetch( bld, inst, 0, chan_index );
+         src1 = emit_fetch( bld, inst, 1, chan_index );
+         src2 = emit_fetch( bld, inst, 2, chan_index );
+         tmp0 = lp_build_cmp( &bld->base, PIPE_FUNC_LESS, src0, bld->base.zero );
+         dst0[chan_index] = lp_build_select( &bld->base, tmp0, src1, src2);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_X ) {
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+         dst0[CHAN_X] = lp_build_cos( &bld->base, tmp0 );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Y ) {
+         tmp0 = emit_fetch( bld, inst, 0, CHAN_X );
+         dst0[CHAN_Y] = lp_build_sin( &bld->base, tmp0 );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_Z ) {
+         dst0[CHAN_Z] = bld->base.zero;
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( inst, CHAN_W ) {
+         dst0[CHAN_W] = bld->base.one;
+      }
+      break;
+
+   case TGSI_OPCODE_TXB:
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
+      break;
+
+   case TGSI_OPCODE_NRM:
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      /* 3 or 4-component normalization */
+      {
+         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+
+         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) ||
+             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y) ||
+             IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z) ||
+             (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 4)) {
+
+            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
+
+            /* xmm4 = src.x */
+            /* xmm0 = src.x * src.x */
+            tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
+               tmp4 = tmp0;
+            }
+            tmp0 = lp_build_mul( &bld->base, tmp0, tmp0);
+
+            /* xmm5 = src.y */
+            /* xmm0 = xmm0 + src.y * src.y */
+            tmp1 = emit_fetch(bld, inst, 0, CHAN_Y);
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
+               tmp5 = tmp1;
+            }
+            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
+            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+
+            /* xmm6 = src.z */
+            /* xmm0 = xmm0 + src.z * src.z */
+            tmp1 = emit_fetch(bld, inst, 0, CHAN_Z);
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
+               tmp6 = tmp1;
+            }
+            tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
+            tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+
+            if (dims == 4) {
+               /* xmm7 = src.w */
+               /* xmm0 = xmm0 + src.w * src.w */
+               tmp1 = emit_fetch(bld, inst, 0, CHAN_W);
+               if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W)) {
+                  tmp7 = tmp1;
+               }
+               tmp1 = lp_build_mul( &bld->base, tmp1, tmp1);
+               tmp0 = lp_build_add( &bld->base, tmp0, tmp1);
+            }
+
+            /* xmm1 = 1 / sqrt(xmm0) */
+            tmp1 = lp_build_rsqrt( &bld->base, tmp0);
+
+            /* dst.x = xmm1 * src.x */
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X)) {
+               dst0[CHAN_X] = lp_build_mul( &bld->base, tmp4, tmp1);
+            }
+
+            /* dst.y = xmm1 * src.y */
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Y)) {
+               dst0[CHAN_Y] = lp_build_mul( &bld->base, tmp5, tmp1);
+            }
+
+            /* dst.z = xmm1 * src.z */
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_Z)) {
+               dst0[CHAN_Z] = lp_build_mul( &bld->base, tmp6, tmp1);
+            }
+
+            /* dst.w = xmm1 * src.w */
+            if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_X) && dims == 4) {
+               dst0[CHAN_W] = lp_build_mul( &bld->base, tmp7, tmp1);
+            }
+         }
+
+         /* dst.w = 1.0 */
+         if (IS_DST0_CHANNEL_ENABLED(inst, CHAN_W) && dims == 3) {
+            dst0[CHAN_W] = bld->base.one;
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_DIV:
+      /* deprecated */
+      assert( 0 );
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      tmp0 = emit_fetch( bld, inst, 0, CHAN_X );  /* xmm0 = src[0].x */
+      tmp1 = emit_fetch( bld, inst, 1, CHAN_X );  /* xmm1 = src[1].x */
+      tmp0 = lp_build_mul( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 * xmm1 */
+      tmp1 = emit_fetch( bld, inst, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      tmp2 = emit_fetch( bld, inst, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      tmp1 = lp_build_mul( &bld->base, tmp1, tmp2);              /* xmm1 = xmm1 * xmm2 */
+      tmp0 = lp_build_add( &bld->base, tmp0, tmp1);              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         dst0[chan_index] = tmp0;  /* dest[ch] = xmm0 */
+      }
+      break;
+
+   case TGSI_OPCODE_TXL:
+      emit_tex( bld, inst, TRUE, FALSE, dst0 );
+      break;
+
+   case TGSI_OPCODE_TXP:
+      emit_tex( bld, inst, FALSE, TRUE, dst0 );
+      break;
+      
+   case TGSI_OPCODE_BRK:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_IF:
+      tmp0 = emit_fetch(bld, inst, 0, CHAN_X);
+      lp_exec_mask_cond_push(&bld->exec_mask, tmp0);
+      break;
+
+   case TGSI_OPCODE_BGNFOR:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_REP:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      lp_exec_mask_cond_invert(&bld->exec_mask);
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      lp_exec_mask_cond_pop(&bld->exec_mask);
+      break;
+
+   case TGSI_OPCODE_ENDFOR:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+      /* deprecated */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PUSHA:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_POPA:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_ceil(&bld->base, tmp0);
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOT:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         tmp0 = emit_fetch( bld, inst, 0, chan_index );
+         dst0[chan_index] = lp_build_trunc(&bld->base, tmp0);
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ISHR:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_AND:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_OR:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MOD:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_XOR:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SAD:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXF:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      /* deprecated? */
+      assert(0);
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* FIXME */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      return 0;
+   }
+   
+   if(info->num_dst) {
+      FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
+         emit_store( bld, inst, 0, chan_index, dst0[chan_index]);
+      }
+   }
+
+   return 1;
+}
+
+
+void
+lp_build_tgsi_soa(LLVMBuilderRef builder,
+                  const struct tgsi_token *tokens,
+                  struct lp_type type,
+                  struct lp_build_mask_context *mask,
+                  LLVMValueRef consts_ptr,
+                  const LLVMValueRef *pos,
+                  const LLVMValueRef (*inputs)[NUM_CHANNELS],
+                  LLVMValueRef (*outputs)[NUM_CHANNELS],
+                  struct lp_build_sampler_soa *sampler)
+{
+   struct lp_build_tgsi_soa_context bld;
+   struct tgsi_parse_context parse;
+   uint num_immediates = 0;
+   unsigned i;
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, builder, type);
+   bld.mask = mask;
+   bld.pos = pos;
+   bld.inputs = inputs;
+   bld.outputs = outputs;
+   bld.consts_ptr = consts_ptr;
+   bld.sampler = sampler;
+
+   lp_exec_mask_init(&bld.exec_mask, &bld.base);
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         /* Inputs already interpolated */
+         {
+            if (!emit_declaration( &bld, &parse.FullToken.FullDeclaration ))
+               _debug_printf("warning: failed to define LLVM variable\n");
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
+            const struct tgsi_opcode_info *info = tgsi_get_opcode_info(opcode);
+            if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, info ))
+               _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n",
+                             info ? info->mnemonic : "<invalid>");
+         }
+
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* simply copy the immediate values into the next immediates[] slot */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
+            assert(size <= 4);
+            assert(num_immediates < LP_MAX_IMMEDIATES);
+            for( i = 0; i < size; ++i )
+               bld.immediates[num_immediates][i] =
+                  lp_build_const_scalar(type, parse.FullToken.FullImmediate.u[i].Float);
+            for( i = size; i < 4; ++i )
+               bld.immediates[num_immediates][i] = bld.base.undef;
+            num_immediates++;
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   tgsi_parse_free( &parse );
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c
new file mode 100644
index 00000000000..c327ba045a6
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c
@@ -0,0 +1,242 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_debug.h"
+
+#include "lp_bld_type.h"
+#include "lp_bld_const.h"
+
+
+LLVMTypeRef
+lp_build_elem_type(struct lp_type type)
+{
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         return LLVMFloatType();
+         break;
+      case 64:
+         return LLVMDoubleType();
+         break;
+      default:
+         assert(0);
+         return LLVMFloatType();
+      }
+   }
+   else {
+      return LLVMIntType(type.width);
+   }
+}
+
+
+LLVMTypeRef
+lp_build_vec_type(struct lp_type type)
+{
+   LLVMTypeRef elem_type = lp_build_elem_type(type);
+   return LLVMVectorType(elem_type, type.length);
+}
+
+
+/**
+ * This function is a mirror of lp_build_elem_type() above.
+ *
+ * XXX: I'm not sure if it wouldn't be easier/efficient to just recreate the
+ * type and check for identity.
+ */
+boolean
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type) 
+{
+   LLVMTypeKind elem_kind;
+
+   assert(elem_type);
+   if(!elem_type)
+      return FALSE;
+
+   elem_kind = LLVMGetTypeKind(elem_type);
+
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         if(elem_kind != LLVMFloatTypeKind)
+            return FALSE;
+         break;
+      case 64:
+         if(elem_kind != LLVMDoubleTypeKind)
+            return FALSE;
+         break;
+      default:
+         assert(0);
+         return FALSE;
+      }
+   }
+   else {
+      if(elem_kind != LLVMIntegerTypeKind)
+         return FALSE;
+
+      if(LLVMGetIntTypeWidth(elem_type) != type.width)
+         return FALSE;
+   }
+
+   return TRUE; 
+}
+
+
+boolean
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type) 
+{
+   LLVMTypeRef elem_type;
+
+   assert(vec_type);
+   if(!vec_type)
+      return FALSE;
+
+   if(LLVMGetTypeKind(vec_type) != LLVMVectorTypeKind)
+      return FALSE;
+
+   if(LLVMGetVectorSize(vec_type) != type.length)
+      return FALSE;
+
+   elem_type = LLVMGetElementType(vec_type);
+
+   return lp_check_elem_type(type, elem_type);
+}
+
+
+boolean
+lp_check_value(struct lp_type type, LLVMValueRef val) 
+{
+   LLVMTypeRef vec_type;
+
+   assert(val);
+   if(!val)
+      return FALSE;
+
+   vec_type = LLVMTypeOf(val);
+
+   return lp_check_vec_type(type, vec_type);
+}
+
+
+LLVMTypeRef
+lp_build_int_elem_type(struct lp_type type)
+{
+   return LLVMIntType(type.width);
+}
+
+
+LLVMTypeRef
+lp_build_int_vec_type(struct lp_type type)
+{
+   LLVMTypeRef elem_type = lp_build_int_elem_type(type);
+   return LLVMVectorType(elem_type, type.length);
+}
+
+
+/**
+ * Build int32[4] vector type
+ */
+LLVMTypeRef
+lp_build_int32_vec4_type(void)
+{
+   struct lp_type t;
+   LLVMTypeRef type;
+
+   memset(&t, 0, sizeof(t));
+   t.floating = FALSE; /* floating point values */
+   t.sign = TRUE;      /* values are signed */
+   t.norm = FALSE;     /* values are not limited to [0,1] or [-1,1] */
+   t.width = 32;       /* 32-bit int */
+   t.length = 4;       /* 4 elements per vector */
+
+   type = lp_build_int_elem_type(t);
+   return LLVMVectorType(type, t.length);
+}
+
+
+/**
+ * Create unsigned integer type variation of given type.
+ */
+struct lp_type
+lp_uint_type(struct lp_type type)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = type.width;
+   res_type.length = type.length;
+
+   return res_type;
+}
+
+
+/**
+ * Create signed integer type variation of given type.
+ */
+struct lp_type
+lp_int_type(struct lp_type type)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = type.width;
+   res_type.length = type.length;
+   res_type.sign = 1;
+
+   return res_type;
+}
+
+
+/**
+ * Return the type with twice the bit width (hence half the number of elements).
+ */
+struct lp_type
+lp_wider_type(struct lp_type type)
+{
+   struct lp_type res_type;
+
+   memcpy(&res_type, &type, sizeof res_type);
+   res_type.width *= 2;
+   res_type.length /= 2;
+
+   assert(res_type.length);
+
+   return res_type;
+}
+
+
+void
+lp_build_context_init(struct lp_build_context *bld,
+                      LLVMBuilderRef builder,
+                      struct lp_type type)
+{
+   bld->builder = builder;
+   bld->type = type;
+   bld->undef = lp_build_undef(type);
+   bld->zero = lp_build_zero(type);
+   bld->one = lp_build_one(type);
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h
new file mode 100644
index 00000000000..16946cc28a2
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h
@@ -0,0 +1,277 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Convenient representation of SIMD types.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#ifndef LP_BLD_TYPE_H
+#define LP_BLD_TYPE_H
+
+
+#include <llvm-c/Core.h>  
+
+#include <pipe/p_compiler.h>
+
+
+/**
+ * Native SIMD register width.
+ *
+ * 128 for all architectures we care about.
+ */
+#define LP_NATIVE_VECTOR_WIDTH 128
+
+/**
+ * Several functions can only cope with vectors of length up to this value.
+ * You may need to increase that value if you want to represent bigger vectors.
+ */
+#define LP_MAX_VECTOR_LENGTH 16
+
+
+/**
+ * The LLVM type system can't conveniently express all the things we care about
+ * on the types used for intermediate computations, such as signed vs unsigned,
+ * normalized values, or fixed point.
+ */
+struct lp_type {
+   /**
+    * Floating-point. Cannot be used with fixed. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned floating:1;
+
+   /**
+    * Fixed-point. Cannot be used with floating. Integer numbers are
+    * represented by this zero.
+    */
+   unsigned fixed:1;
+
+   /**
+    * Whether it can represent negative values or not.
+    *
+    * If this is not set for floating point, it means that all values are
+    * assumed to be positive.
+    */
+   unsigned sign:1;
+
+   /**
+    * Whether values are normalized to fit [0, 1] interval, or [-1, 1]
+    * interval for signed types.
+    *
+    * For integer types it means the representable integer range should be
+    * interpreted as the interval above.
+    *
+    * For floating and fixed point formats it means the values should be
+    * clamped to the interval above.
+    */
+   unsigned norm:1;
+
+   /**
+    * Element width.
+    *
+    * For fixed point values, the fixed point is assumed to be at half the
+    * width.
+    */
+   unsigned width:14;
+
+   /**
+    * Vector length.
+    *
+    * width*length should be a power of two greater or equal to eight.
+    *
+    * @sa LP_MAX_VECTOR_LENGTH
+    */
+   unsigned length:14;
+};
+
+
+/**
+ * We need most of the information here in order to correctly and efficiently
+ * translate an arithmetic operation into LLVM IR. Putting it here avoids the
+ * trouble of passing it as parameters.
+ */
+struct lp_build_context
+{
+   LLVMBuilderRef builder;
+
+   /**
+    * This not only describes the input/output LLVM types, but also whether
+    * to normalize/clamp the results.
+    */
+   struct lp_type type;
+
+   /** Same as lp_build_undef(type) */
+   LLVMValueRef undef;
+
+   /** Same as lp_build_zero(type) */
+   LLVMValueRef zero;
+
+   /** Same as lp_build_one(type) */
+   LLVMValueRef one;
+};
+
+
+static INLINE struct lp_type
+lp_type_float(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.floating = TRUE;
+   res_type.sign = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_int(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.sign = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_uint(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_unorm(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.norm = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_fixed(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.sign = TRUE;
+   res_type.fixed = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+static INLINE struct lp_type
+lp_type_ufixed(unsigned width)
+{
+   struct lp_type res_type;
+
+   memset(&res_type, 0, sizeof res_type);
+   res_type.fixed = TRUE;
+   res_type.width = width;
+   res_type.length = LP_NATIVE_VECTOR_WIDTH / width;
+
+   return res_type;
+}
+
+
+LLVMTypeRef
+lp_build_elem_type(struct lp_type type);
+
+
+LLVMTypeRef
+lp_build_vec_type(struct lp_type type);
+
+
+boolean
+lp_check_elem_type(struct lp_type type, LLVMTypeRef elem_type);
+
+
+boolean
+lp_check_vec_type(struct lp_type type, LLVMTypeRef vec_type);
+
+
+boolean
+lp_check_value(struct lp_type type, LLVMValueRef val);
+
+
+LLVMTypeRef
+lp_build_int_elem_type(struct lp_type type);
+
+
+LLVMTypeRef
+lp_build_int_vec_type(struct lp_type type);
+
+
+LLVMTypeRef
+lp_build_int32_vec4_type(void);
+
+
+struct lp_type
+lp_uint_type(struct lp_type type);
+
+
+struct lp_type
+lp_int_type(struct lp_type type);
+
+
+struct lp_type
+lp_wider_type(struct lp_type type);
+
+
+void
+lp_build_context_init(struct lp_build_context *bld,
+                      LLVMBuilderRef builder,
+                      struct lp_type type);
+
+
+#endif /* !LP_BLD_TYPE_H */
diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
deleted file mode 100644
index cb85e1734ec..00000000000
--- a/src/gallium/auxiliary/gallivm/soabuiltins.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * This file is compiled with clang into the LLVM bitcode
-  *
-  * Authors:
-  *   Zack Rusin [email protected]
-  */
-typedef __attribute__(( ext_vector_type(4) )) float float4;
-
-
-extern float fabsf(float val);
-
-/* helpers */
-
-float4 absvec(float4 vec)
-{
-   float4 res;
-   res.x = fabsf(vec.x);
-   res.y = fabsf(vec.y);
-   res.z = fabsf(vec.z);
-   res.w = fabsf(vec.w);
-
-   return res;
-}
-
-float4 maxvec(float4 a, float4 b)
-{
-   return (float4){(a.x > b.x) ? a.x : b.x,
-         (a.y > b.y) ? a.y : b.y,
-         (a.z > b.z) ? a.z : b.z,
-         (a.w > b.w) ? a.w : b.w};
-}
-
-float4 minvec(float4 a, float4 b)
-{
-   return (float4){(a.x < b.x) ? a.x : b.x,
-         (a.y < b.y) ? a.y : b.y,
-         (a.z < b.z) ? a.z : b.z,
-         (a.w < b.w) ? a.w : b.w};
-}
-
-extern float powf(float num, float p);
-extern float sqrtf(float x);
-
-float4 powvec(float4 vec, float4 q)
-{
-   float4 p;
-   p.x = powf(vec.x, q.x);
-   p.y = powf(vec.y, q.y);
-   p.z = powf(vec.z, q.z);
-   p.w = powf(vec.w, q.w);
-   return p;
-}
-
-float4 sqrtvec(float4 vec)
-{
-   float4 p;
-   p.x = sqrtf(vec.x);
-   p.y = sqrtf(vec.y);
-   p.z = sqrtf(vec.z);
-   p.w = sqrtf(vec.w);
-   return p;
-}
-
-float4 sltvec(float4 v1, float4 v2)
-{
-   float4 p;
-   p.x = (v1.x < v2.x) ? 1.0 : 0.0;
-   p.y = (v1.y < v2.y) ? 1.0 : 0.0;
-   p.z = (v1.z < v2.z) ? 1.0 : 0.0;
-   p.w = (v1.w < v2.w) ? 1.0 : 0.0;
-   return p;
-}
-
-
-/* instructions */
-
-void abs(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
-{
-   res[0] = absvec(tmp0x);
-   res[1] = absvec(tmp0y);
-   res[2] = absvec(tmp0z);
-   res[3] = absvec(tmp0w);
-}
-
-void dp3(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
-{
-   float4 dot = (tmp0x * tmp1x) + (tmp0y * tmp1y) +
-                (tmp0z * tmp1z);
-
-   res[0] = dot;
-   res[1] = dot;
-   res[2] = dot;
-   res[3] = dot;
-}
-
-void dp4(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
-{
-   float4 dot = (tmp0x * tmp1x) + (tmp0y * tmp1y) +
-                (tmp0z * tmp1z) + (tmp0w * tmp1w);
-
-   res[0] = dot;
-   res[1] = dot;
-   res[2] = dot;
-   res[3] = dot;
-}
-
-void lit(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
-{
-   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
-   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
-   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
-
-   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
-   if (tmp0x.x > 0) {
-      float4 tmpy = maxvec(tmp0y, zerovec);
-      float4 tmpw = minvec(tmp0w, plus128);
-      tmpw = maxvec(tmpw, min128);
-      res[1] = tmp0x;
-      res[2] = powvec(tmpy, tmpw);
-   } else {
-      res[1] = zerovec;
-      res[2] = zerovec;
-   }
-   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
-}
-
-void min(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
-{
-   res[0] = minvec(tmp0x, tmp1x);
-   res[1] = minvec(tmp0y, tmp1y);
-   res[2] = minvec(tmp0z, tmp1z);
-   res[3] = minvec(tmp0w, tmp1w);
-}
-
-
-void max(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
-{
-   res[0] = maxvec(tmp0x, tmp1x);
-   res[1] = maxvec(tmp0y, tmp1y);
-   res[2] = maxvec(tmp0z, tmp1z);
-   res[3] = maxvec(tmp0w, tmp1w);
-}
-
-void pow(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
-{
-   res[0] = powvec(tmp0x, tmp1x);
-   res[1] = res[0];
-   res[2] = res[0];
-   res[3] = res[0];
-}
-
-void rsq(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
-{
-   const float4 onevec = (float4) {1., 1., 1., 1.};
-   res[0] = onevec/sqrtvec(absvec(tmp0x));
-   res[1] = onevec/sqrtvec(absvec(tmp0y));
-   res[2] = onevec/sqrtvec(absvec(tmp0z));
-   res[3] = onevec/sqrtvec(absvec(tmp0w));
-}
-
-void slt(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
-{
-   res[0] = sltvec(tmp0x, tmp1x);
-   res[1] = sltvec(tmp0y, tmp1y);
-   res[2] = sltvec(tmp0z, tmp1z);
-   res[3] = sltvec(tmp0w, tmp1w);
-}
-
diff --git a/src/gallium/auxiliary/gallivm/storage.cpp b/src/gallium/auxiliary/gallivm/storage.cpp
deleted file mode 100644
index 73df24c9769..00000000000
--- a/src/gallium/auxiliary/gallivm/storage.cpp
+++ /dev/null
@@ -1,364 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Zack Rusin [email protected]
-  */
-#ifdef MESA_LLVM
-
-#include "storage.h"
-
-#include "gallivm_p.h"
-
-#include "pipe/p_shader_tokens.h"
-#include <llvm/BasicBlock.h>
-#include <llvm/Module.h>
-#include <llvm/Value.h>
-
-#include <llvm/CallingConv.h>
-#include <llvm/Constants.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/InstrTypes.h>
-#include <llvm/Instructions.h>
-
-using namespace llvm;
-
-Storage::Storage(llvm::BasicBlock *block, llvm::Value *input)
-   : m_block(block),
-     m_INPUT(input),
-     m_addrs(32),
-     m_idx(0)
-{
-   m_floatVecType = VectorType::get(Type::FloatTy, 4);
-   m_intVecType   = VectorType::get(IntegerType::get(32), 4);
-
-   m_undefFloatVec = UndefValue::get(m_floatVecType);
-   m_undefIntVec   = UndefValue::get(m_intVecType);
-   m_extSwizzleVec = 0;
-
-   m_numConsts = 0;
-}
-
-//can only build vectors with all members in the [0, 9] range
-llvm::Constant *Storage::shuffleMask(int vec)
-{
-   if (!m_extSwizzleVec) {
-      std::vector<Constant*> elems;
-      elems.push_back(ConstantFP::get(APFloat(0.f)));
-      elems.push_back(ConstantFP::get(APFloat(1.f)));
-      elems.push_back(ConstantFP::get(APFloat(0.f)));
-      elems.push_back(ConstantFP::get(APFloat(1.f)));
-      m_extSwizzleVec = ConstantVector::get(m_floatVecType, elems);
-   }
-
-   if (m_intVecs.find(vec) != m_intVecs.end()) {
-      return m_intVecs[vec];
-   }
-   int origVec = vec;
-   Constant* const_vec = 0;
-   if (origVec == 0) {
-      const_vec = Constant::getNullValue(m_intVecType);
-   } else {
-      int x = gallivm_x_swizzle(vec);
-      int y = gallivm_y_swizzle(vec);
-      int z = gallivm_z_swizzle(vec);
-      int w = gallivm_w_swizzle(vec);
-      std::vector<Constant*> elems;
-      elems.push_back(constantInt(x));
-      elems.push_back(constantInt(y));
-      elems.push_back(constantInt(z));
-      elems.push_back(constantInt(w));
-      const_vec = ConstantVector::get(m_intVecType, elems);
-   }
-
-   m_intVecs[origVec] = const_vec;
-   return const_vec;
-}
-
-llvm::ConstantInt *Storage::constantInt(int idx)
-{
-   if (m_constInts.find(idx) != m_constInts.end()) {
-      return m_constInts[idx];
-   }
-   ConstantInt *const_int = ConstantInt::get(APInt(32,  idx));
-   m_constInts[idx] = const_int;
-   return const_int;
-}
-
-llvm::Value *Storage::inputElement(int idx, llvm::Value *indIdx)
-{
-   Value *val = element(InputsArg, idx, indIdx);
-   LoadInst *load = new LoadInst(val, name("input"), false, m_block);
-   load->setAlignment(8);
-
-   return load;
-}
-
-llvm::Value *Storage::constElement(int idx, llvm::Value *indIdx)
-{
-   m_numConsts = ((idx + 1) > m_numConsts) ? (idx + 1) : m_numConsts;
-
-   Value *elem = element(ConstsArg, idx, indIdx);
-   LoadInst *load = new LoadInst(elem, name("const"), false, m_block);
-   load->setAlignment(8);
-   return load;
-}
-
-llvm::Value *Storage::shuffleVector(llvm::Value *vec, int shuffle)
-{
-   Constant *mask = shuffleMask(shuffle);
-   ShuffleVectorInst *res =
-      new ShuffleVectorInst(vec, m_extSwizzleVec, mask,
-                            name("shuffle"), m_block);
-   return res;
-}
-
-
-llvm::Value *Storage::tempElement(int idx, llvm::Value *indIdx)
-{
-   Value *elem = element(TempsArg, idx, indIdx);
-
-   LoadInst *load = new LoadInst(elem, name("temp"), false, m_block);
-   load->setAlignment(8);
-
-   return load;
-}
-
-void Storage::setTempElement(int idx, llvm::Value *val, int mask)
-{
-   if (mask != TGSI_WRITEMASK_XYZW) {
-      llvm::Value *templ = 0;
-      if (m_tempWriteMap[idx])
-         templ = tempElement(idx);
-      val = maskWrite(val, mask, templ);
-   }
-   Value *elem = element(TempsArg, idx);
-   StoreInst *st = new StoreInst(val, elem, false, m_block);
-   st->setAlignment(8);
-   m_tempWriteMap[idx] = true;
-}
-
-void Storage::setOutputElement(int dstIdx, llvm::Value *val, int mask)
-{
-   if (mask != TGSI_WRITEMASK_XYZW) {
-      llvm::Value *templ = 0;
-      if (m_destWriteMap[dstIdx])
-         templ = outputElement(dstIdx);
-      val = maskWrite(val, mask, templ);
-   }
-
-   Value *elem = element(DestsArg, dstIdx);
-   StoreInst *st = new StoreInst(val, elem, false, m_block);
-   st->setAlignment(8);
-   m_destWriteMap[dstIdx] = true;
-}
-
-llvm::Value *Storage::maskWrite(llvm::Value *src, int mask, llvm::Value *templ)
-{
-   llvm::Value *dst = templ;
-   if (!dst)
-      dst = Constant::getNullValue(m_floatVecType);
-   if ((mask & TGSI_WRITEMASK_X)) {
-      llvm::Value *x = new ExtractElementInst(src, unsigned(0),
-                                              name("x"), m_block);
-      dst = InsertElementInst::Create(dst, x, unsigned(0),
-                                      name("dstx"), m_block);
-   }
-   if ((mask & TGSI_WRITEMASK_Y)) {
-      llvm::Value *y = new ExtractElementInst(src, unsigned(1),
-                                              name("y"), m_block);
-      dst = InsertElementInst::Create(dst, y, unsigned(1),
-                                      name("dsty"), m_block);
-   }
-   if ((mask & TGSI_WRITEMASK_Z)) {
-      llvm::Value *z = new ExtractElementInst(src, unsigned(2),
-                                              name("z"), m_block);
-      dst = InsertElementInst::Create(dst, z, unsigned(2),
-                                      name("dstz"), m_block);
-   }
-   if ((mask & TGSI_WRITEMASK_W)) {
-      llvm::Value *w = new ExtractElementInst(src, unsigned(3),
-                                              name("w"), m_block);
-      dst = InsertElementInst::Create(dst, w, unsigned(3),
-                                      name("dstw"), m_block);
-   }
-   return dst;
-}
-
-const char * Storage::name(const char *prefix)
-{
-   ++m_idx;
-   snprintf(m_name, 32, "%s%d", prefix, m_idx);
-   return m_name;
-}
-
-int Storage::numConsts() const
-{
-   return m_numConsts;
-}
-
-llvm::Value * Storage::addrElement(int idx) const
-{
-   Value *ret = m_addrs[idx];
-   if (!ret)
-      return m_undefFloatVec;
-   return ret;
-}
-
-void Storage::setAddrElement(int idx, llvm::Value *val, int mask)
-{
-   if (mask != TGSI_WRITEMASK_XYZW) {
-      llvm::Value *templ = m_addrs[idx];
-      val = maskWrite(val, mask, templ);
-   }
-   m_addrs[idx] = val;
-}
-
-llvm::Value * Storage::extractIndex(llvm::Value *vec)
-{
-   llvm::Value *x = new ExtractElementInst(vec, unsigned(0),
-                                           name("x"), m_block);
-   return new FPToSIInst(x, IntegerType::get(32), name("intidx"), m_block);
-}
-
-void Storage::setCurrentBlock(llvm::BasicBlock *block)
-{
-   m_block = block;
-}
-
-llvm::Value * Storage::outputElement(int idx, llvm::Value *indIdx)
-{
-   Value *elem = element(DestsArg, idx, indIdx);
-   LoadInst *load = new LoadInst(elem, name("output"), false, m_block);
-   load->setAlignment(8);
-
-   return load;
-}
-
-llvm::Value * Storage::inputPtr() const
-{
-   return m_INPUT;
-}
-
-void Storage::pushArguments(llvm::Value *input)
-{
-   m_argStack.push(m_INPUT);
-
-   m_INPUT = input;
-}
-
-void Storage::popArguments()
-{
-   m_INPUT = m_argStack.top();
-   m_argStack.pop();
-}
-
-void Storage::pushTemps()
-{
-   m_extSwizzleVec = 0;
-}
-
-void Storage::popTemps()
-{
-}
-
-llvm::Value * Storage::immediateElement(int idx)
-{
-   return m_immediates[idx];
-}
-
-void Storage::addImmediate(float *val)
-{
-   std::vector<Constant*> vec(4);
-   vec[0] = ConstantFP::get(APFloat(val[0]));
-   vec[1] = ConstantFP::get(APFloat(val[1]));
-   vec[2] = ConstantFP::get(APFloat(val[2]));
-   vec[3] = ConstantFP::get(APFloat(val[3]));
-   m_immediates.push_back(ConstantVector::get(m_floatVecType, vec));
-}
-
-
-llvm::Value * Storage::elemPtr(Args arg)
-{
-   std::vector<Value*> indices;
-   indices.push_back(constantInt(0));
-   indices.push_back(constantInt(static_cast<int>(arg)));
-   GetElementPtrInst *getElem = GetElementPtrInst::Create(m_INPUT,
-                                                          indices.begin(),
-                                                          indices.end(),
-                                                          name("input_ptr"),
-                                                          m_block);
-   return new LoadInst(getElem, name("input_field"), false, m_block);
-}
-
-llvm::Value * Storage::elemIdx(llvm::Value *ptr, int idx,
-                               llvm::Value *indIdx )
-{
-   GetElementPtrInst *getElem = 0;
-
-   if (indIdx) {
-      getElem = GetElementPtrInst::Create(ptr,
-                                      BinaryOperator::Create(Instruction::Add,
-                                                             indIdx,
-                                                             constantInt(idx),
-                                                             name("add"),
-                                                             m_block),
-                                      name("field"),
-                                      m_block);
-   } else {
-      getElem = GetElementPtrInst::Create(ptr,
-                                      constantInt(idx),
-                                      name("field"),
-                                      m_block);
-   }
-   return getElem;
-}
-
-llvm::Value * Storage::element(Args arg, int idx, llvm::Value *indIdx )
-{
-   Value *val = elemPtr(arg);
-   return elemIdx(val, idx, indIdx);
-}
-
-void Storage::setKilElement(llvm::Value *val)
-{
-   std::vector<Value*> indices;
-   indices.push_back(constantInt(0));
-   indices.push_back(constantInt(static_cast<int>(KilArg)));
-   GetElementPtrInst *elem = GetElementPtrInst::Create(m_INPUT,
-                                                   indices.begin(),
-                                                   indices.end(),
-                                                   name("kil_ptr"),
-                                                   m_block);
-   StoreInst *st = new StoreInst(val, elem, false, m_block);
-   st->setAlignment(8);
-}
-
-#endif //MESA_LLVM
-
-
diff --git a/src/gallium/auxiliary/gallivm/storage.h b/src/gallium/auxiliary/gallivm/storage.h
deleted file mode 100644
index 8574f7554e3..00000000000
--- a/src/gallium/auxiliary/gallivm/storage.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Zack Rusin [email protected]
-  */
-
-#ifndef STORAGE_H
-#define STORAGE_H
-
-#include <map>
-#include <set>
-#include <stack>
-#include <vector>
-
-namespace llvm {
-   class BasicBlock;
-   class Constant;
-   class ConstantInt;
-   class LoadInst;
-   class Value;
-   class VectorType;
-}
-
-class Storage
-{
-public:
-   Storage(llvm::BasicBlock *block,
-           llvm::Value *input);
-
-   llvm::Value *inputPtr() const;
-
-   void setCurrentBlock(llvm::BasicBlock *block);
-
-   llvm::ConstantInt *constantInt(int);
-   llvm::Constant *shuffleMask(int vec);
-   llvm::Value *inputElement(int idx, llvm::Value *indIdx =0);
-   llvm::Value *constElement(int idx, llvm::Value *indIdx =0);
-   llvm::Value *outputElement(int idx, llvm::Value *indIdx =0);
-   llvm::Value *tempElement(int idx, llvm::Value *indIdx =0);
-   llvm::Value *immediateElement(int idx);
-
-   void setOutputElement(int dstIdx, llvm::Value *val, int mask);
-   void setTempElement(int idx, llvm::Value *val, int mask);
-
-   llvm::Value *addrElement(int idx) const;
-   void setAddrElement(int idx, llvm::Value *val, int mask);
-
-   void setKilElement(llvm::Value *val);
-
-   llvm::Value *shuffleVector(llvm::Value *vec, int shuffle);
-
-   llvm::Value *extractIndex(llvm::Value *vec);
-
-   int numConsts() const;
-
-   void pushArguments(llvm::Value *input);
-   void popArguments();
-   void pushTemps();
-   void popTemps();
-
-   void addImmediate(float *val);
-
-private:
-   llvm::Value *maskWrite(llvm::Value *src, int mask, llvm::Value *templ);
-   const char *name(const char *prefix);
-
-   enum Args {
-      DestsArg   = 0,
-      InputsArg  = 1,
-      TempsArg   = 2,
-      ConstsArg  = 3,
-      KilArg     = 4
-   };
-   llvm::Value *elemPtr(Args arg);
-   llvm::Value *elemIdx(llvm::Value *ptr, int idx,
-                        llvm::Value *indIdx = 0);
-   llvm::Value *element(Args arg, int idx, llvm::Value *indIdx = 0);
-
-private:
-   llvm::BasicBlock *m_block;
-   llvm::Value *m_INPUT;
-
-   std::map<int, llvm::ConstantInt*> m_constInts;
-   std::map<int, llvm::Constant*>    m_intVecs;
-   std::vector<llvm::Value*>         m_addrs;
-   std::vector<llvm::Constant*>      m_immediates;
-
-   llvm::VectorType *m_floatVecType;
-   llvm::VectorType *m_intVecType;
-
-   char        m_name[32];
-   int         m_idx;
-
-   int         m_numConsts;
-
-   std::map<int, bool > m_destWriteMap;
-   std::map<int, bool > m_tempWriteMap;
-
-   llvm::Value      *m_undefFloatVec;
-   llvm::Value      *m_undefIntVec;
-   llvm::Value      *m_extSwizzleVec;
-
-   std::stack<llvm::Value*> m_argStack;
-   std::stack<std::vector<llvm::Value*> > m_tempStack;
-};
-
-#endif
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
deleted file mode 100644
index 4984ce985c6..00000000000
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ /dev/null
@@ -1,438 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "storagesoa.h"
-
-#include "gallivm_p.h"
-
-#include "pipe/p_shader_tokens.h"
-#include "util/u_debug.h"
-
-#include <llvm/BasicBlock.h>
-#include <llvm/Module.h>
-#include <llvm/Value.h>
-
-#include <llvm/CallingConv.h>
-#include <llvm/Constants.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/InstrTypes.h>
-#include <llvm/Instructions.h>
-
-using namespace llvm;
-
-
-StorageSoa::StorageSoa(llvm::BasicBlock *block,
-                       llvm::Value *input,
-                       llvm::Value *output,
-                       llvm::Value *consts)
-   : m_block(block),
-     m_input(input),
-     m_output(output),
-     m_consts(consts),
-     m_immediates(0),
-     m_idx(0)
-{
-}
-
-void StorageSoa::addImmediate(float *vec)
-{
-   std::vector<float> vals(4);
-   vals[0] = vec[0];
-   vals[1] = vec[1];
-   vals[2] = vec[2];
-   vals[3] = vec[3];
-   m_immediatesToFlush.push_back(vals);
-}
-
-void StorageSoa::declareImmediates()
-{
-   if (m_immediatesToFlush.empty())
-      return;
-
-   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
-   ArrayType  *vectorChannels = ArrayType::get(vectorType, 4);
-   ArrayType  *arrayType = ArrayType::get(vectorChannels, m_immediatesToFlush.size());
-
-   m_immediates = new GlobalVariable(
-      /*Type=*/arrayType,
-      /*isConstant=*/false,
-      /*Linkage=*/GlobalValue::ExternalLinkage,
-      /*Initializer=*/0, // has initializer, specified below
-      /*Name=*/name("immediates"),
-      currentModule());
-
-   std::vector<Constant*> arrayVals;
-   for (unsigned int i = 0; i < m_immediatesToFlush.size(); ++i) {
-      std::vector<float> vec = m_immediatesToFlush[i];
-      std::vector<float> vals(4);
-      std::vector<Constant*> channelArray;
-
-      vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
-      llvm::Constant *xChannel = createConstGlobalVector(vals);
-
-      vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
-      llvm::Constant *yChannel = createConstGlobalVector(vals);
-
-      vals[0] = vec[2]; vals[1] = vec[2]; vals[2] = vec[2]; vals[3] = vec[2];
-      llvm::Constant *zChannel = createConstGlobalVector(vals);
-
-      vals[0] = vec[3]; vals[1] = vec[3]; vals[2] = vec[3]; vals[3] = vec[3];
-      llvm::Constant *wChannel = createConstGlobalVector(vals);
-      channelArray.push_back(xChannel);
-      channelArray.push_back(yChannel);
-      channelArray.push_back(zChannel);
-      channelArray.push_back(wChannel);
-      Constant *constChannels = ConstantArray::get(vectorChannels,
-                                                   channelArray);
-      arrayVals.push_back(constChannels);
-   }
-   Constant *constArray = ConstantArray::get(arrayType, arrayVals);
-   m_immediates->setInitializer(constArray);
-
-   m_immediatesToFlush.clear();
-}
-
-llvm::Value *StorageSoa::addrElement(int idx) const
-{
-   std::map<int, llvm::Value*>::const_iterator itr = m_addresses.find(idx);
-   if (itr == m_addresses.end()) {
-      debug_printf("Trying to access invalid shader 'address'\n");
-      return 0;
-   }
-   llvm::Value * res = (*itr).second;
-
-   res = new LoadInst(res, name("addr"), false, m_block);
-
-   return res;
-}
-
-std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = element(m_input, idx, 0);
-   res[1] = element(m_input, idx, 1);
-   res[2] = element(m_input, idx, 2);
-   res[3] = element(m_input, idx, 3);
-
-   return res;
-}
-
-llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc)
-{
-   std::vector<llvm::Value*> x(4);
-   x[0] = m_builder->CreateExtractElement(vector,
-                                           constantInt(cc),
-                                           name("x"));
-
-   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
-   Constant *constVector = Constant::getNullValue(vectorType);
-   Value *res = m_builder->CreateInsertElement(constVector, x[0],
-                                              constantInt(0),
-                                              name("vecx"));
-   res = m_builder->CreateInsertElement(res, x[0], constantInt(1),
-                               name("vecxx"));
-   res = m_builder->CreateInsertElement(res, x[0], constantInt(2),
-                               name("vecxxx"));
-   res = m_builder->CreateInsertElement(res, x[0], constantInt(3),
-                               name("vecxxxx"));
-   return res;
-}
-
-std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
-{
-   llvm::Value* res;
-   std::vector<llvm::Value*> res2(4);
-   llvm::Value *xChannel;
-
-   xChannel = elementPointer(m_consts, idx, 0);
-
-   res = alignedArrayLoad(xChannel);
-
-   res2[0]=unpackConstElement(m_builder, res,0);
-   res2[1]=unpackConstElement(m_builder, res,1);
-   res2[2]=unpackConstElement(m_builder, res,2);
-   res2[3]=unpackConstElement(m_builder, res,3);
-
-   return res2;
-}
-
-std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = element(m_output, idx, 0);
-   res[1] = element(m_output, idx, 1);
-   res[2] = element(m_output, idx, 2);
-   res[3] = element(m_output, idx, 3);
-
-   return res;
-}
-
-std::vector<llvm::Value*> StorageSoa::tempElement(llvm::IRBuilder<>* m_builder, int idx)
-{
-   std::vector<llvm::Value*> res(4);
-   llvm::Value *temp = m_temps[idx];
-
-   res[0] = element(temp, constantInt(0), 0);
-   res[1] = element(temp, constantInt(0), 1);
-   res[2] = element(temp, constantInt(0), 2);
-   res[3] = element(temp, constantInt(0), 3);
-
-   return res;
-}
-
-std::vector<llvm::Value*> StorageSoa::immediateElement(llvm::Value *idx)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = element(m_immediates, idx, 0);
-   res[1] = element(m_immediates, idx, 1);
-   res[2] = element(m_immediates, idx, 2);
-   res[3] = element(m_immediates, idx, 3);
-
-   return res;
-}
-
-llvm::Value * StorageSoa::elementPointer(llvm::Value *ptr, llvm::Value *index,
-                                         int channel) const
-{
-   std::vector<Value*> indices;
-   if (m_immediates == ptr)
-      indices.push_back(constantInt(0));
-   indices.push_back(index);
-   indices.push_back(constantInt(channel));
-
-   GetElementPtrInst *getElem = GetElementPtrInst::Create(ptr,
-                                                          indices.begin(),
-                                                          indices.end(),
-                                                          name("ptr"),
-                                                          m_block);
-   return getElem;
-}
-
-llvm::Value * StorageSoa::element(llvm::Value *ptr, llvm::Value *index,
-                                  int channel) const
-{
-   llvm::Value *res = elementPointer(ptr, index, channel);
-   LoadInst *load = new LoadInst(res, name("element"), false, m_block);
-   //load->setAlignment(8);
-   return load;
-}
-
-const char * StorageSoa::name(const char *prefix) const
-{
-   ++m_idx;
-   snprintf(m_name, 32, "%s%d", prefix, m_idx);
-   return m_name;
-}
-
-llvm::ConstantInt * StorageSoa::constantInt(int idx) const
-{
-   if (m_constInts.find(idx) != m_constInts.end()) {
-      return m_constInts[idx];
-   }
-   ConstantInt *constInt = ConstantInt::get(APInt(32,  idx));
-   m_constInts[idx] = constInt;
-   return constInt;
-}
-
-llvm::Value *StorageSoa::alignedArrayLoad(llvm::Value *val)
-{
-   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
-   PointerType *vectorPtr  = PointerType::get(vectorType, 0);
-
-   CastInst *cast = new BitCastInst(val, vectorPtr, name("toVector"), m_block);
-   LoadInst *load = new LoadInst(cast, name("alignLoad"), false, m_block);
-   load->setAlignment(8);
-   return load;
-}
-
-llvm::Module * StorageSoa::currentModule() const
-{
-    if (!m_block || !m_block->getParent())
-       return 0;
-
-    return m_block->getParent()->getParent();
-}
-
-llvm::Constant * StorageSoa::createConstGlobalFloat(const float val)
-{
-   Constant*c = ConstantFP::get(APFloat(val));
-   return c;
-}
-
-llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
-{
-   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
-   std::vector<Constant*> immValues;
-   ConstantFP *constx = ConstantFP::get(APFloat(vec[0]));
-   ConstantFP *consty = ConstantFP::get(APFloat(vec[1]));
-   ConstantFP *constz = ConstantFP::get(APFloat(vec[2]));
-   ConstantFP *constw = ConstantFP::get(APFloat(vec[3]));
-   immValues.push_back(constx);
-   immValues.push_back(consty);
-   immValues.push_back(constz);
-   immValues.push_back(constw);
-   Constant  *constVector = ConstantVector::get(vectorType, immValues);
-
-   return constVector;
-}
-
-std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
-                                           llvm::IRBuilder<>* m_builder,llvm::Value *indIdx)
-{
-   std::vector<llvm::Value*> val(4);
-
-   //if we have an indirect index, always use that
-   //   if not use the integer offset to create one
-   llvm::Value *realIndex = 0;
-   if (indIdx)
-      realIndex = indIdx;
-   else
-      realIndex = constantInt(idx);
-   debug_printf("XXXXXXXXX realIdx = %p, indIdx = %p\n", realIndex, indIdx);
-
-   switch(type) {
-   case TGSI_FILE_INPUT:
-      val = inputElement(realIndex);
-      break;
-   case TGSI_FILE_OUTPUT:
-      val = outputElement(realIndex);
-      break;
-   case TGSI_FILE_TEMPORARY:
-      val = tempElement(m_builder, idx);
-      break;
-   case TGSI_FILE_CONSTANT:
-      val = constElement(m_builder, realIndex);
-      break;
-   case TGSI_FILE_IMMEDIATE:
-      val = immediateElement(realIndex);
-      break;
-   case TGSI_FILE_ADDRESS:
-      debug_printf("Address not handled in the load phase!\n");
-      assert(0);
-      break;
-   default:
-      debug_printf("Unknown load!\n");
-      assert(0);
-      break;
-   }
-   if (!gallivm_is_swizzle(swizzle))
-      return val;
-
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = val[gallivm_x_swizzle(swizzle)];
-   res[1] = val[gallivm_y_swizzle(swizzle)];
-   res[2] = val[gallivm_z_swizzle(swizzle)];
-   res[3] = val[gallivm_w_swizzle(swizzle)];
-   return res;
-}
-
-llvm::Value * StorageSoa::allocaTemp(llvm::IRBuilder<>* m_builder)
-{
-   VectorType *vector   = VectorType::get(Type::FloatTy, 4);
-   ArrayType  *vecArray = ArrayType::get(vector, 4);
-   AllocaInst *alloca = new AllocaInst(vecArray, "temp",
-                                       m_builder->GetInsertBlock());
-
-   return alloca;
-}
-
-
-void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
-                       int mask, llvm::IRBuilder<>* m_builder)
-{
-   llvm::Value *out = 0;
-   llvm::Value *realIndex = 0;
-   switch(type) {
-   case TGSI_FILE_OUTPUT:
-      out = m_output;
-      realIndex = constantInt(idx);
-      break;
-   case TGSI_FILE_TEMPORARY:
-      // if that temp doesn't already exist, alloca it
-      if (m_temps.find(idx) == m_temps.end())
-         m_temps[idx] = allocaTemp(m_builder);
-
-      out = m_temps[idx];
-
-      realIndex = constantInt(0);
-      break;
-   case TGSI_FILE_INPUT:
-      out = m_input;
-      realIndex = constantInt(idx);
-      break;
-   case TGSI_FILE_ADDRESS: {
-      llvm::Value *addr = m_addresses[idx];
-      if (!addr) {
-         addAddress(idx);
-         addr = m_addresses[idx];
-         assert(addr);
-      }
-      new StoreInst(val[0], addr, false, m_block);
-      return;
-      break;
-   }
-   default:
-      debug_printf("Can't save output of this type: %d !\n", type);
-      assert(0);
-      break;
-   }
-   if ((mask & TGSI_WRITEMASK_X)) {
-      llvm::Value *xChannel = elementPointer(out, realIndex, 0);
-      new StoreInst(val[0], xChannel, false, m_block);
-   }
-   if ((mask & TGSI_WRITEMASK_Y)) {
-      llvm::Value *yChannel = elementPointer(out, realIndex, 1);
-      new StoreInst(val[1], yChannel, false, m_block);
-   }
-   if ((mask & TGSI_WRITEMASK_Z)) {
-      llvm::Value *zChannel = elementPointer(out, realIndex, 2);
-      new StoreInst(val[2], zChannel, false, m_block);
-   }
-   if ((mask & TGSI_WRITEMASK_W)) {
-      llvm::Value *wChannel = elementPointer(out, realIndex, 3);
-      new StoreInst(val[3], wChannel, false, m_block);
-   }
-}
-
-void StorageSoa::addAddress(int idx)
-{
-   GlobalVariable *val = new GlobalVariable(
-      /*Type=*/IntegerType::get(32),
-      /*isConstant=*/false,
-      /*Linkage=*/GlobalValue::ExternalLinkage,
-      /*Initializer=*/0, // has initializer, specified below
-      /*Name=*/name("address"),
-      currentModule());
-   val->setInitializer(Constant::getNullValue(IntegerType::get(32)));
-
-   debug_printf("adding to %d\n", idx);
-   m_addresses[idx] = val;
-}
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
deleted file mode 100644
index 56886f85e7a..00000000000
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#ifndef STORAGESOA_H
-#define STORAGESOA_H
-
-#include <pipe/p_shader_tokens.h>
-#include <llvm/Support/IRBuilder.h>
-
-#include <vector>
-#include <list>
-#include <map>
-
-namespace llvm {
-   class BasicBlock;
-   class Constant;
-   class ConstantInt;
-   class GlobalVariable;
-   class LoadInst;
-   class Value;
-   class VectorType;
-   class Module;
-}
-
-class StorageSoa
-{
-public:
-   StorageSoa(llvm::BasicBlock *block,
-              llvm::Value *input,
-              llvm::Value *output,
-              llvm::Value *consts);
-
-
-   std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, 
-                                  llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
-   void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
-              int mask, llvm::IRBuilder<>* m_builder);
-
-   void addImmediate(float *vec);
-   void declareImmediates();
-
-   void addAddress(int idx);
-
-   llvm::Value  * addrElement(int idx) const;
-
-   llvm::ConstantInt *constantInt(int) const;
-private:
-   llvm::Value *elementPointer(llvm::Value *ptr, llvm::Value *indIdx,
-                               int channel) const;
-   llvm::Value *element(llvm::Value *ptr, llvm::Value *idx,
-                        int channel) const;
-   const char *name(const char *prefix) const;
-   llvm::Value  *alignedArrayLoad(llvm::Value *val);
-   llvm::Module *currentModule() const;
-   llvm::Constant  *createConstGlobalFloat(const float val);
-   llvm::Constant  *createConstGlobalVector(const std::vector<float> &vec);
-
-   std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
-   llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
-   std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
-   std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
-   std::vector<llvm::Value*> tempElement(llvm::IRBuilder<>* m_builder, int idx);
-   std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
-private:
-   llvm::BasicBlock *m_block;
-
-   llvm::Value *m_input;
-   llvm::Value *m_output;
-   llvm::Value *m_consts;
-   std::map<int, llvm::Value*> m_temps;
-   llvm::GlobalVariable *m_immediates;
-
-   std::map<int, llvm::Value*> m_addresses;
-
-   std::vector<std::vector<float> > m_immediatesToFlush;
-   llvm::Value * allocaTemp(llvm::IRBuilder<>* m_builder);
-
-   mutable std::map<int, llvm::ConstantInt*> m_constInts;
-   mutable char        m_name[32];
-   mutable int         m_idx;
-};
-
-#endif
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
deleted file mode 100644
index 5cafe8c3f0c..00000000000
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ /dev/null
@@ -1,1136 +0,0 @@
-#include "tgsitollvm.h"
-
-#include "gallivm.h"
-#include "gallivm_p.h"
-
-#include "storage.h"
-#include "instructions.h"
-#include "storagesoa.h"
-#include "instructionssoa.h"
-
-#include "pipe/p_shader_tokens.h"
-
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_exec.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_build.h"
-#include "tgsi/tgsi_dump.h"
-
-
-#include <llvm/Module.h>
-#include <llvm/CallingConv.h>
-#include <llvm/Constants.h>
-#include <llvm/DerivedTypes.h>
-#include <llvm/Instructions.h>
-#include <llvm/ModuleProvider.h>
-#include <llvm/Pass.h>
-#include <llvm/PassManager.h>
-#include <llvm/Attributes.h>
-#include <llvm/Support/PatternMatch.h>
-#include <llvm/ExecutionEngine/JIT.h>
-#include <llvm/ExecutionEngine/Interpreter.h>
-#include <llvm/ExecutionEngine/GenericValue.h>
-#include <llvm/Support/MemoryBuffer.h>
-#include <llvm/LinkAllPasses.h>
-#include <llvm/Analysis/Verifier.h>
-#include <llvm/Analysis/LoopPass.h>
-#include <llvm/Target/TargetData.h>
-#include <llvm/Bitcode/ReaderWriter.h>
-#include <llvm/Transforms/Utils/Cloning.h>
-
-
-#include <sstream>
-#include <fstream>
-#include <iostream>
-
-using namespace llvm;
-
-static inline FunctionType *vertexShaderFunctionType()
-{
-   //Function takes three arguments,
-   // the calling code has to make sure the types it will
-   // pass are castable to the following:
-   // [4 x <4 x float>] inputs,
-   // [4 x <4 x float>] output,
-   // [4 x [1 x float]] consts,
-
-   std::vector<const Type*> funcArgs;
-   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
-   ArrayType *vectorArray = ArrayType::get(vectorType, 4);
-   PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
-
-   ArrayType   *floatArray     = ArrayType::get(Type::FloatTy, 4);
-   ArrayType   *constsArray    = ArrayType::get(floatArray, 1);
-   PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
-
-   funcArgs.push_back(vectorArrayPtr);//inputs
-   funcArgs.push_back(vectorArrayPtr);//output
-   funcArgs.push_back(constsArrayPtr);//consts
-
-   FunctionType *functionType = FunctionType::get(
-      /*Result=*/Type::VoidTy,
-      /*Params=*/funcArgs,
-      /*isVarArg=*/false);
-
-   return functionType;
-}
-
-static inline void
-add_interpolator(struct gallivm_ir *ir,
-                 struct gallivm_interpolate *interp)
-{
-   ir->interpolators[ir->num_interp] = *interp;
-   ++ir->num_interp;
-}
-
-static void
-translate_declaration(struct gallivm_ir *prog,
-                      llvm::Module *module,
-                      Storage *storage,
-                      struct tgsi_full_declaration *decl,
-                      struct tgsi_full_declaration *fd)
-{
-   if (decl->Declaration.File == TGSI_FILE_INPUT) {
-      unsigned first, last, mask;
-      uint interp_method;
-
-      first = decl->Range.First;
-      last = decl->Range.Last;
-      mask = decl->Declaration.UsageMask;
-
-      /* Do not touch WPOS.xy */
-      if (first == 0) {
-         mask &= ~TGSI_WRITEMASK_XY;
-         if (mask == TGSI_WRITEMASK_NONE) {
-            first++;
-            if (first > last) {
-               return;
-            }
-         }
-      }
-
-      interp_method = decl->Declaration.Interpolate;
-
-      if (mask == TGSI_WRITEMASK_XYZW) {
-         unsigned i, j;
-
-         for (i = first; i <= last; i++) {
-            for (j = 0; j < NUM_CHANNELS; j++) {
-               //interp( mach, i, j );
-               struct gallivm_interpolate interp;
-               interp.type = interp_method;
-               interp.attrib = i;
-               interp.chan = j;
-               add_interpolator(prog, &interp);
-            }
-         }
-      } else {
-         unsigned i, j;
-         for( j = 0; j < NUM_CHANNELS; j++ ) {
-            if( mask & (1 << j) ) {
-               for( i = first; i <= last; i++ ) {
-                  struct gallivm_interpolate interp;
-                  interp.type = interp_method;
-                  interp.attrib = i;
-                  interp.chan = j;
-                  add_interpolator(prog, &interp);
-               }
-            }
-         }
-      }
-   }
-}
-
-static void
-translate_declarationir(struct gallivm_ir *,
-                      llvm::Module *,
-                      StorageSoa *storage,
-                      struct tgsi_full_declaration *decl,
-                      struct tgsi_full_declaration *)
-{
-   if (decl->Declaration.File == TGSI_FILE_ADDRESS) {
-      int idx = decl->Range.First;
-      storage->addAddress(idx);
-   }
-}
-
-static void
-translate_immediate(Storage *storage,
-                    struct tgsi_full_immediate *imm)
-{
-   float vec[4];
-   int i;
-   assert( imm->Immediate.NrTokens <= 4 + 1 );
-   for (i = 0; i < imm->Immediate.NrTokens - 1; ++i) {
-      switch (imm->Immediate.DataType) {
-      case TGSI_IMM_FLOAT32:
-         vec[i] = imm->u[i].Float;
-         break;
-      default:
-         assert(0);
-      }
-   }
-   storage->addImmediate(vec);
-}
-
-
-static void
-translate_immediateir(StorageSoa *storage,
-                      struct tgsi_full_immediate *imm)
-{
-   float vec[4];
-   int i;
-   assert( imm->Immediate.NrTokens <= 4 + 1 );
-   for (i = 0; i < imm->Immediate.NrTokens - 1; ++i) {
-      switch (imm->Immediate.DataType) {
-      case TGSI_IMM_FLOAT32:
-         vec[i] = imm->u[i].Float;
-         break;
-      default:
-         assert(0);
-      }
-   }
-   storage->addImmediate(vec);
-}
-
-static inline int
-swizzleInt(struct tgsi_full_src_register *src)
-{
-   int swizzle = 0;
-   int start = 1000;
-
-   for (int k = 0; k < 4; ++k) {
-      swizzle += tgsi_util_get_full_src_register_extswizzle(src, k) * start;
-      start /= 10;
-   }
-   return swizzle;
-}
-
-static inline llvm::Value *
-swizzleVector(llvm::Value *val, struct tgsi_full_src_register *src,
-              Storage *storage)
-{
-   int swizzle = swizzleInt(src);
-
-   if (gallivm_is_swizzle(swizzle)) {
-      /*fprintf(stderr, "XXXXXXXX swizzle = %d\n", swizzle);*/
-      val = storage->shuffleVector(val, swizzle);
-   }
-   return val;
-}
-
-static void
-translate_instruction(llvm::Module *module,
-                      Storage *storage,
-                      Instructions *instr,
-                      struct tgsi_full_instruction *inst,
-                      struct tgsi_full_instruction *fi,
-                      unsigned instno)
-{
-   llvm::Value *inputs[4];
-   inputs[0] = 0;
-   inputs[1] = 0;
-   inputs[2] = 0;
-   inputs[3] = 0;
-
-   for (int i = 0; i < inst->Instruction.NumSrcRegs; ++i) {
-      struct tgsi_full_src_register *src = &inst->Src[i];
-      llvm::Value *val = 0;
-      llvm::Value *indIdx = 0;
-
-      if (src->Register.Indirect) {
-         indIdx = storage->addrElement(src->Indirect.Index);
-         indIdx = storage->extractIndex(indIdx);
-      }
-      if (src->Register.File == TGSI_FILE_CONSTANT) {
-         val = storage->constElement(src->Register.Index, indIdx);
-      } else if (src->Register.File == TGSI_FILE_INPUT) {
-         val = storage->inputElement(src->Register.Index, indIdx);
-      } else if (src->Register.File == TGSI_FILE_TEMPORARY) {
-         val = storage->tempElement(src->Register.Index);
-      } else if (src->Register.File == TGSI_FILE_OUTPUT) {
-         val = storage->outputElement(src->Register.Index, indIdx);
-      } else if (src->Register.File == TGSI_FILE_IMMEDIATE) {
-         val = storage->immediateElement(src->Register.Index);
-      } else {
-         fprintf(stderr, "ERROR: not supported llvm source %d\n", src->Register.File);
-         return;
-      }
-
-      inputs[i] = swizzleVector(val, src, storage);
-   }
-
-   /*if (inputs[0])
-     instr->printVector(inputs[0]);
-     if (inputs[1])
-     instr->printVector(inputs[1]);*/
-   llvm::Value *out = 0;
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ARL: {
-      out = instr->arl(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_MOV: {
-      out = inputs[0];
-   }
-      break;
-   case TGSI_OPCODE_LIT: {
-      out = instr->lit(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_RCP: {
-      out = instr->rcp(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_RSQ: {
-      out = instr->rsq(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_EXP: {
-      out = instr->exp(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_LOG: {
-      out = instr->log(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_MUL: {
-      out = instr->mul(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_ADD: {
-      out = instr->add(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DP3: {
-      out = instr->dp3(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DP4: {
-      out = instr->dp4(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DST: {
-      out = instr->dst(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_MIN: {
-      out = instr->min(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_MAX: {
-      out = instr->max(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SLT: {
-      out = instr->slt(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SGE: {
-      out = instr->sge(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_MAD: {
-      out = instr->madd(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_SUB: {
-      out = instr->sub(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_LRP: {
-      out = instr->lerp(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_CND: {
-      out = instr->cnd(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_CND0: {
-      out = instr->cnd0(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_DP2A: {
-      out = instr->dot2add(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_FRC: {
-      out = instr->frc(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_CLAMP: {
-      out = instr->clamp(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_FLR: {
-      out = instr->floor(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_ROUND:
-      break;
-   case TGSI_OPCODE_EX2: {
-      out = instr->ex2(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_LG2: {
-      out = instr->lg2(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_POW: {
-      out = instr->pow(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_XPD: {
-      out = instr->cross(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_ABS: {
-      out = instr->abs(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_RCC:
-      break;
-   case TGSI_OPCODE_DPH: {
-      out = instr->dph(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_COS: {
-      out = instr->cos(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_DDX: {
-      out = instr->ddx(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_DDY: {
-      out = instr->ddy(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_KILP:
-      break;
-   case TGSI_OPCODE_PK2H:
-      break;
-   case TGSI_OPCODE_PK2US:
-      break;
-   case TGSI_OPCODE_PK4B:
-      break;
-   case TGSI_OPCODE_PK4UB:
-      break;
-   case TGSI_OPCODE_RFL:
-      break;
-   case TGSI_OPCODE_SEQ: {
-      out = instr->seq(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SFL: {
-      out = instr->sfl(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SGT: {
-      out = instr->sgt(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SIN: {
-      out = instr->sin(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_SLE: {
-      out = instr->sle(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SNE: {
-      out = instr->sne(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_STR: {
-      out = instr->str(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_TEX:
-      break;
-   case TGSI_OPCODE_TXD:
-      break;
-   case TGSI_OPCODE_UP2H:
-      break;
-   case TGSI_OPCODE_UP2US:
-      break;
-   case TGSI_OPCODE_UP4B:
-      break;
-   case TGSI_OPCODE_UP4UB:
-      break;
-   case TGSI_OPCODE_X2D: {
-      out = instr->x2d(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_ARA:
-      break;
-   case TGSI_OPCODE_ARR:
-      break;
-   case TGSI_OPCODE_BRA:
-      break;
-   case TGSI_OPCODE_CAL: {
-      instr->cal(inst->InstructionExtLabel.Label, storage->inputPtr());
-      return;
-   }
-      break;
-   case TGSI_OPCODE_RET: {
-      instr->end();
-      return;
-   }
-      break;
-   case TGSI_OPCODE_SSG:
-      break;
-   case TGSI_OPCODE_CMP: {
-      out = instr->cmp(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_SCS: {
-      out = instr->scs(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_TXB:
-      break;
-   case TGSI_OPCODE_NRM4:
-   case TGSI_OPCODE_NRM: {
-      out = instr->nrm(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_DIV: {
-      out = instr->div(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DP2: {
-      out = instr->dp2(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_TXL:
-      break;
-   case TGSI_OPCODE_BRK: {
-      instr->brk();
-      return;
-   }
-      break;
-   case TGSI_OPCODE_IF: {
-      instr->ifop(inputs[0]);
-      storage->setCurrentBlock(instr->currentBlock());
-      return;  //just update the state
-   }
-      break;
-   case TGSI_OPCODE_BGNFOR:
-      break;
-   case TGSI_OPCODE_REP:
-      break;
-   case TGSI_OPCODE_ELSE: {
-      instr->elseop();
-      storage->setCurrentBlock(instr->currentBlock());
-      return; //only state update
-   }
-      break;
-   case TGSI_OPCODE_ENDIF: {
-      instr->endif();
-      storage->setCurrentBlock(instr->currentBlock());
-      return; //just update the state
-   }
-      break;
-   case TGSI_OPCODE_ENDFOR:
-      break;
-   case TGSI_OPCODE_ENDREP:
-      break;
-   case TGSI_OPCODE_PUSHA:
-      break;
-   case TGSI_OPCODE_POPA:
-      break;
-   case TGSI_OPCODE_CEIL:
-      break;
-   case TGSI_OPCODE_I2F:
-      break;
-   case TGSI_OPCODE_NOT:
-      break;
-   case TGSI_OPCODE_TRUNC: {
-      out = instr->trunc(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_SHL:
-      break;
-   case TGSI_OPCODE_SHR:
-      break;
-   case TGSI_OPCODE_AND:
-      break;
-   case TGSI_OPCODE_OR:
-      break;
-   case TGSI_OPCODE_MOD:
-      break;
-   case TGSI_OPCODE_XOR:
-      break;
-   case TGSI_OPCODE_SAD:
-      break;
-   case TGSI_OPCODE_TXF:
-      break;
-   case TGSI_OPCODE_TXQ:
-      break;
-   case TGSI_OPCODE_CONT:
-      break;
-   case TGSI_OPCODE_EMIT:
-      break;
-   case TGSI_OPCODE_ENDPRIM:
-      break;
-   case TGSI_OPCODE_BGNLOOP: {
-      instr->beginLoop();
-      storage->setCurrentBlock(instr->currentBlock());
-      return;
-   }
-      break;
-   case TGSI_OPCODE_BGNSUB: {
-      instr->bgnSub(instno);
-      storage->setCurrentBlock(instr->currentBlock());
-      storage->pushTemps();
-      return;
-   }
-      break;
-   case TGSI_OPCODE_ENDLOOP: {
-      instr->endLoop();
-      storage->setCurrentBlock(instr->currentBlock());
-      return;
-   }
-      break;
-   case TGSI_OPCODE_ENDSUB: {
-      instr->endSub();
-      storage->setCurrentBlock(instr->currentBlock());
-      storage->popArguments();
-      storage->popTemps();
-      return;
-   }
-      break;
-   case TGSI_OPCODE_NOISE1:
-      break;
-   case TGSI_OPCODE_NOISE2:
-      break;
-   case TGSI_OPCODE_NOISE3:
-      break;
-   case TGSI_OPCODE_NOISE4:
-      break;
-   case TGSI_OPCODE_NOP:
-      break;
-   case TGSI_OPCODE_CALLNZ:
-      break;
-   case TGSI_OPCODE_IFC:
-      break;
-   case TGSI_OPCODE_BREAKC:
-      break;
-   case TGSI_OPCODE_KIL: {
-      out = instr->kil(inputs[0]);
-      storage->setKilElement(out);
-      return;
-   }
-      break;
-   case TGSI_OPCODE_END:
-      instr->end();
-      return;
-      break;
-   default:
-      fprintf(stderr, "ERROR: Unknown opcode %d\n",
-              inst->Instruction.Opcode);
-      assert(0);
-      break;
-   }
-
-   if (!out) {
-      fprintf(stderr, "ERROR: unsupported opcode %d\n",
-              inst->Instruction.Opcode);
-      assert(!"Unsupported opcode");
-   }
-
-   /* # not sure if we need this */
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-   case TGSI_SAT_ZERO_ONE:
-      /*TXT( "_SAT" );*/
-      break;
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      /*TXT( "_SAT[-1,1]" );*/
-      break;
-   default:
-      assert( 0 );
-   }
-
-   /* store results  */
-   for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
-      struct tgsi_full_dst_register *dst = &inst->Dst[i];
-
-      if (dst->Register.File == TGSI_FILE_OUTPUT) {
-         storage->setOutputElement(dst->Register.Index, out, dst->Register.WriteMask);
-      } else if (dst->Register.File == TGSI_FILE_TEMPORARY) {
-         storage->setTempElement(dst->Register.Index, out, dst->Register.WriteMask);
-      } else if (dst->Register.File == TGSI_FILE_ADDRESS) {
-         storage->setAddrElement(dst->Register.Index, out, dst->Register.WriteMask);
-      } else {
-         fprintf(stderr, "ERROR: unsupported LLVM destination!");
-         assert(!"wrong destination");
-      }
-   }
-}
-
-
-static void
-translate_instructionir(llvm::Module *module,
-                        StorageSoa *storage,
-                        InstructionsSoa *instr,
-                        struct tgsi_full_instruction *inst,
-                        struct tgsi_full_instruction *fi,
-                        unsigned instno)
-{
-   std::vector< std::vector<llvm::Value*> > inputs(inst->Instruction.NumSrcRegs);
-
-   for (int i = 0; i < inst->Instruction.NumSrcRegs; ++i) {
-      struct tgsi_full_src_register *src = &inst->Src[i];
-      std::vector<llvm::Value*> val;
-      llvm::Value *indIdx = 0;
-      int swizzle = swizzleInt(src);
-
-      if (src->Register.Indirect) {
-         indIdx = storage->addrElement(src->Indirect.Index);
-      }
-      val = storage->load((enum tgsi_file_type)src->Register.File,
-                          src->Register.Index, swizzle, instr->getIRBuilder(), indIdx);
-
-      inputs[i] = val;
-   }
-
-   std::vector<llvm::Value*> out(4);
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ARL: {
-      out = instr->arl(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_MOV: {
-      out = inputs[0];
-   }
-      break;
-   case TGSI_OPCODE_LIT: {
-      out = instr->lit(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_RCP: {
-   }
-      break;
-   case TGSI_OPCODE_RSQ: {
-      out = instr->rsq(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_EXP:
-      break;
-   case TGSI_OPCODE_LOG:
-      break;
-   case TGSI_OPCODE_MUL: {
-      out = instr->mul(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_ADD: {
-      out = instr->add(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DP3: {
-      out = instr->dp3(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DP4: {
-      out = instr->dp4(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_DST: {
-   }
-      break;
-   case TGSI_OPCODE_MIN: {
-      out = instr->min(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_MAX: {
-      out = instr->max(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SLT: {
-      out = instr->slt(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_SGE: {
-   }
-      break;
-   case TGSI_OPCODE_MAD: {
-      out = instr->madd(inputs[0], inputs[1], inputs[2]);
-   }
-      break;
-   case TGSI_OPCODE_SUB: {
-      out = instr->sub(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_LRP: {
-   }
-      break;
-   case TGSI_OPCODE_CND:
-      break;
-   case TGSI_OPCODE_CND0:
-      break;
-   case TGSI_OPCODE_DP2A:
-      break;
-   case TGSI_OPCODE_FRC: {
-   }
-      break;
-   case TGSI_OPCODE_CLAMP:
-      break;
-   case TGSI_OPCODE_FLR: {
-   }
-      break;
-   case TGSI_OPCODE_ROUND:
-      break;
-   case TGSI_OPCODE_EX2: {
-   }
-      break;
-   case TGSI_OPCODE_LG2: {
-   }
-      break;
-   case TGSI_OPCODE_POW: {
-      out = instr->pow(inputs[0], inputs[1]);
-   }
-      break;
-   case TGSI_OPCODE_XPD: {
-   }
-      break;
-   case TGSI_OPCODE_ABS: {
-      out = instr->abs(inputs[0]);
-   }
-      break;
-   case TGSI_OPCODE_RCC:
-      break;
-   case TGSI_OPCODE_DPH: {
-   }
-      break;
-   case TGSI_OPCODE_COS: {
-   }
-      break;
-   case TGSI_OPCODE_DDX:
-      break;
-   case TGSI_OPCODE_DDY:
-      break;
-   case TGSI_OPCODE_KILP:
-      break;
-   case TGSI_OPCODE_PK2H:
-      break;
-   case TGSI_OPCODE_PK2US:
-      break;
-   case TGSI_OPCODE_PK4B:
-      break;
-   case TGSI_OPCODE_PK4UB:
-      break;
-   case TGSI_OPCODE_RFL:
-      break;
-   case TGSI_OPCODE_SEQ:
-      break;
-   case TGSI_OPCODE_SFL:
-      break;
-   case TGSI_OPCODE_SGT: {
-   }
-      break;
-   case TGSI_OPCODE_SIN: {
-   }
-      break;
-   case TGSI_OPCODE_SLE:
-      break;
-   case TGSI_OPCODE_SNE:
-      break;
-   case TGSI_OPCODE_STR:
-      break;
-   case TGSI_OPCODE_TEX:
-      break;
-   case TGSI_OPCODE_TXD:
-      break;
-   case TGSI_OPCODE_UP2H:
-      break;
-   case TGSI_OPCODE_UP2US:
-      break;
-   case TGSI_OPCODE_UP4B:
-      break;
-   case TGSI_OPCODE_UP4UB:
-      break;
-   case TGSI_OPCODE_X2D:
-      break;
-   case TGSI_OPCODE_ARA:
-      break;
-   case TGSI_OPCODE_ARR:
-      break;
-   case TGSI_OPCODE_BRA:
-      break;
-   case TGSI_OPCODE_CAL: {
-   }
-      break;
-   case TGSI_OPCODE_RET: {
-   }
-      break;
-   case TGSI_OPCODE_SSG:
-      break;
-   case TGSI_OPCODE_CMP: {
-   }
-      break;
-   case TGSI_OPCODE_SCS: {
-   }
-      break;
-   case TGSI_OPCODE_TXB:
-      break;
-   case TGSI_OPCODE_NRM:
-      break;
-   case TGSI_OPCODE_DIV:
-      break;
-   case TGSI_OPCODE_DP2:
-      break;
-   case TGSI_OPCODE_TXL:
-      break;
-   case TGSI_OPCODE_BRK: {
-   }
-      break;
-   case TGSI_OPCODE_IF: {
-   }
-      break;
-   case TGSI_OPCODE_BGNFOR:
-      break;
-   case TGSI_OPCODE_REP:
-      break;
-   case TGSI_OPCODE_ELSE: {
-   }
-      break;
-   case TGSI_OPCODE_ENDIF: {
-   }
-      break;
-   case TGSI_OPCODE_ENDFOR:
-      break;
-   case TGSI_OPCODE_ENDREP:
-      break;
-   case TGSI_OPCODE_PUSHA:
-      break;
-   case TGSI_OPCODE_POPA:
-      break;
-   case TGSI_OPCODE_CEIL:
-      break;
-   case TGSI_OPCODE_I2F:
-      break;
-   case TGSI_OPCODE_NOT:
-      break;
-   case TGSI_OPCODE_TRUNC: {
-   }
-      break;
-   case TGSI_OPCODE_SHL:
-      break;
-   case TGSI_OPCODE_SHR:
-      break;
-   case TGSI_OPCODE_AND:
-      break;
-   case TGSI_OPCODE_OR:
-      break;
-   case TGSI_OPCODE_MOD:
-      break;
-   case TGSI_OPCODE_XOR:
-      break;
-   case TGSI_OPCODE_SAD:
-      break;
-   case TGSI_OPCODE_TXF:
-      break;
-   case TGSI_OPCODE_TXQ:
-      break;
-   case TGSI_OPCODE_CONT:
-      break;
-   case TGSI_OPCODE_EMIT:
-      break;
-   case TGSI_OPCODE_ENDPRIM:
-      break;
-   case TGSI_OPCODE_BGNLOOP: {
-   }
-      break;
-   case TGSI_OPCODE_BGNSUB: {
-   }
-      break;
-   case TGSI_OPCODE_ENDLOOP: {
-   }
-      break;
-   case TGSI_OPCODE_ENDSUB: {
-   }
-      break;
-   case TGSI_OPCODE_NOISE1:
-      break;
-   case TGSI_OPCODE_NOISE2:
-      break;
-   case TGSI_OPCODE_NOISE3:
-      break;
-   case TGSI_OPCODE_NOISE4:
-      break;
-   case TGSI_OPCODE_NOP:
-      break;
-   case TGSI_OPCODE_NRM4:
-      break;
-   case TGSI_OPCODE_CALLNZ:
-      break;
-   case TGSI_OPCODE_IFC:
-      break;
-   case TGSI_OPCODE_BREAKC:
-      break;
-   case TGSI_OPCODE_KIL: {
-   }
-      break;
-   case TGSI_OPCODE_END:
-      instr->end();
-      return;
-      break;
-   default:
-      fprintf(stderr, "ERROR: Unknown opcode %d\n",
-              inst->Instruction.Opcode);
-      assert(0);
-      break;
-   }
-
-   if (!out[0]) {
-      fprintf(stderr, "ERROR: unsupported opcode %d\n",
-              inst->Instruction.Opcode);
-      assert(!"Unsupported opcode");
-   }
-
-   /* store results  */
-   for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
-      struct tgsi_full_dst_register *dst = &inst->Dst[i];
-      storage->store((enum tgsi_file_type)dst->Register.File,
-                     dst->Register.Index, out, dst->Register.WriteMask,
-		     instr->getIRBuilder() );
-   }
-}
-
-llvm::Module *
-tgsi_to_llvm(struct gallivm_ir *ir, const struct tgsi_token *tokens)
-{
-   llvm::Module *mod = new Module("shader");
-   struct tgsi_parse_context parse;
-   struct tgsi_full_instruction fi;
-   struct tgsi_full_declaration fd;
-   unsigned instno = 0;
-   Function* shader = mod->getFunction("execute_shader");
-   std::ostringstream stream;
-   if (ir->type == GALLIVM_VS) {
-      stream << "vs_shader";
-   } else {
-      stream << "fs_shader";
-   }
-   stream << ir->id;
-   std::string func_name = stream.str();
-   shader->setName(func_name.c_str());
-
-   Function::arg_iterator args = shader->arg_begin();
-   Value *ptr_INPUT = args++;
-   ptr_INPUT->setName("input");
-
-   BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
-
-   tgsi_parse_init(&parse, tokens);
-
-   fi = tgsi_default_full_instruction();
-   fd = tgsi_default_full_declaration();
-   Storage storage(label_entry, ptr_INPUT);
-   Instructions instr(mod, shader, label_entry, &storage);
-   while(!tgsi_parse_end_of_tokens(&parse)) {
-      tgsi_parse_token(&parse);
-
-      switch (parse.FullToken.Token.Type) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         translate_declaration(ir, mod, &storage,
-                               &parse.FullToken.FullDeclaration,
-                               &fd);
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         translate_immediate(&storage,
-                             &parse.FullToken.FullImmediate);
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         translate_instruction(mod, &storage, &instr,
-                               &parse.FullToken.FullInstruction,
-                               &fi, instno);
-         ++instno;
-         break;
-
-      default:
-         assert(0);
-      }
-   }
-
-   tgsi_parse_free(&parse);
-
-   ir->num_consts = storage.numConsts();
-   return mod;
-}
-
-llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
-                              const struct tgsi_token *tokens)
-{
-   llvm::Module *mod = new Module("shader");
-   struct tgsi_parse_context parse;
-   struct tgsi_full_instruction fi;
-   struct tgsi_full_declaration fd;
-   unsigned instno = 0;
-   std::ostringstream stream;
-   if (ir->type == GALLIVM_VS) {
-      stream << "vs_shader";
-   } else {
-      stream << "fs_shader";
-   }
-   //stream << ir->id;
-   std::string func_name = stream.str();
-   Function *shader = llvm::cast<Function>(mod->getOrInsertFunction(
-                                              func_name.c_str(),
-                                              vertexShaderFunctionType()));
-
-   Function::arg_iterator args = shader->arg_begin();
-   Value *input = args++;
-   input->setName("inputs");
-   Value *output = args++;
-   output->setName("outputs");
-   Value *consts = args++;
-   consts->setName("consts");
-
-   BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
-
-   tgsi_parse_init(&parse, tokens);
-
-   fi = tgsi_default_full_instruction();
-   fd = tgsi_default_full_declaration();
-
-   StorageSoa storage(label_entry, input, output, consts);
-   InstructionsSoa instr(mod, shader, label_entry, &storage);
-
-   while(!tgsi_parse_end_of_tokens(&parse)) {
-      tgsi_parse_token(&parse);
-
-      switch (parse.FullToken.Token.Type) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         translate_declarationir(ir, mod, &storage,
-                                 &parse.FullToken.FullDeclaration,
-                                 &fd);
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         translate_immediateir(&storage,
-                             &parse.FullToken.FullImmediate);
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         storage.declareImmediates();
-         translate_instructionir(mod, &storage, &instr,
-                                 &parse.FullToken.FullInstruction,
-                                 &fi, instno);
-         ++instno;
-         break;
-
-      default:
-         assert(0);
-      }
-   }
-
-   tgsi_parse_free(&parse);
-
-   return mod;
-}
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.h b/src/gallium/auxiliary/gallivm/tgsitollvm.h
deleted file mode 100644
index 7ada04d6299..00000000000
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef TGSITOLLVM_H
-#define TGSITOLLVM_H
-
-
-namespace llvm {
-   class Module;
-}
-
-struct gallivm_ir;
-struct tgsi_token;
-
-
-llvm::Module * tgsi_to_llvm(struct gallivm_ir *ir,
-                            const struct tgsi_token *tokens);
-
-
-llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
-                              const struct tgsi_token *tokens);
-
-#endif
diff --git a/src/gallium/auxiliary/indices/Makefile b/src/gallium/auxiliary/indices/Makefile
deleted file mode 100644
index f2ebc3f410a..00000000000
--- a/src/gallium/auxiliary/indices/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = indices
-
-C_SOURCES = \
-	u_indices_gen.c \
-	u_unfilled_gen.c 
-
-include ../../Makefile.template
-
-u_indices_gen.c: u_indices_gen.py
-	python $< > $@
-
-u_unfilled_gen.c: u_unfilled_gen.py
-	python $< > $@
diff --git a/src/gallium/auxiliary/indices/SConscript b/src/gallium/auxiliary/indices/SConscript
deleted file mode 100644
index 712e215534f..00000000000
--- a/src/gallium/auxiliary/indices/SConscript
+++ /dev/null
@@ -1,28 +0,0 @@
-Import('*')
-
-from sys import executable as python_cmd
-
-env.CodeGenerate(
-	target = 'u_indices_gen.c', 
-	script = 'u_indices_gen.py', 
-	source = [],
-	command = python_cmd + ' $SCRIPT > $TARGET'
-)
-
-env.CodeGenerate(
-	target = 'u_unfilled_gen.c', 
-	script = 'u_unfilled_gen.py', 
-	source = [],
-	command = python_cmd + ' $SCRIPT > $TARGET'
-)
-
-indices = env.ConvenienceLibrary(
-	target = 'indices',
-	source = [
-#               'u_indices.c',
-#               'u_unfilled_indices.c',
-               'u_indices_gen.c',
-               'u_unfilled_gen.c',
-	])
-
-auxiliaries.insert(0, indices)
diff --git a/src/gallium/auxiliary/os/os_memory.h b/src/gallium/auxiliary/os/os_memory.h
new file mode 100644
index 00000000000..556662d35e1
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_memory.h
@@ -0,0 +1,84 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Vmware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/*
+ * OS memory management abstractions
+ */
+
+
+#ifndef _OS_MEMORY_H_
+#define _OS_MEMORY_H_
+
+
+#include "pipe/p_config.h"
+#include "pipe/p_compiler.h"
+
+
+#if defined(PIPE_OS_EMBEDDED)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void *
+os_malloc(size_t size);
+
+void *
+os_calloc(size_t count, size_t size);
+
+void
+os_free(void *ptr);
+
+void *
+os_realloc(void *ptr, size_t old_size, size_t new_size);
+
+void *
+os_malloc_aligned(size_t size, size_t alignment);
+
+void
+os_free_aligned(void *ptr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#elif defined(PIPE_OS_WINDOWS) && defined(DEBUG) && !defined(DEBUG_MEMORY_IMPLEMENTATION)
+
+#  include "os_memory_debug.h"
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+
+#  include "os_memory_win32k.h"
+
+#else
+
+#  include "os_memory_stdc.h"
+
+#endif
+
+#endif /* _OS_MEMORY_H_ */
diff --git a/src/gallium/auxiliary/os/os_memory_aligned.h b/src/gallium/auxiliary/os/os_memory_aligned.h
new file mode 100644
index 00000000000..72c5cf65b66
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_memory_aligned.h
@@ -0,0 +1,72 @@
+/**************************************************************************
+ * 
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/*
+ * Memory alignment wrappers.
+ */
+
+
+#ifndef _OS_MEMORY_H_
+#error "Must not be included directly. Include os_memory.h instead"
+#endif
+
+
+#include "pipe/p_compiler.h"
+
+
+/**
+ * Return memory on given byte alignment
+ */
+static INLINE void *
+os_malloc_aligned(size_t size, size_t alignment)
+{
+   char *ptr, *buf;
+
+   ptr = (char *) os_malloc(size + alignment + sizeof(void *));
+   if (!ptr)
+      return NULL;
+
+   buf = (char *)(((uintptr_t)ptr + sizeof(void *) + alignment - 1) & ~((uintptr_t)(alignment - 1)));
+   *(char **)(buf - sizeof(void *)) = ptr;
+
+   return buf;
+}
+
+
+/**
+ * Free memory returned by align_malloc().
+ */
+static INLINE void
+os_free_aligned(void *ptr)
+{
+   if (ptr) {
+      void **cubbyHole = (void **) ((char *) ptr - sizeof(void *));
+      void *realAddr = *cubbyHole;
+      os_free(realAddr);
+   }
+}
diff --git a/src/gallium/auxiliary/os/os_memory_debug.h b/src/gallium/auxiliary/os/os_memory_debug.h
new file mode 100644
index 00000000000..c664be9aad5
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_memory_debug.h
@@ -0,0 +1,83 @@
+/**************************************************************************
+ * 
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/*
+ * Debugging wrappers for OS memory management abstractions.
+ */
+
+
+#ifndef _OS_MEMORY_H_
+#error "Must not be included directly. Include os_memory.h instead"
+#endif
+
+
+#include "pipe/p_compiler.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+void *
+debug_malloc(const char *file, unsigned line, const char *function,
+             size_t size);
+
+void *
+debug_calloc(const char *file, unsigned line, const char *function,
+             size_t count, size_t size );
+
+void
+debug_free(const char *file, unsigned line, const char *function,
+           void *ptr);
+
+void *
+debug_realloc(const char *file, unsigned line, const char *function,
+              void *old_ptr, size_t old_size, size_t new_size );
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#ifndef DEBUG_MEMORY_IMPLEMENTATION
+
+#define os_malloc( _size ) \
+   debug_malloc( __FILE__, __LINE__, __FUNCTION__, _size )
+#define os_calloc( _count, _size ) \
+   debug_calloc(__FILE__, __LINE__, __FUNCTION__, _count, _size )
+#define os_free( _ptr ) \
+   debug_free( __FILE__, __LINE__, __FUNCTION__,  _ptr )
+#define os_realloc( _ptr, _old_size, _new_size ) \
+   debug_realloc( __FILE__, __LINE__, __FUNCTION__,  _ptr, _old_size, _new_size )
+
+/* TODO: wrap os_malloc_aligned() and os_free_aligned() too */
+#include "os_memory_aligned.h"
+
+#endif /* !DEBUG_MEMORY_IMPLEMENTATION */
diff --git a/src/gallium/auxiliary/os/os_memory_stdc.h b/src/gallium/auxiliary/os/os_memory_stdc.h
new file mode 100644
index 00000000000..806e5363568
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_memory_stdc.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/*
+ * OS memory management abstractions for the standard C library.
+ */
+
+
+#ifndef _OS_MEMORY_H_
+#error "Must not be included directly. Include os_memory.h instead"
+#endif
+
+#include <stdlib.h>
+
+#include "pipe/p_compiler.h"
+
+
+#define os_malloc(_size)  malloc(_size)
+#define os_calloc(_count, _size )  calloc(_count, _size )
+#define os_free(_ptr)  free(_ptr)
+
+#define os_realloc( _old_ptr, _old_size, _new_size) \
+   realloc(_old_ptr, _new_size + 0*(_old_size))
+
+
+#if defined(HAVE_POSIX_MEMALIGN)
+
+static INLINE void *
+os_malloc_aligned(size_t size, size_t alignment)
+{
+   void *ptr;
+   alignment = (alignment + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
+   if(posix_memalign(&ptr, alignment, size) != 0)
+      return NULL;
+   return ptr;
+}
+
+#define os_free_aligned(_ptr) free(_ptr)
+
+#elif defined(PIPE_OS_WINDOWS)
+
+#include <malloc.h>
+
+#define os_malloc_aligned(_size, _align) _aligned_malloc(_size, _align)
+#define os_free_aligned(_ptr) _aligned_free(_ptr)
+
+#else
+
+#include "os_memory_aligned.h"
+
+#endif
diff --git a/src/gallium/auxiliary/os/os_memory_win32k.h b/src/gallium/auxiliary/os/os_memory_win32k.h
new file mode 100644
index 00000000000..d56d6908722
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_memory_win32k.h
@@ -0,0 +1,123 @@
+/**************************************************************************
+ * 
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/*
+ * OS memory management abstractions for Windows kernel.
+ */
+
+
+#ifndef _OS_MEMORY_H_
+#error "Must not be included directly. Include os_memory.h instead"
+#endif
+
+
+#include "pipe/p_compiler.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+
+void * __stdcall
+EngAllocMem(unsigned long Flags,
+            unsigned long MemSize,
+            unsigned long Tag);
+
+void __stdcall
+EngFreeMem(void *Mem);
+
+#define os_malloc(_size) EngAllocMem(0, _size, 'D3AG')
+#define os_calloc(_count, _size) EngAllocMem(1, (_count)*(_size), 'D3AG')
+#define _os_free(_ptr) EngFreeMem(_ptr)
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+
+void *
+ExAllocatePool(unsigned long PoolType,
+               size_t NumberOfBytes);
+
+void 
+ExFreePool(void *P);
+
+#define os_malloc(_size) ExAllocatePool(0, _size)
+#define _os_free(_ptr) ExFreePool(_ptr)
+
+static INLINE void *
+os_calloc(unsigned count, unsigned size)
+{
+   void *ptr = os_malloc(count * size);
+   if (ptr) {
+      memset(ptr, 0, count * size);
+   }
+   return ptr;
+}
+
+#else
+
+#error "Unsupported subsystem"
+
+#endif
+
+
+static INLINE void
+os_free( void *ptr )
+{
+   if (ptr) {
+      _os_free(ptr);
+   }
+}
+
+
+static INLINE void *
+os_realloc(void *old_ptr, unsigned old_size, unsigned new_size)
+{
+   void *new_ptr = NULL;
+
+   if (new_size != 0) {
+      unsigned copy_size = old_size < new_size ? old_size : new_size;
+      new_ptr = os_malloc( new_size );
+      if (new_ptr && old_ptr && copy_size) {
+         memcpy(new_ptr, old_ptr, copy_size);
+      }
+   }
+
+   os_free(old_ptr);
+
+   return new_ptr;
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#include "os_memory_aligned.h"
diff --git a/src/gallium/auxiliary/os/os_misc.c b/src/gallium/auxiliary/os/os_misc.c
new file mode 100644
index 00000000000..384988017b7
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_misc.c
@@ -0,0 +1,188 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2010 Vmware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "os_misc.h"
+
+#include <stdarg.h>
+
+
+#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
+
+#include <windows.h>
+#include <winddi.h>
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <windows.h> 
+#include <types.h> 
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN      // Exclude rarely-used stuff from Windows headers
+#endif
+#include <windows.h>
+#include <stdio.h>
+
+#else
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#endif
+
+
+#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
+static INLINE void 
+_EngDebugPrint(const char *format, ...)
+{
+   va_list ap;
+   va_start(ap, format);
+   EngDebugPrint("", (PCHAR)format, ap);
+   va_end(ap);
+}
+#endif
+
+
+void
+os_log_message(const char *message)
+{
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+   _EngDebugPrint("%s", message);
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   OutputDebugStringA(message);
+   if(GetConsoleWindow() && !IsDebuggerPresent()) {
+      fflush(stdout);
+      fputs(message, stderr);
+      fflush(stderr);
+   }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   wchar_t *wide_format;
+   long wide_str_len;   
+   /* Format is ascii - needs to be converted to wchar_t for printing */   
+   wide_str_len = MultiByteToWideChar(CP_ACP, 0, message, -1, NULL, 0);
+   wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t));   
+   if (wide_format) {   
+      MultiByteToWideChar(CP_ACP, 0, message, -1,
+            wide_format, wide_str_len);   
+      NKDbgPrintfW(wide_format, wide_format);   
+      free(wide_format);   
+   } 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+   /* TODO */
+#else /* !PIPE_SUBSYSTEM_WINDOWS */
+   fflush(stdout);
+   fputs(message, stderr);
+#endif
+}
+
+
+#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
+static const char *
+find(const char *start, const char *end, char c)
+{
+   const char *p;
+   for(p = start; !end || p != end; ++p) {
+      if(*p == c)
+         return p;
+      if(*p < 32)
+         break;
+   }
+   return NULL;
+}
+
+static int
+compare(const char *start, const char *end, const char *s)
+{
+   const char *p, *q;
+   for(p = start, q = s; p != end && *q != '\0'; ++p, ++q) {
+      if(*p != *q)
+         return 0;
+   }
+   return p == end && *q == '\0';
+}
+
+static void
+copy(char *dst, const char *start, const char *end, size_t n)
+{
+   const char *p;
+   char *q;
+   for(p = start, q = dst, n = n - 1; p != end && n; ++p, ++q, --n)
+      *q = *p;
+   *q = '\0';
+}
+#endif
+
+
+const char *
+os_get_option(const char *name)
+{
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+   /* EngMapFile creates the file if it does not exists, so it must either be
+    * disabled on release versions (or put in a less conspicuous place). */
+#ifdef DEBUG
+   const char *result = NULL;
+   ULONG_PTR iFile = 0;
+   const void *pMap = NULL;
+   const char *sol, *eol, *sep;
+   static char output[1024];
+   
+   pMap = EngMapFile(L"\\??\\c:\\gallium.cfg", 0, &iFile);
+   if(pMap) {
+      sol = (const char *)pMap;
+      while(1) {
+	 /* TODO: handle LF line endings */
+	 eol = find(sol, NULL, '\r');
+	 if(!eol || eol == sol)
+	    break;
+	 sep = find(sol, eol, '=');
+	 if(!sep)
+	    break;
+	 if(compare(sol, sep, name)) {
+	    copy(output, sep + 1, eol, sizeof(output));
+	    result = output;
+	    break;
+	 }
+	 sol = eol + 2;
+      }
+      EngUnmapFile(iFile);
+   }
+   return result;
+#else
+   return NULL;
+#endif
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) 
+   /* TODO: implement */
+   return NULL;
+#else
+   return getenv(name);
+#endif
+}
+
diff --git a/src/gallium/auxiliary/os/os_misc.h b/src/gallium/auxiliary/os/os_misc.h
new file mode 100644
index 00000000000..d59f9819fec
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_misc.h
@@ -0,0 +1,99 @@
+/**************************************************************************
+ *
+ * Copyright 2010 Vmware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/*
+ * Miscellaneous OS services.
+ */
+
+
+#ifndef _OS_MISC_H_
+#define _OS_MISC_H_
+
+
+#include "pipe/p_compiler.h"
+
+
+#if defined(PIPE_OS_UNIX)
+#  include <signal.h> /* for kill() */
+#  include <unistd.h> /* getpid() */
+#endif
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * Trap into the debugger.
+ */
+#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && defined(PIPE_CC_GCC)
+#  define os_break() __asm("int3")
+#elif defined(PIPE_CC_MSVC)
+#  define os_break()  __debugbreak()
+#elif defined(PIPE_OS_UNIX)
+#  define os_break() kill(getpid(), SIGTRAP)
+#elif defined(PIPE_OS_EMBEDDED)
+void os_break(void);
+#else
+#  define os_break() abort()
+#endif
+
+
+/*
+ * Abort the program.
+ */
+#if defined(DEBUG) || defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+#  define os_abort() os_break()
+#elif defined(PIPE_OS_EMBEDDED)
+void os_abort(void);
+#else
+#  define os_abort() abort()
+#endif
+
+
+/*
+ * Output a message. Message should preferably end in a newline.
+ */
+void
+os_log_message(const char *message);
+
+
+/*
+ * Get an option. Should return NULL if specified option is not set.
+ */
+const char *
+os_get_option(const char *name);
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+
+#endif /* _OS_MISC_H_ */
diff --git a/src/gallium/auxiliary/os/os_stream.h b/src/gallium/auxiliary/os/os_stream.h
new file mode 100644
index 00000000000..693a0621e2d
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_stream.h
@@ -0,0 +1,122 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Cross-platform sequential access stream abstraction.
+ */
+
+#ifndef _OS_STREAM_H_
+#define _OS_STREAM_H_
+
+
+#include "pipe/p_compiler.h"
+
+
+/**
+ * OS stream (FILE, socket, etc) abstraction.
+ */
+struct os_stream
+{
+   void
+   (*close)(struct os_stream *stream);
+
+   boolean
+   (*write)(struct os_stream *stream, const void *data, size_t size);
+
+   void
+   (*flush)(struct os_stream *stream);
+};
+
+
+static INLINE void
+os_stream_close(struct os_stream *stream)
+{
+   if (!stream)
+      return;
+
+   stream->close(stream);
+}
+
+
+static INLINE boolean
+os_stream_write(struct os_stream *stream, const void *data, size_t size)
+{
+   if (!stream)
+      return FALSE;
+   return stream->write(stream, data, size);
+}
+
+
+static INLINE boolean
+os_stream_write_str(struct os_stream *stream, const char *str)
+{
+   size_t size;
+   if (!stream)
+      return FALSE;
+   for(size = 0; str[size]; ++size)
+      ;
+   return stream->write(stream, str, size);
+}
+
+
+static INLINE void
+os_stream_flush(struct os_stream *stream)
+{
+   stream->flush(stream);
+}
+
+
+struct os_stream *
+os_file_stream_create(const char *filename);
+
+
+struct os_stream *
+os_null_stream_create(void);
+
+
+extern struct os_stream *
+os_log_stream;
+
+
+struct os_stream *
+os_str_stream_create(size_t initial_size);
+
+
+const char *
+os_str_stream_get(struct os_stream *stream);
+
+char *
+os_str_stream_get_and_close(struct os_stream *stream);
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+#define os_file_stream_create(_filename) os_null_stream_create()
+#endif
+
+
+#endif /* _OS_STREAM_H_ */
diff --git a/src/gallium/auxiliary/os/os_stream_log.c b/src/gallium/auxiliary/os/os_stream_log.c
new file mode 100644
index 00000000000..7cc2028a22c
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_stream_log.c
@@ -0,0 +1,81 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Debug logging stream implementation.
+ */
+
+#include "os_memory.h"
+#include "os_misc.h"
+#include "os_stream.h"
+
+
+static void
+os_log_stream_close(struct os_stream *stream)
+{
+   (void)stream;
+}
+
+
+static boolean
+os_log_stream_write(struct os_stream *stream, const void *data, size_t size)
+{
+   char *str;
+
+   str = os_malloc(size + 1);
+   if (!str)
+      return FALSE;
+
+   memcpy(str, data, size);
+   str[size] = 0;
+
+   os_log_message(str);
+
+   os_free(str);
+
+   return TRUE;
+}
+
+
+static void
+os_log_stream_flush(struct os_stream *stream)
+{
+   (void)stream;
+}
+
+
+static struct os_stream
+os_log_stream_struct = {
+   &os_log_stream_close,
+   &os_log_stream_write,
+   &os_log_stream_flush
+};
+
+
+struct os_stream *
+os_log_stream = &os_log_stream_struct;
diff --git a/src/gallium/auxiliary/util/u_stream.h b/src/gallium/auxiliary/os/os_stream_null.c
index a9d0f0121a6..128c4e8f0e0 100644
--- a/src/gallium/auxiliary/util/u_stream.h
+++ b/src/gallium/auxiliary/os/os_stream_null.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2008-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -27,35 +27,46 @@
 
 /**
  * @file
- * Cross-platform sequential access stream abstraction.
+ * Null stream implementation.
  */
 
-#ifndef U_STREAM_H
-#define U_STREAM_H
+#include "os_memory.h"
+#include "os_stream.h"
 
 
-#include "pipe/p_compiler.h"
+static void
+os_null_stream_close(struct os_stream *stream)
+{
+   (void)stream;
+}
 
 
-struct util_stream;
+static boolean
+os_null_stream_write(struct os_stream *stream, const void *data, size_t size)
+{
+   (void)data;
+   (void)size;
+   return TRUE;
+}
 
 
-/**
- * Create a stream
- * @param filename relative or absolute path (necessary for windows)  
- * @param optional maximum file size (0 for a growable size).
- */
-struct util_stream *
-util_stream_create(const char *filename, size_t max_size);
-
-boolean
-util_stream_write(struct util_stream *stream, const void *data, size_t size);
+static void
+os_null_stream_flush(struct os_stream *stream)
+{
+   (void)stream;
+}
 
-void
-util_stream_flush(struct util_stream *stream);
 
-void
-util_stream_close(struct util_stream *stream);
+static struct os_stream
+os_null_stream = {
+   &os_null_stream_close,
+   &os_null_stream_write,
+   &os_null_stream_flush
+};
 
 
-#endif /* U_STREAM_H */
+struct os_stream *
+os_null_stream_create()
+{
+   return &os_null_stream;
+}
diff --git a/src/gallium/auxiliary/util/u_stream_stdc.c b/src/gallium/auxiliary/os/os_stream_stdc.c
index 5cd05b29047..9e7ed711076 100644
--- a/src/gallium/auxiliary/util/u_stream_stdc.c
+++ b/src/gallium/auxiliary/os/os_stream_stdc.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2008-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -18,7 +18,7 @@
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -32,74 +32,81 @@
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_HAIKU)
+#if defined(PIPE_OS_UNIX) || defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 
+#include <stdlib.h>
 #include <stdio.h>
 
-#include "util/u_memory.h"
+#include "os_stream.h"
 
-#include "u_stream.h"
 
-
-struct util_stream 
+struct os_stdc_stream
 {
+   struct os_stream base;
+
    FILE *file;
 };
 
 
-struct util_stream *
-util_stream_create(const char *filename, size_t max_size)
+static INLINE struct os_stdc_stream *
+os_stdc_stream(struct os_stream *stream)
 {
-   struct util_stream *stream;
-   
-   (void)max_size;
-   
-   stream = CALLOC_STRUCT(util_stream);
-   if(!stream)
-      goto error1;
-   
-   stream->file = fopen(filename, "w");
-   if(!stream->file)
-      goto error2;
-   
-   return stream;
-   
-error2:
-   FREE(stream);
-error1:
-   return NULL;
+   return (struct os_stdc_stream *)stream;
 }
 
 
-boolean
-util_stream_write(struct util_stream *stream, const void *data, size_t size)
+static void
+os_stdc_stream_close(struct os_stream *_stream)
 {
-   if(!stream)
-      return FALSE;
-   
+   struct os_stdc_stream *stream = os_stdc_stream(_stream);
+
+   fclose(stream->file);
+
+   free(stream);
+}
+
+
+static boolean
+os_stdc_stream_write(struct os_stream *_stream, const void *data, size_t size)
+{
+   struct os_stdc_stream *stream = os_stdc_stream(_stream);
+
    return fwrite(data, size, 1, stream->file) == size ? TRUE : FALSE;
 }
 
 
-void
-util_stream_flush(struct util_stream *stream) 
+static void
+os_stdc_stream_flush(struct os_stream *_stream)
 {
-   if(!stream)
-      return;
-   
+   struct os_stdc_stream *stream = os_stdc_stream(_stream);
+
    fflush(stream->file);
 }
 
 
-void
-util_stream_close(struct util_stream *stream) 
+struct os_stream *
+os_file_stream_create(const char *filename)
 {
+   struct os_stdc_stream *stream;
+
+   stream = (struct os_stdc_stream *)calloc(1, sizeof(*stream));
    if(!stream)
-      return;
-   
-   fclose(stream->file);
+      goto no_stream;
 
-   FREE(stream);
+   stream->base.close = &os_stdc_stream_close;
+   stream->base.write = &os_stdc_stream_write;
+   stream->base.flush = &os_stdc_stream_flush;
+
+   stream->file = fopen(filename, "w");
+   if(!stream->file)
+      goto no_file;
+
+   return &stream->base;
+
+no_file:
+   free(stream);
+no_stream:
+   return NULL;
 }
 
 
diff --git a/src/gallium/auxiliary/os/os_stream_str.c b/src/gallium/auxiliary/os/os_stream_str.c
new file mode 100644
index 00000000000..b5c7270d2ae
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_stream_str.c
@@ -0,0 +1,166 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Malloc string stream implementation.
+ */
+
+#include "pipe/p_config.h"
+
+#include "os_memory.h"
+#include "os_stream.h"
+
+
+struct os_str_stream
+{
+   struct os_stream base;
+
+   char *str;
+
+   size_t size;
+   size_t written;
+};
+
+
+static INLINE struct os_str_stream *
+os_str_stream(struct os_stream *stream)
+{
+   return (struct os_str_stream *)stream;
+}
+
+
+static void
+os_str_stream_close(struct os_stream *_stream)
+{
+   struct os_str_stream *stream = os_str_stream(_stream);
+
+   os_free(stream->str);
+
+   os_free(stream);
+}
+
+
+static boolean
+os_str_stream_write(struct os_stream *_stream, const void *data, size_t size)
+{
+   struct os_str_stream *stream = os_str_stream(_stream);
+   size_t minimum_size;
+   boolean ret = TRUE;
+
+   minimum_size = stream->written + size + 1;
+   if (stream->size < minimum_size) {
+      size_t new_size = stream->size;
+      char * new_str;
+
+      do {
+         new_size *= 2;
+      } while (new_size < minimum_size);
+
+      new_str = os_realloc(stream->str, stream->size, new_size);
+      if (new_str) {
+         stream->str = new_str;
+         stream->size = new_size;
+      }
+      else {
+         size = stream->size - stream->written - 1;
+         ret = FALSE;
+      }
+   }
+
+   memcpy(stream->str + stream->written, data, size);
+   stream->written += size;
+
+   return ret;
+}
+
+
+static void
+os_str_stream_flush(struct os_stream *stream)
+{
+   (void)stream;
+}
+
+
+struct os_stream *
+os_str_stream_create(size_t size)
+{
+   struct os_str_stream *stream;
+
+   stream = (struct os_str_stream *)os_calloc(1, sizeof(*stream));
+   if(!stream)
+      goto no_stream;
+
+   stream->base.close = &os_str_stream_close;
+   stream->base.write = &os_str_stream_write;
+   stream->base.flush = &os_str_stream_flush;
+
+   stream->str = os_malloc(size);
+   if(!stream->str)
+      goto no_str;
+
+   stream->size = size;
+
+   return &stream->base;
+
+no_str:
+   os_free(stream);
+no_stream:
+   return NULL;
+}
+
+
+const char *
+os_str_stream_get(struct os_stream *_stream)
+{
+   struct os_str_stream *stream = os_str_stream(_stream);
+
+   if (!stream)
+      return NULL;
+
+   stream->str[stream->written] = 0;
+   return stream->str;
+}
+
+
+char *
+os_str_stream_get_and_close(struct os_stream *_stream)
+{
+   struct os_str_stream *stream = os_str_stream(_stream);
+   char *str;
+
+   if (!stream)
+      return NULL;
+
+   str = stream->str;
+
+   str[stream->written] = 0;
+
+   os_free(stream);
+
+   return str;
+}
diff --git a/src/gallium/auxiliary/os/os_thread.h b/src/gallium/auxiliary/os/os_thread.h
new file mode 100644
index 00000000000..a04df4106f2
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_thread.h
@@ -0,0 +1,443 @@
+/**************************************************************************
+ * 
+ * Copyright 1999-2006 Brian Paul
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * @file
+ * 
+ * Thread, mutex, condition variable, barrier, semaphore and
+ * thread-specific data functions.
+ */
+
+
+#ifndef OS_THREAD_H_
+#define OS_THREAD_H_
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h" /* for assert */
+
+
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+
+#include <pthread.h> /* POSIX threads headers */
+#include <stdio.h> /* for perror() */
+
+#define PIPE_THREAD_HAVE_CONDVAR
+
+/* pipe_thread
+ */
+typedef pthread_t pipe_thread;
+
+#define PIPE_THREAD_ROUTINE( name, param ) \
+   void *name( void *param )
+
+static INLINE pipe_thread pipe_thread_create( void *(* routine)( void *), void *param )
+{
+   pipe_thread thread;
+   if (pthread_create( &thread, NULL, routine, param ))
+      return 0;
+   return thread;
+}
+
+static INLINE int pipe_thread_wait( pipe_thread thread )
+{
+   return pthread_join( thread, NULL );
+}
+
+static INLINE int pipe_thread_destroy( pipe_thread thread )
+{
+   return pthread_detach( thread );
+}
+
+
+/* pipe_mutex
+ */
+typedef pthread_mutex_t pipe_mutex;
+
+#define pipe_static_mutex(mutex) \
+   static pipe_mutex mutex = PTHREAD_MUTEX_INITIALIZER
+
+#define pipe_mutex_init(mutex) \
+   (void) pthread_mutex_init(&(mutex), NULL)
+
+#define pipe_mutex_destroy(mutex) \
+   pthread_mutex_destroy(&(mutex))
+
+#define pipe_mutex_lock(mutex) \
+   (void) pthread_mutex_lock(&(mutex))
+
+#define pipe_mutex_unlock(mutex) \
+   (void) pthread_mutex_unlock(&(mutex))
+
+
+/* pipe_condvar
+ */
+typedef pthread_cond_t pipe_condvar;
+
+#define pipe_static_condvar(mutex) \
+   static pipe_condvar mutex = PTHREAD_COND_INITIALIZER
+
+#define pipe_condvar_init(cond)	\
+   pthread_cond_init(&(cond), NULL)
+
+#define pipe_condvar_destroy(cond) \
+   pthread_cond_destroy(&(cond))
+
+#define pipe_condvar_wait(cond, mutex) \
+  pthread_cond_wait(&(cond), &(mutex))
+
+#define pipe_condvar_signal(cond) \
+  pthread_cond_signal(&(cond))
+
+#define pipe_condvar_broadcast(cond) \
+  pthread_cond_broadcast(&(cond))
+
+
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+
+#include <windows.h>
+
+/* pipe_thread
+ */
+typedef HANDLE pipe_thread;
+
+#define PIPE_THREAD_ROUTINE( name, param ) \
+   void * WINAPI name( void *param )
+
+static INLINE pipe_thread pipe_thread_create( void *(WINAPI * routine)( void *), void *param )
+{
+   DWORD id;
+   return CreateThread( NULL, 0, (LPTHREAD_START_ROUTINE) routine, param, 0, &id );
+}
+
+static INLINE int pipe_thread_wait( pipe_thread thread )
+{
+   if (WaitForSingleObject( thread, INFINITE ) == WAIT_OBJECT_0)
+      return 0;
+   return -1;
+}
+
+static INLINE int pipe_thread_destroy( pipe_thread thread )
+{
+   if (CloseHandle( thread ))
+      return 0;
+   return -1;
+}
+
+
+/* pipe_mutex
+ */
+typedef CRITICAL_SECTION pipe_mutex;
+
+#define pipe_static_mutex(mutex) \
+   /*static*/ pipe_mutex mutex = {0,0,0,0,0,0}
+
+#define pipe_mutex_init(mutex) \
+   InitializeCriticalSection(&mutex)
+
+#define pipe_mutex_destroy(mutex) \
+   DeleteCriticalSection(&mutex)
+
+#define pipe_mutex_lock(mutex) \
+   EnterCriticalSection(&mutex)
+
+#define pipe_mutex_unlock(mutex) \
+   LeaveCriticalSection(&mutex)
+
+
+/* pipe_condvar (XXX FIX THIS)
+ */
+typedef unsigned pipe_condvar;
+
+#define pipe_condvar_init(cond) \
+   (void) cond
+
+#define pipe_condvar_destroy(cond) \
+   (void) cond
+
+#define pipe_condvar_wait(cond, mutex) \
+   (void) cond; (void) mutex
+
+#define pipe_condvar_signal(cond) \
+   (void) cond
+
+#define pipe_condvar_broadcast(cond) \
+   (void) cond
+
+
+#else
+
+/** Dummy definitions */
+
+typedef unsigned pipe_thread;
+
+#define PIPE_THREAD_ROUTINE( name, param ) \
+   void * name( void *param )
+
+static INLINE pipe_thread pipe_thread_create( void *(* routine)( void *), void *param )
+{
+   return 0;
+}
+
+static INLINE int pipe_thread_wait( pipe_thread thread )
+{
+   return -1;
+}
+
+static INLINE int pipe_thread_destroy( pipe_thread thread )
+{
+   return -1;
+}
+
+typedef unsigned pipe_mutex;
+typedef unsigned pipe_condvar;
+
+#define pipe_static_mutex(mutex) \
+   static pipe_mutex mutex = 0
+
+#define pipe_mutex_init(mutex) \
+   (void) mutex
+
+#define pipe_mutex_destroy(mutex) \
+   (void) mutex
+
+#define pipe_mutex_lock(mutex) \
+   (void) mutex
+
+#define pipe_mutex_unlock(mutex) \
+   (void) mutex
+
+#define pipe_static_condvar(condvar) \
+   static unsigned condvar = 0
+
+#define pipe_condvar_init(condvar) \
+   (void) condvar
+
+#define pipe_condvar_destroy(condvar) \
+   (void) condvar
+
+#define pipe_condvar_wait(condvar, mutex) \
+   (void) condvar
+
+#define pipe_condvar_signal(condvar) \
+   (void) condvar
+
+#define pipe_condvar_broadcast(condvar) \
+   (void) condvar
+
+
+#endif  /* PIPE_OS_? */
+
+
+/*
+ * pipe_barrier
+ */
+
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_HAIKU)
+
+typedef pthread_barrier_t pipe_barrier;
+
+static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
+{
+   pthread_barrier_init(barrier, NULL, count);
+}
+
+static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
+{
+   pthread_barrier_destroy(barrier);
+}
+
+static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
+{
+   pthread_barrier_wait(barrier);
+}
+
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+
+/* XXX FIX THIS */
+typedef unsigned pipe_barrier;
+
+static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
+{
+   /* XXX we could implement barriers with a mutex and condition var */
+}
+
+static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
+{
+}
+
+static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
+{
+   assert(0);
+}
+
+
+#else
+
+typedef unsigned pipe_barrier;
+
+static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count)
+{
+   /* XXX we could implement barriers with a mutex and condition var */
+   assert(0);
+}
+
+static INLINE void pipe_barrier_destroy(pipe_barrier *barrier)
+{
+   assert(0);
+}
+
+static INLINE void pipe_barrier_wait(pipe_barrier *barrier)
+{
+   assert(0);
+}
+
+
+#endif
+
+
+/*
+ * Semaphores
+ */
+
+typedef struct
+{
+   pipe_mutex mutex;
+   pipe_condvar cond;
+   int counter;
+} pipe_semaphore;
+
+
+static INLINE void
+pipe_semaphore_init(pipe_semaphore *sema, int init_val)
+{
+   pipe_mutex_init(sema->mutex);
+   pipe_condvar_init(sema->cond);
+   sema->counter = init_val;
+}
+
+static INLINE void
+pipe_semaphore_destroy(pipe_semaphore *sema)
+{
+   pipe_mutex_destroy(sema->mutex);
+   pipe_condvar_destroy(sema->cond);
+}
+
+/** Signal/increment semaphore counter */
+static INLINE void
+pipe_semaphore_signal(pipe_semaphore *sema)
+{
+   pipe_mutex_lock(sema->mutex);
+   sema->counter++;
+   pipe_condvar_signal(sema->cond);
+   pipe_mutex_unlock(sema->mutex);
+}
+
+/** Wait for semaphore counter to be greater than zero */
+static INLINE void
+pipe_semaphore_wait(pipe_semaphore *sema)
+{
+   pipe_mutex_lock(sema->mutex);
+   while (sema->counter <= 0) {
+      pipe_condvar_wait(sema->cond, sema->mutex);
+   }
+   sema->counter--;
+   pipe_mutex_unlock(sema->mutex);
+}
+
+
+
+/*
+ * Thread-specific data.
+ */
+
+typedef struct {
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+   pthread_key_t key;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   DWORD key;
+#endif
+   int initMagic;
+} pipe_tsd;
+
+
+#define PIPE_TSD_INIT_MAGIC 0xff8adc98
+
+
+static INLINE void
+pipe_tsd_init(pipe_tsd *tsd)
+{
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+   if (pthread_key_create(&tsd->key, NULL/*free*/) != 0) {
+      perror("pthread_key_create(): failed to allocate key for thread specific data");
+      exit(-1);
+   }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   assert(0);
+#endif
+   tsd->initMagic = PIPE_TSD_INIT_MAGIC;
+}
+
+static INLINE void *
+pipe_tsd_get(pipe_tsd *tsd)
+{
+   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+      pipe_tsd_init(tsd);
+   }
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+   return pthread_getspecific(tsd->key);
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   assert(0);
+   return NULL;
+#else
+   assert(0);
+   return NULL;
+#endif
+}
+
+static INLINE void
+pipe_tsd_set(pipe_tsd *tsd, void *value)
+{
+   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+      pipe_tsd_init(tsd);
+   }
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+   if (pthread_setspecific(tsd->key, value) != 0) {
+      perror("pthread_set_specific() failed");
+      exit(-1);
+   }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   assert(0);
+#else
+   assert(0);
+#endif
+}
+
+
+
+#endif /* OS_THREAD_H_ */
diff --git a/src/gallium/auxiliary/os/os_time.c b/src/gallium/auxiliary/os/os_time.c
new file mode 100644
index 00000000000..6259142bec0
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_time.c
@@ -0,0 +1,128 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * OS independent time-manipulation functions.
+ * 
+ * @author Jose Fonseca <[email protected]>
+ */
+
+
+#include "pipe/p_config.h"
+
+#if !defined(PIPE_OS_EMBEDDED)
+
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
+#  include <sys/time.h> /* timeval */
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+#  include <windows.h>
+#  include <winddi.h>
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+#  include <windows.h>
+extern VOID KeQuerySystemTime(PLARGE_INTEGER);
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+#  include <windows.h>
+#else
+#  error Unsupported OS
+#endif
+
+#include "os_time.h"
+
+
+int64_t
+os_time_get(void)
+{
+#if defined(PIPE_OS_UNIX)
+
+   struct timeval tv;
+   gettimeofday(&tv, NULL);
+   return tv.tv_usec + tv.tv_sec*1000000LL;
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+
+   static LONGLONG frequency;
+   LONGLONG counter;
+   if(!frequency)
+      EngQueryPerformanceFrequency(&frequency);
+   EngQueryPerformanceCounter(&counter);
+   return counter*INT64_C(1000000)/frequency;
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+
+   static LARGE_INTEGER frequency;
+   LARGE_INTEGER counter;
+   if(!frequency.QuadPart)
+      QueryPerformanceFrequency(&frequency);
+   QueryPerformanceCounter(&counter);
+   return counter.QuadPart*INT64_C(1000000)/frequency.QuadPart;
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+
+   /* Updated every 10 miliseconds, measured in units of 100 nanoseconds.
+    * http://msdn.microsoft.com/en-us/library/ms801642.aspx */
+   LARGE_INTEGER counter;
+   KeQuerySystemTime(&counter);
+   return counter.QuadPart/10;
+
+#endif
+}
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+
+void
+os_time_sleep(int64_t usecs)
+{
+   static LONGLONG frequency;
+   LONGLONG start, curr, end;
+   
+   EngQueryPerformanceCounter(&start);
+   
+   if(!frequency)
+      EngQueryPerformanceFrequency(&frequency);
+   
+   end = start + (usecs * frequency + 999999LL)/1000000LL;
+   
+   do {
+      EngQueryPerformanceCounter(&curr);
+   } while(start <= curr && curr < end || 
+	   end < start && (curr < end || start <= curr));
+}
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+
+void
+os_time_sleep(int64_t usecs)
+{
+   Sleep((usecs + 999) / 1000);
+}
+
+#endif
+
+
+#endif /* !PIPE_OS_EMBEDDED */
diff --git a/src/gallium/auxiliary/os/os_time.h b/src/gallium/auxiliary/os/os_time.h
new file mode 100644
index 00000000000..5b55c1b3747
--- /dev/null
+++ b/src/gallium/auxiliary/os/os_time.h
@@ -0,0 +1,92 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * OS independent time-manipulation functions.
+ * 
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#ifndef _OS_TIME_H_
+#define _OS_TIME_H_
+
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_OS_UNIX)
+#  include <unistd.h> /* usleep */
+#endif
+
+#include "pipe/p_compiler.h"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * Get the current time in microseconds from an unknown base.
+ */
+int64_t
+os_time_get(void);
+
+
+/*
+ * Sleep.
+ */
+#if defined(PIPE_OS_UNIX)
+#define os_time_sleep(_usecs) usleep(_usecs)
+#else
+void
+os_time_sleep(int64_t usecs);
+#endif
+
+
+/*
+ * Helper function for detecting time outs, taking in account overflow.
+ *
+ * Returns true the the current time has elapsed beyond the specified interval.
+ */
+static INLINE boolean
+os_time_timeout(int64_t start,
+                int64_t end,
+                int64_t curr)
+{
+   if(start <= end)
+      return !(start <= curr && curr < end);
+   else
+      return !((start <= curr) || (curr < end));
+}
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _OS_TIME_H_ */
diff --git a/src/gallium/auxiliary/pipebuffer/Makefile b/src/gallium/auxiliary/pipebuffer/Makefile
index 1c00ba8d986..21d25d24748 100644
--- a/src/gallium/auxiliary/pipebuffer/Makefile
+++ b/src/gallium/auxiliary/pipebuffer/Makefile
@@ -9,7 +9,6 @@ C_SOURCES = \
 	pb_bufmgr_alt.c \
 	pb_bufmgr_cache.c \
 	pb_bufmgr_debug.c \
-	pb_bufmgr_fenced.c \
 	pb_bufmgr_mm.c \
 	pb_bufmgr_ondemand.c \
 	pb_bufmgr_pool.c \
diff --git a/src/gallium/auxiliary/pipebuffer/SConscript b/src/gallium/auxiliary/pipebuffer/SConscript
index 8e9f06abe45..a074a554717 100644
--- a/src/gallium/auxiliary/pipebuffer/SConscript
+++ b/src/gallium/auxiliary/pipebuffer/SConscript
@@ -8,7 +8,6 @@ pipebuffer = env.ConvenienceLibrary(
 		'pb_bufmgr_alt.c',
 		'pb_bufmgr_cache.c',
 		'pb_bufmgr_debug.c',
-		'pb_bufmgr_fenced.c',
 		'pb_bufmgr_mm.c',
 		'pb_bufmgr_ondemand.c',
 		'pb_bufmgr_pool.c',
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 4ef372233f0..34b1b77df40 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -46,6 +46,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
+#include "util/u_inlines.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_state.h"
 
@@ -237,8 +238,9 @@ pb_reference(struct pb_buffer **dst,
 {
    struct pb_buffer *old = *dst;
 
-   if (pipe_reference((struct pipe_reference**)dst, &src->base.reference))
+   if (pipe_reference(&(*dst)->base.reference, &src->base.reference))
       pb_destroy( old );
+   *dst = src;
 }
 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 2f973684f67..d97f749b6ed 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2007-2009 VMware, Inc.
+ * Copyright 2007-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -28,9 +28,9 @@
 /**
  * \file
  * Implementation of fenced buffers.
- * 
- * \author Jose Fonseca <jrfonseca-at-tungstengraphics-dot-com>
- * \author Thomas Hellström <thomas-at-tungstengraphics-dot-com>
+ *
+ * \author Jose Fonseca <jfonseca-at-vmware-dot-com>
+ * \author Thomas Hellström <thellstrom-at-vmware-dot-com>
  */
 
 
@@ -44,12 +44,13 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_defines.h"
 #include "util/u_debug.h"
-#include "pipe/p_thread.h"
+#include "os/os_thread.h"
 #include "util/u_memory.h"
 #include "util/u_double_list.h"
 
 #include "pb_buffer.h"
 #include "pb_buffer_fenced.h"
+#include "pb_bufmgr.h"
 
 
 
@@ -59,48 +60,79 @@
 #define SUPER(__derived) (&(__derived)->base)
 
 
-struct fenced_buffer_list
+struct fenced_manager
 {
-   pipe_mutex mutex;
-   
+   struct pb_manager base;
+   struct pb_manager *provider;
    struct pb_fence_ops *ops;
-   
-   pb_size numDelayed;
-   struct list_head delayed;
-   
-#ifdef DEBUG
-   pb_size numUnfenced;
+
+   /**
+    * Maximum buffer size that can be safely allocated.
+    */
+   pb_size max_buffer_size;
+
+   /**
+    * Maximum cpu memory we can allocate before we start waiting for the
+    * GPU to idle.
+    */
+   pb_size max_cpu_total_size;
+
+   /**
+    * Following members are mutable and protected by this mutex.
+    */
+   pipe_mutex mutex;
+
+   /**
+    * Fenced buffer list.
+    *
+    * All fenced buffers are placed in this listed, ordered from the oldest
+    * fence to the newest fence.
+    */
+   struct list_head fenced;
+   pb_size num_fenced;
+
    struct list_head unfenced;
-#endif
+   pb_size num_unfenced;
+
+   /**
+    * How much temporary CPU memory is being used to hold unvalidated buffers.
+    */
+   pb_size cpu_total_size;
 };
 
 
 /**
+ * Fenced buffer.
+ *
  * Wrapper around a pipe buffer which adds fencing and reference counting.
  */
 struct fenced_buffer
 {
-   /* 
+   /*
     * Immutable members.
     */
 
    struct pb_buffer base;
-   struct pb_buffer *buffer;
-   struct fenced_buffer_list *list;
+   struct fenced_manager *mgr;
 
-   /**
-    * Protected by fenced_buffer_list::mutex
+   /*
+    * Following members are mutable and protected by fenced_manager::mutex.
     */
+
    struct list_head head;
 
    /**
-    * Following members are mutable and protected by this mutex.
-    * 
-    * You may lock this mutex alone, or lock it with fenced_buffer_list::mutex
-    * held, but in order to prevent deadlocks you must never lock 
-    * fenced_buffer_list::mutex with this mutex held.
+    * Buffer with storage.
     */
-   pipe_mutex mutex;
+   struct pb_buffer *buffer;
+   pb_size size;
+   struct pb_desc desc;
+
+   /**
+    * Temporary CPU storage data. Used when there isn't enough GPU memory to
+    * store the buffer.
+    */
+   void *data;
 
    /**
     * A bitmask of PIPE_BUFFER_USAGE_CPU/GPU_READ/WRITE describing the current
@@ -109,12 +141,22 @@ struct fenced_buffer
    unsigned flags;
 
    unsigned mapcount;
+
    struct pb_validate *vl;
    unsigned validation_flags;
+
    struct pipe_fence_handle *fence;
 };
 
 
+static INLINE struct fenced_manager *
+fenced_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct fenced_manager *)mgr;
+}
+
+
 static INLINE struct fenced_buffer *
 fenced_buffer(struct pb_buffer *buf)
 {
@@ -123,81 +165,172 @@ fenced_buffer(struct pb_buffer *buf)
 }
 
 
+static void
+fenced_buffer_destroy_cpu_storage_locked(struct fenced_buffer *fenced_buf);
+
+static enum pipe_error
+fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                        struct fenced_buffer *fenced_buf);
+
+static void
+fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf);
+
+static enum pipe_error
+fenced_buffer_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                        struct fenced_buffer *fenced_buf,
+                                        boolean wait);
+
+static enum pipe_error
+fenced_buffer_copy_storage_to_gpu_locked(struct fenced_buffer *fenced_buf);
+
+static enum pipe_error
+fenced_buffer_copy_storage_to_cpu_locked(struct fenced_buffer *fenced_buf);
+
+
+/**
+ * Dump the fenced buffer list.
+ *
+ * Useful to understand failures to allocate buffers.
+ */
+static void
+fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
+{
+#ifdef DEBUG
+   struct pb_fence_ops *ops = fenced_mgr->ops;
+   struct list_head *curr, *next;
+   struct fenced_buffer *fenced_buf;
+
+   debug_printf("%10s %7s %8s %7s %10s %s\n",
+                "buffer", "size", "refcount", "storage", "fence", "signalled");
+
+   curr = fenced_mgr->unfenced.next;
+   next = curr->next;
+   while(curr != &fenced_mgr->unfenced) {
+      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+      assert(!fenced_buf->fence);
+      debug_printf("%10p %7u %8u %7s\n",
+                   (void *) fenced_buf,
+                   fenced_buf->base.base.size,
+                   p_atomic_read(&fenced_buf->base.base.reference.count),
+                   fenced_buf->buffer ? "gpu" : (fenced_buf->data ? "cpu" : "none"));
+      curr = next;
+      next = curr->next;
+   }
+
+   curr = fenced_mgr->fenced.next;
+   next = curr->next;
+   while(curr != &fenced_mgr->fenced) {
+      int signaled;
+      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+      assert(fenced_buf->buffer);
+      signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
+      debug_printf("%10p %7u %8u %7s %10p %s\n",
+                   (void *) fenced_buf,
+                   fenced_buf->base.base.size,
+                   p_atomic_read(&fenced_buf->base.base.reference.count),
+                   "gpu",
+                   (void *) fenced_buf->fence,
+                   signaled == 0 ? "y" : "n");
+      curr = next;
+      next = curr->next;
+   }
+#else
+   (void)fenced_mgr;
+#endif
+}
+
+
+static INLINE void
+fenced_buffer_destroy_locked(struct fenced_manager *fenced_mgr,
+                             struct fenced_buffer *fenced_buf)
+{
+   assert(!pipe_is_referenced(&fenced_buf->base.base.reference));
+
+   assert(!fenced_buf->fence);
+   assert(fenced_buf->head.prev);
+   assert(fenced_buf->head.next);
+   LIST_DEL(&fenced_buf->head);
+   assert(fenced_mgr->num_unfenced);
+   --fenced_mgr->num_unfenced;
+
+   fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
+   fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
+
+   FREE(fenced_buf);
+}
+
+
 /**
  * Add the buffer to the fenced list.
- * 
- * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this 
- * order before calling this function.
- * 
+ *
  * Reference count should be incremented before calling this function.
  */
 static INLINE void
-fenced_buffer_add_locked(struct fenced_buffer_list *fenced_list, 
+fenced_buffer_add_locked(struct fenced_manager *fenced_mgr,
                          struct fenced_buffer *fenced_buf)
 {
    assert(pipe_is_referenced(&fenced_buf->base.base.reference));
    assert(fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
    assert(fenced_buf->fence);
 
-   /* TODO: Move the reference count increment here */
-   
-#ifdef DEBUG
+   p_atomic_inc(&fenced_buf->base.base.reference.count);
+
    LIST_DEL(&fenced_buf->head);
-   assert(fenced_list->numUnfenced);
-   --fenced_list->numUnfenced;
-#endif
-   LIST_ADDTAIL(&fenced_buf->head, &fenced_list->delayed);
-   ++fenced_list->numDelayed;
+   assert(fenced_mgr->num_unfenced);
+   --fenced_mgr->num_unfenced;
+   LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->fenced);
+   ++fenced_mgr->num_fenced;
 }
 
 
 /**
- * Remove the buffer from the fenced list.
- * 
- * fenced_buffer_list::mutex and fenced_buffer::mutex must be held, in this 
- * order before calling this function.
- * 
- * Reference count should be decremented after calling this function.
+ * Remove the buffer from the fenced list, and potentially destroy the buffer
+ * if the reference count reaches zero.
+ *
+ * Returns TRUE if the buffer was detroyed.
  */
-static INLINE void
-fenced_buffer_remove_locked(struct fenced_buffer_list *fenced_list,
+static INLINE boolean
+fenced_buffer_remove_locked(struct fenced_manager *fenced_mgr,
                             struct fenced_buffer *fenced_buf)
 {
-   struct pb_fence_ops *ops = fenced_list->ops;
+   struct pb_fence_ops *ops = fenced_mgr->ops;
 
    assert(fenced_buf->fence);
-   assert(fenced_buf->list == fenced_list);
-   
+   assert(fenced_buf->mgr == fenced_mgr);
+
    ops->fence_reference(ops, &fenced_buf->fence, NULL);
    fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
-   
+
    assert(fenced_buf->head.prev);
    assert(fenced_buf->head.next);
-   
+
    LIST_DEL(&fenced_buf->head);
-   assert(fenced_list->numDelayed);
-   --fenced_list->numDelayed;
-   
-#ifdef DEBUG
-   LIST_ADDTAIL(&fenced_buf->head, &fenced_list->unfenced);
-   ++fenced_list->numUnfenced;
-#endif
-   
-   /* TODO: Move the reference count decrement and destruction here */
+   assert(fenced_mgr->num_fenced);
+   --fenced_mgr->num_fenced;
+
+   LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->unfenced);
+   ++fenced_mgr->num_unfenced;
+
+   if (p_atomic_dec_zero(&fenced_buf->base.base.reference.count)) {
+      fenced_buffer_destroy_locked(fenced_mgr, fenced_buf);
+      return TRUE;
+   }
+
+   return FALSE;
 }
 
 
 /**
  * Wait for the fence to expire, and remove it from the fenced list.
- * 
- * fenced_buffer::mutex must be held. fenced_buffer_list::mutex must not be 
- * held -- it will
+ *
+ * This function will release and re-aquire the mutex, so any copy of mutable
+ * state must be discarded after calling it.
  */
 static INLINE enum pipe_error
-fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
-                              struct fenced_buffer *fenced_buf)
+fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
+                            struct fenced_buffer *fenced_buf)
 {
-   struct pb_fence_ops *ops = fenced_list->ops;
+   struct pb_fence_ops *ops = fenced_mgr->ops;
    enum pipe_error ret = PIPE_ERROR;
 
 #if 0
@@ -207,19 +340,42 @@ fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
    assert(pipe_is_referenced(&fenced_buf->base.base.reference));
    assert(fenced_buf->fence);
 
-   /* Acquire the global lock */
-   pipe_mutex_unlock(fenced_buf->mutex);
-   pipe_mutex_lock(fenced_list->mutex);
-   pipe_mutex_lock(fenced_buf->mutex);
-
    if(fenced_buf->fence) {
-      if(ops->fence_finish(ops, fenced_buf->fence, 0) == 0) {
-         /* Remove from the fenced list */
-         /* TODO: remove consequents */
-         fenced_buffer_remove_locked(fenced_list, fenced_buf);
-         
-         p_atomic_dec(&fenced_buf->base.base.reference.count);
-         assert(pipe_is_referenced(&fenced_buf->base.base.reference));
+      struct pipe_fence_handle *fence = NULL;
+      int finished;
+      boolean proceed;
+
+      ops->fence_reference(ops, &fence, fenced_buf->fence);
+
+      pipe_mutex_unlock(fenced_mgr->mutex);
+
+      finished = ops->fence_finish(ops, fenced_buf->fence, 0);
+
+      pipe_mutex_lock(fenced_mgr->mutex);
+
+      assert(pipe_is_referenced(&fenced_buf->base.base.reference));
+
+      /*
+       * Only proceed if the fence object didn't change in the meanwhile.
+       * Otherwise assume the work has been already carried out by another
+       * thread that re-aquired the lock before us.
+       */
+      proceed = fence == fenced_buf->fence ? TRUE : FALSE;
+
+      ops->fence_reference(ops, &fence, NULL);
+
+      if(proceed && finished == 0) {
+         /*
+          * Remove from the fenced list
+          */
+
+         boolean destroyed;
+
+         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+
+         /* TODO: remove consequents buffers with the same fence? */
+
+         assert(!destroyed);
 
          fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
 
@@ -227,129 +383,350 @@ fenced_buffer_finish_locked(struct fenced_buffer_list *fenced_list,
       }
    }
 
-   pipe_mutex_unlock(fenced_list->mutex);
-
    return ret;
 }
 
 
 /**
- * Free as many fenced buffers from the list head as possible. 
+ * Remove as many fenced buffers from the fenced list as possible.
+ *
+ * Returns TRUE if at least one buffer was removed.
  */
-static void
-fenced_buffer_list_check_free_locked(struct fenced_buffer_list *fenced_list, 
-                               int wait)
+static boolean
+fenced_manager_check_signalled_locked(struct fenced_manager *fenced_mgr,
+                                      boolean wait)
 {
-   struct pb_fence_ops *ops = fenced_list->ops;
+   struct pb_fence_ops *ops = fenced_mgr->ops;
    struct list_head *curr, *next;
    struct fenced_buffer *fenced_buf;
    struct pipe_fence_handle *prev_fence = NULL;
+   boolean ret = FALSE;
 
-   curr = fenced_list->delayed.next;
+   curr = fenced_mgr->fenced.next;
    next = curr->next;
-   while(curr != &fenced_list->delayed) {
+   while(curr != &fenced_mgr->fenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
 
-      pipe_mutex_lock(fenced_buf->mutex);
-
       if(fenced_buf->fence != prev_fence) {
 	 int signaled;
-	 if (wait)
+
+	 if (wait) {
 	    signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
-	 else
+
+	    /*
+	     * Don't return just now. Instead preemptively check if the
+	     * following buffers' fences already expired, without further waits.
+	     */
+	    wait = FALSE;
+	 }
+	 else {
 	    signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
+	 }
+
 	 if (signaled != 0) {
-            pipe_mutex_unlock(fenced_buf->mutex);
-	    break;
+	    return ret;
          }
+
 	 prev_fence = fenced_buf->fence;
       }
       else {
+         /* This buffer's fence object is identical to the previous buffer's
+          * fence object, so no need to check the fence again.
+          */
 	 assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
       }
 
-      fenced_buffer_remove_locked(fenced_list, fenced_buf);
-      pipe_mutex_unlock(fenced_buf->mutex);
+      fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+
+      ret = TRUE;
+
+      curr = next;
+      next = curr->next;
+   }
+
+   return ret;
+}
+
+
+/**
+ * Try to free some GPU memory by backing it up into CPU memory.
+ *
+ * Returns TRUE if at least one buffer was freed.
+ */
+static boolean
+fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)
+{
+   struct list_head *curr, *next;
+   struct fenced_buffer *fenced_buf;
+
+   curr = fenced_mgr->unfenced.next;
+   next = curr->next;
+   while(curr != &fenced_mgr->unfenced) {
+      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
 
-      pb_reference((struct pb_buffer **)&fenced_buf, NULL);
+      /*
+       * We can only move storage if the buffer is not mapped and not
+       * validated.
+       */
+      if(fenced_buf->buffer &&
+         !fenced_buf->mapcount &&
+         !fenced_buf->vl) {
+         enum pipe_error ret;
+
+         ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
+         if(ret == PIPE_OK) {
+            ret = fenced_buffer_copy_storage_to_cpu_locked(fenced_buf);
+            if(ret == PIPE_OK) {
+               fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
+               return TRUE;
+            }
+            fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
+         }
+      }
 
-      curr = next; 
+      curr = next;
       next = curr->next;
    }
+
+   return FALSE;
+}
+
+
+/**
+ * Destroy CPU storage for this buffer.
+ */
+static void
+fenced_buffer_destroy_cpu_storage_locked(struct fenced_buffer *fenced_buf)
+{
+   if(fenced_buf->data) {
+      align_free(fenced_buf->data);
+      fenced_buf->data = NULL;
+      assert(fenced_buf->mgr->cpu_total_size >= fenced_buf->size);
+      fenced_buf->mgr->cpu_total_size -= fenced_buf->size;
+   }
+}
+
+
+/**
+ * Create CPU storage for this buffer.
+ */
+static enum pipe_error
+fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                        struct fenced_buffer *fenced_buf)
+{
+   assert(!fenced_buf->data);
+   if(fenced_buf->data)
+      return PIPE_OK;
+
+   if (fenced_mgr->cpu_total_size + fenced_buf->size > fenced_mgr->max_cpu_total_size)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   fenced_buf->data = align_malloc(fenced_buf->size, fenced_buf->desc.alignment);
+   if(!fenced_buf->data)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   fenced_mgr->cpu_total_size += fenced_buf->size;
+
+   return PIPE_OK;
+}
+
+
+/**
+ * Destroy the GPU storage.
+ */
+static void
+fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
+{
+   if(fenced_buf->buffer) {
+      pb_reference(&fenced_buf->buffer, NULL);
+   }
+}
+
+
+/**
+ * Try to create GPU storage for this buffer.
+ *
+ * This function is a shorthand around pb_manager::create_buffer for
+ * fenced_buffer_create_gpu_storage_locked()'s benefit.
+ */
+static INLINE boolean
+fenced_buffer_try_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                            struct fenced_buffer *fenced_buf)
+{
+   struct pb_manager *provider = fenced_mgr->provider;
+
+   assert(!fenced_buf->buffer);
+
+   fenced_buf->buffer = provider->create_buffer(fenced_mgr->provider,
+                                                fenced_buf->size,
+                                                &fenced_buf->desc);
+   return fenced_buf->buffer ? TRUE : FALSE;
+}
+
+
+/**
+ * Create GPU storage for this buffer.
+ */
+static enum pipe_error
+fenced_buffer_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
+                                        struct fenced_buffer *fenced_buf,
+                                        boolean wait)
+{
+   assert(!fenced_buf->buffer);
+
+   /*
+    * Check for signaled buffers before trying to allocate.
+    */
+   fenced_manager_check_signalled_locked(fenced_mgr, FALSE);
+
+   fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
+
+   /*
+    * Keep trying while there is some sort of progress:
+    * - fences are expiring,
+    * - or buffers are being being swapped out from GPU memory into CPU memory.
+    */
+   while(!fenced_buf->buffer &&
+         (fenced_manager_check_signalled_locked(fenced_mgr, FALSE) ||
+          fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
+      fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
+   }
+
+   if(!fenced_buf->buffer && wait) {
+      /*
+       * Same as before, but this time around, wait to free buffers if
+       * necessary.
+       */
+      while(!fenced_buf->buffer &&
+            (fenced_manager_check_signalled_locked(fenced_mgr, TRUE) ||
+             fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
+         fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
+      }
+   }
+
+   if(!fenced_buf->buffer) {
+      if(0)
+         fenced_manager_dump_locked(fenced_mgr);
+
+      /* give up */
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+
+   return PIPE_OK;
+}
+
+
+static enum pipe_error
+fenced_buffer_copy_storage_to_gpu_locked(struct fenced_buffer *fenced_buf)
+{
+   uint8_t *map;
+
+   assert(fenced_buf->data);
+   assert(fenced_buf->buffer);
+
+   map = pb_map(fenced_buf->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+   if(!map)
+      return PIPE_ERROR;
+
+   memcpy(map, fenced_buf->data, fenced_buf->size);
+
+   pb_unmap(fenced_buf->buffer);
+
+   return PIPE_OK;
+}
+
+
+static enum pipe_error
+fenced_buffer_copy_storage_to_cpu_locked(struct fenced_buffer *fenced_buf)
+{
+   const uint8_t *map;
+
+   assert(fenced_buf->data);
+   assert(fenced_buf->buffer);
+
+   map = pb_map(fenced_buf->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+   if(!map)
+      return PIPE_ERROR;
+
+   memcpy(fenced_buf->data, map, fenced_buf->size);
+
+   pb_unmap(fenced_buf->buffer);
+
+   return PIPE_OK;
 }
 
 
 static void
 fenced_buffer_destroy(struct pb_buffer *buf)
 {
-   struct fenced_buffer *fenced_buf = fenced_buffer(buf);   
-   struct fenced_buffer_list *fenced_list = fenced_buf->list;
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
 
    assert(!pipe_is_referenced(&fenced_buf->base.base.reference));
-   assert(!fenced_buf->fence);
 
-#ifdef DEBUG
-   pipe_mutex_lock(fenced_list->mutex);
-   assert(fenced_buf->head.prev);
-   assert(fenced_buf->head.next);
-   LIST_DEL(&fenced_buf->head);
-   assert(fenced_list->numUnfenced);
-   --fenced_list->numUnfenced;
-   pipe_mutex_unlock(fenced_list->mutex);
-#else
-   (void)fenced_list;
-#endif
+   pipe_mutex_lock(fenced_mgr->mutex);
 
-   pb_reference(&fenced_buf->buffer, NULL);
+   fenced_buffer_destroy_locked(fenced_mgr, fenced_buf);
 
-   pipe_mutex_destroy(fenced_buf->mutex);
-   FREE(fenced_buf);
+   pipe_mutex_unlock(fenced_mgr->mutex);
 }
 
 
 static void *
-fenced_buffer_map(struct pb_buffer *buf, 
+fenced_buffer_map(struct pb_buffer *buf,
                   unsigned flags)
 {
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
-   struct fenced_buffer_list *fenced_list = fenced_buf->list;
-   struct pb_fence_ops *ops = fenced_list->ops;
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+   struct pb_fence_ops *ops = fenced_mgr->ops;
    void *map = NULL;
 
-   pipe_mutex_lock(fenced_buf->mutex);
+   pipe_mutex_lock(fenced_mgr->mutex);
 
    assert(!(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE));
-   
-   /* Serialize writes */
-   if((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_WRITE) ||
-      ((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ) && (flags & PIPE_BUFFER_USAGE_CPU_WRITE))) {
+
+   /*
+    * Serialize writes.
+    */
+   while((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_WRITE) ||
+         ((fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ) &&
+          (flags & PIPE_BUFFER_USAGE_CPU_WRITE))) {
+
+      /* 
+       * Don't wait for the GPU to finish accessing it, if blocking is forbidden.
+       */
       if((flags & PIPE_BUFFER_USAGE_DONTBLOCK) &&
-          ops->fence_signalled(ops, fenced_buf->fence, 0) == 0) {
-         /* Don't wait for the GPU to finish writing */
-         goto finish;
+          ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
+         goto done;
+      }
+
+      if (flags & PIPE_BUFFER_USAGE_UNSYNCHRONIZED) {
+         break;
       }
 
-      /* Wait for the GPU to finish writing */
-      fenced_buffer_finish_locked(fenced_list, fenced_buf);
+      /*
+       * Wait for the GPU to finish accessing. This will release and re-acquire
+       * the mutex, so all copies of mutable state must be discarded.
+       */
+      fenced_buffer_finish_locked(fenced_mgr, fenced_buf);
    }
 
-#if 0
-   /* Check for CPU write access (read is OK) */
-   if(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
-      /* this is legal -- just for debugging */
-      debug_warning("concurrent CPU writes");
+   if(fenced_buf->buffer) {
+      map = pb_map(fenced_buf->buffer, flags);
    }
-#endif
-   
-   map = pb_map(fenced_buf->buffer, flags);
+   else {
+      assert(fenced_buf->data);
+      map = fenced_buf->data;
+   }
+
    if(map) {
       ++fenced_buf->mapcount;
       fenced_buf->flags |= flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE;
    }
 
-finish:
-   pipe_mutex_unlock(fenced_buf->mutex);
-   
+done:
+   pipe_mutex_unlock(fenced_mgr->mutex);
+
    return map;
 }
 
@@ -358,18 +735,20 @@ static void
 fenced_buffer_unmap(struct pb_buffer *buf)
 {
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
-   
-   pipe_mutex_lock(fenced_buf->mutex);
-   
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
    assert(fenced_buf->mapcount);
    if(fenced_buf->mapcount) {
-      pb_unmap(fenced_buf->buffer);
+      if (fenced_buf->buffer)
+         pb_unmap(fenced_buf->buffer);
       --fenced_buf->mapcount;
       if(!fenced_buf->mapcount)
 	 fenced_buf->flags &= ~PIPE_BUFFER_USAGE_CPU_READ_WRITE;
    }
-   
-   pipe_mutex_unlock(fenced_buf->mutex);
+
+   pipe_mutex_unlock(fenced_mgr->mutex);
 }
 
 
@@ -379,56 +758,70 @@ fenced_buffer_validate(struct pb_buffer *buf,
                        unsigned flags)
 {
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
    enum pipe_error ret;
-   
-   pipe_mutex_lock(fenced_buf->mutex);
+
+   pipe_mutex_lock(fenced_mgr->mutex);
 
    if(!vl) {
       /* invalidate */
       fenced_buf->vl = NULL;
       fenced_buf->validation_flags = 0;
       ret = PIPE_OK;
-      goto finish;
+      goto done;
    }
-   
+
    assert(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
    assert(!(flags & ~PIPE_BUFFER_USAGE_GPU_READ_WRITE));
    flags &= PIPE_BUFFER_USAGE_GPU_READ_WRITE;
 
-   /* Buffer cannot be validated in two different lists */ 
+   /* Buffer cannot be validated in two different lists */
    if(fenced_buf->vl && fenced_buf->vl != vl) {
       ret = PIPE_ERROR_RETRY;
-      goto finish;
-   }
-   
-#if 0
-   /* Do not validate if buffer is still mapped */
-   if(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
-      /* TODO: wait for the thread that mapped the buffer to unmap it */
-      ret = PIPE_ERROR_RETRY;
-      goto finish;
+      goto done;
    }
-   /* Final sanity checking */
-   assert(!(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE));
-   assert(!fenced_buf->mapcount);
-#endif
 
    if(fenced_buf->vl == vl &&
       (fenced_buf->validation_flags & flags) == flags) {
       /* Nothing to do -- buffer already validated */
       ret = PIPE_OK;
-      goto finish;
+      goto done;
    }
-   
+
+   /*
+    * Create and update GPU storage.
+    */
+   if(!fenced_buf->buffer) {
+      assert(!fenced_buf->mapcount);
+
+      ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
+      if(ret != PIPE_OK) {
+         goto done;
+      }
+
+      ret = fenced_buffer_copy_storage_to_gpu_locked(fenced_buf);
+      if(ret != PIPE_OK) {
+         fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
+         goto done;
+      }
+
+      if(fenced_buf->mapcount) {
+         debug_printf("warning: validating a buffer while it is still mapped\n");
+      }
+      else {
+         fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
+      }
+   }
+
    ret = pb_validate(fenced_buf->buffer, vl, flags);
    if (ret != PIPE_OK)
-      goto finish;
-   
+      goto done;
+
    fenced_buf->vl = vl;
    fenced_buf->validation_flags |= flags;
-   
-finish:
-   pipe_mutex_unlock(fenced_buf->mutex);
+
+done:
+   pipe_mutex_unlock(fenced_mgr->mutex);
 
    return ret;
 }
@@ -438,43 +831,37 @@ static void
 fenced_buffer_fence(struct pb_buffer *buf,
                     struct pipe_fence_handle *fence)
 {
-   struct fenced_buffer *fenced_buf;
-   struct fenced_buffer_list *fenced_list;
-   struct pb_fence_ops *ops;
-
-   fenced_buf = fenced_buffer(buf);
-   fenced_list = fenced_buf->list;
-   ops = fenced_list->ops;
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+   struct pb_fence_ops *ops = fenced_mgr->ops;
 
-   pipe_mutex_lock(fenced_list->mutex);
-   pipe_mutex_lock(fenced_buf->mutex);
+   pipe_mutex_lock(fenced_mgr->mutex);
 
    assert(pipe_is_referenced(&fenced_buf->base.base.reference));
+   assert(fenced_buf->buffer);
 
    if(fence != fenced_buf->fence) {
       assert(fenced_buf->vl);
       assert(fenced_buf->validation_flags);
-      
+
       if (fenced_buf->fence) {
-         fenced_buffer_remove_locked(fenced_list, fenced_buf);
-         p_atomic_dec(&fenced_buf->base.base.reference.count);
-         assert(pipe_is_referenced(&fenced_buf->base.base.reference));
+         boolean destroyed;
+         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+         assert(!destroyed);
       }
       if (fence) {
          ops->fence_reference(ops, &fenced_buf->fence, fence);
          fenced_buf->flags |= fenced_buf->validation_flags;
-         p_atomic_inc(&fenced_buf->base.base.reference.count);
-         fenced_buffer_add_locked(fenced_list, fenced_buf);
+         fenced_buffer_add_locked(fenced_mgr, fenced_buf);
       }
 
       pb_fence(fenced_buf->buffer, fence);
-   
+
       fenced_buf->vl = NULL;
       fenced_buf->validation_flags = 0;
    }
 
-   pipe_mutex_unlock(fenced_buf->mutex);
-   pipe_mutex_unlock(fenced_list->mutex);
+   pipe_mutex_unlock(fenced_mgr->mutex);
 }
 
 
@@ -484,12 +871,29 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,
                               pb_size *offset)
 {
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);
-   /* NOTE: accesses immutable members only -- mutex not necessary */
-   pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
+   struct fenced_manager *fenced_mgr = fenced_buf->mgr;
+
+   pipe_mutex_lock(fenced_mgr->mutex);
+
+   /*
+    * This should only be called when the buffer is validated. Typically
+    * when processing relocations.
+    */
+   assert(fenced_buf->vl);
+   assert(fenced_buf->buffer);
+
+   if(fenced_buf->buffer)
+      pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
+   else {
+      *base_buf = buf;
+      *offset = 0;
+   }
+
+   pipe_mutex_unlock(fenced_mgr->mutex);
 }
 
 
-static const struct pb_vtbl 
+static const struct pb_vtbl
 fenced_buffer_vtbl = {
       fenced_buffer_destroy,
       fenced_buffer_map,
@@ -500,154 +904,166 @@ fenced_buffer_vtbl = {
 };
 
 
-struct pb_buffer *
-fenced_buffer_create(struct fenced_buffer_list *fenced_list, 
-                     struct pb_buffer *buffer)
+/**
+ * Wrap a buffer in a fenced buffer.
+ */
+static struct pb_buffer *
+fenced_bufmgr_create_buffer(struct pb_manager *mgr,
+                            pb_size size,
+                            const struct pb_desc *desc)
 {
-   struct fenced_buffer *buf;
-   
-   if(!buffer)
-      return NULL;
-   
-   buf = CALLOC_STRUCT(fenced_buffer);
-   if(!buf) {
-      pb_reference(&buffer, NULL);
-      return NULL;
+   struct fenced_manager *fenced_mgr = fenced_manager(mgr);
+   struct fenced_buffer *fenced_buf;
+   enum pipe_error ret;
+
+   /*
+    * Don't stall the GPU, waste time evicting buffers, or waste memory
+    * trying to create a buffer that will most likely never fit into the
+    * graphics aperture.
+    */
+   if(size > fenced_mgr->max_buffer_size) {
+      goto no_buffer;
    }
-   
-   pipe_reference_init(&buf->base.base.reference, 1);
-   buf->base.base.alignment = buffer->base.alignment;
-   buf->base.base.usage = buffer->base.usage;
-   buf->base.base.size = buffer->base.size;
-   
-   buf->base.vtbl = &fenced_buffer_vtbl;
-   buf->buffer = buffer;
-   buf->list = fenced_list;
-   
-   pipe_mutex_init(buf->mutex);
 
-#ifdef DEBUG
-   pipe_mutex_lock(fenced_list->mutex);
-   LIST_ADDTAIL(&buf->head, &fenced_list->unfenced);
-   ++fenced_list->numUnfenced;
-   pipe_mutex_unlock(fenced_list->mutex);
-#endif
+   fenced_buf = CALLOC_STRUCT(fenced_buffer);
+   if(!fenced_buf)
+      goto no_buffer;
 
-   return &buf->base;
-}
+   pipe_reference_init(&fenced_buf->base.base.reference, 1);
+   fenced_buf->base.base.alignment = desc->alignment;
+   fenced_buf->base.base.usage = desc->usage;
+   fenced_buf->base.base.size = size;
+   fenced_buf->size = size;
+   fenced_buf->desc = *desc;
 
+   fenced_buf->base.vtbl = &fenced_buffer_vtbl;
+   fenced_buf->mgr = fenced_mgr;
 
-struct fenced_buffer_list *
-fenced_buffer_list_create(struct pb_fence_ops *ops) 
-{
-   struct fenced_buffer_list *fenced_list;
+   pipe_mutex_lock(fenced_mgr->mutex);
 
-   fenced_list = CALLOC_STRUCT(fenced_buffer_list);
-   if (!fenced_list)
-      return NULL;
+   /*
+    * Try to create GPU storage without stalling,
+    */
+   ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, FALSE);
 
-   fenced_list->ops = ops;
+   /*
+    * Attempt to use CPU memory to avoid stalling the GPU.
+    */
+   if(ret != PIPE_OK) {
+      ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
+   }
 
-   LIST_INITHEAD(&fenced_list->delayed);
-   fenced_list->numDelayed = 0;
-   
-#ifdef DEBUG
-   LIST_INITHEAD(&fenced_list->unfenced);
-   fenced_list->numUnfenced = 0;
-#endif
+   /*
+    * Create GPU storage, waiting for some to be available.
+    */
+   if(ret != PIPE_OK) {
+      ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
+   }
 
-   pipe_mutex_init(fenced_list->mutex);
+   /*
+    * Give up.
+    */
+   if(ret != PIPE_OK) {
+      goto no_storage;
+   }
 
-   return fenced_list;
-}
+   assert(fenced_buf->buffer || fenced_buf->data);
 
+   LIST_ADDTAIL(&fenced_buf->head, &fenced_mgr->unfenced);
+   ++fenced_mgr->num_unfenced;
+   pipe_mutex_unlock(fenced_mgr->mutex);
 
-void
-fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
-                              int wait)
-{
-   pipe_mutex_lock(fenced_list->mutex);
-   fenced_buffer_list_check_free_locked(fenced_list, wait);
-   pipe_mutex_unlock(fenced_list->mutex);
+   return &fenced_buf->base;
+
+no_storage:
+   pipe_mutex_unlock(fenced_mgr->mutex);
+   FREE(fenced_buf);
+no_buffer:
+   return NULL;
 }
 
 
-#ifdef DEBUG
-void
-fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list)
+static void
+fenced_bufmgr_flush(struct pb_manager *mgr)
 {
-   struct pb_fence_ops *ops = fenced_list->ops;
-   struct list_head *curr, *next;
-   struct fenced_buffer *fenced_buf;
+   struct fenced_manager *fenced_mgr = fenced_manager(mgr);
 
-   pipe_mutex_lock(fenced_list->mutex);
+   pipe_mutex_lock(fenced_mgr->mutex);
+   while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+      ;
+   pipe_mutex_unlock(fenced_mgr->mutex);
 
-   debug_printf("%10s %7s %7s %10s %s\n",
-                "buffer", "size", "refcount", "fence", "signalled");
-   
-   curr = fenced_list->unfenced.next;
-   next = curr->next;
-   while(curr != &fenced_list->unfenced) {
-      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
-      pipe_mutex_lock(fenced_buf->mutex);
-      assert(!fenced_buf->fence);
-      debug_printf("%10p %7u %7u\n",
-                   (void *) fenced_buf,
-                   fenced_buf->base.base.size,
-                   p_atomic_read(&fenced_buf->base.base.reference.count));
-      pipe_mutex_unlock(fenced_buf->mutex);
-      curr = next; 
-      next = curr->next;
-   }
-   
-   curr = fenced_list->delayed.next;
-   next = curr->next;
-   while(curr != &fenced_list->delayed) {
-      int signaled;
-      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
-      pipe_mutex_lock(fenced_buf->mutex);
-      signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
-      debug_printf("%10p %7u %7u %10p %s\n",
-                   (void *) fenced_buf,
-                   fenced_buf->base.base.size,
-                   p_atomic_read(&fenced_buf->base.base.reference.count),
-                   (void *) fenced_buf->fence,
-                   signaled == 0 ? "y" : "n");
-      pipe_mutex_unlock(fenced_buf->mutex);
-      curr = next; 
-      next = curr->next;
-   }
-   
-   pipe_mutex_unlock(fenced_list->mutex);
+   assert(fenced_mgr->provider->flush);
+   if(fenced_mgr->provider->flush)
+      fenced_mgr->provider->flush(fenced_mgr->provider);
 }
-#endif
 
 
-void
-fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list)
+static void
+fenced_bufmgr_destroy(struct pb_manager *mgr)
 {
-   pipe_mutex_lock(fenced_list->mutex);
+   struct fenced_manager *fenced_mgr = fenced_manager(mgr);
+
+   pipe_mutex_lock(fenced_mgr->mutex);
 
    /* Wait on outstanding fences */
-   while (fenced_list->numDelayed) {
-      pipe_mutex_unlock(fenced_list->mutex);
+   while (fenced_mgr->num_fenced) {
+      pipe_mutex_unlock(fenced_mgr->mutex);
 #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
       sched_yield();
 #endif
-      pipe_mutex_lock(fenced_list->mutex);
-      fenced_buffer_list_check_free_locked(fenced_list, 1);
+      pipe_mutex_lock(fenced_mgr->mutex);
+      while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+         ;
    }
 
 #ifdef DEBUG
-   /*assert(!fenced_list->numUnfenced);*/
+   /*assert(!fenced_mgr->num_unfenced);*/
 #endif
-      
-   pipe_mutex_unlock(fenced_list->mutex);
-   pipe_mutex_destroy(fenced_list->mutex);
-   
-   fenced_list->ops->destroy(fenced_list->ops);
-   
-   FREE(fenced_list);
+
+   pipe_mutex_unlock(fenced_mgr->mutex);
+   pipe_mutex_destroy(fenced_mgr->mutex);
+
+   if(fenced_mgr->provider)
+      fenced_mgr->provider->destroy(fenced_mgr->provider);
+
+   fenced_mgr->ops->destroy(fenced_mgr->ops);
+
+   FREE(fenced_mgr);
 }
 
 
+struct pb_manager *
+fenced_bufmgr_create(struct pb_manager *provider,
+                     struct pb_fence_ops *ops,
+                     pb_size max_buffer_size,
+                     pb_size max_cpu_total_size)
+{
+   struct fenced_manager *fenced_mgr;
+
+   if(!provider)
+      return NULL;
+
+   fenced_mgr = CALLOC_STRUCT(fenced_manager);
+   if (!fenced_mgr)
+      return NULL;
+
+   fenced_mgr->base.destroy = fenced_bufmgr_destroy;
+   fenced_mgr->base.create_buffer = fenced_bufmgr_create_buffer;
+   fenced_mgr->base.flush = fenced_bufmgr_flush;
+
+   fenced_mgr->provider = provider;
+   fenced_mgr->ops = ops;
+   fenced_mgr->max_buffer_size = max_buffer_size;
+   fenced_mgr->max_cpu_total_size = max_cpu_total_size;
+
+   LIST_INITHEAD(&fenced_mgr->fenced);
+   fenced_mgr->num_fenced = 0;
+
+   LIST_INITHEAD(&fenced_mgr->unfenced);
+   fenced_mgr->num_unfenced = 0;
+
+   pipe_mutex_init(fenced_mgr->mutex);
+
+   return &fenced_mgr->base;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
index 034ca1e024a..0372f81d0a1 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
@@ -98,43 +98,6 @@ struct pb_fence_ops
 };
 
 
-/**
- * Create a fenced buffer list.
- * 
- * See also fenced_bufmgr_create for a more convenient way to use this.
- */
-struct fenced_buffer_list *
-fenced_buffer_list_create(struct pb_fence_ops *ops);
-
-
-/**
- * Walk the fenced buffer list to check and free signalled buffers.
- */ 
-void
-fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
-                              int wait);
-
-
-#ifdef DEBUG
-void
-fenced_buffer_list_dump(struct fenced_buffer_list *fenced_list);
-#endif
-
-
-void
-fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list);
-
-
-/**
- * Wrap a buffer in a fenced buffer.
- * 
- * NOTE: this will not increase the buffer reference count.
- */
-struct pb_buffer *
-fenced_buffer_create(struct fenced_buffer_list *fenced, 
-                     struct pb_buffer *buffer);
-
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
index 8c8d7130781..06669917ff6 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -175,7 +175,9 @@ struct pb_fence_ops;
  */
 struct pb_manager *
 fenced_bufmgr_create(struct pb_manager *provider,
-                     struct pb_fence_ops *ops);
+                     struct pb_fence_ops *ops,
+                     pb_size max_buffer_size,
+                     pb_size max_cpu_total_size);
 
 
 struct pb_manager *
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 57d1ede45a4..86f9266c95f 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -36,7 +36,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_thread.h"
+#include "os/os_thread.h"
 #include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_time.h"
@@ -65,7 +65,7 @@ struct pb_cache_buffer
    struct pb_cache_manager *mgr;
 
    /** Caching time interval */
-   struct util_time start, end;
+   int64_t start, end;
 
    struct list_head head;
 };
@@ -126,16 +126,16 @@ _pb_cache_buffer_list_check_free(struct pb_cache_manager *mgr)
 {
    struct list_head *curr, *next;
    struct pb_cache_buffer *buf;
-   struct util_time now;
+   int64_t now;
    
-   util_time_get(&now);
+   now = os_time_get();
    
    curr = mgr->delayed.next;
    next = curr->next;
    while(curr != &mgr->delayed) {
       buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
 
-      if(!util_time_timeout(&buf->start, &buf->end, &now))
+      if(!os_time_timeout(buf->start, buf->end, now))
 	 break;
 	 
       _pb_cache_buffer_destroy(buf);
@@ -157,8 +157,8 @@ pb_cache_buffer_destroy(struct pb_buffer *_buf)
    
    _pb_cache_buffer_list_check_free(mgr);
    
-   util_time_get(&buf->start);
-   util_time_add(&buf->start, mgr->usecs, &buf->end);
+   buf->start = os_time_get();
+   buf->end = buf->start + mgr->usecs;
    LIST_ADDTAIL(&buf->head, &mgr->delayed);
    ++mgr->numDelayed;
    pipe_mutex_unlock(mgr->mutex);
@@ -253,7 +253,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    struct pb_cache_buffer *buf;
    struct pb_cache_buffer *curr_buf;
    struct list_head *curr, *next;
-   struct util_time now;
+   int64_t now;
    
    pipe_mutex_lock(mgr->mutex);
 
@@ -262,12 +262,12 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    next = curr->next;
    
    /* search in the expired buffers, freeing them in the process */
-   util_time_get(&now);
+   now = os_time_get();
    while(curr != &mgr->delayed) {
       curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
       if(!buf && pb_cache_is_buffer_compat(curr_buf, size, desc))
 	 buf = curr_buf;
-      else if(util_time_timeout(&curr_buf->start, &curr_buf->end, &now))
+      else if(os_time_timeout(curr_buf->start, curr_buf->end, now))
 	 _pb_cache_buffer_destroy(curr_buf);
       else
          /* This buffer (and all hereafter) are still hot in cache */
@@ -294,7 +294,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
       LIST_DEL(&buf->head);
       pipe_mutex_unlock(mgr->mutex);
       /* Increase refcount */
-      pb_reference((struct pb_buffer**)&buf, &buf->base);
+      pipe_reference_init(&buf->base.base.reference, 1);
       return &buf->base;
    }
    
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 6e3214ca9c9..a5dbded2bce 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -35,7 +35,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_thread.h"
+#include "os/os_thread.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_double_list.h"
@@ -179,7 +179,9 @@ pb_debug_buffer_check(struct pb_debug_buffer *buf)
 {
    uint8_t *map;
    
-   map = pb_map(buf->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+   map = pb_map(buf->buffer,
+                PIPE_BUFFER_USAGE_CPU_READ |
+                PIPE_BUFFER_USAGE_UNSYNCHRONIZED);
    assert(map);
    if(map) {
       boolean underflow, overflow;
@@ -349,7 +351,7 @@ pb_debug_manager_dump(struct pb_debug_manager *mgr)
    while(curr != &mgr->list) {
       buf = LIST_ENTRY(struct pb_debug_buffer, curr, head);
 
-      debug_printf("buffer = %p\n", buf);
+      debug_printf("buffer = %p\n", (void *) buf);
       debug_printf("    .size = 0x%x\n", buf->base.base.size);
       debug_backtrace_dump(buf->create_backtrace, PB_DEBUG_CREATE_BACKTRACE);
       
@@ -371,6 +373,9 @@ pb_debug_manager_create_buffer(struct pb_manager *_mgr,
    struct pb_desc real_desc;
    pb_size real_size;
    
+   assert(size);
+   assert(desc->alignment);
+
    buf = CALLOC_STRUCT(pb_debug_buffer);
    if(!buf)
       return NULL;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
deleted file mode 100644
index 97dd1427fda..00000000000
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * 
- **************************************************************************/
-
-/**
- * \file
- * A buffer manager that wraps buffers in fenced buffers.
- * 
- * \author Jose Fonseca <[email protected]>
- */
-
-
-#include "util/u_debug.h"
-#include "util/u_memory.h"
-
-#include "pb_buffer.h"
-#include "pb_buffer_fenced.h"
-#include "pb_bufmgr.h"
-
-
-struct fenced_pb_manager
-{
-   struct pb_manager base;
-
-   struct pb_manager *provider;
-   
-   struct fenced_buffer_list *fenced_list;
-};
-
-
-static INLINE struct fenced_pb_manager *
-fenced_pb_manager(struct pb_manager *mgr)
-{
-   assert(mgr);
-   return (struct fenced_pb_manager *)mgr;
-}
-
-
-static struct pb_buffer *
-fenced_bufmgr_create_buffer(struct pb_manager *mgr, 
-                            pb_size size,
-                            const struct pb_desc *desc)
-{
-   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
-   struct pb_buffer *buf;
-   struct pb_buffer *fenced_buf;
-
-   /* check for free buffers before allocating new ones */
-   fenced_buffer_list_check_free(fenced_mgr->fenced_list, 0);
-   
-   buf = fenced_mgr->provider->create_buffer(fenced_mgr->provider, size, desc);
-   if(!buf) {
-      /* try harder to get a buffer */
-      fenced_buffer_list_check_free(fenced_mgr->fenced_list, 1);
-      
-      buf = fenced_mgr->provider->create_buffer(fenced_mgr->provider, size, desc);
-      if(!buf) {
-#if 0
-         fenced_buffer_list_dump(fenced_mgr->fenced_list);
-#endif
-         
-         /* give up */
-         return NULL;
-      }
-   }
-   
-   fenced_buf = fenced_buffer_create(fenced_mgr->fenced_list, buf);
-   if(!fenced_buf) {
-      pb_reference(&buf, NULL);
-   }
-   
-   return fenced_buf;
-}
-
-
-static void
-fenced_bufmgr_flush(struct pb_manager *mgr)
-{
-   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
-
-   fenced_buffer_list_check_free(fenced_mgr->fenced_list, TRUE);
-
-   assert(fenced_mgr->provider->flush);
-   if(fenced_mgr->provider->flush)
-      fenced_mgr->provider->flush(fenced_mgr->provider);
-}
-
-
-static void
-fenced_bufmgr_destroy(struct pb_manager *mgr)
-{
-   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
-
-   fenced_buffer_list_destroy(fenced_mgr->fenced_list);
-
-   if(fenced_mgr->provider)
-      fenced_mgr->provider->destroy(fenced_mgr->provider);
-   
-   FREE(fenced_mgr);
-}
-
-
-struct pb_manager *
-fenced_bufmgr_create(struct pb_manager *provider, 
-                     struct pb_fence_ops *ops) 
-{
-   struct fenced_pb_manager *fenced_mgr;
-
-   if(!provider)
-      return NULL;
-   
-   fenced_mgr = CALLOC_STRUCT(fenced_pb_manager);
-   if (!fenced_mgr)
-      return NULL;
-
-   fenced_mgr->base.destroy = fenced_bufmgr_destroy;
-   fenced_mgr->base.create_buffer = fenced_bufmgr_create_buffer;
-   fenced_mgr->base.flush = fenced_bufmgr_flush;
-
-   fenced_mgr->provider = provider;
-   fenced_mgr->fenced_list = fenced_buffer_list_create(ops);
-   if(!fenced_mgr->fenced_list) {
-      FREE(fenced_mgr);
-      return NULL;
-   }
-      
-   return &fenced_mgr->base;
-}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index 6400fc5b0a3..63195715d68 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -35,7 +35,7 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_debug.h"
-#include "pipe/p_thread.h"
+#include "os/os_thread.h"
 #include "util/u_memory.h"
 #include "util/u_double_list.h"
 #include "util/u_mm.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index 7fd65ed2261..fea234ae8c7 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -37,7 +37,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_thread.h"
+#include "os/os_thread.h"
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "util/u_double_list.h"
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index d21910d0bf0..24e2820f881 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -38,7 +38,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_thread.h"
+#include "os/os_thread.h"
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "util/u_double_list.h"
@@ -483,11 +483,15 @@ pb_slab_range_manager_create_buffer(struct pb_manager *_mgr,
 {
    struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
    pb_size bufSize;
+   pb_size reqSize = size;
    unsigned i;
 
+   if(desc->alignment > reqSize)
+	   reqSize = desc->alignment;
+
    bufSize = mgr->minBufSize;
    for (i = 0; i < mgr->numBuckets; ++i) {
-      if(bufSize >= size)
+      if(bufSize >= reqSize)
 	 return mgr->buckets[i]->create_buffer(mgr->buckets[i], size, desc);
       bufSize *= 2;
    }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.c b/src/gallium/auxiliary/pipebuffer/pb_validate.c
index ce40c0cf0e6..903afc749d3 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_validate.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.c
@@ -39,7 +39,6 @@
 #include "util/u_debug.h"
 
 #include "pb_buffer.h"
-#include "pb_buffer_fenced.h"
 #include "pb_validate.h"
 
 
diff --git a/src/gallium/auxiliary/rbug/Makefile b/src/gallium/auxiliary/rbug/Makefile
deleted file mode 100644
index cd12e8468fc..00000000000
--- a/src/gallium/auxiliary/rbug/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = rbug
-
-C_SOURCES = \
-	rbug_connection.c \
-	rbug_core.c \
-	rbug_texture.c \
-	rbug_context.c \
-	rbug_shader.c \
-	rbug_demarshal.c
-
-include ../../Makefile.template
diff --git a/src/gallium/auxiliary/rbug/SConscript b/src/gallium/auxiliary/rbug/SConscript
deleted file mode 100644
index 4a9afb45d3c..00000000000
--- a/src/gallium/auxiliary/rbug/SConscript
+++ /dev/null
@@ -1,14 +0,0 @@
-Import('*')
-
-rbug = env.ConvenienceLibrary(
-	target = 'rbug',
-	source = [
-		'rbug_core.c',
-		'rbug_shader.c',
-		'rbug_context.c',
-		'rbug_texture.c',
-		'rbug_demarshal.c',
-		'rbug_connection.c',
-	])
-
-auxiliaries.insert(0, rbug)
diff --git a/src/gallium/auxiliary/rbug/rbug_connection.c b/src/gallium/auxiliary/rbug/rbug_connection.c
index 52acb700af9..ae4e27f9f6b 100644
--- a/src/gallium/auxiliary/rbug/rbug_connection.c
+++ b/src/gallium/auxiliary/rbug/rbug_connection.c
@@ -87,6 +87,7 @@ rbug_get_message(struct rbug_connection *c, uint32_t *serial)
    if (!data) {
       return NULL;
    }
+   data->opcode = 0;
 
    do {
       uint8_t *ptr = ((uint8_t*)data) + read;
diff --git a/src/gallium/auxiliary/rbug/rbug_context.h b/src/gallium/auxiliary/rbug/rbug_context.h
index da61c2365b0..03126d6b123 100644
--- a/src/gallium/auxiliary/rbug/rbug_context.h
+++ b/src/gallium/auxiliary/rbug/rbug_context.h
@@ -46,7 +46,7 @@ typedef enum
 	RBUG_BLOCK_BEFORE = 1,
 	RBUG_BLOCK_AFTER = 2,
 	RBUG_BLOCK_RULE = 4,
-	RBUG_BLOCK_MASK = 7,
+	RBUG_BLOCK_MASK = 7
 } rbug_block_t;
 
 struct rbug_proto_context_list
diff --git a/src/gallium/auxiliary/rbug/rbug_proto.h b/src/gallium/auxiliary/rbug/rbug_proto.h
index d273be0166d..4f3eb75dc4d 100644
--- a/src/gallium/auxiliary/rbug/rbug_proto.h
+++ b/src/gallium/auxiliary/rbug/rbug_proto.h
@@ -65,7 +65,7 @@ enum rbug_opcode
 	RBUG_OP_SHADER_DISABLE = 770,
 	RBUG_OP_SHADER_REPLACE = 771,
 	RBUG_OP_SHADER_LIST_REPLY = -768,
-	RBUG_OP_SHADER_INFO_REPLY = -769,
+	RBUG_OP_SHADER_INFO_REPLY = -769
 };
 
 /**
diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
deleted file mode 100644
index ab8ea464c6e..00000000000
--- a/src/gallium/auxiliary/rtasm/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = rtasm
-
-C_SOURCES = \
-	rtasm_cpu.c \
-	rtasm_execmem.c \
-	rtasm_x86sse.c \
-	rtasm_ppc.c \
-	rtasm_ppc_spe.c
-
-include ../../Makefile.template
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
deleted file mode 100644
index eb48368accb..00000000000
--- a/src/gallium/auxiliary/rtasm/SConscript
+++ /dev/null
@@ -1,13 +0,0 @@
-Import('*')
-
-rtasm = env.ConvenienceLibrary(
-	target = 'rtasm',
-	source = [
-		'rtasm_cpu.c',
-		'rtasm_execmem.c',
-		'rtasm_x86sse.c',
-		'rtasm_ppc.c',
-		'rtasm_ppc_spe.c',
-	])
-
-auxiliaries.insert(0, rtasm)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 01811d50114..65d5ce795be 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -32,7 +32,7 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_thread.h"
+#include "os/os_thread.h"
 #include "util/u_memory.h"
 
 #include "rtasm_execmem.h"
@@ -41,6 +41,12 @@
 #define MAP_ANONYMOUS MAP_ANON
 #endif
 
+#if defined(PIPE_OS_WINDOWS)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN 1
+#endif
+#include <windows.h>
+#endif
 
 #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
 
@@ -52,7 +58,7 @@
 
 #include <unistd.h>
 #include <sys/mman.h>
-#include "pipe/p_thread.h"
+#include "os/os_thread.h"
 #include "util/u_mm.h"
 
 #define EXEC_HEAP_SIZE (10*1024*1024)
@@ -118,7 +124,29 @@ rtasm_exec_free(void *addr)
 }
 
 
-#else /* PIPE_OS_LINUX || PIPE_OS_BSD || PIPE_OS_SOLARIS */
+#elif defined(PIPE_OS_WINDOWS)
+
+
+/*
+ * Avoid Data Execution Prevention.
+ */
+
+void *
+rtasm_exec_malloc(size_t size)
+{
+   return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
+}
+
+
+void
+rtasm_exec_free(void *addr)
+{
+   VirtualFree(addr, 0, MEM_RELEASE);
+}
+
+
+#else
+
 
 /*
  * Just use regular memory.
@@ -138,4 +166,4 @@ rtasm_exec_free(void *addr)
 }
 
 
-#endif /* PIPE_OS_LINUX || PIPE_OS_BSD || PIPE_OS_SOLARIS */
+#endif
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 1acf3c373eb..f675427d987 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -673,6 +673,13 @@ void x86_and( struct x86_function *p,
    emit_op_modrm( p, 0x23, 0x21, dst, src );
 }
 
+void x86_div( struct x86_function *p,
+              struct x86_reg src )
+{
+   assert(src.file == file_REG32 && src.mod == mod_REG);
+   emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src);
+}
+
 
 
 /***********************************************************************
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 731a6517968..f7612d416a0 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -244,6 +244,7 @@ void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_sahf( struct x86_function *p );
+void x86_div( struct x86_function *p, struct x86_reg src );
 
 
 void x86_cdecl_caller_push_regs( struct x86_function *p );
diff --git a/src/gallium/auxiliary/sct/Makefile b/src/gallium/auxiliary/sct/Makefile
deleted file mode 100644
index a7d111b6891..00000000000
--- a/src/gallium/auxiliary/sct/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = sct
-
-C_SOURCES = \
-	sct.c
-
-include ../../Makefile.template
diff --git a/src/gallium/auxiliary/sct/SConscript b/src/gallium/auxiliary/sct/SConscript
deleted file mode 100644
index 76927d973f8..00000000000
--- a/src/gallium/auxiliary/sct/SConscript
+++ /dev/null
@@ -1,9 +0,0 @@
-Import('*')
-
-sct = env.ConvenienceLibrary(
-	target = 'sct',
-	source = [
-		'sct.c'
-	])
-
-auxiliaries.insert(0, sct)
diff --git a/src/gallium/auxiliary/sct/sct.c b/src/gallium/auxiliary/sct/sct.c
deleted file mode 100644
index 722d2b7e66e..00000000000
--- a/src/gallium/auxiliary/sct/sct.c
+++ /dev/null
@@ -1,453 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "util/u_memory.h"
-#include "pipe/p_state.h"
-#include "sct.h"
-
-
-struct texture_list
-{
-   struct pipe_texture *texture;
-   struct texture_list *next;
-};
-
-
-
-#define MAX_SURFACES  ((PIPE_MAX_COLOR_BUFS) + 1)
-
-struct sct_context
-{
-   const struct pipe_context *context;
-
-   /** surfaces the context is drawing into */
-   struct pipe_surface *surfaces[MAX_SURFACES];
-
-   /** currently bound textures */
-   struct pipe_texture *textures[PIPE_MAX_SAMPLERS];
-
-   /** previously bound textures, used but not flushed */
-   struct texture_list *textures_used;
-
-   boolean needs_flush;
-
-   struct sct_context *next;
-};
-
-
-
-struct sct_surface
-{
-   const struct pipe_surface *surface;
-
-   /** list of contexts drawing to this surface */
-   struct sct_context_list *contexts;
-
-   struct sct_surface *next;
-};
-
-
-
-/**
- * Find the surface_info for the given pipe_surface
- */
-static struct sct_surface *
-find_surface_info(struct surface_context_tracker *sct,
-                  const struct pipe_surface *surface)
-{
-   struct sct_surface *si;
-   for (si = sct->surfaces; si; si = si->next)
-      if (si->surface == surface)
-         return si;
-   return NULL;
-}
-
-
-/**
- * As above, but create new surface_info if surface is new.
- */
-static struct sct_surface *
-find_create_surface_info(struct surface_context_tracker *sct,
-                         const struct pipe_surface *surface)
-{
-   struct sct_surface *si = find_surface_info(sct, surface);
-   if (si)
-      return si;
-
-   /* alloc new */
-   si = CALLOC_STRUCT(sct_surface);
-   if (si) {
-      si->surface = surface;
-
-      /* insert at head */
-      si->next = sct->surfaces;
-      sct->surfaces = si;
-   }
-
-   return si;
-}
-
-
-/**
- * Find a context_info for the given context.
- */
-static struct sct_context *
-find_context_info(struct surface_context_tracker *sct,
-                  const struct pipe_context *context)
-{
-   struct sct_context *ci;
-   for (ci = sct->contexts; ci; ci = ci->next)
-      if (ci->context == context)
-         return ci;
-   return NULL;
-}
-
-
-/**
- * As above, but create new context_info if context is new.
- */
-static struct sct_context *
-find_create_context_info(struct surface_context_tracker *sct,
-                         const struct pipe_context *context)
-{
-   struct sct_context *ci = find_context_info(sct, context);
-   if (ci)
-      return ci;
-
-   /* alloc new */
-   ci = CALLOC_STRUCT(sct_context);
-   if (ci) {
-      ci->context = context;
-
-      /* insert at head */
-      ci->next = sct->contexts;
-      sct->contexts = ci;
-   }
-
-   return ci;
-}
-
-
-/**
- * Is the context already bound to the surface?
- */
-static boolean
-find_surface_context(const struct sct_surface *si,
-                     const struct pipe_context *context)
-{
-   const struct sct_context_list *cl;
-   for (cl = si->contexts; cl; cl = cl->next) {
-      if (cl->context == context) {
-         return TRUE;
-      }
-   }
-   return FALSE;
-}
-
-
-/**
- * Add a context to the list of contexts associated with a surface.
- */
-static void
-add_context_to_surface(struct sct_surface *si,
-                       const struct pipe_context *context)
-{
-   struct sct_context_list *cl = CALLOC_STRUCT(sct_context_list);
-   if (cl) {
-      cl->context = context;
-      /* insert at head of list of contexts */
-      cl->next = si->contexts;
-      si->contexts = cl;
-   }
-}
-
-
-/**
- * Remove a context from the list of contexts associated with a surface.
- */
-static void
-remove_context_from_surface(struct sct_surface *si,
-                            const struct pipe_context *context)
-{
-   struct sct_context_list *prev = NULL, *curr, *next;
-
-   for (curr = si->contexts; curr; curr = next) {
-      if (curr->context == context) {
-         /* remove */
-         if (prev)
-            prev->next = curr->next;
-         else
-            si->contexts = curr->next;
-         next = curr->next;
-         FREE(curr);
-      }
-      else {
-         prev = curr;
-         next = curr->next;
-      }
-   }
-}
-
-
-/**
- * Unbind context from surface.
- */
-static void
-unbind_context_surface(struct surface_context_tracker *sct,
-                       struct pipe_context *context,
-                       struct pipe_surface *surface)
-{
-   struct sct_surface *si = find_surface_info(sct, surface);
-   if (si) {
-      remove_context_from_surface(si, context);
-   }
-}
-
-
-/**
- * Bind context to a set of surfaces (color + Z).
- * Like MakeCurrent().
- */
-void
-sct_bind_surfaces(struct surface_context_tracker *sct,
-                  struct pipe_context *context,
-                  uint num_surf,
-                  struct pipe_surface **surfaces)
-{
-   struct sct_context *ci = find_create_context_info(sct, context);
-   uint i;
-
-   if (!ci) {
-      return; /* out of memory */
-   }
-
-   /* unbind currently bound surfaces */
-   for (i = 0; i < MAX_SURFACES; i++) {
-      if (ci->surfaces[i]) {
-         unbind_context_surface(sct, context, ci->surfaces[i]);
-      }
-   }
-
-   /* bind new surfaces */
-   for (i = 0; i < num_surf; i++) {
-      struct sct_surface *si = find_create_surface_info(sct, surfaces[i]);
-      if (!find_surface_context(si, context)) {
-         add_context_to_surface(si, context);
-      }
-   }
-}
-
-
-/**
- * Return list of contexts bound to a surface.
- */
-const struct sct_context_list *
-sct_get_surface_contexts(struct surface_context_tracker *sct,
-                         const struct pipe_surface *surface)
-{
-   const struct sct_surface *si = find_surface_info(sct, surface);
-   return si->contexts;
-}
-
-
-
-static boolean
-find_texture(const struct sct_context *ci,
-             const struct pipe_texture *texture)
-{
-   const struct texture_list *tl;
-
-   for (tl = ci->textures_used; tl; tl = tl->next) {
-      if (tl->texture == texture) {
-         return TRUE;
-      }
-   }
-   return FALSE;
-}
-
-
-/**
- * Add the given texture to the context's list of used textures.
- */
-static void
-add_texture_used(struct sct_context *ci,
-                 struct pipe_texture *texture)
-{
-   if (!find_texture(ci, texture)) {
-      /* add to list */
-      struct texture_list *tl = CALLOC_STRUCT(texture_list);
-      if (tl) {
-         pipe_texture_reference(&tl->texture, texture);
-         /* insert at head */
-         tl->next = ci->textures_used;
-         ci->textures_used = tl;
-      }
-   }
-}
-
-
-/**
- * Bind a texture to a rendering context.
- */
-void
-sct_bind_texture(struct surface_context_tracker *sct,
-                 struct pipe_context *context,
-                 uint unit,
-                 struct pipe_texture *tex)
-{
-   struct sct_context *ci = find_context_info(sct, context);
-
-   if (ci->textures[unit] != tex) {
-      /* put texture on the 'used' list */
-      add_texture_used(ci, tex);
-      /* bind new */
-      pipe_texture_reference(&ci->textures[unit], tex);
-   }
-}
-
-
-/**
- * Check if the given texture has been used by the rendering context
- * since the last call to sct_flush_textures().
- */
-boolean
-sct_is_texture_used(struct surface_context_tracker *sct,
-                    const struct pipe_context *context,
-                    const struct pipe_texture *texture)
-{
-   const struct sct_context *ci = find_context_info(sct, context);
-   return find_texture(ci, texture);
-}
-
-
-/**
- * To be called when the image contents of a texture are changed, such
- * as for gl[Copy]TexSubImage().
- * XXX this may not be needed
- */
-void
-sct_update_texture(struct pipe_texture *tex)
-{
-
-}
-
-
-/**
- * When a scene is flushed/rendered we can release the list of
- * used textures.
- */
-void
-sct_flush_textures(struct surface_context_tracker *sct,
-                   struct pipe_context *context)
-{
-   struct sct_context *ci = find_context_info(sct, context);
-   struct texture_list *tl, *next;
-   uint i;
-
-   for (tl = ci->textures_used; tl; tl = next) {
-      next = tl->next;
-      pipe_texture_reference(&tl->texture, NULL);
-      FREE(tl);
-   }
-   ci->textures_used = NULL;
-
-   /* put the currently bound textures on the 'used' list */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      add_texture_used(ci, ci->textures[i]);
-   }
-}
-
-
-
-void
-sct_destroy_context(struct surface_context_tracker *sct,
-                    struct pipe_context *context)
-{
-   /* XXX should we require an unbinding first? */
-   {
-      struct sct_surface *si;
-      for (si = sct->surfaces; si; si = si->next) {
-         remove_context_from_surface(si, context);
-      }
-   }
-
-   /* remove context from context_info list */
-   {
-      struct sct_context *ci, *next, *prev = NULL;
-      for (ci = sct->contexts; ci; ci = next) {
-         next = ci->next;
-         if (ci->context == context) {
-            if (prev)
-               prev->next = ci->next;
-            else
-               sct->contexts = ci->next;
-            FREE(ci);
-         }
-         else {
-            prev = ci;
-         }
-      }
-   }
-
-}
-
-
-void
-sct_destroy_surface(struct surface_context_tracker *sct,
-                    struct pipe_surface *surface)
-{
-   if (1) {
-      /* debug/sanity: no context should be bound to surface */
-      struct sct_context *ci;
-      uint i;
-      for (ci = sct->contexts; ci; ci = ci->next) {
-         for (i = 0; i < MAX_SURFACES; i++) {
-            assert(ci->surfaces[i] != surface);
-         }
-      }
-   }
-
-   /* remove surface from sct_surface list */
-   {
-      struct sct_surface *si, *next, *prev = NULL;
-      for (si = sct->surfaces; si; si = next) {
-         next = si->next;
-         if (si->surface == surface) {
-            /* unlink */
-            if (prev)
-               prev->next = si->next;
-            else
-               sct->surfaces = si->next;
-            FREE(si);
-         }
-         else {
-            prev = si;
-         }
-      }
-   }
-}
diff --git a/src/gallium/auxiliary/sct/sct.h b/src/gallium/auxiliary/sct/sct.h
deleted file mode 100644
index cf7c4d3bdfd..00000000000
--- a/src/gallium/auxiliary/sct/sct.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Surface/Context Tracking
- *
- * For some drivers, we need to monitor the binding between contexts and
- * surfaces/textures.
- * This code may evolve quite a bit...
- */
-
-
-#ifndef SCT_H
-#define SCT_H
-
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-
-struct pipe_context;
-struct pipe_surface;
-
-struct sct_context;
-struct sct_surface;
-
-
-/**
- * Per-device info, basically
- */
-struct surface_context_tracker
-{
-   struct sct_context *contexts;
-   struct sct_surface *surfaces;
-};
-
-
-
-/**
- * Simple linked list of contexts
- */
-struct sct_context_list
-{
-   const struct pipe_context *context;
-   struct sct_context_list *next;
-};
-
-
-
-extern void
-sct_bind_surfaces(struct surface_context_tracker *sct,
-                  struct pipe_context *context,
-                  uint num_surf,
-                  struct pipe_surface **surfaces);
-
-
-extern void
-sct_bind_texture(struct surface_context_tracker *sct,
-                 struct pipe_context *context,
-                 uint unit,
-                 struct pipe_texture *texture);
-
-
-extern void
-sct_update_texture(struct pipe_texture *tex);
-
-
-extern boolean
-sct_is_texture_used(struct surface_context_tracker *sct,
-                    const struct pipe_context *context,
-                    const struct pipe_texture *texture);
-
-extern void
-sct_flush_textures(struct surface_context_tracker *sct,
-                   struct pipe_context *context);
-
-
-extern const struct sct_context_list *
-sct_get_surface_contexts(struct surface_context_tracker *sct,
-                         const struct pipe_surface *surf);
-
-
-extern void
-sct_destroy_context(struct surface_context_tracker *sct,
-                    struct pipe_context *context);
-
-
-extern void
-sct_destroy_surface(struct surface_context_tracker *sct,
-                    struct pipe_surface *surface);
-
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* SCT_H */
diff --git a/src/gallium/auxiliary/sct/usage.c b/src/gallium/auxiliary/sct/usage.c
deleted file mode 100644
index 6227f199628..00000000000
--- a/src/gallium/auxiliary/sct/usage.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/* surface / context tracking */
-
-
-/*
-
-context A:
-  render to texture T
-
-context B:
-  texture from T
-
------------------------
-
-flush surface:
-  which contexts are bound to the surface?
-
------------------------
-
-glTexSubImage():
-  which contexts need to be flushed?
-
- */
-
-
-/*
-
-in MakeCurrent():
-
-  call sct_bind_surfaces(context, list of surfaces) to update the
-  dependencies between context and surfaces
-
-
-in SurfaceFlush(), or whatever it is in D3D:
-
-  call sct_get_surface_contexts(surface) to get a list of contexts
-  which are currently bound to the surface.
-
-
-
-in BindTexture():
-
-  call sct_bind_texture(context, texture) to indicate that the texture
-  is used in the scene.
-
-
-in glTexSubImage() or RenderToTexture():
-
-  call sct_is_texture_used(context, texture) to determine if the texture
-  has been used in the scene, but the scene's not flushed.  If TRUE is
-  returned it means the scene has to be rendered/flushed before the contents
-  of the texture can be changed.
-
-
-in psb_scene_flush/terminate():
-
-  call sct_flush_textures(context) to tell the SCT that the textures which
-  were used in the scene can be released.
-
-
-
-*/
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
deleted file mode 100644
index 5f0a580b096..00000000000
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = tgsi
-
-C_SOURCES = \
-	tgsi_sanity.c \
-	tgsi_build.c \
-	tgsi_dump.c \
-	tgsi_exec.c \
-	tgsi_info.c \
-	tgsi_iterate.c \
-	tgsi_parse.c \
-	tgsi_ppc.c \
-	tgsi_scan.c \
-	tgsi_sse2.c \
-	tgsi_text.c \
-	tgsi_transform.c \
-	tgsi_ureg.c \
-	tgsi_util.c
-
-include ../../Makefile.template
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
deleted file mode 100644
index b6bc2924f06..00000000000
--- a/src/gallium/auxiliary/tgsi/SConscript
+++ /dev/null
@@ -1,23 +0,0 @@
-Import('*')
-
-tgsi = env.ConvenienceLibrary(
-	target = 'tgsi',
-	source = [
-		'tgsi_build.c',
-		'tgsi_dump.c',
-		'tgsi_dump_c.c',
-		'tgsi_exec.c',
-		'tgsi_info.c',
-		'tgsi_iterate.c',
-		'tgsi_parse.c',
-		'tgsi_sanity.c',
-		'tgsi_scan.c',
-		'tgsi_ppc.c',
-		'tgsi_sse2.c',
-		'tgsi_text.c',
-		'tgsi_transform.c',
-		'tgsi_ureg.c',
-		'tgsi_util.c',
-	])
-
-auxiliaries.insert(0, tgsi)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index d75ab1b3ff9..0890078cd05 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -103,10 +103,11 @@ tgsi_default_declaration( void )
    declaration.File = TGSI_FILE_NULL;
    declaration.UsageMask = TGSI_WRITEMASK_XYZW;
    declaration.Interpolate = TGSI_INTERPOLATE_CONSTANT;
+   declaration.Dimension = 0;
    declaration.Semantic = 0;
    declaration.Centroid = 0;
    declaration.Invariant = 0;
-   declaration.Padding = 0;
+   declaration.CylindricalWrap = 0;
 
    return declaration;
 }
@@ -116,9 +117,11 @@ tgsi_build_declaration(
    unsigned file,
    unsigned usage_mask,
    unsigned interpolate,
+   unsigned dimension,
    unsigned semantic,
    unsigned centroid,
    unsigned invariant,
+   unsigned cylindrical_wrap,
    struct tgsi_header *header )
 {
    struct tgsi_declaration declaration;
@@ -130,9 +133,11 @@ tgsi_build_declaration(
    declaration.File = file;
    declaration.UsageMask = usage_mask;
    declaration.Interpolate = interpolate;
+   declaration.Dimension = dimension;
    declaration.Semantic = semantic;
    declaration.Centroid = centroid;
    declaration.Invariant = invariant;
+   declaration.CylindricalWrap = cylindrical_wrap;
 
    header_bodysize_grow( header );
 
@@ -183,9 +188,11 @@ tgsi_build_full_declaration(
       full_decl->Declaration.File,
       full_decl->Declaration.UsageMask,
       full_decl->Declaration.Interpolate,
+      full_decl->Declaration.Dimension,
       full_decl->Declaration.Semantic,
       full_decl->Declaration.Centroid,
       full_decl->Declaration.Invariant,
+      full_decl->Declaration.CylindricalWrap,
       header );
 
    if (maxsize <= size)
@@ -199,6 +206,20 @@ tgsi_build_full_declaration(
       declaration,
       header );
 
+   if (full_decl->Declaration.Dimension) {
+      struct tgsi_declaration_dimension *dd;
+
+      if (maxsize <= size) {
+         return 0;
+      }
+      dd = (struct tgsi_declaration_dimension *)&tokens[size];
+      size++;
+
+      *dd = tgsi_build_declaration_dimension(full_decl->Dim.Index2D,
+                                             declaration,
+                                             header);
+   }
+
    if( full_decl->Declaration.Semantic ) {
       struct tgsi_declaration_semantic *ds;
 
@@ -249,6 +270,34 @@ tgsi_build_declaration_range(
    return declaration_range;
 }
 
+struct tgsi_declaration_dimension
+tgsi_default_declaration_dimension(void)
+{
+   struct tgsi_declaration_dimension dd;
+
+   dd.Index2D = 0;
+   dd.Padding = 0;
+
+   return dd;
+}
+
+struct tgsi_declaration_dimension
+tgsi_build_declaration_dimension(unsigned index_2d,
+                                 struct tgsi_declaration *declaration,
+                                 struct tgsi_header *header)
+{
+   struct tgsi_declaration_dimension dd;
+
+   assert(index_2d <= 0xFFFF);
+
+   dd = tgsi_default_declaration_dimension();
+   dd.Index2D = index_2d;
+
+   declaration_grow(declaration, header);
+
+   return dd;
+}
+
 struct tgsi_declaration_semantic
 tgsi_default_declaration_semantic( void )
 {
@@ -399,7 +448,7 @@ tgsi_default_instruction( void )
    struct tgsi_instruction instruction;
 
    instruction.Type = TGSI_TOKEN_TYPE_INSTRUCTION;
-   instruction.NrTokens = 1;
+   instruction.NrTokens = 0;
    instruction.Opcode = TGSI_OPCODE_MOV;
    instruction.Saturate = TGSI_SAT_NONE;
    instruction.Predicate = 0;
@@ -942,3 +991,107 @@ tgsi_default_full_dst_register( void )
    return full_dst_register;
 }
 
+struct tgsi_property
+tgsi_default_property( void )
+{
+   struct tgsi_property property;
+
+   property.Type = TGSI_TOKEN_TYPE_PROPERTY;
+   property.NrTokens = 1;
+   property.PropertyName = TGSI_PROPERTY_GS_INPUT_PRIM;
+   property.Padding = 0;
+
+   return property;
+}
+
+struct tgsi_property
+tgsi_build_property(unsigned property_name,
+                    struct tgsi_header *header)
+{
+   struct tgsi_property property;
+
+   property = tgsi_default_property();
+   property.PropertyName = property_name;
+
+   header_bodysize_grow( header );
+
+   return property;
+}
+
+
+struct tgsi_full_property
+tgsi_default_full_property( void )
+{
+   struct tgsi_full_property  full_property;
+
+   full_property.Property  = tgsi_default_property();
+   memset(full_property.u, 0,
+          sizeof(struct tgsi_property_data) * 8);
+
+   return full_property;
+}
+
+static void
+property_grow(
+   struct tgsi_property *property,
+   struct tgsi_header *header )
+{
+   assert( property->NrTokens < 0xFF );
+
+   property->NrTokens++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_property_data
+tgsi_build_property_data(
+   unsigned value,
+   struct tgsi_property *property,
+   struct tgsi_header *header )
+{
+   struct tgsi_property_data property_data;
+
+   property_data.Data = value;
+
+   property_grow( property, header );
+
+   return property_data;
+}
+
+unsigned
+tgsi_build_full_property(
+   const struct tgsi_full_property *full_prop,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize )
+{
+   unsigned size = 0, i;
+   struct tgsi_property *property;
+
+   if( maxsize <= size )
+      return 0;
+   property = (struct tgsi_property *) &tokens[size];
+   size++;
+
+   *property = tgsi_build_property(
+      full_prop->Property.PropertyName,
+      header );
+
+   assert( full_prop->Property.NrTokens <= 8 + 1 );
+
+   for( i = 0; i < full_prop->Property.NrTokens - 1; i++ ) {
+      struct tgsi_property_data *data;
+
+      if( maxsize <= size )
+         return  0;
+      data = (struct tgsi_property_data *) &tokens[size];
+      size++;
+
+      *data = tgsi_build_property_data(
+         full_prop->u[i].Data,
+         property,
+         header );
+   }
+
+   return size;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
index ffea786770c..13d7f5272d6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -64,9 +64,11 @@ tgsi_build_declaration(
    unsigned file,
    unsigned usage_mask,
    unsigned interpolate,
+   unsigned dimension,
    unsigned semantic,
    unsigned centroid,
    unsigned invariant,
+   unsigned cylindrical_wrap,
    struct tgsi_header *header );
 
 struct tgsi_full_declaration
@@ -89,6 +91,14 @@ tgsi_build_declaration_range(
    struct tgsi_declaration *declaration,
    struct tgsi_header *header );
 
+struct tgsi_declaration_dimension
+tgsi_default_declaration_dimension(void);
+
+struct tgsi_declaration_dimension
+tgsi_build_declaration_dimension(unsigned index_2d,
+                                 struct tgsi_declaration *declaration,
+                                 struct tgsi_header *header);
+
 struct tgsi_declaration_semantic
 tgsi_default_declaration_semantic( void );
 
@@ -127,6 +137,34 @@ tgsi_build_full_immediate(
    unsigned maxsize );
 
 /*
+ * properties
+ */
+
+struct tgsi_property
+tgsi_default_property( void );
+
+struct tgsi_property
+tgsi_build_property(
+   unsigned property_name,
+   struct tgsi_header *header );
+
+struct tgsi_full_property
+tgsi_default_full_property( void );
+
+struct tgsi_property_data
+tgsi_build_property_data(
+   unsigned value,
+   struct tgsi_property *property,
+   struct tgsi_header *header );
+
+unsigned
+tgsi_build_full_property(
+   const struct tgsi_full_property *full_prop,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+/*
  * instruction
  */
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index d09ab925656..57031419f8e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -101,7 +101,8 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "ADDR",
    "IMM",
    "LOOP",
-   "PRED"
+   "PRED",
+   "SV"
 };
 
 static const char *interpolate_names[] =
@@ -120,12 +121,17 @@ static const char *semantic_names[] =
    "PSIZE",
    "GENERIC",
    "NORMAL",
-   "FACE"
+   "FACE",
+   "EDGEFLAG",
+   "PRIM_ID",
+   "INSTANCEID"
 };
 
 static const char *immediate_type_names[] =
 {
-   "FLT32"
+   "FLT32",
+   "UINT32",
+   "INT32"
 };
 
 static const char *swizzle_names[] =
@@ -149,24 +155,87 @@ static const char *texture_names[] =
    "SHADOWRECT"
 };
 
+static const char *property_names[] =
+{
+   "GS_INPUT_PRIMITIVE",
+   "GS_OUTPUT_PRIMITIVE",
+   "GS_MAX_OUTPUT_VERTICES",
+   "FS_COORD_ORIGIN",
+   "FS_COORD_PIXEL_CENTER"
+};
+
+static const char *primitive_names[] =
+{
+   "POINTS",
+   "LINES",
+   "LINE_LOOP",
+   "LINE_STRIP",
+   "TRIANGLES",
+   "TRIANGLE_STRIP",
+   "TRIANGLE_FAN",
+   "QUADS",
+   "QUAD_STRIP",
+   "POLYGON"
+};
+
+static const char *fs_coord_origin_names[] =
+{
+   "UPPER_LEFT",
+   "LOWER_LEFT"
+};
+
+static const char *fs_coord_pixel_center_names[] =
+{
+   "HALF_INTEGER",
+   "INTEGER"
+};
+
 
 static void
-_dump_register(
+_dump_register_dst(
    struct dump_ctx *ctx,
    uint file,
-   int first,
-   int last )
+   int index)
 {
    ENM( file, file_names );
+
    CHR( '[' );
-   SID( first );
-   if (first != last) {
-      TXT( ".." );
-      SID( last );
-   }
+   SID( index );
    CHR( ']' );
 }
 
+
+static void
+_dump_register_src(
+   struct dump_ctx *ctx,
+   const struct tgsi_full_src_register *src )
+{
+   ENM(src->Register.File, file_names);
+   if (src->Register.Dimension) {
+      CHR('[');
+      SID(src->Dimension.Index);
+      CHR(']');
+   }
+   if (src->Register.Indirect) {
+      CHR( '[' );
+      ENM( src->Indirect.File, file_names );
+      CHR( '[' );
+      SID( src->Indirect.Index );
+      TXT( "]." );
+      ENM( src->Indirect.SwizzleX, swizzle_names );
+      if (src->Register.Index != 0) {
+         if (src->Register.Index > 0)
+            CHR( '+' );
+         SID( src->Register.Index );
+      }
+      CHR( ']' );
+   } else {
+      CHR( '[' );
+      SID( src->Register.Index );
+      CHR( ']' );
+   }
+}
+
 static void
 _dump_register_ind(
    struct dump_ctx *ctx,
@@ -221,11 +290,28 @@ iter_declaration(
 
    TXT( "DCL " );
 
-   _dump_register(
-      ctx,
-      decl->Declaration.File,
-      decl->Range.First,
-      decl->Range.Last );
+   ENM(decl->Declaration.File, file_names);
+
+   /* all geometry shader inputs are two dimensional */
+   if (decl->Declaration.File == TGSI_FILE_INPUT &&
+       iter->processor.Processor == TGSI_PROCESSOR_GEOMETRY) {
+      TXT("[]");
+   }
+
+   if (decl->Declaration.Dimension) {
+      CHR('[');
+      SID(decl->Dim.Index2D);
+      CHR(']');
+   }
+
+   CHR('[');
+   SID(decl->Range.First);
+   if (decl->Range.First != decl->Range.Last) {
+      TXT("..");
+      SID(decl->Range.Last);
+   }
+   CHR(']');
+
    _dump_writemask(
       ctx,
       decl->Declaration.UsageMask );
@@ -256,6 +342,22 @@ iter_declaration(
       TXT( ", INVARIANT" );
    }
 
+   if (decl->Declaration.CylindricalWrap) {
+      TXT(", CYLWRAP_");
+      if (decl->Declaration.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_X) {
+         CHR('X');
+      }
+      if (decl->Declaration.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_Y) {
+         CHR('Y');
+      }
+      if (decl->Declaration.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_Z) {
+         CHR('Z');
+      }
+      if (decl->Declaration.CylindricalWrap & TGSI_CYLINDRICAL_WRAP_W) {
+         CHR('W');
+      }
+   }
+
    EOL();
 
    return TRUE;
@@ -273,6 +375,56 @@ tgsi_dump_declaration(
 }
 
 static boolean
+iter_property(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_property *prop )
+{
+   int i;
+   struct dump_ctx *ctx = (struct dump_ctx *)iter;
+
+   assert(Elements(property_names) == TGSI_PROPERTY_COUNT);
+
+   TXT( "PROPERTY " );
+   ENM(prop->Property.PropertyName, property_names);
+
+   if (prop->Property.NrTokens > 1)
+      TXT(" ");
+
+   for (i = 0; i < prop->Property.NrTokens - 1; ++i) {
+      switch (prop->Property.PropertyName) {
+      case TGSI_PROPERTY_GS_INPUT_PRIM:
+      case TGSI_PROPERTY_GS_OUTPUT_PRIM:
+         ENM(prop->u[i].Data, primitive_names);
+         break;
+      case TGSI_PROPERTY_FS_COORD_ORIGIN:
+         ENM(prop->u[i].Data, fs_coord_origin_names);
+         break;
+      case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER:
+         ENM(prop->u[i].Data, fs_coord_pixel_center_names);
+         break;
+      default:
+         SID( prop->u[i].Data );
+         break;
+      }
+      if (i < prop->Property.NrTokens - 2)
+         TXT( ", " );
+   }
+   EOL();
+
+   return TRUE;
+}
+
+void tgsi_dump_property(
+   const struct tgsi_full_property *prop )
+{
+   struct dump_ctx ctx;
+
+   ctx.printf = dump_ctx_printf;
+
+   iter_property( &ctx.iter, (struct tgsi_full_property *)prop );
+}
+
+static boolean
 iter_immediate(
    struct tgsi_iterate_context *iter,
    struct tgsi_full_immediate *imm )
@@ -292,6 +444,12 @@ iter_immediate(
       case TGSI_IMM_FLOAT32:
          FLT( imm->u[i].Float );
          break;
+      case TGSI_IMM_UINT32:
+         UID(imm->u[i].Uint);
+         break;
+      case TGSI_IMM_INT32:
+         SID(imm->u[i].Int);
+         break;
       default:
          assert( 0 );
       }
@@ -368,10 +526,9 @@ iter_instruction(
             dst->Indirect.SwizzleX );
       }
       else {
-         _dump_register(
+         _dump_register_dst(
             ctx,
             dst->Register.File,
-            dst->Register.Index,
             dst->Register.Index );
       }
       _dump_writemask( ctx, dst->Register.WriteMask );
@@ -387,26 +544,11 @@ iter_instruction(
       CHR( ' ' );
 
       if (src->Register.Negate)
-         TXT( "-(" );
+         CHR( '-' );
       if (src->Register.Absolute)
          CHR( '|' );
 
-      if (src->Register.Indirect) {
-         _dump_register_ind(
-            ctx,
-            src->Register.File,
-            src->Register.Index,
-            src->Indirect.File,
-            src->Indirect.Index,
-            src->Indirect.SwizzleX );
-      }
-      else {
-         _dump_register(
-            ctx,
-            src->Register.File,
-            src->Register.Index,
-            src->Register.Index );
-      }
+      _dump_register_src(ctx, src);
 
       if (src->Register.SwizzleX != TGSI_SWIZZLE_X ||
           src->Register.SwizzleY != TGSI_SWIZZLE_Y ||
@@ -421,8 +563,6 @@ iter_instruction(
 
       if (src->Register.Absolute)
          CHR( '|' );
-      if (src->Register.Negate)
-         CHR( ')' );
 
       first_reg = FALSE;
    }
@@ -492,6 +632,7 @@ tgsi_dump(
    ctx.iter.iterate_instruction = iter_instruction;
    ctx.iter.iterate_declaration = iter_declaration;
    ctx.iter.iterate_immediate = iter_immediate;
+   ctx.iter.iterate_property = iter_property;
    ctx.iter.epilog = NULL;
 
    ctx.instno = 0;
@@ -546,6 +687,7 @@ tgsi_dump_str(
    ctx.base.iter.iterate_instruction = iter_instruction;
    ctx.base.iter.iterate_declaration = iter_declaration;
    ctx.base.iter.iterate_immediate = iter_immediate;
+   ctx.base.iter.iterate_property = iter_property;
    ctx.base.iter.epilog = NULL;
 
    ctx.base.instno = 0;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
index ad1e647ec90..4cd27317b36 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -49,6 +49,7 @@ tgsi_dump(
 struct tgsi_full_immediate;
 struct tgsi_full_instruction;
 struct tgsi_full_declaration;
+struct tgsi_full_property;
 
 void
 tgsi_dump_immediate(
@@ -63,6 +64,10 @@ void
 tgsi_dump_declaration(
    const struct tgsi_full_declaration *decl );
 
+void
+tgsi_dump_property(
+   const struct tgsi_full_property *prop );
+
 #if defined __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
deleted file mode 100644
index 47fd1dd590e..00000000000
--- a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
+++ /dev/null
@@ -1,462 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "util/u_debug.h"
-#include "util/u_string.h"
-#include "tgsi_dump_c.h"
-#include "tgsi_build.h"
-#include "tgsi_info.h"
-#include "tgsi_parse.h"
-
-static void
-dump_enum(
-   const unsigned    e,
-   const char        **enums,
-   const unsigned    enums_count )
-{
-   if (e >= enums_count) {
-      debug_printf( "%u", e );
-   }
-   else {
-      debug_printf( "%s", enums[e] );
-   }
-}
-
-#define EOL()           debug_printf( "\n" )
-#define TXT(S)          debug_printf( "%s", S )
-#define CHR(C)          debug_printf( "%c", C )
-#define UIX(I)          debug_printf( "0x%x", I )
-#define UID(I)          debug_printf( "%u", I )
-#define SID(I)          debug_printf( "%d", I )
-#define FLT(F)          debug_printf( "%10.4f", F )
-#define ENM(E,ENUMS)    dump_enum( E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
-
-static const char *TGSI_PROCESSOR_TYPES[] =
-{
-   "PROCESSOR_FRAGMENT",
-   "PROCESSOR_VERTEX",
-   "PROCESSOR_GEOMETRY"
-};
-
-static const char *TGSI_TOKEN_TYPES[] =
-{
-   "TOKEN_TYPE_DECLARATION",
-   "TOKEN_TYPE_IMMEDIATE",
-   "TOKEN_TYPE_INSTRUCTION"
-};
-
-static const char *TGSI_FILES[TGSI_FILE_COUNT] =
-{
-   "FILE_NULL",
-   "FILE_CONSTANT",
-   "FILE_INPUT",
-   "FILE_OUTPUT",
-   "FILE_TEMPORARY",
-   "FILE_SAMPLER",
-   "FILE_ADDRESS",
-   "FILE_IMMEDIATE",
-   "FILE_LOOP",
-   "FILE_PREDICATE"
-};
-
-static const char *TGSI_INTERPOLATES[] =
-{
-   "INTERPOLATE_CONSTANT",
-   "INTERPOLATE_LINEAR",
-   "INTERPOLATE_PERSPECTIVE"
-};
-
-static const char *TGSI_SEMANTICS[] =
-{
-   "SEMANTIC_POSITION",
-   "SEMANTIC_COLOR",
-   "SEMANTIC_BCOLOR",
-   "SEMANTIC_FOG",
-   "SEMANTIC_PSIZE",
-   "SEMANTIC_GENERIC",
-   "SEMANTIC_NORMAL"
-};
-
-static const char *TGSI_IMMS[] =
-{
-   "IMM_FLOAT32"
-};
-
-static const char *TGSI_SATS[] =
-{
-   "SAT_NONE",
-   "SAT_ZERO_ONE",
-   "SAT_MINUS_PLUS_ONE"
-};
-
-static const char *TGSI_SWIZZLES[] =
-{
-   "SWIZZLE_X",
-   "SWIZZLE_Y",
-   "SWIZZLE_Z",
-   "SWIZZLE_W"
-};
-
-static const char *TGSI_TEXTURES[] =
-{
-   "TEXTURE_UNKNOWN",
-   "TEXTURE_1D",
-   "TEXTURE_2D",
-   "TEXTURE_3D",
-   "TEXTURE_CUBE",
-   "TEXTURE_RECT",
-   "TEXTURE_SHADOW1D",
-   "TEXTURE_SHADOW2D",
-   "TEXTURE_SHADOWRECT"
-};
-
-static const char *TGSI_WRITEMASKS[] =
-{
-   "0",
-   "WRITEMASK_X",
-   "WRITEMASK_Y",
-   "WRITEMASK_XY",
-   "WRITEMASK_Z",
-   "WRITEMASK_XZ",
-   "WRITEMASK_YZ",
-   "WRITEMASK_XYZ",
-   "WRITEMASK_W",
-   "WRITEMASK_XW",
-   "WRITEMASK_YW",
-   "WRITEMASK_XYW",
-   "WRITEMASK_ZW",
-   "WRITEMASK_XZW",
-   "WRITEMASK_YZW",
-   "WRITEMASK_XYZW"
-};
-
-static void
-dump_declaration_verbose(
-   struct tgsi_full_declaration  *decl,
-   unsigned                      ignored,
-   unsigned                      deflt,
-   struct tgsi_full_declaration  *fd )
-{
-   TXT( "\nFile       : " );
-   ENM( decl->Declaration.File, TGSI_FILES );
-   if( deflt || fd->Declaration.UsageMask != decl->Declaration.UsageMask ) {
-      TXT( "\nUsageMask  : " );
-      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_X ) {
-         CHR( 'X' );
-      }
-      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Y ) {
-         CHR( 'Y' );
-      }
-      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Z ) {
-         CHR( 'Z' );
-      }
-      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_W ) {
-         CHR( 'W' );
-      }
-   }
-   if( deflt || fd->Declaration.Interpolate != decl->Declaration.Interpolate ) {
-      TXT( "\nInterpolate: " );
-      ENM( decl->Declaration.Interpolate, TGSI_INTERPOLATES );
-   }
-   if( deflt || fd->Declaration.Semantic != decl->Declaration.Semantic ) {
-      TXT( "\nSemantic   : " );
-      UID( decl->Declaration.Semantic );
-   }
-   if (deflt || fd->Declaration.Centroid != decl->Declaration.Centroid) {
-      TXT("\nCentroid   : ");
-      UID(decl->Declaration.Centroid);
-   }
-   if (deflt || fd->Declaration.Invariant != decl->Declaration.Invariant) {
-      TXT("\nInvariant  : ");
-      UID(decl->Declaration.Invariant);
-   }
-   if( ignored ) {
-      TXT( "\nPadding    : " );
-      UIX( decl->Declaration.Padding );
-   }
-
-   EOL();
-   TXT( "\nFirst: " );
-   UID( decl->Range.First );
-   TXT( "\nLast : " );
-   UID( decl->Range.Last );
-
-   if( decl->Declaration.Semantic ) {
-      EOL();
-      TXT( "\nName : " );
-      ENM( decl->Semantic.Name, TGSI_SEMANTICS );
-      TXT( "\nIndex: " );
-      UID( decl->Semantic.Index );
-      if( ignored ) {
-         TXT( "\nPadding      : " );
-         UIX( decl->Semantic.Padding );
-      }
-   }
-}
-
-static void
-dump_immediate_verbose(
-   struct tgsi_full_immediate *imm,
-   unsigned                   ignored )
-{
-   unsigned i;
-
-   TXT( "\nDataType   : " );
-   ENM( imm->Immediate.DataType, TGSI_IMMS );
-   if( ignored ) {
-      TXT( "\nPadding    : " );
-      UIX( imm->Immediate.Padding );
-   }
-
-   assert( imm->Immediate.NrTokens <= 4 + 1 );
-   for( i = 0; i < imm->Immediate.NrTokens - 1; i++ ) {
-      EOL();
-      switch( imm->Immediate.DataType ) {
-      case TGSI_IMM_FLOAT32:
-         TXT( "\nFloat: " );
-         FLT( imm->u[i].Float );
-         break;
-
-      default:
-         assert( 0 );
-      }
-   }
-}
-
-static void
-dump_instruction_verbose(
-   struct tgsi_full_instruction  *inst,
-   unsigned                      ignored,
-   unsigned                      deflt,
-   struct tgsi_full_instruction  *fi )
-{
-   unsigned i;
-
-   TXT( "\nOpcode     : OPCODE_" );
-   TXT( tgsi_get_opcode_info( inst->Instruction.Opcode )->mnemonic );
-   if( deflt || fi->Instruction.Saturate != inst->Instruction.Saturate ) {
-      TXT( "\nSaturate   : " );
-      ENM( inst->Instruction.Saturate, TGSI_SATS );
-   }
-   if( deflt || fi->Instruction.NumDstRegs != inst->Instruction.NumDstRegs ) {
-      TXT( "\nNumDstRegs : " );
-      UID( inst->Instruction.NumDstRegs );
-   }
-   if( deflt || fi->Instruction.NumSrcRegs != inst->Instruction.NumSrcRegs ) {
-      TXT( "\nNumSrcRegs : " );
-      UID( inst->Instruction.NumSrcRegs );
-   }
-   if (deflt || fi->Instruction.Predicate != inst->Instruction.Predicate) {
-      TXT("\nPredicate  : ");
-      UID(inst->Instruction.Predicate);
-   }
-   if (deflt || fi->Instruction.Label != inst->Instruction.Label) {
-      TXT("\nLabel      : ");
-      UID(inst->Instruction.Label);
-   }
-   if (deflt || fi->Instruction.Texture != inst->Instruction.Texture) {
-      TXT("\nTexture    : ");
-      UID(inst->Instruction.Texture);
-   }
-   if( ignored ) {
-      TXT( "\nPadding    : " );
-      UIX( inst->Instruction.Padding );
-   }
-
-   if (deflt || inst->Instruction.Label) {
-      EOL();
-      if (deflt || fi->Label.Label != inst->Label.Label) {
-         TXT( "\nLabel   : " );
-         UID(inst->Label.Label);
-      }
-      if( ignored ) {
-         TXT( "\nPadding : " );
-         UIX(inst->Label.Padding);
-      }
-   }
-
-   if (deflt || inst->Instruction.Texture) {
-      EOL();
-      if (deflt || fi->Texture.Texture != inst->Texture.Texture) {
-         TXT( "\nTexture : " );
-         ENM(inst->Texture.Texture, TGSI_TEXTURES);
-      }
-      if( ignored ) {
-         TXT( "\nPadding : " );
-         UIX(inst->Texture.Padding);
-      }
-   }
-
-   for( i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
-      struct tgsi_full_dst_register *dst = &inst->Dst[i];
-      struct tgsi_full_dst_register *fd = &fi->Dst[i];
-
-      EOL();
-      TXT( "\nFile     : " );
-      ENM( dst->Register.File, TGSI_FILES );
-      if( deflt || fd->Register.WriteMask != dst->Register.WriteMask ) {
-         TXT( "\nWriteMask: " );
-         ENM( dst->Register.WriteMask, TGSI_WRITEMASKS );
-      }
-      if( ignored ) {
-         if( deflt || fd->Register.Indirect != dst->Register.Indirect ) {
-            TXT( "\nIndirect : " );
-            UID( dst->Register.Indirect );
-         }
-         if( deflt || fd->Register.Dimension != dst->Register.Dimension ) {
-            TXT( "\nDimension: " );
-            UID( dst->Register.Dimension );
-         }
-      }
-      if( deflt || fd->Register.Index != dst->Register.Index ) {
-         TXT( "\nIndex    : " );
-         SID( dst->Register.Index );
-      }
-      if( ignored ) {
-         TXT( "\nPadding  : " );
-         UIX( dst->Register.Padding );
-      }
-   }
-
-   for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
-      struct tgsi_full_src_register *src = &inst->Src[i];
-      struct tgsi_full_src_register *fs = &fi->Src[i];
-
-      EOL();
-      TXT( "\nFile     : ");
-      ENM( src->Register.File, TGSI_FILES );
-      if( deflt || fs->Register.SwizzleX != src->Register.SwizzleX ) {
-         TXT( "\nSwizzleX : " );
-         ENM( src->Register.SwizzleX, TGSI_SWIZZLES );
-      }
-      if( deflt || fs->Register.SwizzleY != src->Register.SwizzleY ) {
-         TXT( "\nSwizzleY : " );
-         ENM( src->Register.SwizzleY, TGSI_SWIZZLES );
-      }
-      if( deflt || fs->Register.SwizzleZ != src->Register.SwizzleZ ) {
-         TXT( "\nSwizzleZ : " );
-         ENM( src->Register.SwizzleZ, TGSI_SWIZZLES );
-      }
-      if( deflt || fs->Register.SwizzleW != src->Register.SwizzleW ) {
-         TXT( "\nSwizzleW : " );
-         ENM( src->Register.SwizzleW, TGSI_SWIZZLES );
-      }
-      if (deflt || fs->Register.Absolute != src->Register.Absolute) {
-         TXT("\nAbsolute : ");
-         UID(src->Register.Absolute);
-      }
-      if( deflt || fs->Register.Negate != src->Register.Negate ) {
-         TXT( "\nNegate   : " );
-         UID( src->Register.Negate );
-      }
-      if( ignored ) {
-         if( deflt || fs->Register.Indirect != src->Register.Indirect ) {
-            TXT( "\nIndirect : " );
-            UID( src->Register.Indirect );
-         }
-         if( deflt || fs->Register.Dimension != src->Register.Dimension ) {
-            TXT( "\nDimension: " );
-            UID( src->Register.Dimension );
-         }
-      }
-      if( deflt || fs->Register.Index != src->Register.Index ) {
-         TXT( "\nIndex    : " );
-         SID( src->Register.Index );
-      }
-   }
-}
-
-void
-tgsi_dump_c(
-   const struct tgsi_token *tokens,
-   uint flags )
-{
-   struct tgsi_parse_context parse;
-   struct tgsi_full_instruction fi;
-   struct tgsi_full_declaration fd;
-   uint ignored = flags & TGSI_DUMP_C_IGNORED;
-   uint deflt = flags & TGSI_DUMP_C_DEFAULT;
-
-   tgsi_parse_init( &parse, tokens );
-
-   TXT( "tgsi-dump begin -----------------" );
-
-   TXT( "\nHeaderSize: " );
-   UID( parse.FullHeader.Header.HeaderSize );
-   TXT( "\nBodySize  : " );
-   UID( parse.FullHeader.Header.BodySize );
-   TXT( "\nProcessor : " );
-   ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES );
-   EOL();
-
-   fi = tgsi_default_full_instruction();
-   fd = tgsi_default_full_declaration();
-
-   while( !tgsi_parse_end_of_tokens( &parse ) ) {
-      tgsi_parse_token( &parse );
-
-      TXT( "\nType       : " );
-      ENM( parse.FullToken.Token.Type, TGSI_TOKEN_TYPES );
-      if( ignored ) {
-         TXT( "\nSize       : " );
-         UID( parse.FullToken.Token.NrTokens );
-      }
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         dump_declaration_verbose(
-            &parse.FullToken.FullDeclaration,
-            ignored,
-            deflt,
-            &fd );
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         dump_immediate_verbose(
-            &parse.FullToken.FullImmediate,
-            ignored );
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         dump_instruction_verbose(
-            &parse.FullToken.FullInstruction,
-            ignored,
-            deflt,
-            &fi );
-         break;
-
-      default:
-         assert( 0 );
-      }
-
-      EOL();
-   }
-
-   TXT( "\ntgsi-dump end -------------------\n" );
-
-   tgsi_parse_free( &parse );
-}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 22984c32320..f853ea2820e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -60,6 +61,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
+
 #define FAST_MATH 1
 
 #define TILE_TOP_LEFT     0
@@ -67,11 +69,387 @@
 #define TILE_BOTTOM_LEFT  2
 #define TILE_BOTTOM_RIGHT 3
 
+static void
+micro_abs(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = fabsf(src->f[0]);
+   dst->f[1] = fabsf(src->f[1]);
+   dst->f[2] = fabsf(src->f[2]);
+   dst->f[3] = fabsf(src->f[3]);
+}
+
+static void
+micro_arl(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->i[0] = (int)floorf(src->f[0]);
+   dst->i[1] = (int)floorf(src->f[1]);
+   dst->i[2] = (int)floorf(src->f[2]);
+   dst->i[3] = (int)floorf(src->f[3]);
+}
+
+static void
+micro_arr(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->i[0] = (int)floorf(src->f[0] + 0.5f);
+   dst->i[1] = (int)floorf(src->f[1] + 0.5f);
+   dst->i[2] = (int)floorf(src->f[2] + 0.5f);
+   dst->i[3] = (int)floorf(src->f[3] + 0.5f);
+}
+
+static void
+micro_ceil(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->f[0] = ceilf(src->f[0]);
+   dst->f[1] = ceilf(src->f[1]);
+   dst->f[2] = ceilf(src->f[2]);
+   dst->f[3] = ceilf(src->f[3]);
+}
+
+static void
+micro_clamp(union tgsi_exec_channel *dst,
+            const union tgsi_exec_channel *src0,
+            const union tgsi_exec_channel *src1,
+            const union tgsi_exec_channel *src2)
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src1->f[0] : src0->f[0] > src2->f[0] ? src2->f[0] : src0->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src1->f[1] : src0->f[1] > src2->f[1] ? src2->f[1] : src0->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src1->f[2] : src0->f[2] > src2->f[2] ? src2->f[2] : src0->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src1->f[3] : src0->f[3] > src2->f[3] ? src2->f[3] : src0->f[3];
+}
+
+static void
+micro_cmp(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1,
+          const union tgsi_exec_channel *src2)
+{
+   dst->f[0] = src0->f[0] < 0.0f ? src1->f[0] : src2->f[0];
+   dst->f[1] = src0->f[1] < 0.0f ? src1->f[1] : src2->f[1];
+   dst->f[2] = src0->f[2] < 0.0f ? src1->f[2] : src2->f[2];
+   dst->f[3] = src0->f[3] < 0.0f ? src1->f[3] : src2->f[3];
+}
+
+static void
+micro_cnd(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1,
+          const union tgsi_exec_channel *src2)
+{
+   dst->f[0] = src2->f[0] > 0.5f ? src0->f[0] : src1->f[0];
+   dst->f[1] = src2->f[1] > 0.5f ? src0->f[1] : src1->f[1];
+   dst->f[2] = src2->f[2] > 0.5f ? src0->f[2] : src1->f[2];
+   dst->f[3] = src2->f[3] > 0.5f ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_cos(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = cosf(src->f[0]);
+   dst->f[1] = cosf(src->f[1]);
+   dst->f[2] = cosf(src->f[2]);
+   dst->f[3] = cosf(src->f[3]);
+}
+
+static void
+micro_ddx(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_ddy(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
+}
+
+static void
+micro_exp2(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+#if FAST_MATH
+   dst->f[0] = util_fast_exp2(src->f[0]);
+   dst->f[1] = util_fast_exp2(src->f[1]);
+   dst->f[2] = util_fast_exp2(src->f[2]);
+   dst->f[3] = util_fast_exp2(src->f[3]);
+#else
+#if DEBUG
+   /* Inf is okay for this instruction, so clamp it to silence assertions. */
+   uint i;
+   union tgsi_exec_channel clamped;
+
+   for (i = 0; i < 4; i++) {
+      if (src->f[i] > 127.99999f) {
+         clamped.f[i] = 127.99999f;
+      } else if (src->f[i] < -126.99999f) {
+         clamped.f[i] = -126.99999f;
+      } else {
+         clamped.f[i] = src->f[i];
+      }
+   }
+   src = &clamped;
+#endif /* DEBUG */
+
+   dst->f[0] = powf(2.0f, src->f[0]);
+   dst->f[1] = powf(2.0f, src->f[1]);
+   dst->f[2] = powf(2.0f, src->f[2]);
+   dst->f[3] = powf(2.0f, src->f[3]);
+#endif /* FAST_MATH */
+}
+
+static void
+micro_flr(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = floorf(src->f[0]);
+   dst->f[1] = floorf(src->f[1]);
+   dst->f[2] = floorf(src->f[2]);
+   dst->f[3] = floorf(src->f[3]);
+}
+
+static void
+micro_frc(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src->f[0] - floorf(src->f[0]);
+   dst->f[1] = src->f[1] - floorf(src->f[1]);
+   dst->f[2] = src->f[2] - floorf(src->f[2]);
+   dst->f[3] = src->f[3] - floorf(src->f[3]);
+}
+
+static void
+micro_iabs(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = src->i[0] >= 0 ? src->i[0] : -src->i[0];
+   dst->i[1] = src->i[1] >= 0 ? src->i[1] : -src->i[1];
+   dst->i[2] = src->i[2] >= 0 ? src->i[2] : -src->i[2];
+   dst->i[3] = src->i[3] >= 0 ? src->i[3] : -src->i[3];
+}
+
+static void
+micro_ineg(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src)
+{
+   dst->i[0] = -src->i[0];
+   dst->i[1] = -src->i[1];
+   dst->i[2] = -src->i[2];
+   dst->i[3] = -src->i[3];
+}
+
+static void
+micro_lg2(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+#if FAST_MATH
+   dst->f[0] = util_fast_log2(src->f[0]);
+   dst->f[1] = util_fast_log2(src->f[1]);
+   dst->f[2] = util_fast_log2(src->f[2]);
+   dst->f[3] = util_fast_log2(src->f[3]);
+#else
+   dst->f[0] = logf(src->f[0]) * 1.442695f;
+   dst->f[1] = logf(src->f[1]) * 1.442695f;
+   dst->f[2] = logf(src->f[2]) * 1.442695f;
+   dst->f[3] = logf(src->f[3]) * 1.442695f;
+#endif
+}
+
+static void
+micro_lrp(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1,
+          const union tgsi_exec_channel *src2)
+{
+   dst->f[0] = src0->f[0] * (src1->f[0] - src2->f[0]) + src2->f[0];
+   dst->f[1] = src0->f[1] * (src1->f[1] - src2->f[1]) + src2->f[1];
+   dst->f[2] = src0->f[2] * (src1->f[2] - src2->f[2]) + src2->f[2];
+   dst->f[3] = src0->f[3] * (src1->f[3] - src2->f[3]) + src2->f[3];
+}
+
+static void
+micro_mad(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1,
+          const union tgsi_exec_channel *src2)
+{
+   dst->f[0] = src0->f[0] * src1->f[0] + src2->f[0];
+   dst->f[1] = src0->f[1] * src1->f[1] + src2->f[1];
+   dst->f[2] = src0->f[2] * src1->f[2] + src2->f[2];
+   dst->f[3] = src0->f[3] * src1->f[3] + src2->f[3];
+}
+
+static void
+micro_mov(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = src->u[0];
+   dst->u[1] = src->u[1];
+   dst->u[2] = src->u[2];
+   dst->u[3] = src->u[3];
+}
+
+static void
+micro_rcp(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+#if 0 /* for debugging */
+   assert(src->f[0] != 0.0f);
+   assert(src->f[1] != 0.0f);
+   assert(src->f[2] != 0.0f);
+   assert(src->f[3] != 0.0f);
+#endif
+   dst->f[0] = 1.0f / src->f[0];
+   dst->f[1] = 1.0f / src->f[1];
+   dst->f[2] = 1.0f / src->f[2];
+   dst->f[3] = 1.0f / src->f[3];
+}
+
+static void
+micro_rnd(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = floorf(src->f[0] + 0.5f);
+   dst->f[1] = floorf(src->f[1] + 0.5f);
+   dst->f[2] = floorf(src->f[2] + 0.5f);
+   dst->f[3] = floorf(src->f[3] + 0.5f);
+}
+
+static void
+micro_rsq(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+#if 0 /* for debugging */
+   assert(src->f[0] != 0.0f);
+   assert(src->f[1] != 0.0f);
+   assert(src->f[2] != 0.0f);
+   assert(src->f[3] != 0.0f);
+#endif
+   dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
+   dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
+   dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
+   dst->f[3] = 1.0f / sqrtf(fabsf(src->f[3]));
+}
+
+static void
+micro_seq(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
+{
+   dst->f[0] = src0->f[0] == src1->f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src0->f[1] == src1->f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src0->f[2] == src1->f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src0->f[3] == src1->f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sge(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
+{
+   dst->f[0] = src0->f[0] >= src1->f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src0->f[1] >= src1->f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src0->f[2] >= src1->f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src0->f[3] >= src1->f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sgn(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
+   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
+   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
+   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
+}
+
+static void
+micro_sgt(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
+{
+   dst->f[0] = src0->f[0] > src1->f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src0->f[1] > src1->f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src0->f[2] > src1->f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src0->f[3] > src1->f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sin(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = sinf(src->f[0]);
+   dst->f[1] = sinf(src->f[1]);
+   dst->f[2] = sinf(src->f[2]);
+   dst->f[3] = sinf(src->f[3]);
+}
+
+static void
+micro_sle(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
+{
+   dst->f[0] = src0->f[0] <= src1->f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src0->f[1] <= src1->f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src0->f[2] <= src1->f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src0->f[3] <= src1->f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_slt(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src0->f[1] < src1->f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src0->f[2] < src1->f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src0->f[3] < src1->f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_sne(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
+{
+   dst->f[0] = src0->f[0] != src1->f[0] ? 1.0f : 0.0f;
+   dst->f[1] = src0->f[1] != src1->f[1] ? 1.0f : 0.0f;
+   dst->f[2] = src0->f[2] != src1->f[2] ? 1.0f : 0.0f;
+   dst->f[3] = src0->f[3] != src1->f[3] ? 1.0f : 0.0f;
+}
+
+static void
+micro_trunc(union tgsi_exec_channel *dst,
+            const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float)(int)src->f[0];
+   dst->f[1] = (float)(int)src->f[1];
+   dst->f[2] = (float)(int)src->f[2];
+   dst->f[3] = (float)(int)src->f[3];
+}
+
+
 #define CHAN_X  0
 #define CHAN_Y  1
 #define CHAN_Z  2
 #define CHAN_W  3
 
+enum tgsi_exec_datatype {
+   TGSI_EXEC_DATA_FLOAT,
+   TGSI_EXEC_DATA_INT,
+   TGSI_EXEC_DATA_UINT
+};
+
 /*
  * Shorthand locations of various utility registers (_I = Index, _C = Channel)
  */
@@ -123,23 +501,31 @@
 
 /** The execution mask depends on the conditional mask and the loop mask */
 #define UPDATE_EXEC_MASK(MACH) \
-      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->Switch.mask & MACH->FuncMask
 
 
 static const union tgsi_exec_channel ZeroVec =
    { { 0.0, 0.0, 0.0, 0.0 } };
 
+static const union tgsi_exec_channel OneVec = {
+   {1.0f, 1.0f, 1.0f, 1.0f}
+};
 
-#ifdef DEBUG
-static void
+
+/**
+ * Assert that none of the float values in 'chan' are infinite or NaN.
+ * NaN and Inf may occur normally during program execution and should
+ * not lead to crashes, etc.  But when debugging, it's helpful to catch
+ * them.
+ */
+static INLINE void
 check_inf_or_nan(const union tgsi_exec_channel *chan)
 {
-   assert(!util_is_inf_or_nan(chan->f[0]));
-   assert(!util_is_inf_or_nan(chan->f[1]));
-   assert(!util_is_inf_or_nan(chan->f[2]));
-   assert(!util_is_inf_or_nan(chan->f[3]));
+   assert(!util_is_inf_or_nan((chan)->f[0]));
+   assert(!util_is_inf_or_nan((chan)->f[1]));
+   assert(!util_is_inf_or_nan((chan)->f[2]));
+   assert(!util_is_inf_or_nan((chan)->f[3]));
 }
-#endif
 
 
 #ifdef DEBUG
@@ -292,6 +678,14 @@ tgsi_exec_machine_bind_shader(
                                    * sizeof(struct tgsi_full_declaration));
             maxDeclarations += 10;
          }
+         if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_OUTPUT) {
+            unsigned reg;
+            for (reg = parse.FullToken.FullDeclaration.Range.First;
+                 reg <= parse.FullToken.FullDeclaration.Range.Last;
+                 ++reg) {
+               ++mach->NumOutputs;
+            }
+         }
          memcpy(declarations + numDeclarations,
                 &parse.FullToken.FullDeclaration,
                 sizeof(declarations[0]));
@@ -336,6 +730,9 @@ tgsi_exec_machine_bind_shader(
          numInstructions++;
          break;
 
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         break;
+
       default:
          assert( 0 );
       }
@@ -369,6 +766,7 @@ tgsi_exec_machine_create( void )
    memset(mach, 0, sizeof(*mach));
 
    mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
+   mach->MaxGeometryShaderOutputs = TGSI_MAX_TOTAL_VERTICES;
    mach->Predicates = &mach->Temps[TGSI_EXEC_TEMP_P0];
 
    /* Setup constants. */
@@ -410,23 +808,10 @@ tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach)
    align_free(mach);
 }
 
-
 static void
-micro_abs(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = fabsf( src->f[0] );
-   dst->f[1] = fabsf( src->f[1] );
-   dst->f[2] = fabsf( src->f[2] );
-   dst->f[3] = fabsf( src->f[3] );
-}
-
-static void
-micro_add(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
+micro_add(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
 {
    dst->f[0] = src0->f[0] + src1->f[0];
    dst->f[1] = src0->f[1] + src1->f[1];
@@ -434,76 +819,6 @@ micro_add(
    dst->f[3] = src0->f[3] + src1->f[3];
 }
 
-#if 0
-static void
-micro_iadd(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] + src1->i[0];
-   dst->i[1] = src0->i[1] + src1->i[1];
-   dst->i[2] = src0->i[2] + src1->i[2];
-   dst->i[3] = src0->i[3] + src1->i[3];
-}
-#endif
-
-static void
-micro_and(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] & src1->u[0];
-   dst->u[1] = src0->u[1] & src1->u[1];
-   dst->u[2] = src0->u[2] & src1->u[2];
-   dst->u[3] = src0->u[3] & src1->u[3];
-}
-
-static void
-micro_ceil(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = ceilf( src->f[0] );
-   dst->f[1] = ceilf( src->f[1] );
-   dst->f[2] = ceilf( src->f[2] );
-   dst->f[3] = ceilf( src->f[3] );
-}
-
-static void
-micro_cos(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = cosf( src->f[0] );
-   dst->f[1] = cosf( src->f[1] );
-   dst->f[2] = cosf( src->f[2] );
-   dst->f[3] = cosf( src->f[3] );
-}
-
-static void
-micro_ddx(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
-}
-
-static void
-micro_ddy(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] =
-   dst->f[1] =
-   dst->f[2] =
-   dst->f[3] = src->f[TILE_BOTTOM_LEFT] - src->f[TILE_TOP_LEFT];
-}
-
 static void
 micro_div(
    union tgsi_exec_channel *dst,
@@ -524,99 +839,6 @@ micro_div(
    }
 }
 
-#if 0
-static void
-micro_udiv(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] / src1->u[0];
-   dst->u[1] = src0->u[1] / src1->u[1];
-   dst->u[2] = src0->u[2] / src1->u[2];
-   dst->u[3] = src0->u[3] / src1->u[3];
-}
-#endif
-
-static void
-micro_eq(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-#if 0
-static void
-micro_ieq(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
-}
-#endif
-
-static void
-micro_exp2(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src)
-{
-#if FAST_MATH
-   dst->f[0] = util_fast_exp2( src->f[0] );
-   dst->f[1] = util_fast_exp2( src->f[1] );
-   dst->f[2] = util_fast_exp2( src->f[2] );
-   dst->f[3] = util_fast_exp2( src->f[3] );
-#else
-
-#if DEBUG
-   /* Inf is okay for this instruction, so clamp it to silence assertions. */
-   uint i;
-   union tgsi_exec_channel clamped;
-
-   for (i = 0; i < 4; i++) {
-      if (src->f[i] > 127.99999f) {
-         clamped.f[i] = 127.99999f;
-      } else if (src->f[i] < -126.99999f) {
-         clamped.f[i] = -126.99999f;
-      } else {
-         clamped.f[i] = src->f[i];
-      }
-   }
-   src = &clamped;
-#endif
-
-   dst->f[0] = powf( 2.0f, src->f[0] );
-   dst->f[1] = powf( 2.0f, src->f[1] );
-   dst->f[2] = powf( 2.0f, src->f[2] );
-   dst->f[3] = powf( 2.0f, src->f[3] );
-#endif
-}
-
-#if 0
-static void
-micro_f2ut(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->u[0] = (uint) src->f[0];
-   dst->u[1] = (uint) src->f[1];
-   dst->u[2] = (uint) src->f[2];
-   dst->u[3] = (uint) src->f[3];
-}
-#endif
-
 static void
 micro_float_clamp(union tgsi_exec_channel *dst,
                   const union tgsi_exec_channel *src)
@@ -644,71 +866,6 @@ micro_float_clamp(union tgsi_exec_channel *dst,
 }
 
 static void
-micro_flr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = floorf( src->f[0] );
-   dst->f[1] = floorf( src->f[1] );
-   dst->f[2] = floorf( src->f[2] );
-   dst->f[3] = floorf( src->f[3] );
-}
-
-static void
-micro_frc(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = src->f[0] - floorf( src->f[0] );
-   dst->f[1] = src->f[1] - floorf( src->f[1] );
-   dst->f[2] = src->f[2] - floorf( src->f[2] );
-   dst->f[3] = src->f[3] - floorf( src->f[3] );
-}
-
-static void
-micro_i2f(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = (float) src->i[0];
-   dst->f[1] = (float) src->i[1];
-   dst->f[2] = (float) src->i[2];
-   dst->f[3] = (float) src->i[3];
-}
-
-static void
-micro_lg2(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-#if FAST_MATH
-   dst->f[0] = util_fast_log2( src->f[0] );
-   dst->f[1] = util_fast_log2( src->f[1] );
-   dst->f[2] = util_fast_log2( src->f[2] );
-   dst->f[3] = util_fast_log2( src->f[3] );
-#else
-   dst->f[0] = logf( src->f[0] ) * 1.442695f;
-   dst->f[1] = logf( src->f[1] ) * 1.442695f;
-   dst->f[2] = logf( src->f[2] ) * 1.442695f;
-   dst->f[3] = logf( src->f[3] ) * 1.442695f;
-#endif
-}
-
-static void
-micro_le(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
-   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
-   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
-   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
-}
-
-static void
 micro_lt(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src0,
@@ -722,43 +879,10 @@ micro_lt(
    dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
 }
 
-#if 0
-static void
-micro_ilt(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
-}
-#endif
-
-#if 0
-static void
-micro_ult(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1,
-   const union tgsi_exec_channel *src2,
-   const union tgsi_exec_channel *src3 )
-{
-   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
-}
-#endif
-
 static void
-micro_max(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
+micro_max(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
 {
    dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
    dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
@@ -766,39 +890,10 @@ micro_max(
    dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
 }
 
-#if 0
 static void
-micro_imax(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
-}
-#endif
-
-#if 0
-static void
-micro_umax(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
-}
-#endif
-
-static void
-micro_min(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
+micro_min(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
 {
    dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
    dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
@@ -806,53 +901,10 @@ micro_min(
    dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
 }
 
-#if 0
-static void
-micro_imin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
-   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
-   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
-   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
-}
-#endif
-
-#if 0
 static void
-micro_umin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
-   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
-   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
-   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
-}
-#endif
-
-#if 0
-static void
-micro_umod(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] % src1->u[0];
-   dst->u[1] = src0->u[1] % src1->u[1];
-   dst->u[2] = src0->u[2] % src1->u[2];
-   dst->u[3] = src0->u[3] % src1->u[3];
-}
-#endif
-
-static void
-micro_mul(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
+micro_mul(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
 {
    dst->f[0] = src0->f[0] * src1->f[0];
    dst->f[1] = src0->f[1] * src1->f[1];
@@ -862,20 +914,6 @@ micro_mul(
 
 #if 0
 static void
-micro_imul(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] * src1->i[0];
-   dst->i[1] = src0->i[1] * src1->i[1];
-   dst->i[2] = src0->i[2] * src1->i[2];
-   dst->i[3] = src0->i[3] * src1->i[3];
-}
-#endif
-
-#if 0
-static void
 micro_imul64(
    union tgsi_exec_channel *dst0,
    union tgsi_exec_channel *dst1,
@@ -939,42 +977,6 @@ micro_neg(
    dst->f[3] = -src->f[3];
 }
 
-#if 0
-static void
-micro_ineg(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->i[0] = -src->i[0];
-   dst->i[1] = -src->i[1];
-   dst->i[2] = -src->i[2];
-   dst->i[3] = -src->i[3];
-}
-#endif
-
-static void
-micro_not(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->u[0] = ~src->u[0];
-   dst->u[1] = ~src->u[1];
-   dst->u[2] = ~src->u[2];
-   dst->u[3] = ~src->u[3];
-}
-
-static void
-micro_or(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] | src1->u[0];
-   dst->u[1] = src0->u[1] | src1->u[1];
-   dst->u[2] = src0->u[2] | src1->u[2];
-   dst->u[3] = src0->u[3] | src1->u[3];
-}
-
 static void
 micro_pow(
    union tgsi_exec_channel *dst,
@@ -995,238 +997,113 @@ micro_pow(
 }
 
 static void
-micro_rnd(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = floorf( src->f[0] + 0.5f );
-   dst->f[1] = floorf( src->f[1] + 0.5f );
-   dst->f[2] = floorf( src->f[2] + 0.5f );
-   dst->f[3] = floorf( src->f[3] + 0.5f );
-}
-
-static void
-micro_sgn(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
-   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
-   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
-   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
-}
-
-static void
-micro_shl(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->i[0] = src0->i[0] << src1->i[0];
-   dst->i[1] = src0->i[1] << src1->i[1];
-   dst->i[2] = src0->i[2] << src1->i[2];
-   dst->i[3] = src0->i[3] << src1->i[3];
-}
-
-static void
-micro_ishr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
+micro_sub(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
 {
-   dst->i[0] = src0->i[0] >> src1->i[0];
-   dst->i[1] = src0->i[1] >> src1->i[1];
-   dst->i[2] = src0->i[2] >> src1->i[2];
-   dst->i[3] = src0->i[3] >> src1->i[3];
+   dst->f[0] = src0->f[0] - src1->f[0];
+   dst->f[1] = src0->f[1] - src1->f[1];
+   dst->f[2] = src0->f[2] - src1->f[2];
+   dst->f[3] = src0->f[3] - src1->f[3];
 }
 
 static void
-micro_trunc(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0 )
+fetch_src_file_channel(const struct tgsi_exec_machine *mach,
+                       const uint file,
+                       const uint swizzle,
+                       const union tgsi_exec_channel *index,
+                       const union tgsi_exec_channel *index2D,
+                       union tgsi_exec_channel *chan)
 {
-   dst->f[0] = (float) (int) src0->f[0];
-   dst->f[1] = (float) (int) src0->f[1];
-   dst->f[2] = (float) (int) src0->f[2];
-   dst->f[3] = (float) (int) src0->f[3];
-}
+   uint i;
 
-#if 0
-static void
-micro_ushr(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] >> src1->u[0];
-   dst->u[1] = src0->u[1] >> src1->u[1];
-   dst->u[2] = src0->u[2] >> src1->u[2];
-   dst->u[3] = src0->u[3] >> src1->u[3];
-}
-#endif
+   switch (file) {
+   case TGSI_FILE_CONSTANT:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index2D->i[i] >= 0 && index2D->i[i] < PIPE_MAX_CONSTANT_BUFFERS);
+         assert(mach->Consts[index2D->i[i]]);
 
-static void
-micro_sin(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = sinf( src->f[0] );
-   dst->f[1] = sinf( src->f[1] );
-   dst->f[2] = sinf( src->f[2] );
-   dst->f[3] = sinf( src->f[3] );
-}
+         if (index->i[i] < 0) {
+            chan->u[i] = 0;
+         } else {
+            const uint *p = (const uint *)mach->Consts[index2D->i[i]];
 
-static void
-micro_sqrt( union tgsi_exec_channel *dst,
-            const union tgsi_exec_channel *src )
-{
-   dst->f[0] = sqrtf( src->f[0] );
-   dst->f[1] = sqrtf( src->f[1] );
-   dst->f[2] = sqrtf( src->f[2] );
-   dst->f[3] = sqrtf( src->f[3] );
-}
+            chan->u[i] = p[index->i[i] * 4 + swizzle];
+         }
+      }
+      break;
 
-static void
-micro_sub(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->f[0] = src0->f[0] - src1->f[0];
-   dst->f[1] = src0->f[1] - src1->f[1];
-   dst->f[2] = src0->f[2] - src1->f[2];
-   dst->f[3] = src0->f[3] - src1->f[3];
-}
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_SYSTEM_VALUE:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         /* XXX: 2D indexing */
+         chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i];
+      }
+      break;
 
-#if 0
-static void
-micro_u2f(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src )
-{
-   dst->f[0] = (float) src->u[0];
-   dst->f[1] = (float) src->u[1];
-   dst->f[2] = (float) src->u[2];
-   dst->f[3] = (float) src->u[3];
-}
-#endif
+   case TGSI_FILE_TEMPORARY:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index->i[i] < TGSI_EXEC_NUM_TEMPS);
+         assert(index2D->i[i] == 0);
 
-static void
-micro_xor(
-   union tgsi_exec_channel *dst,
-   const union tgsi_exec_channel *src0,
-   const union tgsi_exec_channel *src1 )
-{
-   dst->u[0] = src0->u[0] ^ src1->u[0];
-   dst->u[1] = src0->u[1] ^ src1->u[1];
-   dst->u[2] = src0->u[2] ^ src1->u[2];
-   dst->u[3] = src0->u[3] ^ src1->u[3];
-}
+         chan->u[i] = mach->Temps[index->i[i]].xyzw[swizzle].u[i];
+      }
+      break;
 
-static void
-fetch_src_file_channel(
-   const struct tgsi_exec_machine *mach,
-   const uint file,
-   const uint swizzle,
-   const union tgsi_exec_channel *index,
-   union tgsi_exec_channel *chan )
-{
-   switch( swizzle ) {
-   case TGSI_SWIZZLE_X:
-   case TGSI_SWIZZLE_Y:
-   case TGSI_SWIZZLE_Z:
-   case TGSI_SWIZZLE_W:
-      switch( file ) {
-      case TGSI_FILE_CONSTANT:
-         assert(mach->Consts);
-         if (index->i[0] < 0)
-            chan->f[0] = 0.0f;
-         else
-            chan->f[0] = mach->Consts[index->i[0]][swizzle];
-         if (index->i[1] < 0)
-            chan->f[1] = 0.0f;
-         else
-            chan->f[1] = mach->Consts[index->i[1]][swizzle];
-         if (index->i[2] < 0)
-            chan->f[2] = 0.0f;
-         else
-            chan->f[2] = mach->Consts[index->i[2]][swizzle];
-         if (index->i[3] < 0)
-            chan->f[3] = 0.0f;
-         else
-            chan->f[3] = mach->Consts[index->i[3]][swizzle];
-         break;
+   case TGSI_FILE_IMMEDIATE:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit);
+         assert(index2D->i[i] == 0);
 
-      case TGSI_FILE_INPUT:
-         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
-         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
-         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
-         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
-         break;
+         chan->f[i] = mach->Imms[index->i[i]][swizzle];
+      }
+      break;
 
-      case TGSI_FILE_TEMPORARY:
-         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
-         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
-         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
-         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
-         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
-         break;
+   case TGSI_FILE_ADDRESS:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index->i[i] >= 0);
+         assert(index2D->i[i] == 0);
 
-      case TGSI_FILE_IMMEDIATE:
-         assert( index->i[0] < (int) mach->ImmLimit );
-         chan->f[0] = mach->Imms[index->i[0]][swizzle];
-         assert( index->i[1] < (int) mach->ImmLimit );
-         chan->f[1] = mach->Imms[index->i[1]][swizzle];
-         assert( index->i[2] < (int) mach->ImmLimit );
-         chan->f[2] = mach->Imms[index->i[2]][swizzle];
-         assert( index->i[3] < (int) mach->ImmLimit );
-         chan->f[3] = mach->Imms[index->i[3]][swizzle];
-         break;
+         chan->u[i] = mach->Addrs[index->i[i]].xyzw[swizzle].u[i];
+      }
+      break;
 
-      case TGSI_FILE_ADDRESS:
-         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
-         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
-         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
-         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
-         break;
+   case TGSI_FILE_PREDICATE:
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index->i[i] >= 0 && index->i[i] < TGSI_EXEC_NUM_PREDS);
+         assert(index2D->i[i] == 0);
 
-      case TGSI_FILE_PREDICATE:
-         assert(index->i[0] < TGSI_EXEC_NUM_PREDS);
-         assert(index->i[1] < TGSI_EXEC_NUM_PREDS);
-         assert(index->i[2] < TGSI_EXEC_NUM_PREDS);
-         assert(index->i[3] < TGSI_EXEC_NUM_PREDS);
-         chan->u[0] = mach->Predicates[0].xyzw[swizzle].u[0];
-         chan->u[1] = mach->Predicates[0].xyzw[swizzle].u[1];
-         chan->u[2] = mach->Predicates[0].xyzw[swizzle].u[2];
-         chan->u[3] = mach->Predicates[0].xyzw[swizzle].u[3];
-         break;
+         chan->u[i] = mach->Predicates[0].xyzw[swizzle].u[i];
+      }
+      break;
 
-      case TGSI_FILE_OUTPUT:
-         /* vertex/fragment output vars can be read too */
-         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
-         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
-         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
-         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
-         break;
+   case TGSI_FILE_OUTPUT:
+      /* vertex/fragment output vars can be read too */
+      for (i = 0; i < QUAD_SIZE; i++) {
+         assert(index->i[i] >= 0);
+         assert(index2D->i[i] == 0);
 
-      default:
-         assert( 0 );
+         chan->u[i] = mach->Outputs[index->i[i]].xyzw[swizzle].u[i];
       }
       break;
 
    default:
-      assert( 0 );
+      assert(0);
+      for (i = 0; i < QUAD_SIZE; i++) {
+         chan->u[i] = 0;
+      }
    }
 }
 
 static void
-fetch_source(
-   const struct tgsi_exec_machine *mach,
-   union tgsi_exec_channel *chan,
-   const struct tgsi_full_src_register *reg,
-   const uint chan_index )
+fetch_source(const struct tgsi_exec_machine *mach,
+             union tgsi_exec_channel *chan,
+             const struct tgsi_full_src_register *reg,
+             const uint chan_index,
+             enum tgsi_exec_datatype src_datatype)
 {
    union tgsi_exec_channel index;
+   union tgsi_exec_channel index2D;
    uint swizzle;
 
    /* We start with a direct index into a register file.
@@ -1265,18 +1142,18 @@ fetch_source(
 
       /* get current value of address register[swizzle] */
       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
-      fetch_src_file_channel(
-         mach,
-         reg->Indirect.File,
-         swizzle,
-         &index2,
-         &indir_index );
+      fetch_src_file_channel(mach,
+                             reg->Indirect.File,
+                             swizzle,
+                             &index2,
+                             &ZeroVec,
+                             &indir_index);
 
       /* add value of address register to the offset */
-      index.i[0] += (int) indir_index.f[0];
-      index.i[1] += (int) indir_index.f[1];
-      index.i[2] += (int) indir_index.f[2];
-      index.i[3] += (int) indir_index.f[3];
+      index.i[0] += indir_index.i[0];
+      index.i[1] += indir_index.i[1];
+      index.i[2] += indir_index.i[2];
+      index.i[3] += indir_index.i[3];
 
       /* for disabled execution channels, zero-out the index to
        * avoid using a potential garbage value.
@@ -1291,43 +1168,22 @@ fetch_source(
     * subscript to a register file. Effectively it means that
     * the register file is actually a 2D array of registers.
     *
-    *    file[1][3] == file[1*sizeof(file[1])+3],
+    *    file[3][1],
     *    where:
     *       [3] = Dimension.Index
     */
    if (reg->Register.Dimension) {
-      /* The size of the first-order array depends on the register file type.
-       * We need to multiply the index to the first array to get an effective,
-       * "flat" index that points to the beginning of the second-order array.
-       */
-      switch (reg->Register.File) {
-      case TGSI_FILE_INPUT:
-         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
-         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
-         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
-         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
-         break;
-      case TGSI_FILE_CONSTANT:
-         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
-         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
-         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
-         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
-         break;
-      default:
-         assert( 0 );
-      }
-
-      index.i[0] += reg->Dimension.Index;
-      index.i[1] += reg->Dimension.Index;
-      index.i[2] += reg->Dimension.Index;
-      index.i[3] += reg->Dimension.Index;
+      index2D.i[0] =
+      index2D.i[1] =
+      index2D.i[2] =
+      index2D.i[3] = reg->Dimension.Index;
 
       /* Again, the second subscript index can be addressed indirectly
        * identically to the first one.
        * Nothing stops us from indirectly addressing the indirect register,
        * but there is no need for that, so we won't exercise it.
        *
-       *    file[1][ind[4].y+3],
+       *    file[ind[4].y+3][1],
        *    where:
        *       ind = DimIndirect.File
        *       [4] = DimIndirect.Index
@@ -1345,24 +1201,25 @@ fetch_source(
          index2.i[3] = reg->DimIndirect.Index;
 
          swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
-         fetch_src_file_channel(
-            mach,
-            reg->DimIndirect.File,
-            swizzle,
-            &index2,
-            &indir_index );
-
-         index.i[0] += (int) indir_index.f[0];
-         index.i[1] += (int) indir_index.f[1];
-         index.i[2] += (int) indir_index.f[2];
-         index.i[3] += (int) indir_index.f[3];
+         fetch_src_file_channel(mach,
+                                reg->DimIndirect.File,
+                                swizzle,
+                                &index2,
+                                &ZeroVec,
+                                &indir_index);
+
+         index2D.i[0] += indir_index.i[0];
+         index2D.i[1] += indir_index.i[1];
+         index2D.i[2] += indir_index.i[2];
+         index2D.i[3] += indir_index.i[3];
 
          /* for disabled execution channels, zero-out the index to
           * avoid using a potential garbage value.
           */
          for (i = 0; i < QUAD_SIZE; i++) {
-            if ((execmask & (1 << i)) == 0)
-               index.i[i] = 0;
+            if ((execmask & (1 << i)) == 0) {
+               index2D.i[i] = 0;
+            }
          }
       }
 
@@ -1370,42 +1227,45 @@ fetch_source(
        * files, we would have to check whether Dimension is followed
        * by a dimension register and continue the saga.
        */
+   } else {
+      index2D.i[0] =
+      index2D.i[1] =
+      index2D.i[2] =
+      index2D.i[3] = 0;
    }
 
    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
-   fetch_src_file_channel(
-      mach,
-      reg->Register.File,
-      swizzle,
-      &index,
-      chan );
-
-   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      micro_abs( chan, chan );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      micro_abs( chan, chan );
-      micro_neg( chan, chan );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      micro_neg( chan, chan );
-      break;
+   fetch_src_file_channel(mach,
+                          reg->Register.File,
+                          swizzle,
+                          &index,
+                          &index2D,
+                          chan);
+
+   if (reg->Register.Absolute) {
+      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
+         micro_abs(chan, chan);
+      } else {
+         micro_iabs(chan, chan);
+      }
+   }
 
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
+   if (reg->Register.Negate) {
+      if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
+         micro_neg(chan, chan);
+      } else {
+         micro_ineg(chan, chan);
+      }
    }
 }
 
 static void
-store_dest(
-   struct tgsi_exec_machine *mach,
-   const union tgsi_exec_channel *chan,
-   const struct tgsi_full_dst_register *reg,
-   const struct tgsi_full_instruction *inst,
-   uint chan_index )
+store_dest(struct tgsi_exec_machine *mach,
+           const union tgsi_exec_channel *chan,
+           const struct tgsi_full_dst_register *reg,
+           const struct tgsi_full_instruction *inst,
+           uint chan_index,
+           enum tgsi_exec_datatype dst_datatype)
 {
    uint i;
    union tgsi_exec_channel null;
@@ -1414,9 +1274,10 @@ store_dest(
    int offset = 0;  /* indirection offset */
    int index;
 
-#ifdef DEBUG
-   check_inf_or_nan(chan);
-#endif
+   /* for debugging */
+   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
+      check_inf_or_nan(chan);
+   }
 
    /* There is an extra source register that indirectly subscripts
     * a register file. The direct index now becomes an offset
@@ -1443,15 +1304,15 @@ store_dest(
       swizzle = tgsi_util_get_src_register_swizzle( &reg->Indirect, CHAN_X );
 
       /* fetch values from the address/indirection register */
-      fetch_src_file_channel(
-         mach,
-         reg->Indirect.File,
-         swizzle,
-         &index,
-         &indir_index );
+      fetch_src_file_channel(mach,
+                             reg->Indirect.File,
+                             swizzle,
+                             &index,
+                             &ZeroVec,
+                             &indir_index);
 
       /* save indirection offset */
-      offset = (int) indir_index.f[0];
+      offset = indir_index.i[0];
    }
 
    switch (reg->Register.File) {
@@ -1463,6 +1324,15 @@ store_dest(
       index = mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
          + reg->Register.Index;
       dst = &mach->Outputs[offset + index].xyzw[chan_index];
+#if 0
+      if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) {
+         fprintf(stderr, "STORING OUT[%d] mask(%d), = (", offset + index, execmask);
+         for (i = 0; i < QUAD_SIZE; i++)
+            if (execmask & (1 << i))
+               fprintf(stderr, "%f, ", chan->f[i]);
+         fprintf(stderr, ")\n");
+      }
+#endif
       break;
 
    case TGSI_FILE_TEMPORARY:
@@ -1572,10 +1442,10 @@ store_dest(
 }
 
 #define FETCH(VAL,INDEX,CHAN)\
-    fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
+    fetch_source(mach, VAL, &inst->Src[INDEX], CHAN, TGSI_EXEC_DATA_FLOAT)
 
 #define STORE(VAL,INDEX,CHAN)\
-    store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
+   store_dest(mach, VAL, &inst->Dst[INDEX], inst, CHAN, TGSI_EXEC_DATA_FLOAT)
 
 
 /**
@@ -1633,16 +1503,46 @@ exec_kilp(struct tgsi_exec_machine *mach,
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
 }
 
+static void
+emit_vertex(struct tgsi_exec_machine *mach)
+{
+   /* FIXME: check for exec mask correctly
+   unsigned i;
+   for (i = 0; i < QUAD_SIZE; ++i) {
+         if ((mach->ExecMask & (1 << i)))
+   */
+   if (mach->ExecMask) {
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += mach->NumOutputs;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+   }
+}
+
+static void
+emit_primitive(struct tgsi_exec_machine *mach)
+{
+   unsigned *prim_count = &mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];
+   /* FIXME: check for exec mask correctly
+   unsigned i;
+   for (i = 0; i < QUAD_SIZE; ++i) {
+         if ((mach->ExecMask & (1 << i)))
+   */
+   if (mach->ExecMask) {
+      ++(*prim_count);
+      debug_assert((*prim_count * mach->NumOutputs) < mach->MaxGeometryShaderOutputs);
+      mach->Primitives[*prim_count] = 0;
+   }
+}
 
 /*
- * Fetch a four texture samples using STR texture coordinates.
+ * Fetch four texture samples using STR texture coordinates.
  */
 static void
 fetch_texel( struct tgsi_sampler *sampler,
              const union tgsi_exec_channel *s,
              const union tgsi_exec_channel *t,
              const union tgsi_exec_channel *p,
-             float lodbias,  /* XXX should be float[4] */
+             const union tgsi_exec_channel *c0,
+             enum tgsi_sampler_control control,
              union tgsi_exec_channel *r,
              union tgsi_exec_channel *g,
              union tgsi_exec_channel *b,
@@ -1651,7 +1551,7 @@ fetch_texel( struct tgsi_sampler *sampler,
    uint j;
    float rgba[NUM_CHANNELS][QUAD_SIZE];
 
-   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+   sampler->get_samples(sampler, s->f, t->f, p->f, c0->f, control, rgba);
 
    for (j = 0; j < 4; j++) {
       r->f[j] = rgba[0][j];
@@ -1662,102 +1562,95 @@ fetch_texel( struct tgsi_sampler *sampler,
 }
 
 
+#define TEX_MODIFIER_NONE           0
+#define TEX_MODIFIER_PROJECTED      1
+#define TEX_MODIFIER_LOD_BIAS       2
+#define TEX_MODIFIER_EXPLICIT_LOD   3
+
+
 static void
 exec_tex(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst,
-         boolean biasLod,
-         boolean projected)
+         uint modifier)
 {
    const uint unit = inst->Src[1].Register.Index;
    union tgsi_exec_channel r[4];
+   const union tgsi_exec_channel *lod = &ZeroVec;
+   enum tgsi_sampler_control control;
    uint chan_index;
-   float lodBias;
 
-   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
+   if (modifier != TEX_MODIFIER_NONE) {
+      FETCH(&r[3], 0, CHAN_W);
+      if (modifier != TEX_MODIFIER_PROJECTED) {
+         lod = &r[3];
+      }
+   }
+
+   if (modifier == TEX_MODIFIER_EXPLICIT_LOD) {
+      control = tgsi_sampler_lod_explicit;
+   } else {
+      control = tgsi_sampler_lod_bias;
+   }
 
    switch (inst->Texture.Texture) {
    case TGSI_TEXTURE_1D:
    case TGSI_TEXTURE_SHADOW1D:
-
       FETCH(&r[0], 0, CHAN_X);
 
-      if (projected) {
-         FETCH(&r[1], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[1] );
+      if (modifier == TEX_MODIFIER_PROJECTED) {
+         micro_div(&r[0], &r[0], &r[3]);
       }
 
-      if (biasLod) {
-         FETCH(&r[1], 0, CHAN_W);
-         lodBias = r[2].f[0];
-      }
-      else
-         lodBias = 0.0;
-
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &ZeroVec, &ZeroVec, lodBias,  /* S, T, P, BIAS */
-                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+                  &r[0], &ZeroVec, &ZeroVec, lod,  /* S, T, P, LOD */
+                  control,
+                  &r[0], &r[1], &r[2], &r[3]);     /* R, G, B, A */
       break;
 
    case TGSI_TEXTURE_2D:
    case TGSI_TEXTURE_RECT:
    case TGSI_TEXTURE_SHADOW2D:
    case TGSI_TEXTURE_SHADOWRECT:
-
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      if (projected) {
-         FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
+      if (modifier == TEX_MODIFIER_PROJECTED) {
+         micro_div(&r[0], &r[0], &r[3]);
+         micro_div(&r[1], &r[1], &r[3]);
+         micro_div(&r[2], &r[2], &r[3]);
       }
 
-      if (biasLod) {
-         FETCH(&r[3], 0, CHAN_W);
-         lodBias = r[3].f[0];
-      }
-      else
-         lodBias = 0.0;
-
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], lod,     /* S, T, P, LOD */
+                  control,
                   &r[0], &r[1], &r[2], &r[3]);  /* outputs */
       break;
 
    case TGSI_TEXTURE_3D:
    case TGSI_TEXTURE_CUBE:
-
       FETCH(&r[0], 0, CHAN_X);
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      if (projected) {
-         FETCH(&r[3], 0, CHAN_W);
-         micro_div( &r[0], &r[0], &r[3] );
-         micro_div( &r[1], &r[1], &r[3] );
-         micro_div( &r[2], &r[2], &r[3] );
-      }
-
-      if (biasLod) {
-         FETCH(&r[3], 0, CHAN_W);
-         lodBias = r[3].f[0];
+      if (modifier == TEX_MODIFIER_PROJECTED) {
+         micro_div(&r[0], &r[0], &r[3]);
+         micro_div(&r[1], &r[1], &r[3]);
+         micro_div(&r[2], &r[2], &r[3]);
       }
-      else
-         lodBias = 0.0;
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], lod,
+                  control,
                   &r[0], &r[1], &r[2], &r[3]);
       break;
 
    default:
-      assert (0);
+      assert(0);
    }
 
-   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-      STORE( &r[chan_index], 0, chan_index );
+   FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
+      STORE(&r[chan_index], 0, chan_index);
    }
 }
 
@@ -1780,8 +1673,9 @@ exec_txd(struct tgsi_exec_machine *mach,
       FETCH(&r[0], 0, CHAN_X);
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &ZeroVec, &ZeroVec, 0.0f,  /* S, T, P, BIAS */
-                  &r[0], &r[1], &r[2], &r[3]);      /* R, G, B, A */
+                  &r[0], &ZeroVec, &ZeroVec, &ZeroVec,   /* S, T, P, BIAS */
+                  tgsi_sampler_lod_bias,
+                  &r[0], &r[1], &r[2], &r[3]);           /* R, G, B, A */
       break;
 
    case TGSI_TEXTURE_2D:
@@ -1794,8 +1688,9 @@ exec_txd(struct tgsi_exec_machine *mach,
       FETCH(&r[2], 0, CHAN_Z);
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], 0.0f,    /* inputs */
-                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+                  &r[0], &r[1], &r[2], &ZeroVec,   /* inputs */
+                  tgsi_sampler_lod_bias,
+                  &r[0], &r[1], &r[2], &r[3]);     /* outputs */
       break;
 
    case TGSI_TEXTURE_3D:
@@ -1806,7 +1701,8 @@ exec_txd(struct tgsi_exec_machine *mach,
       FETCH(&r[2], 0, CHAN_Z);
 
       fetch_texel(mach->Samplers[unit],
-                  &r[0], &r[1], &r[2], 0.0f,
+                  &r[0], &r[1], &r[2], &ZeroVec,
+                  tgsi_sampler_lod_bias,
                   &r[0], &r[1], &r[2], &r[3]);
       break;
 
@@ -1892,20 +1788,15 @@ exec_declaration(struct tgsi_exec_machine *mach,
                  const struct tgsi_full_declaration *decl)
 {
    if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
-      if (decl->Declaration.File == TGSI_FILE_INPUT) {
+      if (decl->Declaration.File == TGSI_FILE_INPUT ||
+          decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
          uint first, last, mask;
 
          first = decl->Range.First;
          last = decl->Range.Last;
          mask = decl->Declaration.UsageMask;
 
-         if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
-            assert(decl->Semantic.Index == 0);
-            assert(first == last);
-            assert(mask = TGSI_WRITEMASK_XYZW);
-
-            mach->Inputs[first] = mach->QuadPos;
-         } else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
+         if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
             uint i;
 
             assert(decl->Semantic.Index == 0);
@@ -1948,6 +1839,681 @@ exec_declaration(struct tgsi_exec_machine *mach,
    }
 }
 
+typedef void (* micro_unary_op)(union tgsi_exec_channel *dst,
+                                const union tgsi_exec_channel *src);
+
+static void
+exec_scalar_unary(struct tgsi_exec_machine *mach,
+                  const struct tgsi_full_instruction *inst,
+                  micro_unary_op op,
+                  enum tgsi_exec_datatype dst_datatype,
+                  enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   union tgsi_exec_channel src;
+   union tgsi_exec_channel dst;
+
+   fetch_source(mach, &src, &inst->Src[0], CHAN_X, src_datatype);
+   op(&dst, &src);
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst, &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_vector_unary(struct tgsi_exec_machine *mach,
+                  const struct tgsi_full_instruction *inst,
+                  micro_unary_op op,
+                  enum tgsi_exec_datatype dst_datatype,
+                  enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   struct tgsi_exec_vector dst;
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         union tgsi_exec_channel src;
+
+         fetch_source(mach, &src, &inst->Src[0], chan, src_datatype);
+         op(&dst.xyzw[chan], &src);
+      }
+   }
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+typedef void (* micro_binary_op)(union tgsi_exec_channel *dst,
+                                 const union tgsi_exec_channel *src0,
+                                 const union tgsi_exec_channel *src1);
+
+static void
+exec_vector_binary(struct tgsi_exec_machine *mach,
+                   const struct tgsi_full_instruction *inst,
+                   micro_binary_op op,
+                   enum tgsi_exec_datatype dst_datatype,
+                   enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   struct tgsi_exec_vector dst;
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         union tgsi_exec_channel src[2];
+
+         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
+         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
+         op(&dst.xyzw[chan], &src[0], &src[1]);
+      }
+   }
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+typedef void (* micro_trinary_op)(union tgsi_exec_channel *dst,
+                                  const union tgsi_exec_channel *src0,
+                                  const union tgsi_exec_channel *src1,
+                                  const union tgsi_exec_channel *src2);
+
+static void
+exec_vector_trinary(struct tgsi_exec_machine *mach,
+                    const struct tgsi_full_instruction *inst,
+                    micro_trinary_op op,
+                    enum tgsi_exec_datatype dst_datatype,
+                    enum tgsi_exec_datatype src_datatype)
+{
+   unsigned int chan;
+   struct tgsi_exec_vector dst;
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         union tgsi_exec_channel src[3];
+
+         fetch_source(mach, &src[0], &inst->Src[0], chan, src_datatype);
+         fetch_source(mach, &src[1], &inst->Src[1], chan, src_datatype);
+         fetch_source(mach, &src[2], &inst->Src[2], chan, src_datatype);
+         op(&dst.xyzw[chan], &src[0], &src[1], &src[2]);
+      }
+   }
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst.xyzw[chan], &inst->Dst[0], inst, chan, dst_datatype);
+      }
+   }
+}
+
+static void
+exec_dp3(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
+      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
+      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
+      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
+   }
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dp4(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
+      fetch_source(mach, &arg[0], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
+      fetch_source(mach, &arg[1], &inst->Src[1], chan, TGSI_EXEC_DATA_FLOAT);
+      micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
+   }
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dp2a(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
+
+   fetch_source(mach, &arg[1], &inst->Src[2], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_add(&arg[0], &arg[0], &arg[1]);
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dph(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Z, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[0], &arg[0], &arg[1], &arg[2]);
+
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_W, TGSI_EXEC_DATA_FLOAT);
+   micro_add(&arg[0], &arg[0], &arg[1]);
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[0], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_dp2(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[3];
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&arg[2], &arg[0], &arg[1]);
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[1], CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   micro_mad(&arg[2], &arg[0], &arg[1], &arg[2]);
+
+   for (chan = 0; chan < NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &arg[2], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_nrm4(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[4];
+   union tgsi_exec_channel scale;
+
+   fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   micro_mul(&scale, &arg[0], &arg[0]);
+
+   for (chan = CHAN_Y; chan <= CHAN_W; chan++) {
+      union tgsi_exec_channel product;
+
+      fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
+      micro_mul(&product, &arg[chan], &arg[chan]);
+      micro_add(&scale, &scale, &product);
+   }
+
+   micro_rsq(&scale, &scale);
+
+   for (chan = CHAN_X; chan <= CHAN_W; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         micro_mul(&arg[chan], &arg[chan], &scale);
+         store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_nrm3(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XYZ) {
+      unsigned int chan;
+      union tgsi_exec_channel arg[3];
+      union tgsi_exec_channel scale;
+
+      fetch_source(mach, &arg[0], &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_FLOAT);
+      micro_mul(&scale, &arg[0], &arg[0]);
+
+      for (chan = CHAN_Y; chan <= CHAN_Z; chan++) {
+         union tgsi_exec_channel product;
+
+         fetch_source(mach, &arg[chan], &inst->Src[0], chan, TGSI_EXEC_DATA_FLOAT);
+         micro_mul(&product, &arg[chan], &arg[chan]);
+         micro_add(&scale, &scale, &product);
+      }
+
+      micro_rsq(&scale, &scale);
+
+      for (chan = CHAN_X; chan <= CHAN_Z; chan++) {
+         if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+            micro_mul(&arg[chan], &arg[chan], &scale);
+            store_dest(mach, &arg[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+         }
+      }
+   }
+
+   if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
+      store_dest(mach, &OneVec, &inst->Dst[0], inst, CHAN_W, TGSI_EXEC_DATA_FLOAT);
+   }
+}
+
+static void
+exec_break(struct tgsi_exec_machine *mach)
+{
+   if (mach->BreakType == TGSI_EXEC_BREAK_INSIDE_LOOP) {
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+   } else {
+      assert(mach->BreakType == TGSI_EXEC_BREAK_INSIDE_SWITCH);
+
+      mach->Switch.mask = 0x0;
+
+      UPDATE_EXEC_MASK(mach);
+   }
+}
+
+static void
+exec_switch(struct tgsi_exec_machine *mach,
+            const struct tgsi_full_instruction *inst)
+{
+   assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
+   assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
+
+   mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
+   fetch_source(mach, &mach->Switch.selector, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
+   mach->Switch.mask = 0x0;
+   mach->Switch.defaultMask = 0x0;
+
+   mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
+   mach->BreakType = TGSI_EXEC_BREAK_INSIDE_SWITCH;
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+exec_case(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
+   union tgsi_exec_channel src;
+   uint mask = 0;
+
+   fetch_source(mach, &src, &inst->Src[0], CHAN_X, TGSI_EXEC_DATA_UINT);
+
+   if (mach->Switch.selector.u[0] == src.u[0]) {
+      mask |= 0x1;
+   }
+   if (mach->Switch.selector.u[1] == src.u[1]) {
+      mask |= 0x2;
+   }
+   if (mach->Switch.selector.u[2] == src.u[2]) {
+      mask |= 0x4;
+   }
+   if (mach->Switch.selector.u[3] == src.u[3]) {
+      mask |= 0x8;
+   }
+
+   mach->Switch.defaultMask |= mask;
+
+   mach->Switch.mask |= mask & prevMask;
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+exec_default(struct tgsi_exec_machine *mach)
+{
+   uint prevMask = mach->SwitchStack[mach->SwitchStackTop - 1].mask;
+
+   mach->Switch.mask |= ~mach->Switch.defaultMask & prevMask;
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+exec_endswitch(struct tgsi_exec_machine *mach)
+{
+   mach->Switch = mach->SwitchStack[--mach->SwitchStackTop];
+   mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
+
+   UPDATE_EXEC_MASK(mach);
+}
+
+static void
+micro_i2f(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float)src->i[0];
+   dst->f[1] = (float)src->i[1];
+   dst->f[2] = (float)src->i[2];
+   dst->f[3] = (float)src->i[3];
+}
+
+static void
+micro_not(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = ~src->u[0];
+   dst->u[1] = ~src->u[1];
+   dst->u[2] = ~src->u[2];
+   dst->u[3] = ~src->u[3];
+}
+
+static void
+micro_shl(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] << src1->u[0];
+   dst->u[1] = src0->u[1] << src1->u[1];
+   dst->u[2] = src0->u[2] << src1->u[2];
+   dst->u[3] = src0->u[3] << src1->u[3];
+}
+
+static void
+micro_and(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] & src1->u[0];
+   dst->u[1] = src0->u[1] & src1->u[1];
+   dst->u[2] = src0->u[2] & src1->u[2];
+   dst->u[3] = src0->u[3] & src1->u[3];
+}
+
+static void
+micro_or(union tgsi_exec_channel *dst,
+         const union tgsi_exec_channel *src0,
+         const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] | src1->u[0];
+   dst->u[1] = src0->u[1] | src1->u[1];
+   dst->u[2] = src0->u[2] | src1->u[2];
+   dst->u[3] = src0->u[3] | src1->u[3];
+}
+
+static void
+micro_xor(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src0,
+          const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] ^ src1->u[0];
+   dst->u[1] = src0->u[1] ^ src1->u[1];
+   dst->u[2] = src0->u[2] ^ src1->u[2];
+   dst->u[3] = src0->u[3] ^ src1->u[3];
+}
+
+static void
+micro_f2i(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->i[0] = (int)src->f[0];
+   dst->i[1] = (int)src->f[1];
+   dst->i[2] = (int)src->f[2];
+   dst->i[3] = (int)src->f[3];
+}
+
+static void
+micro_idiv(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->i[0] = src0->i[0] / src1->i[0];
+   dst->i[1] = src0->i[1] / src1->i[1];
+   dst->i[2] = src0->i[2] / src1->i[2];
+   dst->i[3] = src0->i[3] / src1->i[3];
+}
+
+static void
+micro_imax(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_imin(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_isge(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->i[0] = src0->i[0] >= src1->i[0] ? -1 : 0;
+   dst->i[1] = src0->i[1] >= src1->i[1] ? -1 : 0;
+   dst->i[2] = src0->i[2] >= src1->i[2] ? -1 : 0;
+   dst->i[3] = src0->i[3] >= src1->i[3] ? -1 : 0;
+}
+
+static void
+micro_ishr(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->i[0] = src0->i[0] >> src1->i[0];
+   dst->i[1] = src0->i[1] >> src1->i[1];
+   dst->i[2] = src0->i[2] >> src1->i[2];
+   dst->i[3] = src0->i[3] >> src1->i[3];
+}
+
+static void
+micro_islt(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? -1 : 0;
+   dst->i[1] = src0->i[1] < src1->i[1] ? -1 : 0;
+   dst->i[2] = src0->i[2] < src1->i[2] ? -1 : 0;
+   dst->i[3] = src0->i[3] < src1->i[3] ? -1 : 0;
+}
+
+static void
+micro_f2u(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->u[0] = (uint)src->f[0];
+   dst->u[1] = (uint)src->f[1];
+   dst->u[2] = (uint)src->f[2];
+   dst->u[3] = (uint)src->f[3];
+}
+
+static void
+micro_u2f(union tgsi_exec_channel *dst,
+          const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float)src->u[0];
+   dst->f[1] = (float)src->u[1];
+   dst->f[2] = (float)src->u[2];
+   dst->f[3] = (float)src->u[3];
+}
+
+static void
+micro_uadd(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] + src1->u[0];
+   dst->u[1] = src0->u[1] + src1->u[1];
+   dst->u[2] = src0->u[2] + src1->u[2];
+   dst->u[3] = src0->u[3] + src1->u[3];
+}
+
+static void
+micro_udiv(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] / src1->u[0];
+   dst->u[1] = src0->u[1] / src1->u[1];
+   dst->u[2] = src0->u[2] / src1->u[2];
+   dst->u[3] = src0->u[3] / src1->u[3];
+}
+
+static void
+micro_umad(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1,
+           const union tgsi_exec_channel *src2)
+{
+   dst->u[0] = src0->u[0] * src1->u[0] + src2->u[0];
+   dst->u[1] = src0->u[1] * src1->u[1] + src2->u[1];
+   dst->u[2] = src0->u[2] * src1->u[2] + src2->u[2];
+   dst->u[3] = src0->u[3] * src1->u[3] + src2->u[3];
+}
+
+static void
+micro_umax(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_umin(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_umod(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] % src1->u[0];
+   dst->u[1] = src0->u[1] % src1->u[1];
+   dst->u[2] = src0->u[2] % src1->u[2];
+   dst->u[3] = src0->u[3] % src1->u[3];
+}
+
+static void
+micro_umul(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] * src1->u[0];
+   dst->u[1] = src0->u[1] * src1->u[1];
+   dst->u[2] = src0->u[2] * src1->u[2];
+   dst->u[3] = src0->u[3] * src1->u[3];
+}
+
+static void
+micro_useq(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] == src1->u[0] ? ~0 : 0;
+   dst->u[1] = src0->u[1] == src1->u[1] ? ~0 : 0;
+   dst->u[2] = src0->u[2] == src1->u[2] ? ~0 : 0;
+   dst->u[3] = src0->u[3] == src1->u[3] ? ~0 : 0;
+}
+
+static void
+micro_usge(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] >= src1->u[0] ? ~0 : 0;
+   dst->u[1] = src0->u[1] >= src1->u[1] ? ~0 : 0;
+   dst->u[2] = src0->u[2] >= src1->u[2] ? ~0 : 0;
+   dst->u[3] = src0->u[3] >= src1->u[3] ? ~0 : 0;
+}
+
+static void
+micro_ushr(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] >> src1->u[0];
+   dst->u[1] = src0->u[1] >> src1->u[1];
+   dst->u[2] = src0->u[2] >> src1->u[2];
+   dst->u[3] = src0->u[3] >> src1->u[3];
+}
+
+static void
+micro_uslt(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? ~0 : 0;
+   dst->u[1] = src0->u[1] < src1->u[1] ? ~0 : 0;
+   dst->u[2] = src0->u[2] < src1->u[2] ? ~0 : 0;
+   dst->u[3] = src0->u[3] < src1->u[3] ? ~0 : 0;
+}
+
+static void
+micro_usne(union tgsi_exec_channel *dst,
+           const union tgsi_exec_channel *src0,
+           const union tgsi_exec_channel *src1)
+{
+   dst->u[0] = src0->u[0] != src1->u[0] ? ~0 : 0;
+   dst->u[1] = src0->u[1] != src1->u[1] ? ~0 : 0;
+   dst->u[2] = src0->u[2] != src1->u[2] ? ~0 : 0;
+   dst->u[3] = src0->u[3] != src1->u[3] ? ~0 : 0;
+}
+
 static void
 exec_instruction(
    struct tgsi_exec_machine *mach,
@@ -1962,23 +2528,11 @@ exec_instruction(
 
    switch (inst->Instruction.Opcode) {
    case TGSI_OPCODE_ARL:
-   case TGSI_OPCODE_FLR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_flr(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_arl, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_MOV:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&d[chan_index], 0, chan_index);
-      }
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_mov, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_LIT:
@@ -2015,23 +2569,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_RCP:
-   /* TGSI_OPCODE_RECIP */
-      FETCH( &r[0], 0, CHAN_X );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_rcp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_RSQ:
-   /* TGSI_OPCODE_RECIPSQRT */
-      FETCH( &r[0], 0, CHAN_X );
-      micro_abs( &r[0], &r[0] );
-      micro_sqrt( &r[0], &r[0] );
-      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_rsq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_EXP:
@@ -2076,76 +2618,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MUL:
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         micro_mul(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_mul, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_ADD:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_add(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_add, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DP3:
-   /* TGSI_OPCODE_DOT3 */
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Z );
-      FETCH( &r[2], 1, CHAN_Z );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dp3(mach, inst);
       break;
 
-    case TGSI_OPCODE_DP4:
-    /* TGSI_OPCODE_DOT4 */
-       FETCH(&r[0], 0, CHAN_X);
-       FETCH(&r[1], 1, CHAN_X);
-
-       micro_mul( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_Y);
-       FETCH(&r[2], 1, CHAN_Y);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_Z);
-       FETCH(&r[2], 1, CHAN_Z);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-       FETCH(&r[1], 0, CHAN_W);
-       FETCH(&r[2], 1, CHAN_W);
-
-       micro_mul( &r[1], &r[1], &r[2] );
-       micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+   case TGSI_OPCODE_DP4:
+      exec_dp4(mach, inst);
       break;
 
    case TGSI_OPCODE_DST:
@@ -2176,174 +2661,63 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MIN:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-
-         /* XXX use micro_min()?? */
-         micro_lt(&d[chan_index], &r[0], &r[1], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_min, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_MAX:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-
-         /* XXX use micro_max()?? */
-         micro_lt(&d[chan_index], &r[0], &r[1], &r[1], &r[0] );
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_max, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SLT:
-   /* TGSI_OPCODE_SETLT */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_lt(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_slt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SGE:
-   /* TGSI_OPCODE_SETGE */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le(&d[chan_index], &r[1], &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sge, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_MAD:
-   /* TGSI_OPCODE_MADD */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_mul( &r[0], &r[0], &r[1] );
-         FETCH( &r[1], 2, chan_index );
-         micro_add(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_trinary(mach, inst, micro_mad, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SUB:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         micro_sub(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sub, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_LRP:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         FETCH(&r[2], 2, chan_index);
-         micro_sub( &r[1], &r[1], &r[2] );
-         micro_mul( &r[0], &r[0], &r[1] );
-         micro_add(&d[chan_index], &r[0], &r[2]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_trinary(mach, inst, micro_lrp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_CND:
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         FETCH(&r[2], 2, chan_index);
-         micro_lt(&d[chan_index], &mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C], &r[2], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_trinary(mach, inst, micro_cnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DP2A:
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[2], 2, CHAN_X );
-      micro_add( &r[0], &r[0], &r[2] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dp2a(mach, inst);
       break;
 
    case TGSI_OPCODE_FRC:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_frc(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_frc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_CLAMP:
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         micro_max(&r[0], &r[0], &r[1]);
-         FETCH(&r[1], 2, chan_index);
-         micro_min(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_trinary(mach, inst, micro_clamp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
+   case TGSI_OPCODE_FLR:
+      exec_vector_unary(mach, inst, micro_flr, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_ROUND:
-   case TGSI_OPCODE_ARR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_rnd(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_rnd, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_EX2:
-      FETCH(&r[0], 0, CHAN_X);
-
-      micro_exp2( &r[0], &r[0] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_exp2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_LG2:
-      FETCH( &r[0], 0, CHAN_X );
-      micro_lg2( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_lg2, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_POW:
@@ -2396,15 +2770,9 @@ exec_instruction(
       }
       break;
 
-    case TGSI_OPCODE_ABS:
-       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-          FETCH(&r[0], 0, chan_index);
-          micro_abs(&d[chan_index], &r[0]);
-       }
-       FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
-       break;
+   case TGSI_OPCODE_ABS:
+      exec_vector_unary(mach, inst, micro_abs, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
+      break;
 
    case TGSI_OPCODE_RCC:
       FETCH(&r[0], 0, CHAN_X);
@@ -2416,60 +2784,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DPH:
-      FETCH(&r[0], 0, CHAN_X);
-      FETCH(&r[1], 1, CHAN_X);
-
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 0, CHAN_Y);
-      FETCH(&r[2], 1, CHAN_Y);
-
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 0, CHAN_Z);
-      FETCH(&r[2], 1, CHAN_Z);
-
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FETCH(&r[1], 1, CHAN_W);
-
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dph(mach, inst);
       break;
 
    case TGSI_OPCODE_COS:
-      FETCH(&r[0], 0, CHAN_X);
-
-      micro_cos( &r[0], &r[0] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_cos, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DDX:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ddx(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_ddx, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_DDY:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ddy(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_ddy, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_KILP:
@@ -2546,14 +2873,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SEQ:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_seq, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SFL:
@@ -2563,44 +2883,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SGT:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sgt, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SIN:
-      FETCH( &r[0], 0, CHAN_X );
-      micro_sin( &r[0], &r[0] );
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_scalar_unary(mach, inst, micro_sin, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SLE:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_le(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sle, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SNE:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_eq(&d[chan_index], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_sne, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_STR:
@@ -2613,14 +2908,14 @@ exec_instruction(
       /* simple texture lookup */
       /* src[0] = texcoord */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE, FALSE);
+      exec_tex(mach, inst, TEX_MODIFIER_NONE);
       break;
 
    case TGSI_OPCODE_TXB:
       /* Texture lookup with lod bias */
       /* src[0] = texcoord (src[0].w = LOD bias) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE, FALSE);
+      exec_tex(mach, inst, TEX_MODIFIER_LOD_BIAS);
       break;
 
    case TGSI_OPCODE_TXD:
@@ -2636,14 +2931,14 @@ exec_instruction(
       /* Texture lookup with explit LOD */
       /* src[0] = texcoord (src[0].w = LOD) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE, FALSE);
+      exec_tex(mach, inst, TEX_MODIFIER_EXPLICIT_LOD);
       break;
 
    case TGSI_OPCODE_TXP:
       /* Texture lookup with projection */
       /* src[0] = texcoord (src[0].w = projection) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE, TRUE);
+      exec_tex(mach, inst, TEX_MODIFIER_PROJECTED);
       break;
 
    case TGSI_OPCODE_UP2H:
@@ -2705,6 +3000,10 @@ exec_instruction(
       assert (0);
       break;
 
+   case TGSI_OPCODE_ARR:
+      exec_vector_unary(mach, inst, micro_arr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
    case TGSI_OPCODE_BRA:
       assert (0);
       break;
@@ -2724,6 +3023,8 @@ exec_instruction(
          mach->CallStack[mach->CallStackTop].CondStackTop = mach->CondStackTop;
          mach->CallStack[mach->CallStackTop].LoopStackTop = mach->LoopStackTop;
          mach->CallStack[mach->CallStackTop].ContStackTop = mach->ContStackTop;
+         mach->CallStack[mach->CallStackTop].SwitchStackTop = mach->SwitchStackTop;
+         mach->CallStack[mach->CallStackTop].BreakStackTop = mach->BreakStackTop;
          /* note that PC was already incremented above */
          mach->CallStack[mach->CallStackTop].ReturnAddr = *pc;
 
@@ -2731,12 +3032,17 @@ exec_instruction(
 
          /* Second, push the Cond, Loop, Cont, Func stacks */
          assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
-         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
          assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
          assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+         assert(mach->SwitchStackTop < TGSI_EXEC_MAX_SWITCH_NESTING);
+         assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
          assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+         mach->SwitchStack[mach->SwitchStackTop++] = mach->Switch;
+         mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
 
          /* Finally, jump to the subroutine */
@@ -2769,6 +3075,12 @@ exec_instruction(
          mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
          mach->ContMask = mach->ContStack[mach->ContStackTop];
 
+         mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
+         mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
+
+         mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
+         mach->BreakType = mach->BreakStack[mach->BreakStackTop];
+
          assert(mach->FuncStackTop > 0);
          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
 
@@ -2779,26 +3091,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-   /* TGSI_OPCODE_SGN */
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_sgn(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_sgn, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_CMP:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH(&r[0], 0, chan_index);
-         FETCH(&r[1], 1, chan_index);
-         FETCH(&r[2], 2, chan_index);
-         micro_lt(&d[chan_index], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_trinary(mach, inst, micro_cmp, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SCS:
@@ -2822,70 +3119,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_NRM:
-      /* 3-component vector normalize */
-      if(IS_CHANNEL_ENABLED(*inst, CHAN_X) ||
-         IS_CHANNEL_ENABLED(*inst, CHAN_Y) ||
-         IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-         /* r3 = sqrt(dp3(src0, src0)) */
-         FETCH(&r[0], 0, CHAN_X);
-         micro_mul(&r[3], &r[0], &r[0]);
-         FETCH(&r[1], 0, CHAN_Y);
-         micro_mul(&r[4], &r[1], &r[1]);
-         micro_add(&r[3], &r[3], &r[4]);
-         FETCH(&r[2], 0, CHAN_Z);
-         micro_mul(&r[4], &r[2], &r[2]);
-         micro_add(&r[3], &r[3], &r[4]);
-         micro_sqrt(&r[3], &r[3]);
-
-         if (IS_CHANNEL_ENABLED(*inst, CHAN_X)) {
-            micro_div(&r[0], &r[0], &r[3]);
-            STORE(&r[0], 0, CHAN_X);
-         }
-         if (IS_CHANNEL_ENABLED(*inst, CHAN_Y)) {
-            micro_div(&r[1], &r[1], &r[3]);
-            STORE(&r[1], 0, CHAN_Y);
-         }
-         if (IS_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-            micro_div(&r[2], &r[2], &r[3]);
-            STORE(&r[2], 0, CHAN_Z);
-         }
-      }
-      if (IS_CHANNEL_ENABLED(*inst, CHAN_W)) {
-         STORE(&mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W);
-      }
+      exec_nrm3(mach, inst);
       break;
 
    case TGSI_OPCODE_NRM4:
-      /* 4-component vector normalize */
-      {
-         union tgsi_exec_channel tmp, dot;
-
-         /* tmp = dp4(src0, src0): */
-         FETCH( &r[0], 0, CHAN_X );
-         micro_mul( &tmp, &r[0], &r[0] );
-
-         FETCH( &r[1], 0, CHAN_Y );
-         micro_mul( &dot, &r[1], &r[1] );
-         micro_add( &tmp, &tmp, &dot );
-
-         FETCH( &r[2], 0, CHAN_Z );
-         micro_mul( &dot, &r[2], &r[2] );
-         micro_add( &tmp, &tmp, &dot );
-
-         FETCH( &r[3], 0, CHAN_W );
-         micro_mul( &dot, &r[3], &r[3] );
-         micro_add( &tmp, &tmp, &dot );
-
-         /* tmp = 1 / sqrt(tmp) */
-         micro_sqrt( &tmp, &tmp );
-         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
-
-         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-            /* chan = chan * tmp */
-            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
-            STORE( &r[chan_index], 0, chan_index );
-         }
-      }
+      exec_nrm4(mach, inst);
       break;
 
    case TGSI_OPCODE_DIV:
@@ -2893,18 +3131,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_DP2:
-      FETCH( &r[0], 0, CHAN_X );
-      FETCH( &r[1], 1, CHAN_X );
-      micro_mul( &r[0], &r[0], &r[1] );
-
-      FETCH( &r[1], 0, CHAN_Y );
-      FETCH( &r[2], 1, CHAN_Y );
-      micro_mul( &r[1], &r[1], &r[2] );
-      micro_add( &r[0], &r[0], &r[1] );
-
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( &r[0], 0, chan_index );
-      }
+      exec_dp2(mach, inst);
       break;
 
    case TGSI_OPCODE_IF:
@@ -2970,87 +3197,31 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_CEIL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_ceil(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_ceil, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_I2F:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_i2f(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_i2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_INT);
       break;
 
    case TGSI_OPCODE_NOT:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_not(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_not, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_TRUNC:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         micro_trunc(&d[chan_index], &r[0]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_unary(mach, inst, micro_trunc, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT);
       break;
 
    case TGSI_OPCODE_SHL:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_shl(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
-      break;
-
-   case TGSI_OPCODE_SHR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_ishr(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_shl, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_AND:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_and(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_and, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_OR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_or(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_or, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_MOD:
@@ -3058,14 +3229,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_XOR:
-      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( &r[0], 0, chan_index );
-         FETCH( &r[1], 1, chan_index );
-         micro_xor(&d[chan_index], &r[0], &r[1]);
-      }
-      FOR_EACH_ENABLED_CHANNEL(*inst, chan_index) {
-         STORE(&d[chan_index], 0, chan_index);
-      }
+      exec_vector_binary(mach, inst, micro_xor, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
       break;
 
    case TGSI_OPCODE_SAD:
@@ -3081,13 +3245,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_EMIT:
-      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
-      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      emit_vertex(mach);
       break;
 
    case TGSI_OPCODE_ENDPRIM:
-      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
-      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      emit_primitive(mach);
       break;
 
    case TGSI_OPCODE_BGNFOR:
@@ -3116,11 +3278,15 @@ exec_instruction(
    case TGSI_OPCODE_BGNLOOP:
       /* push LoopMask and ContMasks */
       assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
       assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
-      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       assert(mach->LoopLabelStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      assert(mach->BreakStackTop < TGSI_EXEC_MAX_BREAK_STACK);
+
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       mach->LoopLabelStack[mach->LoopLabelStackTop++] = *pc - 1;
+      mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
+      mach->BreakType = TGSI_EXEC_BREAK_INSIDE_LOOP;
       break;
 
    case TGSI_OPCODE_ENDFOR:
@@ -3167,6 +3333,8 @@ exec_instruction(
          --mach->LoopLabelStackTop;
          assert(mach->LoopCounterStackTop > 0);
          --mach->LoopCounterStackTop;
+
+         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
       }
       UPDATE_EXEC_MASK(mach);
       break;
@@ -3190,15 +3358,14 @@ exec_instruction(
          mach->ContMask = mach->ContStack[--mach->ContStackTop];
          assert(mach->LoopLabelStackTop > 0);
          --mach->LoopLabelStackTop;
+
+         mach->BreakType = mach->BreakStack[--mach->BreakStackTop];
       }
       UPDATE_EXEC_MASK(mach);
       break;
 
    case TGSI_OPCODE_BRK:
-      /* turn off loop channels for each enabled exec channel */
-      mach->LoopMask &= ~mach->ExecMask;
-      /* Todo: if mach->LoopMask == 0, jump to end of loop */
-      UPDATE_EXEC_MASK(mach);
+      exec_break(mach);
       break;
 
    case TGSI_OPCODE_CONT:
@@ -3229,6 +3396,12 @@ exec_instruction(
       mach->ContStackTop = mach->CallStack[mach->CallStackTop].ContStackTop;
       mach->ContMask = mach->ContStack[mach->ContStackTop];
 
+      mach->SwitchStackTop = mach->CallStack[mach->CallStackTop].SwitchStackTop;
+      mach->Switch = mach->SwitchStack[mach->SwitchStackTop];
+
+      mach->BreakStackTop = mach->CallStack[mach->CallStackTop].BreakStackTop;
+      mach->BreakType = mach->BreakStack[mach->BreakStackTop];
+
       assert(mach->FuncStackTop > 0);
       mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
 
@@ -3240,11 +3413,135 @@ exec_instruction(
    case TGSI_OPCODE_NOP:
       break;
 
+   case TGSI_OPCODE_BREAKC:
+      FETCH(&r[0], 0, CHAN_X);
+      /* update CondMask */
+      if (r[0].u[0] && (mach->ExecMask & 0x1)) {
+         mach->LoopMask &= ~0x1;
+      }
+      if (r[0].u[1] && (mach->ExecMask & 0x2)) {
+         mach->LoopMask &= ~0x2;
+      }
+      if (r[0].u[2] && (mach->ExecMask & 0x4)) {
+         mach->LoopMask &= ~0x4;
+      }
+      if (r[0].u[3] && (mach->ExecMask & 0x8)) {
+         mach->LoopMask &= ~0x8;
+      }
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_F2I:
+      exec_vector_unary(mach, inst, micro_f2i, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
+   case TGSI_OPCODE_IDIV:
+      exec_vector_binary(mach, inst, micro_idiv, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_IMAX:
+      exec_vector_binary(mach, inst, micro_imax, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_IMIN:
+      exec_vector_binary(mach, inst, micro_imin, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_INEG:
+      exec_vector_unary(mach, inst, micro_ineg, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_ISGE:
+      exec_vector_binary(mach, inst, micro_isge, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_ISHR:
+      exec_vector_binary(mach, inst, micro_ishr, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_ISLT:
+      exec_vector_binary(mach, inst, micro_islt, TGSI_EXEC_DATA_INT, TGSI_EXEC_DATA_INT);
+      break;
+
+   case TGSI_OPCODE_F2U:
+      exec_vector_unary(mach, inst, micro_f2u, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_FLOAT);
+      break;
+
+   case TGSI_OPCODE_U2F:
+      exec_vector_unary(mach, inst, micro_u2f, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UADD:
+      exec_vector_binary(mach, inst, micro_uadd, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UDIV:
+      exec_vector_binary(mach, inst, micro_udiv, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMAD:
+      exec_vector_trinary(mach, inst, micro_umad, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMAX:
+      exec_vector_binary(mach, inst, micro_umax, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMIN:
+      exec_vector_binary(mach, inst, micro_umin, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMOD:
+      exec_vector_binary(mach, inst, micro_umod, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_UMUL:
+      exec_vector_binary(mach, inst, micro_umul, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USEQ:
+      exec_vector_binary(mach, inst, micro_useq, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USGE:
+      exec_vector_binary(mach, inst, micro_usge, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USHR:
+      exec_vector_binary(mach, inst, micro_ushr, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USLT:
+      exec_vector_binary(mach, inst, micro_uslt, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_USNE:
+      exec_vector_binary(mach, inst, micro_usne, TGSI_EXEC_DATA_UINT, TGSI_EXEC_DATA_UINT);
+      break;
+
+   case TGSI_OPCODE_SWITCH:
+      exec_switch(mach, inst);
+      break;
+
+   case TGSI_OPCODE_CASE:
+      exec_case(mach, inst);
+      break;
+
+   case TGSI_OPCODE_DEFAULT:
+      exec_default(mach);
+      break;
+
+   case TGSI_OPCODE_ENDSWITCH:
+      exec_endswitch(mach);
+      break;
+
    default:
       assert( 0 );
    }
 }
 
+
 #define DEBUG_EXECUTION 0
 
 
@@ -3264,9 +3561,13 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
    mach->FuncMask = 0xf;
    mach->ExecMask = 0xf;
 
+   mach->Switch.mask = 0xf;
+
    assert(mach->CondStackTop == 0);
    assert(mach->LoopStackTop == 0);
    assert(mach->ContStackTop == 0);
+   assert(mach->SwitchStackTop == 0);
+   assert(mach->BreakStackTop == 0);
    assert(mach->CallStackTop == 0);
 
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
@@ -3323,11 +3624,11 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
                   if (j > 0) {
                      debug_printf("           ");
                   }
-                  debug_printf("(%6f, %6f, %6f, %6f)\n",
-                               temps[i].xyzw[0].f[j],
-                               temps[i].xyzw[1].f[j],
-                               temps[i].xyzw[2].f[j],
-                               temps[i].xyzw[3].f[j]);
+                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
+                               temps[i].xyzw[0].f[j], temps[i].xyzw[0].u[j],
+                               temps[i].xyzw[1].f[j], temps[i].xyzw[1].u[j],
+                               temps[i].xyzw[2].f[j], temps[i].xyzw[2].u[j],
+                               temps[i].xyzw[3].f[j], temps[i].xyzw[3].u[j]);
                }
             }
          }
@@ -3341,11 +3642,11 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
                   if (j > 0) {
                      debug_printf("           ");
                   }
-                  debug_printf("{%6f, %6f, %6f, %6f}\n",
-                               outputs[i].xyzw[0].f[j],
-                               outputs[i].xyzw[1].f[j],
-                               outputs[i].xyzw[2].f[j],
-                               outputs[i].xyzw[3].f[j]);
+                  debug_printf("(%6f %u, %6f %u, %6f %u, %6f %u)\n",
+                               outputs[i].xyzw[0].f[j], outputs[i].xyzw[0].u[j],
+                               outputs[i].xyzw[1].f[j], outputs[i].xyzw[1].u[j],
+                               outputs[i].xyzw[2].f[j], outputs[i].xyzw[2].u[j],
+                               outputs[i].xyzw[3].f[j], outputs[i].xyzw[3].u[j]);
                }
             }
          }
@@ -3367,6 +3668,8 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
    assert(mach->CondStackTop == 0);
    assert(mach->LoopStackTop == 0);
    assert(mach->ContStackTop == 0);
+   assert(mach->SwitchStackTop == 0);
+   assert(mach->BreakStackTop == 0);
    assert(mach->CallStackTop == 0);
 
    return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index fd94c1bc440..a22873e4c2b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -35,11 +36,13 @@
 extern "C" {
 #endif
 
+
 #define MAX_LABELS (4 * 1024)  /**< basically, max instructions */
 
 #define NUM_CHANNELS 4  /* R,G,B,A */
 #define QUAD_SIZE    4  /* 4 pixel/quad */
 
+
 /**
   * Registers may be treated as float, signed int or unsigned int.
   */
@@ -69,6 +72,11 @@ struct tgsi_interp_coef
    float dady[NUM_CHANNELS];
 };
 
+enum tgsi_sampler_control {
+   tgsi_sampler_lod_bias,
+   tgsi_sampler_lod_explicit
+};
+
 /**
  * Information for sampling textures, which must be implemented
  * by code outside the TGSI executor.
@@ -80,7 +88,8 @@ struct tgsi_sampler
                        const float s[QUAD_SIZE],
                        const float t[QUAD_SIZE],
                        const float p[QUAD_SIZE],
-                       float lodbias,
+                       const float c0[QUAD_SIZE],
+                       enum tgsi_sampler_control control,
                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
 };
 
@@ -179,6 +188,7 @@ struct tgsi_exec_labels
 
 #define TGSI_EXEC_MAX_COND_NESTING  32
 #define TGSI_EXEC_MAX_LOOP_NESTING  32
+#define TGSI_EXEC_MAX_SWITCH_NESTING 32
 #define TGSI_EXEC_MAX_CALL_NESTING  32
 
 /* The maximum number of input attributes per vertex. For 2D
@@ -191,6 +201,14 @@ struct tgsi_exec_labels
  */
 #define TGSI_EXEC_MAX_CONST_BUFFER  4096
 
+/* The maximum number of vertices per primitive */
+#define TGSI_MAX_PRIM_VERTICES 6
+
+/* The maximum number of primitives to be generated */
+#define TGSI_MAX_PRIMITIVES 64
+
+/* The maximum total number of vertices */
+#define TGSI_MAX_TOTAL_VERTICES (TGSI_MAX_PRIM_VERTICES * TGSI_MAX_PRIMITIVES * PIPE_MAX_ATTRIBS)
 
 /** function call/activation record */
 struct tgsi_call_record
@@ -198,10 +216,29 @@ struct tgsi_call_record
    uint CondStackTop;
    uint LoopStackTop;
    uint ContStackTop;
+   int SwitchStackTop;
+   int BreakStackTop;
    uint ReturnAddr;
 };
 
 
+/* Switch-case block state. */
+struct tgsi_switch_record {
+   uint mask;                          /**< execution mask */
+   union tgsi_exec_channel selector;   /**< a value case statements are compared to */
+   uint defaultMask;                   /**< non-execute mask for default case */
+};
+
+
+enum tgsi_break_type {
+   TGSI_EXEC_BREAK_INSIDE_LOOP,
+   TGSI_EXEC_BREAK_INSIDE_SWITCH
+};
+
+
+#define TGSI_EXEC_MAX_BREAK_STACK (TGSI_EXEC_MAX_LOOP_NESTING + TGSI_EXEC_MAX_SWITCH_NESTING)
+
+
 /**
  * Run-time virtual machine state for executing TGSI shader.
  */
@@ -214,8 +251,8 @@ struct tgsi_exec_machine
 
    float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
 
-   struct tgsi_exec_vector       Inputs[PIPE_MAX_ATTRIBS];
-   struct tgsi_exec_vector       Outputs[PIPE_MAX_ATTRIBS];
+   struct tgsi_exec_vector       Inputs[TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS];
+   struct tgsi_exec_vector       Outputs[TGSI_MAX_TOTAL_VERTICES];
 
    struct tgsi_exec_vector       *Addrs;
    struct tgsi_exec_vector       *Predicates;
@@ -223,12 +260,14 @@ struct tgsi_exec_machine
    struct tgsi_sampler           **Samplers;
 
    unsigned                      ImmLimit;
-   const float                   (*Consts)[4];
+   const void *Consts[PIPE_MAX_CONSTANT_BUFFERS];
    const struct tgsi_token       *Tokens;   /**< Declarations, instructions */
    unsigned                      Processor; /**< TGSI_PROCESSOR_x */
 
    /* GEOMETRY processor only. */
    unsigned                      *Primitives;
+   unsigned                       NumOutputs;
+   unsigned                       MaxGeometryShaderOutputs;
 
    /* FRAGMENT processor only. */
    const struct tgsi_interp_coef *InterpCoefs;
@@ -242,6 +281,12 @@ struct tgsi_exec_machine
    uint FuncMask;  /**< For function calls */
    uint ExecMask;  /**< = CondMask & LoopMask */
 
+   /* Current switch-case state. */
+   struct tgsi_switch_record Switch;
+
+   /* Current break type. */
+   enum tgsi_break_type BreakType;
+
    /** Condition mask stack (for nested conditionals) */
    uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
    int CondStackTop;
@@ -262,6 +307,13 @@ struct tgsi_exec_machine
    uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
    int ContStackTop;
 
+   /** Switch case stack */
+   struct tgsi_switch_record SwitchStack[TGSI_EXEC_MAX_SWITCH_NESTING];
+   int SwitchStackTop;
+
+   enum tgsi_break_type BreakStack[TGSI_EXEC_MAX_BREAK_STACK];
+   int BreakStackTop;
+
    /** Function execution mask stack (for executing subroutine code) */
    uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
    int FuncStackTop;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index be375cabb8b..de0e09cdbae 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -119,7 +119,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 1, 1, 0, 0, 0, 0, "NOT", TGSI_OPCODE_NOT },
    { 1, 1, 0, 0, 0, 0, "TRUNC", TGSI_OPCODE_TRUNC },
    { 1, 2, 0, 0, 0, 0, "SHL", TGSI_OPCODE_SHL },
-   { 1, 2, 0, 0, 0, 0, "SHR", TGSI_OPCODE_SHR },
+   { 0, 0, 0, 0, 0, 0, "", 88 },      /* removed */
    { 1, 2, 0, 0, 0, 0, "AND", TGSI_OPCODE_AND },
    { 1, 2, 0, 0, 0, 0, "OR", TGSI_OPCODE_OR },
    { 1, 2, 0, 0, 0, 0, "MOD", TGSI_OPCODE_MOD },
@@ -149,7 +149,33 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 1, 0, 0, 0, 0, "BREAKC", TGSI_OPCODE_BREAKC },
    { 0, 1, 0, 0, 0, 0, "KIL", TGSI_OPCODE_KIL },
    { 0, 0, 0, 0, 0, 0, "END", TGSI_OPCODE_END },
-   { 0, 0, 0, 0, 0, 0, "", 118 }      /* removed */
+   { 0, 0, 0, 0, 0, 0, "", 118 },     /* removed */
+   { 1, 1, 0, 0, 0, 0, "F2I", TGSI_OPCODE_F2I },
+   { 1, 2, 0, 0, 0, 0, "IDIV", TGSI_OPCODE_IDIV },
+   { 1, 2, 0, 0, 0, 0, "IMAX", TGSI_OPCODE_IMAX },
+   { 1, 2, 0, 0, 0, 0, "IMIN", TGSI_OPCODE_IMIN },
+   { 1, 1, 0, 0, 0, 0, "INEG", TGSI_OPCODE_INEG },
+   { 1, 2, 0, 0, 0, 0, "ISGE", TGSI_OPCODE_ISGE },
+   { 1, 2, 0, 0, 0, 0, "ISHR", TGSI_OPCODE_ISHR },
+   { 1, 2, 0, 0, 0, 0, "ISLT", TGSI_OPCODE_ISLT },
+   { 1, 1, 0, 0, 0, 0, "F2U", TGSI_OPCODE_F2U },
+   { 1, 1, 0, 0, 0, 0, "U2F", TGSI_OPCODE_U2F },
+   { 1, 2, 0, 0, 0, 0, "UADD", TGSI_OPCODE_UADD },
+   { 1, 2, 0, 0, 0, 0, "UDIV", TGSI_OPCODE_UDIV },
+   { 1, 3, 0, 0, 0, 0, "UMAD", TGSI_OPCODE_UMAD },
+   { 1, 2, 0, 0, 0, 0, "UMAX", TGSI_OPCODE_UMAX },
+   { 1, 2, 0, 0, 0, 0, "UMIN", TGSI_OPCODE_UMIN },
+   { 1, 2, 0, 0, 0, 0, "UMOD", TGSI_OPCODE_UMOD },
+   { 1, 2, 0, 0, 0, 0, "UMUL", TGSI_OPCODE_UMUL },
+   { 1, 2, 0, 0, 0, 0, "USEQ", TGSI_OPCODE_USEQ },
+   { 1, 2, 0, 0, 0, 0, "USGE", TGSI_OPCODE_USGE },
+   { 1, 2, 0, 0, 0, 0, "USHR", TGSI_OPCODE_USHR },
+   { 1, 2, 0, 0, 0, 0, "USLT", TGSI_OPCODE_USLT },
+   { 1, 2, 0, 0, 0, 0, "USNE", TGSI_OPCODE_USNE },
+   { 0, 1, 0, 0, 0, 0, "SWITCH", TGSI_OPCODE_SWITCH },
+   { 0, 1, 0, 0, 0, 0, "CASE", TGSI_OPCODE_CASE },
+   { 0, 0, 0, 0, 0, 0, "DEFAULT", TGSI_OPCODE_DEFAULT },
+   { 0, 0, 0, 0, 0, 0, "ENDSWITCH", TGSI_OPCODE_ENDSWITCH }
 };
 
 const struct tgsi_opcode_info *
diff --git a/src/gallium/auxiliary/tgsi/tgsi_iterate.c b/src/gallium/auxiliary/tgsi/tgsi_iterate.c
index 7b384f5e12a..0ba5fe48419 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_iterate.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_iterate.c
@@ -66,6 +66,12 @@ tgsi_iterate_shader(
                goto fail;
          break;
 
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         if (ctx->iterate_property)
+            if (!ctx->iterate_property( ctx,  &parse.FullToken.FullProperty ))
+               goto fail;
+         break;
+
       default:
          assert( 0 );
       }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_iterate.h b/src/gallium/auxiliary/tgsi/tgsi_iterate.h
index ef5a33ebce9..8d67f22c429 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_iterate.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_iterate.h
@@ -57,6 +57,11 @@ struct tgsi_iterate_context
       struct tgsi_full_immediate *imm );
 
    boolean
+   (* iterate_property)(
+      struct tgsi_iterate_context *ctx,
+      struct tgsi_full_property *prop );
+
+   boolean
    (* epilog)(
       struct tgsi_iterate_context *ctx );
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index b34263da489..e4af15c156f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -124,7 +124,6 @@ OP11(I2F)
 OP11(NOT)
 OP11(TRUNC)
 OP12(SHL)
-OP12(SHR)
 OP12(AND)
 OP12(OR)
 OP12(MOD)
@@ -146,6 +145,28 @@ OP01(IFC)
 OP01(BREAKC)
 OP01(KIL)
 OP00(END)
+OP11(F2I)
+OP12(IDIV)
+OP12(IMAX)
+OP12(IMIN)
+OP11(INEG)
+OP12(ISGE)
+OP12(ISHR)
+OP12(ISLT)
+OP11(F2U)
+OP11(U2F)
+OP12(UADD)
+OP12(UDIV)
+OP13(UMAD)
+OP12(UMAX)
+OP12(UMIN)
+OP12(UMOD)
+OP12(UMUL)
+OP12(USEQ)
+OP12(USGE)
+OP12(USHR)
+OP12(USLT)
+OP12(USNE)
 
 
 #undef OP00
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 356b4473d96..7e19e1fe36f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -60,7 +60,7 @@ tgsi_parse_end_of_tokens(
    struct tgsi_parse_context *ctx )
 {
    return ctx->Position >=
-      1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
+      ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
 }
 
 
@@ -109,6 +109,10 @@ tgsi_parse_token(
 
       next_token( ctx, &decl->Range );
 
+      if (decl->Declaration.Dimension) {
+         next_token(ctx, &decl->Dim);
+      }
+
       if( decl->Declaration.Semantic ) {
          next_token( ctx, &decl->Semantic );
       }
@@ -119,17 +123,29 @@ tgsi_parse_token(
    case TGSI_TOKEN_TYPE_IMMEDIATE:
    {
       struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
+      uint imm_count;
 
       memset(imm, 0, sizeof *imm);
       copy_token(&imm->Immediate, &token);
 
+      imm_count = imm->Immediate.NrTokens - 1;
+
       switch (imm->Immediate.DataType) {
       case TGSI_IMM_FLOAT32:
-         {
-            uint imm_count = imm->Immediate.NrTokens - 1;
-            for (i = 0; i < imm_count; i++) {
-               next_token(ctx, &imm->u[i]);
-            }
+         for (i = 0; i < imm_count; i++) {
+            next_token(ctx, &imm->u[i].Float);
+         }
+         break;
+
+      case TGSI_IMM_UINT32:
+         for (i = 0; i < imm_count; i++) {
+            next_token(ctx, &imm->u[i].Uint);
+         }
+         break;
+
+      case TGSI_IMM_INT32:
+         for (i = 0; i < imm_count; i++) {
+            next_token(ctx, &imm->u[i].Int);
          }
          break;
 
@@ -220,6 +236,22 @@ tgsi_parse_token(
       break;
    }
 
+   case TGSI_TOKEN_TYPE_PROPERTY:
+   {
+      struct tgsi_full_property *prop = &ctx->FullToken.FullProperty;
+      uint prop_count;
+
+      memset(prop, 0, sizeof *prop);
+      copy_token(&prop->Property, &token);
+
+      prop_count = prop->Property.NrTokens - 1;
+      for (i = 0; i < prop_count; i++) {
+         next_token(ctx, &prop->u[i]);
+      }
+
+      break;
+   }
+
    default:
       assert( 0 );
    }
@@ -232,8 +264,7 @@ tgsi_num_tokens(const struct tgsi_token *tokens)
    struct tgsi_parse_context ctx;
    if (tgsi_parse_init(&ctx, tokens) == TGSI_PARSE_OK) {
       unsigned len = (ctx.FullHeader.Header.HeaderSize +
-                      ctx.FullHeader.Header.BodySize +
-                      1);
+                      ctx.FullHeader.Header.BodySize);
       return len;
    }
    return 0;
@@ -253,3 +284,14 @@ tgsi_dup_tokens(const struct tgsi_token *tokens)
       memcpy(new_tokens, tokens, bytes);
    return new_tokens;
 }
+
+
+/**
+ * Allocate memory for num_tokens tokens.
+ */
+struct tgsi_token *
+tgsi_alloc_tokens(unsigned num_tokens)
+{
+   unsigned bytes = num_tokens * sizeof(struct tgsi_token);
+   return (struct tgsi_token *) MALLOC(bytes);
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index 3aa1979a63a..b45ccee2f63 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -58,6 +58,7 @@ struct tgsi_full_declaration
 {
    struct tgsi_declaration Declaration;
    struct tgsi_declaration_range Range;
+   struct tgsi_declaration_dimension Dim;
    struct tgsi_declaration_semantic Semantic;
 };
 
@@ -67,6 +68,12 @@ struct tgsi_full_immediate
    union tgsi_immediate_data u[4];
 };
 
+struct tgsi_full_property
+{
+   struct tgsi_property   Property;
+   struct tgsi_property_data u[8];
+};
+
 #define TGSI_FULL_MAX_DST_REGISTERS 2
 #define TGSI_FULL_MAX_SRC_REGISTERS 4 /* TXD has 4 */
 
@@ -86,6 +93,7 @@ union tgsi_full_token
    struct tgsi_full_declaration  FullDeclaration;
    struct tgsi_full_immediate    FullImmediate;
    struct tgsi_full_instruction  FullInstruction;
+   struct tgsi_full_property     FullProperty;
 };
 
 struct tgsi_parse_context
@@ -122,6 +130,10 @@ tgsi_num_tokens(const struct tgsi_token *tokens);
 struct tgsi_token *
 tgsi_dup_tokens(const struct tgsi_token *tokens);
 
+struct tgsi_token *
+tgsi_alloc_tokens(unsigned num_tokens);
+
+
 #if defined __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
index da6ad6da04c..ad553c71a57 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -51,7 +51,8 @@
  * Since it's pretty much impossible to form PPC vector immediates, load
  * them from memory here:
  */
-const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+PIPE_ALIGN_VAR(16) const float 
+ppc_builtin_constants[] = {
    1.0f, -128.0f, 128.0, 0.0
 };
 
@@ -293,6 +294,7 @@ emit_fetch(struct gen_context *gen,
    case TGSI_SWIZZLE_W:
       switch (reg->Register.File) {
       case TGSI_FILE_INPUT:
+      case TGSI_FILE_SYSTEM_VALUE:
          {
             int offset = (reg->Register.Index * 4 + swizzle) * 16;
             int offset_reg = emit_li_offset(gen, offset);
@@ -1173,7 +1175,8 @@ emit_declaration(
    struct ppc_function *func,
    struct tgsi_full_declaration *decl )
 {
-   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+   if( decl->Declaration.File == TGSI_FILE_INPUT ||
+       decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE ) {
 #if 0
       unsigned first, last, mask;
       unsigned i, j;
@@ -1339,6 +1342,9 @@ tgsi_emit_ppc(const struct tgsi_token *tokens,
          }
          break;
 
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         break;
+
       default:
 	 ok = 0;
          assert( 0 );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index b5d1faa897a..371f690b295 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -26,32 +26,112 @@
  **************************************************************************/
 
 #include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_prim.h"
+#include "cso_cache/cso_hash.h"
 #include "tgsi_sanity.h"
 #include "tgsi_info.h"
 #include "tgsi_iterate.h"
 
-typedef uint reg_flag;
-
-#define BITS_IN_REG_FLAG (sizeof( reg_flag ) * 8)
-
-#define MAX_REGISTERS 1024
-#define MAX_REG_FLAGS ((MAX_REGISTERS + BITS_IN_REG_FLAG - 1) / BITS_IN_REG_FLAG)
+typedef struct {
+   uint file : 28;
+   /* max 2 dimensions */
+   uint dimensions : 4;
+   uint indices[2];
+} scan_register;
 
 struct sanity_check_ctx
 {
    struct tgsi_iterate_context iter;
+   struct cso_hash *regs_decl;
+   struct cso_hash *regs_used;
+   struct cso_hash *regs_ind_used;
 
-   reg_flag regs_decl[TGSI_FILE_COUNT][MAX_REG_FLAGS];
-   reg_flag regs_used[TGSI_FILE_COUNT][MAX_REG_FLAGS];
-   boolean regs_ind_used[TGSI_FILE_COUNT];
    uint num_imms;
    uint num_instructions;
    uint index_of_END;
 
    uint errors;
    uint warnings;
+   uint implied_array_size;
 };
 
+static INLINE unsigned
+scan_register_key(const scan_register *reg)
+{
+   unsigned key = reg->file;
+   key |= (reg->indices[0] << 4);
+   key |= (reg->indices[1] << 18);
+
+   return key;
+}
+
+static void
+fill_scan_register1d(scan_register *reg,
+                     uint file, uint index)
+{
+   reg->file = file;
+   reg->dimensions = 1;
+   reg->indices[0] = index;
+   reg->indices[1] = 0;
+}
+
+static void
+fill_scan_register2d(scan_register *reg,
+                     uint file, uint index1, uint index2)
+{
+   reg->file = file;
+   reg->dimensions = 2;
+   reg->indices[0] = index1;
+   reg->indices[1] = index2;
+}
+
+static void
+scan_register_dst(scan_register *reg,
+                  struct tgsi_full_dst_register *dst)
+{
+   fill_scan_register1d(reg,
+                        dst->Register.File,
+                        dst->Register.Index);
+}
+
+static void
+scan_register_src(scan_register *reg,
+                  struct tgsi_full_src_register *src)
+{
+   if (src->Register.Dimension) {
+      /*FIXME: right now we don't support indirect
+       * multidimensional addressing */
+      debug_assert(!src->Dimension.Indirect);
+      fill_scan_register2d(reg,
+                           src->Register.File,
+                           src->Register.Index,
+                           src->Dimension.Index);
+   } else {
+      fill_scan_register1d(reg,
+                           src->Register.File,
+                           src->Register.Index);
+   }
+}
+
+static scan_register *
+create_scan_register_src(struct tgsi_full_src_register *src)
+{
+   scan_register *reg = MALLOC(sizeof(scan_register));
+   scan_register_src(reg, src);
+
+   return reg;
+}
+
+static scan_register *
+create_scan_register_dst(struct tgsi_full_dst_register *dst)
+{
+   scan_register *reg = MALLOC(sizeof(scan_register));
+   scan_register_dst(reg, dst);
+
+   return reg;
+}
+
 static void
 report_error(
    struct sanity_check_ctx *ctx,
@@ -99,12 +179,12 @@ check_file_name(
 static boolean
 is_register_declared(
    struct sanity_check_ctx *ctx,
-   uint file,
-   int index )
+   const scan_register *reg)
 {
-   assert( index >= 0 && index < MAX_REGISTERS );
-
-   return (ctx->regs_decl[file][index / BITS_IN_REG_FLAG] & (1 << (index % BITS_IN_REG_FLAG))) ? TRUE : FALSE;
+   void *data = cso_hash_find_data_from_template(
+      ctx->regs_decl, scan_register_key(reg),
+      (void*)reg, sizeof(scan_register));
+   return  data ? TRUE : FALSE;
 }
 
 static boolean
@@ -112,23 +192,37 @@ is_any_register_declared(
    struct sanity_check_ctx *ctx,
    uint file )
 {
-   uint i;
+   struct cso_hash_iter iter =
+      cso_hash_first_node(ctx->regs_decl);
 
-   for (i = 0; i < MAX_REG_FLAGS; i++)
-      if (ctx->regs_decl[file][i])
+   while (!cso_hash_iter_is_null(iter)) {
+      scan_register *reg = (scan_register *)cso_hash_iter_data(iter);
+      if (reg->file == file)
          return TRUE;
+      iter = cso_hash_iter_next(iter);
+   }
+
    return FALSE;
 }
 
 static boolean
 is_register_used(
    struct sanity_check_ctx *ctx,
-   uint file,
-   int index )
+   scan_register *reg)
 {
-   assert( index < MAX_REGISTERS );
+   void *data = cso_hash_find_data_from_template(
+      ctx->regs_used, scan_register_key(reg),
+      reg, sizeof(scan_register));
+   return  data ? TRUE : FALSE;
+}
+
 
-   return (ctx->regs_used[file][index / BITS_IN_REG_FLAG] & (1 << (index % BITS_IN_REG_FLAG))) ? TRUE : FALSE;
+static boolean
+is_ind_register_used(
+   struct sanity_check_ctx *ctx,
+   scan_register *reg)
+{
+   return cso_hash_contains(ctx->regs_ind_used, reg->file);
 }
 
 static const char *file_names[TGSI_FILE_COUNT] =
@@ -148,31 +242,42 @@ static const char *file_names[TGSI_FILE_COUNT] =
 static boolean
 check_register_usage(
    struct sanity_check_ctx *ctx,
-   uint file,
-   int index,
+   scan_register *reg,
    const char *name,
    boolean indirect_access )
 {
-   if (!check_file_name( ctx, file ))
+   if (!check_file_name( ctx, reg->file )) {
+      FREE(reg);
       return FALSE;
+   }
 
    if (indirect_access) {
       /* Note that 'index' is an offset relative to the value of the
-       * address register.  No range checking done here.
-       */
-      if (!is_any_register_declared( ctx, file ))
-         report_error( ctx, "%s: Undeclared %s register", file_names[file], name );
-      ctx->regs_ind_used[file] = TRUE;
+       * address register.  No range checking done here.*/
+      reg->indices[0] = 0;
+      reg->indices[1] = 0;
+      if (!is_any_register_declared( ctx, reg->file ))
+         report_error( ctx, "%s: Undeclared %s register", file_names[reg->file], name );
+      if (!is_ind_register_used(ctx, reg))
+         cso_hash_insert(ctx->regs_ind_used, reg->file, reg);
+      else
+         FREE(reg);
    }
    else {
-      if (index < 0 || index >= MAX_REGISTERS) {
-         report_error( ctx, "%s[%d]: Invalid %s index", file_names[file], index, name );
-         return FALSE;
+      if (!is_register_declared( ctx, reg )) {
+         if (reg->dimensions == 2) {
+            report_error( ctx, "%s[%d][%d]: Undeclared %s register", file_names[reg->file],
+                          reg->indices[0], reg->indices[1], name );
+         }
+         else {
+            report_error( ctx, "%s[%d]: Undeclared %s register", file_names[reg->file],
+                          reg->indices[0], name );
+         }
       }
-
-      if (!is_register_declared( ctx, file, index ))
-         report_error( ctx, "%s[%d]: Undeclared %s register", file_names[file], index, name );
-      ctx->regs_used[file][index / BITS_IN_REG_FLAG] |= (1 << (index % BITS_IN_REG_FLAG));
+      if (!is_register_used( ctx, reg ))
+         cso_hash_insert(ctx->regs_used, scan_register_key(reg), reg);
+      else
+         FREE(reg);
    }
    return TRUE;
 }
@@ -210,35 +315,34 @@ iter_instruction(
     * Mark the registers as used.
     */
    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      scan_register *reg = create_scan_register_dst(&inst->Dst[i]);
       check_register_usage(
          ctx,
-         inst->Dst[i].Register.File,
-         inst->Dst[i].Register.Index,
+         reg,
          "destination",
          FALSE );
+      if (!inst->Dst[i].Register.WriteMask) {
+         report_error(ctx, "Destination register has empty writemask");
+      }
    }
    for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      scan_register *reg = create_scan_register_src(&inst->Src[i]);
       check_register_usage(
          ctx,
-         inst->Src[i].Register.File,
-         inst->Src[i].Register.Index,
+         reg,
          "source",
          (boolean)inst->Src[i].Register.Indirect );
       if (inst->Src[i].Register.Indirect) {
-         uint file;
-         int index;
+         scan_register *ind_reg = MALLOC(sizeof(scan_register));
 
-         file = inst->Src[i].Indirect.File;
-         index = inst->Src[i].Indirect.Index;
+         fill_scan_register1d(ind_reg,
+                              inst->Src[i].Indirect.File,
+                              inst->Src[i].Indirect.Index);
          check_register_usage(
             ctx,
-            file,
-            index,
+            ind_reg,
             "indirect",
             FALSE );
-         if (!(file == TGSI_FILE_ADDRESS || file == TGSI_FILE_LOOP) || index != 0) {
-            report_warning(ctx, "Indirect register neither ADDR[0] nor LOOP[0]");
-         }
       }
    }
 
@@ -266,6 +370,19 @@ iter_instruction(
    return TRUE;
 }
 
+static void
+check_and_declare(struct sanity_check_ctx *ctx,
+                  scan_register *reg)
+{
+   if (is_register_declared( ctx, reg))
+      report_error( ctx, "%s[%u]: The same register declared more than once",
+                    file_names[reg->file], reg->indices[0] );
+   cso_hash_insert(ctx->regs_decl,
+                   scan_register_key(reg),
+                   reg);
+}
+
+
 static boolean
 iter_declaration(
    struct tgsi_iterate_context *iter,
@@ -287,9 +404,25 @@ iter_declaration(
    if (!check_file_name( ctx, file ))
       return TRUE;
    for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-      if (is_register_declared( ctx, file, i ))
-         report_error( ctx, "%s[%u]: The same register declared more than once", file_names[file], i );
-      ctx->regs_decl[file][i / BITS_IN_REG_FLAG] |= (1 << (i % BITS_IN_REG_FLAG));
+      /* declared TGSI_FILE_INPUT's for geometry processor
+       * have an implied second dimension */
+      if (file == TGSI_FILE_INPUT &&
+          ctx->iter.processor.Processor == TGSI_PROCESSOR_GEOMETRY) {
+         uint vert;
+         for (vert = 0; vert < ctx->implied_array_size; ++vert) {
+            scan_register *reg = MALLOC(sizeof(scan_register));
+            fill_scan_register2d(reg, file, i, vert);
+            check_and_declare(ctx, reg);
+         }
+      } else {
+         scan_register *reg = MALLOC(sizeof(scan_register));
+         if (decl->Declaration.Dimension) {
+            fill_scan_register2d(reg, file, i, decl->Dim.Index2D);
+         } else {
+            fill_scan_register1d(reg, file, i);
+         }
+         check_and_declare(ctx, reg);
+      }
    }
 
    return TRUE;
@@ -301,8 +434,7 @@ iter_immediate(
    struct tgsi_full_immediate *imm )
 {
    struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
-
-   assert( ctx->num_imms < MAX_REGISTERS );
+   scan_register *reg;
 
    /* No immediates allowed after the first instruction.
     */
@@ -311,12 +443,16 @@ iter_immediate(
 
    /* Mark the register as declared.
     */
-   ctx->regs_decl[TGSI_FILE_IMMEDIATE][ctx->num_imms / BITS_IN_REG_FLAG] |= (1 << (ctx->num_imms % BITS_IN_REG_FLAG));
+   reg = MALLOC(sizeof(scan_register));
+   fill_scan_register1d(reg, TGSI_FILE_IMMEDIATE, ctx->num_imms);
+   cso_hash_insert(ctx->regs_decl, scan_register_key(reg), reg);
    ctx->num_imms++;
 
    /* Check data type validity.
     */
-   if (imm->Immediate.DataType != TGSI_IMM_FLOAT32) {
+   if (imm->Immediate.DataType != TGSI_IMM_FLOAT32 &&
+       imm->Immediate.DataType != TGSI_IMM_UINT32 &&
+       imm->Immediate.DataType != TGSI_IMM_INT32) {
       report_error( ctx, "(%u): Invalid immediate data type", imm->Immediate.DataType );
       return TRUE;
    }
@@ -324,12 +460,26 @@ iter_immediate(
    return TRUE;
 }
 
+
+static boolean
+iter_property(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_property *prop )
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+
+   if (iter->processor.Processor == TGSI_PROCESSOR_GEOMETRY &&
+       prop->Property.PropertyName == TGSI_PROPERTY_GS_INPUT_PRIM) {
+      ctx->implied_array_size = u_vertices_per_prim(prop->u[0].Data);
+   }
+   return TRUE;
+}
+
 static boolean
 epilog(
    struct tgsi_iterate_context *iter )
 {
    struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
-   uint file;
 
    /* There must be an END instruction somewhere.
     */
@@ -339,13 +489,17 @@ epilog(
 
    /* Check if all declared registers were used.
     */
-   for (file = TGSI_FILE_NULL; file < TGSI_FILE_COUNT; file++) {
-      uint i;
-
-      for (i = 0; i < MAX_REGISTERS; i++) {
-         if (is_register_declared( ctx, file, i ) && !is_register_used( ctx, file, i ) && !ctx->regs_ind_used[file]) {
-            report_warning( ctx, "%s[%u]: Register never used", file_names[file], i );
+   {
+      struct cso_hash_iter iter =
+         cso_hash_first_node(ctx->regs_decl);
+
+      while (!cso_hash_iter_is_null(iter)) {
+         scan_register *reg = (scan_register *)cso_hash_iter_data(iter);
+         if (!is_register_used(ctx, reg) && !is_ind_register_used(ctx, reg)) {
+            report_warning( ctx, "%s[%u]: Register never used",
+                            file_names[reg->file], reg->indices[0] );
          }
+         iter = cso_hash_iter_next(iter);
       }
    }
 
@@ -357,6 +511,19 @@ epilog(
    return TRUE;
 }
 
+static void
+regs_hash_destroy(struct cso_hash *hash)
+{
+   struct cso_hash_iter iter = cso_hash_first_node(hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      scan_register *reg = (scan_register *)cso_hash_iter_data(iter);
+      iter = cso_hash_erase(hash, iter);
+      assert(reg->file < TGSI_FILE_COUNT);
+      FREE(reg);
+   }
+   cso_hash_delete(hash);
+}
+
 boolean
 tgsi_sanity_check(
    const struct tgsi_token *tokens )
@@ -367,20 +534,26 @@ tgsi_sanity_check(
    ctx.iter.iterate_instruction = iter_instruction;
    ctx.iter.iterate_declaration = iter_declaration;
    ctx.iter.iterate_immediate = iter_immediate;
+   ctx.iter.iterate_property = iter_property;
    ctx.iter.epilog = epilog;
 
-   memset( ctx.regs_decl, 0, sizeof( ctx.regs_decl ) );
-   memset( ctx.regs_used, 0, sizeof( ctx.regs_used ) );
-   memset( ctx.regs_ind_used, 0, sizeof( ctx.regs_ind_used ) );
+   ctx.regs_decl = cso_hash_create();
+   ctx.regs_used = cso_hash_create();
+   ctx.regs_ind_used = cso_hash_create();
+
    ctx.num_imms = 0;
    ctx.num_instructions = 0;
    ctx.index_of_END = ~0;
 
    ctx.errors = 0;
    ctx.warnings = 0;
+   ctx.implied_array_size = 0;
 
    if (!tgsi_iterate_shader( tokens, &ctx.iter ))
       return FALSE;
 
+   regs_hash_destroy(ctx.regs_decl);
+   regs_hash_destroy(ctx.regs_used);
+   regs_hash_destroy(ctx.regs_ind_used);
    return ctx.errors == 0;
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index a5d2db04ec1..232fc537c1d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -97,15 +97,14 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                for (i = 0; i < fullinst->Instruction.NumSrcRegs; i++) {
                   const struct tgsi_full_src_register *src =
                      &fullinst->Src[i];
-                  if (src->Register.File == TGSI_FILE_INPUT) {
+                  if (src->Register.File == TGSI_FILE_INPUT ||
+                      src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
                      const int ind = src->Register.Index;
                      if (info->input_semantic_name[ind] == TGSI_SEMANTIC_FOG) {
-                        if (src->Register.SwizzleX == TGSI_SWIZZLE_X) {
-                           info->uses_fogcoord = TRUE;
-                        }
-                        else if (src->Register.SwizzleX == TGSI_SWIZZLE_Y) {
-                           info->uses_frontfacing = TRUE;
-                        }
+                        info->uses_fogcoord = TRUE;
+                     }
+                     else if (info->input_semantic_name[ind] == TGSI_SEMANTIC_FACE) {
+                        info->uses_frontfacing = TRUE;
                      }
                   }
                }
@@ -128,25 +127,30 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                info->file_count[file]++;
                info->file_max[file] = MAX2(info->file_max[file], (int)reg);
 
-               if (file == TGSI_FILE_INPUT) {
+               if (file == TGSI_FILE_INPUT || file == TGSI_FILE_SYSTEM_VALUE) {
                   info->input_semantic_name[reg] = (ubyte)fulldecl->Semantic.Name;
                   info->input_semantic_index[reg] = (ubyte)fulldecl->Semantic.Index;
                   info->input_interpolate[reg] = (ubyte)fulldecl->Declaration.Interpolate;
+                  info->input_cylindrical_wrap[reg] = (ubyte)fulldecl->Declaration.CylindricalWrap;
                   info->num_inputs++;
                }
                else if (file == TGSI_FILE_OUTPUT) {
                   info->output_semantic_name[reg] = (ubyte)fulldecl->Semantic.Name;
                   info->output_semantic_index[reg] = (ubyte)fulldecl->Semantic.Index;
                   info->num_outputs++;
-               }
 
-               /* special case */
-               if (procType == TGSI_PROCESSOR_FRAGMENT &&
-                   file == TGSI_FILE_OUTPUT &&
-                   fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
-                  info->writes_z = TRUE;
+                  /* extra info for special outputs */
+                  if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                      fulldecl->Semantic.Name == TGSI_SEMANTIC_POSITION) {
+                     info->writes_z = TRUE;
+                  }
+                  if (procType == TGSI_PROCESSOR_VERTEX &&
+                      fulldecl->Semantic.Name == TGSI_SEMANTIC_EDGEFLAG) {
+                     info->writes_edgeflag = TRUE;
+                  }
                }
-            }
+
+             }
          }
          break;
 
@@ -160,6 +164,19 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
             info->file_max[file] = MAX2(info->file_max[file], (int)reg);
          }
          break;
+      case TGSI_TOKEN_TYPE_PROPERTY:
+      {
+         const struct tgsi_full_property *fullprop
+            = &parse.FullToken.FullProperty;
+
+         info->properties[info->num_properties].name =
+            fullprop->Property.PropertyName;
+         memcpy(info->properties[info->num_properties].data,
+                fullprop->u, 8 * sizeof(unsigned));;
+
+         ++info->num_properties;
+      }
+      break;
 
       default:
          assert( 0 );
@@ -211,7 +228,8 @@ tgsi_is_passthrough_shader(const struct tgsi_token *tokens)
 
             /* Do a whole bunch of checks for a simple move */
             if (fullinst->Instruction.Opcode != TGSI_OPCODE_MOV ||
-                src->Register.File != TGSI_FILE_INPUT ||
+                (src->Register.File != TGSI_FILE_INPUT &&
+                 src->Register.File != TGSI_FILE_SYSTEM_VALUE) ||
                 dst->Register.File != TGSI_FILE_OUTPUT ||
                 src->Register.Index != dst->Register.Index ||
 
@@ -235,6 +253,8 @@ tgsi_is_passthrough_shader(const struct tgsi_token *tokens)
          /* fall-through */
       case TGSI_TOKEN_TYPE_IMMEDIATE:
          /* fall-through */
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         /* fall-through */
       default:
          ; /* no-op */
       }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 8a7ee0c7e4f..741aa7d5c42 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -33,7 +33,6 @@
 #include "pipe/p_state.h"
 #include "pipe/p_shader_tokens.h"
 
-
 /**
  * Shader summary info
  */
@@ -46,6 +45,7 @@ struct tgsi_shader_info
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
    ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_cylindrical_wrap[PIPE_MAX_SHADER_INPUTS];
    ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
    ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
 
@@ -58,11 +58,17 @@ struct tgsi_shader_info
    uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
 
    boolean writes_z;  /**< does fragment shader write Z value? */
+   boolean writes_edgeflag; /**< vertex shader outputs edgeflag */
    boolean uses_kill;  /**< KIL or KILP instruction used? */
    boolean uses_fogcoord; /**< fragment shader uses fog coord? */
    boolean uses_frontfacing; /**< fragment shader uses front/back-face flag? */
-};
 
+   struct {
+      unsigned name;
+      unsigned data[8];
+   } properties[TGSI_PROPERTY_COUNT];
+   uint num_properties;
+};
 
 extern void
 tgsi_scan_shader(const struct tgsi_token *tokens,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 76051ea0d8e..a85cc4659e0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -1288,6 +1289,7 @@ emit_fetch(
          break;
 
       case TGSI_FILE_INPUT:
+      case TGSI_FILE_SYSTEM_VALUE:
          emit_inputf(
             func,
             xmm,
@@ -1417,13 +1419,13 @@ fetch_texel( struct tgsi_sampler **sampler,
                 sampler, *sampler,
                 store );
 
-   debug_printf("lodbias %f\n", store[12]);
-
    for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f\n", 
+      debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
                    j, 
                    store[0+j],
-                   store[4+j]);
+                   store[4+j],
+                   store[8 + j],
+                   store[12 + j]);
 #endif
 
    {
@@ -1432,7 +1434,8 @@ fetch_texel( struct tgsi_sampler **sampler,
                               &store[0],  /* s */
                               &store[4],  /* t */
                               &store[8],  /* r */
-                              store[12],  /* lodbias */
+                              &store[12], /* lodbias */
+                              tgsi_sampler_lod_bias,
                               rgba);      /* results */
 
       memcpy( store, rgba, 16 * sizeof(float));
@@ -2143,40 +2146,50 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_XPD:
+      /* Note: we do all stores after all operands have been fetched
+       * to avoid src/dst register aliasing issues for an instruction
+       * such as:  XPD TEMP[2].xyz, TEMP[0], TEMP[2];
+       */
       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         FETCH( func, *inst, 1, 1, CHAN_Z );
-         FETCH( func, *inst, 3, 0, CHAN_Z );
+         FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
+         FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
       }
       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 4, 1, CHAN_Y );
+         FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
+         FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_MOV( func, 2, 0 );
-         emit_mul( func, 2, 1 );
-         emit_MOV( func, 5, 3 );
-         emit_mul( func, 5, 4 );
-         emit_sub( func, 2, 5 );
-         STORE( func, *inst, 2, 0, CHAN_X );
+         emit_MOV( func, 7, 0 );  /* xmm[7] = xmm[0] */
+         emit_mul( func, 7, 1 );  /* xmm[7] = xmm[2] * xmm[1] */
+         emit_MOV( func, 5, 3 );  /* xmm[5] = xmm[3] */
+         emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
+         emit_sub( func, 7, 5 );  /* xmm[7] = xmm[2] - xmm[5] */
+         /* store xmm[7] in dst.x below */
       }
       if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 2, 1, CHAN_X );
-         FETCH( func, *inst, 5, 0, CHAN_X );
+         FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
+         FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         emit_mul( func, 3, 2 );
-         emit_mul( func, 1, 5 );
-         emit_sub( func, 3, 1 );
-         STORE( func, *inst, 3, 0, CHAN_Y );
+         emit_mul( func, 3, 2 );  /* xmm[3] = xmm[3] * xmm[2] */
+         emit_mul( func, 1, 5 );  /* xmm[1] = xmm[1] * xmm[5] */
+         emit_sub( func, 3, 1 );  /* xmm[3] = xmm[3] - xmm[1] */
+         /* store xmm[3] in dst.y below */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         emit_mul( func, 5, 4 );
-         emit_mul( func, 0, 2 );
-         emit_sub( func, 5, 0 );
-         STORE( func, *inst, 5, 0, CHAN_Z );
+         emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
+         emit_mul( func, 0, 2 );  /* xmm[0] = xmm[0] * xmm[2] */
+         emit_sub( func, 5, 0 );  /* xmm[5] = xmm[5] - xmm[0] */
+         STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
 	 emit_tempf(
@@ -2505,7 +2518,7 @@ emit_instruction(
       break;
 
    case TGSI_OPCODE_TXL:
-      emit_tex( func, inst, TRUE, FALSE );
+      return 0;
       break;
 
    case TGSI_OPCODE_TXP:
@@ -2577,7 +2590,7 @@ emit_instruction(
       return 0;
       break;
 
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       return 0;
       break;
 
@@ -2633,7 +2646,8 @@ emit_declaration(
    struct x86_function *func,
    struct tgsi_full_declaration *decl )
 {
-   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+   if( decl->Declaration.File == TGSI_FILE_INPUT ||
+       decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE ) {
       unsigned first, last, mask;
       unsigned i, j;
 
@@ -2952,6 +2966,9 @@ tgsi_emit_sse2(
             num_immediates++;
          }
          break;
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         /* we just ignore them for now */
+         break;
 
       default:
 	 ok = 0;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index eb376fa9572..f918151daaa 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -27,6 +27,9 @@
 
 #include "util/u_debug.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
 #include "tgsi_text.h"
 #include "tgsi_build.h"
 #include "tgsi_info.h"
@@ -59,6 +62,23 @@ static boolean uprcase( char c )
    return c;
 }
 
+/*
+ * Ignore case of str1 and assume str1 is already uppercase.
+ * Return TRUE iff str1 and str2 are equal.
+ */
+static int
+streq_nocase_uprcase(const char *str1,
+                     const char *str2)
+{
+   while (*str1 && *str2) {
+      if (*str1 != uprcase(*str2))
+         return FALSE;
+      str1++;
+      str2++;
+   }
+   return TRUE;
+}
+
 static boolean str_match_no_case( const char **pcur, const char *str )
 {
    const char *cur = *pcur;
@@ -110,6 +130,20 @@ static boolean parse_uint( const char **pcur, uint *val )
    return FALSE;
 }
 
+static boolean parse_identifier( const char **pcur, char *ret )
+{
+   const char *cur = *pcur;
+   int i = 0;
+   if (is_alpha_underscore( cur )) {
+      ret[i++] = *cur++;
+      while (is_alpha_underscore( cur ))
+         ret[i++] = *cur++;
+      *pcur = cur;
+      return TRUE;
+   }
+   return FALSE;
+}
+
 /* Parse floating point.
  */
 static boolean parse_float( const char **pcur, float *val )
@@ -163,11 +197,26 @@ struct translate_ctx
    struct tgsi_token *tokens_cur;
    struct tgsi_token *tokens_end;
    struct tgsi_header *header;
+   unsigned processor : 4;
+   int implied_array_size : 5;
 };
 
 static void report_error( struct translate_ctx *ctx, const char *msg )
 {
-   debug_printf( "\nError: %s", msg );
+   int line = 1;
+   int column = 1;
+   const char *itr = ctx->text;
+
+   while (itr != ctx->cur) {
+      if (*itr == '\n') {
+         column = 1;
+         ++line;
+      }
+      ++column;
+      ++itr;
+   }
+
+   debug_printf( "\nTGSI asm error: %s [%d : %d] \n", msg, line, column );
 }
 
 /* Parse shader header.
@@ -199,6 +248,7 @@ static boolean parse_header( struct translate_ctx *ctx )
    if (ctx->tokens_cur >= ctx->tokens_end)
       return FALSE;
    *(struct tgsi_processor *) ctx->tokens_cur++ = tgsi_build_processor( processor, ctx->header );
+   ctx->processor = processor;
 
    return TRUE;
 }
@@ -229,7 +279,8 @@ static const char *file_names[TGSI_FILE_COUNT] =
    "ADDR",
    "IMM",
    "LOOP",
-   "PRED"
+   "PRED",
+   "SV"
 };
 
 static boolean
@@ -294,92 +345,36 @@ parse_opt_writemask(
    return TRUE;
 }
 
-/* <register_file_bracket> ::= <file> `['
- */
 static boolean
-parse_register_file_bracket(
-   struct translate_ctx *ctx,
-   uint *file )
-{
-   if (!parse_file( &ctx->cur, file )) {
-      report_error( ctx, "Unknown register file" );
-      return FALSE;
-   }
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != '[') {
-      report_error( ctx, "Expected `['" );
-      return FALSE;
-   }
-   ctx->cur++;
-   return TRUE;
-}
+parse_register_dst( struct translate_ctx *ctx,
+                    uint *file,
+                    int *index );
 
-/* <register_file_bracket_index> ::= <register_file_bracket> <uint>
- */
-static boolean
-parse_register_file_bracket_index(
-   struct translate_ctx *ctx,
-   uint *file,
-   int *index )
-{
-   uint uindex;
+struct parsed_src_bracket {
+   int index;
 
-   if (!parse_register_file_bracket( ctx, file ))
-      return FALSE;
-   eat_opt_white( &ctx->cur );
-   if (!parse_uint( &ctx->cur, &uindex )) {
-      report_error( ctx, "Expected literal unsigned integer" );
-      return FALSE;
-   }
-   *index = (int) uindex;
-   return TRUE;
-}
+   uint ind_file;
+   int ind_index;
+   uint ind_comp;
+};
 
-/* Parse destination register operand.
- *    <register_dst> ::= <register_file_bracket_index> `]'
- */
-static boolean
-parse_register_dst(
-   struct translate_ctx *ctx,
-   uint *file,
-   int *index )
-{
-   if (!parse_register_file_bracket_index( ctx, file, index ))
-      return FALSE;
-   eat_opt_white( &ctx->cur );
-   if (*ctx->cur != ']') {
-      report_error( ctx, "Expected `]'" );
-      return FALSE;
-   }
-   ctx->cur++;
-   return TRUE;
-}
 
-/* Parse source register operand.
- *    <register_src> ::= <register_file_bracket_index> `]' |
- *                       <register_file_bracket> <register_dst> [`.' (`x' | `y' | `z' | `w')] `]' |
- *                       <register_file_bracket> <register_dst> [`.' (`x' | `y' | `z' | `w')] `+' <uint> `]' |
- *                       <register_file_bracket> <register_dst> [`.' (`x' | `y' | `z' | `w')] `-' <uint> `]'
- */
 static boolean
-parse_register_src(
+parse_register_src_bracket(
    struct translate_ctx *ctx,
-   uint *file,
-   int *index,
-   uint *ind_file,
-   int *ind_index,
-   uint *ind_comp)
+   struct parsed_src_bracket *brackets)
 {
    const char *cur;
    uint uindex;
 
-   *ind_comp = TGSI_SWIZZLE_X;
-   if (!parse_register_file_bracket( ctx, file ))
-      return FALSE;
+   memset(brackets, 0, sizeof(struct parsed_src_bracket));
+
    eat_opt_white( &ctx->cur );
+
    cur = ctx->cur;
-   if (parse_file( &cur, ind_file )) {
-      if (!parse_register_dst( ctx, ind_file, ind_index ))
+   if (parse_file( &cur, &brackets->ind_file )) {
+      if (!parse_register_dst( ctx, &brackets->ind_file,
+                               &brackets->ind_index ))
          return FALSE;
       eat_opt_white( &ctx->cur );
 
@@ -389,16 +384,16 @@ parse_register_src(
 
          switch (uprcase(*ctx->cur)) {
          case 'X':
-            *ind_comp = TGSI_SWIZZLE_X;
+            brackets->ind_comp = TGSI_SWIZZLE_X;
             break;
          case 'Y':
-            *ind_comp = TGSI_SWIZZLE_Y;
+            brackets->ind_comp = TGSI_SWIZZLE_Y;
             break;
          case 'Z':
-            *ind_comp = TGSI_SWIZZLE_Z;
+            brackets->ind_comp = TGSI_SWIZZLE_Z;
             break;
          case 'W':
-            *ind_comp = TGSI_SWIZZLE_W;
+            brackets->ind_comp = TGSI_SWIZZLE_W;
             break;
          default:
             report_error(ctx, "Expected indirect register swizzle component `x', `y', `z' or `w'");
@@ -419,12 +414,12 @@ parse_register_src(
             return FALSE;
          }
          if (negate)
-            *index = -(int) uindex;
+            brackets->index = -(int) uindex;
          else
-            *index = (int) uindex;
+            brackets->index = (int) uindex;
       }
       else {
-         *index = 0;
+         brackets->index = 0;
       }
    }
    else {
@@ -432,9 +427,9 @@ parse_register_src(
          report_error( ctx, "Expected literal unsigned integer" );
          return FALSE;
       }
-      *index = (int) uindex;
-      *ind_file = TGSI_FILE_NULL;
-      *ind_index = 0;
+      brackets->index = (int) uindex;
+      brackets->ind_file = TGSI_FILE_NULL;
+      brackets->ind_index = 0;
    }
    eat_opt_white( &ctx->cur );
    if (*ctx->cur != ']') {
@@ -445,20 +440,123 @@ parse_register_src(
    return TRUE;
 }
 
-/* Parse register declaration.
- *    <register_dcl> ::= <register_file_bracket_index> `]' |
- *                       <register_file_bracket_index> `..' <index> `]'
+static boolean
+parse_opt_register_src_bracket(
+   struct translate_ctx *ctx,
+   struct parsed_src_bracket *brackets,
+   int *parsed_brackets)
+{
+   const char *cur = ctx->cur;
+
+   *parsed_brackets = 0;
+
+   eat_opt_white( &cur );
+   if (cur[0] == '[') {
+      ++cur;
+      ctx->cur = cur;
+
+      if (!parse_register_src_bracket(ctx, brackets))
+         return FALSE;
+
+      *parsed_brackets = 1;
+   }
+
+   return TRUE;
+}
+
+/* <register_file_bracket> ::= <file> `['
  */
 static boolean
-parse_register_dcl(
+parse_register_file_bracket(
+   struct translate_ctx *ctx,
+   uint *file )
+{
+   if (!parse_file( &ctx->cur, file )) {
+      report_error( ctx, "Unknown register file" );
+      return FALSE;
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '[') {
+      report_error( ctx, "Expected `['" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+/* <register_file_bracket_index> ::= <register_file_bracket> <uint>
+ */
+static boolean
+parse_register_file_bracket_index(
    struct translate_ctx *ctx,
    uint *file,
-   int *first,
-   int *last )
+   int *index )
+{
+   uint uindex;
+
+   if (!parse_register_file_bracket( ctx, file ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (!parse_uint( &ctx->cur, &uindex )) {
+      report_error( ctx, "Expected literal unsigned integer" );
+      return FALSE;
+   }
+   *index = (int) uindex;
+   return TRUE;
+}
+
+/* Parse source register operand.
+ *    <register_src> ::= <register_file_bracket_index> `]' |
+ *                       <register_file_bracket> <register_dst> [`.' (`x' | `y' | `z' | `w')] `]' |
+ *                       <register_file_bracket> <register_dst> [`.' (`x' | `y' | `z' | `w')] `+' <uint> `]' |
+ *                       <register_file_bracket> <register_dst> [`.' (`x' | `y' | `z' | `w')] `-' <uint> `]'
+ */
+static boolean
+parse_register_src(
+   struct translate_ctx *ctx,
+   uint *file,
+   struct parsed_src_bracket *brackets)
+{
+
+   brackets->ind_comp = TGSI_SWIZZLE_X;
+   if (!parse_register_file_bracket( ctx, file ))
+      return FALSE;
+   if (!parse_register_src_bracket( ctx, brackets ))
+       return FALSE;
+
+   return TRUE;
+}
+
+struct parsed_dcl_bracket {
+   uint first;
+   uint last;
+};
+
+static boolean
+parse_register_dcl_bracket(
+   struct translate_ctx *ctx,
+   struct parsed_dcl_bracket *bracket)
 {
-   if (!parse_register_file_bracket_index( ctx, file, first ))
+   uint uindex;
+   memset(bracket, 0, sizeof(struct parsed_dcl_bracket));
+
+   eat_opt_white( &ctx->cur );
+
+   if (!parse_uint( &ctx->cur, &uindex )) {
+      /* it can be an empty bracket [] which means its range
+       * is from 0 to some implied size */
+      if (ctx->cur[0] == ']' && ctx->implied_array_size != 0) {
+         bracket->first = 0;
+         bracket->last = ctx->implied_array_size - 1;
+         goto cleanup;
+      }
+      report_error( ctx, "Expected literal unsigned integer" );
       return FALSE;
+   }
+   bracket->first = uindex;
+
    eat_opt_white( &ctx->cur );
+
    if (ctx->cur[0] == '.' && ctx->cur[1] == '.') {
       uint uindex;
 
@@ -468,12 +566,14 @@ parse_register_dcl(
          report_error( ctx, "Expected literal integer" );
          return FALSE;
       }
-      *last = (int) uindex;
+      bracket->last = (int) uindex;
       eat_opt_white( &ctx->cur );
    }
    else {
-      *last = *first;
+      bracket->last = bracket->first;
    }
+
+cleanup:
    if (*ctx->cur != ']') {
       report_error( ctx, "Expected `]' or `..'" );
       return FALSE;
@@ -482,6 +582,72 @@ parse_register_dcl(
    return TRUE;
 }
 
+/* Parse register declaration.
+ *    <register_dcl> ::= <register_file_bracket_index> `]' |
+ *                       <register_file_bracket_index> `..' <index> `]'
+ */
+static boolean
+parse_register_dcl(
+   struct translate_ctx *ctx,
+   uint *file,
+   struct parsed_dcl_bracket *brackets,
+   int *num_brackets)
+{
+   const char *cur;
+
+   *num_brackets = 0;
+
+   if (!parse_register_file_bracket( ctx, file ))
+      return FALSE;
+   if (!parse_register_dcl_bracket( ctx, &brackets[0] ))
+      return FALSE;
+
+   *num_brackets = 1;
+
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+
+   if (cur[0] == '[') {
+      ++cur;
+      ctx->cur = cur;
+      if (!parse_register_dcl_bracket( ctx, &brackets[1] ))
+         return FALSE;
+      /* for geometry shader we don't really care about
+       * the first brackets it's always the size of the
+       * input primitive. so we want to declare just
+       * the index relevant to the semantics which is in
+       * the second bracket */
+      if (ctx->processor == TGSI_PROCESSOR_GEOMETRY && *file == TGSI_FILE_INPUT) {
+         brackets[0] = brackets[1];
+         *num_brackets = 1;
+      } else {
+         *num_brackets = 2;
+      }
+   }
+
+   return TRUE;
+}
+
+
+/* Parse destination register operand.
+ *    <register_dst> ::= <register_file_bracket_index> `]'
+ */
+static boolean
+parse_register_dst(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *index )
+{
+   if (!parse_register_file_bracket_index( ctx, file, index ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != ']') {
+      report_error( ctx, "Expected `]'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
 
 static boolean
 parse_dst_operand(
@@ -551,37 +717,45 @@ parse_src_operand(
    struct tgsi_full_src_register *src )
 {
    uint file;
-   int index;
-   uint ind_file;
-   int ind_index;
-   uint ind_comp;
    uint swizzle[4];
    boolean parsed_swizzle;
+   struct parsed_src_bracket bracket[2];
+   int parsed_opt_brackets;
 
    if (*ctx->cur == '-') {
       ctx->cur++;
       eat_opt_white( &ctx->cur );
       src->Register.Negate = 1;
    }
-   
+
    if (*ctx->cur == '|') {
       ctx->cur++;
       eat_opt_white( &ctx->cur );
       src->Register.Absolute = 1;
    }
 
-   if (!parse_register_src(ctx, &file, &index, &ind_file, &ind_index, &ind_comp))
+   if (!parse_register_src(ctx, &file, &bracket[0]))
+      return FALSE;
+   if (!parse_opt_register_src_bracket(ctx, &bracket[1], &parsed_opt_brackets))
       return FALSE;
+
    src->Register.File = file;
-   src->Register.Index = index;
-   if (ind_file != TGSI_FILE_NULL) {
+   if (parsed_opt_brackets) {
+      src->Register.Dimension = 1;
+      src->Dimension.Indirect = 0;
+      src->Dimension.Dimension = 0;
+      src->Dimension.Index = bracket[0].index;
+      bracket[0] = bracket[1];
+   }
+   src->Register.Index = bracket[0].index;
+   if (bracket[0].ind_file != TGSI_FILE_NULL) {
       src->Register.Indirect = 1;
-      src->Indirect.File = ind_file;
-      src->Indirect.Index = ind_index;
-      src->Indirect.SwizzleX = ind_comp;
-      src->Indirect.SwizzleY = ind_comp;
-      src->Indirect.SwizzleZ = ind_comp;
-      src->Indirect.SwizzleW = ind_comp;
+      src->Indirect.File = bracket[0].ind_file;
+      src->Indirect.Index = bracket[0].ind_index;
+      src->Indirect.SwizzleX = bracket[0].ind_comp;
+      src->Indirect.SwizzleY = bracket[0].ind_comp;
+      src->Indirect.SwizzleZ = bracket[0].ind_comp;
+      src->Indirect.SwizzleW = bracket[0].ind_comp;
    }
 
    /* Parse optional swizzle.
@@ -760,7 +934,10 @@ static const char *semantic_names[TGSI_SEMANTIC_COUNT] =
    "PSIZE",
    "GENERIC",
    "NORMAL",
-   "FACE"
+   "FACE",
+   "EDGEFLAG",
+   "PRIM_ID",
+   "INSTANCEID"
 };
 
 static const char *interpolate_names[TGSI_INTERPOLATE_COUNT] =
@@ -774,8 +951,8 @@ static boolean parse_declaration( struct translate_ctx *ctx )
 {
    struct tgsi_full_declaration decl;
    uint file;
-   int first;
-   int last;
+   struct parsed_dcl_bracket brackets[2];
+   int num_brackets;
    uint writemask;
    const char *cur;
    uint advance;
@@ -787,7 +964,7 @@ static boolean parse_declaration( struct translate_ctx *ctx )
       report_error( ctx, "Syntax error" );
       return FALSE;
    }
-   if (!parse_register_dcl( ctx, &file, &first, &last ))
+   if (!parse_register_dcl( ctx, &file, brackets, &num_brackets))
       return FALSE;
    if (!parse_opt_writemask( ctx, &writemask ))
       return FALSE;
@@ -795,8 +972,17 @@ static boolean parse_declaration( struct translate_ctx *ctx )
    decl = tgsi_default_full_declaration();
    decl.Declaration.File = file;
    decl.Declaration.UsageMask = writemask;
-   decl.Range.First = first;
-   decl.Range.Last = last;
+
+   if (num_brackets == 1) {
+      decl.Range.First = brackets[0].first;
+      decl.Range.Last = brackets[0].last;
+   } else {
+      decl.Range.First = brackets[1].first;
+      decl.Range.Last = brackets[1].last;
+
+      decl.Declaration.Dimension = 1;
+      decl.Dim.Index2D = brackets[0].first;
+   }
 
    cur = ctx->cur;
    eat_opt_white( &cur );
@@ -939,6 +1125,171 @@ static boolean parse_immediate( struct translate_ctx *ctx )
    return TRUE;
 }
 
+static const char *property_names[] =
+{
+   "GS_INPUT_PRIMITIVE",
+   "GS_OUTPUT_PRIMITIVE",
+   "GS_MAX_OUTPUT_VERTICES",
+   "FS_COORD_ORIGIN",
+   "FS_COORD_PIXEL_CENTER"
+};
+
+static const char *primitive_names[] =
+{
+   "POINTS",
+   "LINES",
+   "LINE_LOOP",
+   "LINE_STRIP",
+   "TRIANGLES",
+   "TRIANGLE_STRIP",
+   "TRIANGLE_FAN",
+   "QUADS",
+   "QUAD_STRIP",
+   "POLYGON"
+};
+
+static const char *fs_coord_origin_names[] =
+{
+   "UPPER_LEFT",
+   "LOWER_LEFT"
+};
+
+static const char *fs_coord_pixel_center_names[] =
+{
+   "HALF_INTEGER",
+   "INTEGER"
+};
+
+
+static boolean
+parse_primitive( const char **pcur, uint *primitive )
+{
+   uint i;
+
+   for (i = 0; i < PIPE_PRIM_MAX; i++) {
+      const char *cur = *pcur;
+
+      if (str_match_no_case( &cur, primitive_names[i])) {
+         *primitive = i;
+         *pcur = cur;
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+static boolean
+parse_fs_coord_origin( const char **pcur, uint *fs_coord_origin )
+{
+   uint i;
+
+   for (i = 0; i < sizeof(fs_coord_origin_names) / sizeof(fs_coord_origin_names[0]); i++) {
+      const char *cur = *pcur;
+
+      if (str_match_no_case( &cur, fs_coord_origin_names[i])) {
+         *fs_coord_origin = i;
+         *pcur = cur;
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+static boolean
+parse_fs_coord_pixel_center( const char **pcur, uint *fs_coord_pixel_center )
+{
+   uint i;
+
+   for (i = 0; i < sizeof(fs_coord_pixel_center_names) / sizeof(fs_coord_pixel_center_names[0]); i++) {
+      const char *cur = *pcur;
+
+      if (str_match_no_case( &cur, fs_coord_pixel_center_names[i])) {
+         *fs_coord_pixel_center = i;
+         *pcur = cur;
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+
+static boolean parse_property( struct translate_ctx *ctx )
+{
+   struct tgsi_full_property prop;
+   uint property_name;
+   uint values[8];
+   uint advance;
+   char id[64];
+
+   if (!eat_white( &ctx->cur )) {
+      report_error( ctx, "Syntax error" );
+      return FALSE;
+   }
+   if (!parse_identifier( &ctx->cur, id )) {
+      report_error( ctx, "Syntax error" );
+      return FALSE;
+   }
+   for (property_name = 0; property_name < TGSI_PROPERTY_COUNT;
+        ++property_name) {
+      if (streq_nocase_uprcase(property_names[property_name], id)) {
+         break;
+      }
+   }
+   if (property_name >= TGSI_PROPERTY_COUNT) {
+      debug_printf( "\nError: Unknown property : '%s'", id );
+      return FALSE;
+   }
+
+   eat_opt_white( &ctx->cur );
+   switch(property_name) {
+   case TGSI_PROPERTY_GS_INPUT_PRIM:
+   case TGSI_PROPERTY_GS_OUTPUT_PRIM:
+      if (!parse_primitive(&ctx->cur, &values[0] )) {
+         report_error( ctx, "Unknown primitive name as property!" );
+         return FALSE;
+      }
+      if (property_name == TGSI_PROPERTY_GS_INPUT_PRIM &&
+          ctx->processor == TGSI_PROCESSOR_GEOMETRY) {
+         ctx->implied_array_size = u_vertices_per_prim(values[0]);
+      }
+      break;
+   case TGSI_PROPERTY_FS_COORD_ORIGIN:
+      if (!parse_fs_coord_origin(&ctx->cur, &values[0] )) {
+         report_error( ctx, "Unknown coord origin as property: must be UPPER_LEFT or LOWER_LEFT!" );
+         return FALSE;
+      }
+      break;
+   case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER:
+      if (!parse_fs_coord_pixel_center(&ctx->cur, &values[0] )) {
+         report_error( ctx, "Unknown coord pixel center as property: must be HALF_INTEGER or INTEGER!" );
+         return FALSE;
+      }
+      break;
+   default:
+      if (!parse_uint(&ctx->cur, &values[0] )) {
+         report_error( ctx, "Expected unsigned integer as property!" );
+         return FALSE;
+      }
+   }
+
+   prop = tgsi_default_full_property();
+   prop.Property.PropertyName = property_name;
+   prop.Property.NrTokens += 1;
+   prop.u[0].Data = values[0];
+
+   advance = tgsi_build_full_property(
+      &prop,
+      ctx->tokens_cur,
+      ctx->header,
+      (uint) (ctx->tokens_end - ctx->tokens_cur) );
+   if (advance == 0)
+      return FALSE;
+   ctx->tokens_cur += advance;
+
+   return TRUE;
+}
+
+
 static boolean translate( struct translate_ctx *ctx )
 {
    eat_opt_white( &ctx->cur );
@@ -947,7 +1298,6 @@ static boolean translate( struct translate_ctx *ctx )
 
    while (*ctx->cur != '\0') {
       uint label_val = 0;
-
       if (!eat_white( &ctx->cur )) {
          report_error( ctx, "Syntax error" );
          return FALSE;
@@ -955,7 +1305,6 @@ static boolean translate( struct translate_ctx *ctx )
 
       if (*ctx->cur == '\0')
          break;
-
       if (parse_label( ctx, &label_val )) {
          if (!parse_instruction( ctx, TRUE ))
             return FALSE;
@@ -968,6 +1317,10 @@ static boolean translate( struct translate_ctx *ctx )
          if (!parse_immediate( ctx ))
             return FALSE;
       }
+      else if (str_match_no_case( &ctx->cur, "PROPERTY" )) {
+         if (!parse_property( ctx ))
+            return FALSE;
+      }
       else if (!parse_instruction( ctx, FALSE )) {
          return FALSE;
       }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c b/src/gallium/auxiliary/tgsi/tgsi_transform.c
index 8b8f489b355..ae875f29abf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c
@@ -79,6 +79,19 @@ emit_immediate(struct tgsi_transform_context *ctx,
 }
 
 
+static void
+emit_property(struct tgsi_transform_context *ctx,
+              const struct tgsi_full_property *prop)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_property(prop,
+                                  ctx->tokens_out + ti,
+                                  ctx->header,
+                                  ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
 
 /**
  * Apply user-defined transformations to the input shader to produce
@@ -110,6 +123,7 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in,
    ctx->emit_instruction = emit_instruction;
    ctx->emit_declaration = emit_declaration;
    ctx->emit_immediate = emit_immediate;
+   ctx->emit_property = emit_property;
    ctx->tokens_out = tokens_out;
    ctx->max_tokens_out = max_tokens_out;
 
@@ -182,6 +196,17 @@ tgsi_transform_shader(const struct tgsi_token *tokens_in,
                ctx->emit_immediate(ctx, fullimm);
          }
          break;
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         {
+            struct tgsi_full_property *fullprop
+               = &parse.FullToken.FullProperty;
+
+            if (ctx->transform_property)
+               ctx->transform_property(ctx, fullprop);
+            else
+               ctx->emit_property(ctx, fullprop);
+         }
+         break;
 
       default:
          assert( 0 );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index a121adbaef4..818478e277a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -53,6 +53,8 @@ struct tgsi_transform_context
 
    void (*transform_immediate)(struct tgsi_transform_context *ctx,
                                struct tgsi_full_immediate *imm);
+   void (*transform_property)(struct tgsi_transform_context *ctx,
+                              struct tgsi_full_property *prop);
 
    /**
     * Called at end of input program to allow caller to append extra
@@ -73,6 +75,8 @@ struct tgsi_transform_context
                             const struct tgsi_full_declaration *decl);
    void (*emit_immediate)(struct tgsi_transform_context *ctx,
                           const struct tgsi_full_immediate *imm);
+   void (*emit_property)(struct tgsi_transform_context *ctx,
+                         const struct tgsi_full_property *prop);
 
    struct tgsi_header *header;
    uint max_tokens_out;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 8f0b9842ff1..3d0455de7ce 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -33,6 +33,7 @@
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_sanity.h"
+#include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
@@ -40,8 +41,11 @@ union tgsi_any_token {
    struct tgsi_header header;
    struct tgsi_processor processor;
    struct tgsi_token token;
+   struct tgsi_property prop;
+   struct tgsi_property_data prop_data;
    struct tgsi_declaration decl;
    struct tgsi_declaration_range decl_range;
+   struct tgsi_declaration_dimension decl_dim;
    struct tgsi_declaration_semantic decl_semantic;
    struct tgsi_immediate imm;
    union  tgsi_immediate_data imm_data;
@@ -64,6 +68,7 @@ struct ureg_tokens {
 };
 
 #define UREG_MAX_INPUT PIPE_MAX_ATTRIBS
+#define UREG_MAX_SYSTEM_VALUE PIPE_MAX_ATTRIBS
 #define UREG_MAX_OUTPUT PIPE_MAX_ATTRIBS
 #define UREG_MAX_CONSTANT_RANGE 32
 #define UREG_MAX_IMMEDIATE 32
@@ -72,6 +77,14 @@ struct ureg_tokens {
 #define UREG_MAX_LOOP 1
 #define UREG_MAX_PRED 1
 
+struct const_decl {
+   struct {
+      unsigned first;
+      unsigned last;
+   } constant_range[UREG_MAX_CONSTANT_RANGE];
+   unsigned nr_constant_ranges;
+};
+
 #define DOMAIN_DECL 0
 #define DOMAIN_INSN 1
 
@@ -84,20 +97,40 @@ struct ureg_program
       unsigned semantic_name;
       unsigned semantic_index;
       unsigned interp;
+      unsigned cylindrical_wrap;
    } fs_input[UREG_MAX_INPUT];
    unsigned nr_fs_inputs;
 
    unsigned vs_inputs[UREG_MAX_INPUT/32];
 
    struct {
+      unsigned index;
+      unsigned semantic_name;
+      unsigned semantic_index;
+   } gs_input[UREG_MAX_INPUT];
+   unsigned nr_gs_inputs;
+
+   struct {
+      unsigned index;
+      unsigned semantic_name;
+      unsigned semantic_index;
+   } system_value[UREG_MAX_SYSTEM_VALUE];
+   unsigned nr_system_values;
+
+   struct {
       unsigned semantic_name;
       unsigned semantic_index;
    } output[UREG_MAX_OUTPUT];
    unsigned nr_outputs;
 
    struct {
-      float v[4];
+      union {
+         float f[4];
+         unsigned u[4];
+         int i[4];
+      } value;
       unsigned nr;
+      unsigned type;
    } immediate[UREG_MAX_IMMEDIATE];
    unsigned nr_immediates;
 
@@ -107,11 +140,14 @@ struct ureg_program
    unsigned temps_active[UREG_MAX_TEMP / 32];
    unsigned nr_temps;
 
-   struct {
-      unsigned first;
-      unsigned last;
-   } constant_range[UREG_MAX_CONSTANT_RANGE];
-   unsigned nr_constant_ranges;
+   struct const_decl const_decls;
+   struct const_decl const_decls2D[PIPE_MAX_CONSTANT_BUFFERS];
+
+   unsigned property_gs_input_prim;
+   unsigned property_gs_output_prim;
+   unsigned property_gs_max_vertices;
+   unsigned char property_fs_coord_origin; /* = TGSI_FS_COORD_ORIGIN_* */
+   unsigned char property_fs_coord_pixel_center; /* = TGSI_FS_COORD_PIXEL_CENTER_* */
 
    unsigned nr_addrs;
    unsigned nr_preds;
@@ -213,57 +249,72 @@ ureg_dst_register( unsigned file,
    return dst;
 }
 
-static INLINE struct ureg_src 
-ureg_src_register( unsigned file,
-                   unsigned index )
+
+void
+ureg_property_gs_input_prim(struct ureg_program *ureg,
+                            unsigned input_prim)
 {
-   struct ureg_src src;
-
-   src.File     = file;
-   src.SwizzleX = TGSI_SWIZZLE_X;
-   src.SwizzleY = TGSI_SWIZZLE_Y;
-   src.SwizzleZ = TGSI_SWIZZLE_Z;
-   src.SwizzleW = TGSI_SWIZZLE_W;
-   src.Pad      = 0;
-   src.Indirect = 0;
-   src.IndirectIndex = 0;
-   src.IndirectSwizzle = 0;
-   src.Absolute = 0;
-   src.Index    = index;
-   src.Negate   = 0;
-
-   return src;
+   ureg->property_gs_input_prim = input_prim;
 }
 
+void
+ureg_property_gs_output_prim(struct ureg_program *ureg,
+                             unsigned output_prim)
+{
+   ureg->property_gs_output_prim = output_prim;
+}
 
+void
+ureg_property_gs_max_vertices(struct ureg_program *ureg,
+                              unsigned max_vertices)
+{
+   ureg->property_gs_max_vertices = max_vertices;
+}
 
+void
+ureg_property_fs_coord_origin(struct ureg_program *ureg,
+                            unsigned fs_coord_origin)
+{
+   ureg->property_fs_coord_origin = fs_coord_origin;
+}
 
-struct ureg_src 
-ureg_DECL_fs_input( struct ureg_program *ureg,
-                    unsigned name,
-                    unsigned index,
-                    unsigned interp_mode )
+void
+ureg_property_fs_coord_pixel_center(struct ureg_program *ureg,
+                            unsigned fs_coord_pixel_center)
+{
+   ureg->property_fs_coord_pixel_center = fs_coord_pixel_center;
+}
+
+
+
+struct ureg_src
+ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
+                       unsigned semantic_name,
+                       unsigned semantic_index,
+                       unsigned interp_mode,
+                       unsigned cylindrical_wrap)
 {
    unsigned i;
 
    for (i = 0; i < ureg->nr_fs_inputs; i++) {
-      if (ureg->fs_input[i].semantic_name == name &&
-          ureg->fs_input[i].semantic_index == index) 
+      if (ureg->fs_input[i].semantic_name == semantic_name &&
+          ureg->fs_input[i].semantic_index == semantic_index) {
          goto out;
+      }
    }
 
    if (ureg->nr_fs_inputs < UREG_MAX_INPUT) {
-      ureg->fs_input[i].semantic_name = name;
-      ureg->fs_input[i].semantic_index = index;
+      ureg->fs_input[i].semantic_name = semantic_name;
+      ureg->fs_input[i].semantic_index = semantic_index;
       ureg->fs_input[i].interp = interp_mode;
+      ureg->fs_input[i].cylindrical_wrap = cylindrical_wrap;
       ureg->nr_fs_inputs++;
-   }
-   else {
-      set_bad( ureg );
+   } else {
+      set_bad(ureg);
    }
 
 out:
-   return ureg_src_register( TGSI_FILE_INPUT, i );
+   return ureg_src_register(TGSI_FILE_INPUT, i);
 }
 
 
@@ -278,6 +329,45 @@ ureg_DECL_vs_input( struct ureg_program *ureg,
 }
 
 
+struct ureg_src
+ureg_DECL_gs_input(struct ureg_program *ureg,
+                   unsigned index,
+                   unsigned semantic_name,
+                   unsigned semantic_index)
+{
+   if (ureg->nr_gs_inputs < UREG_MAX_INPUT) {
+      ureg->gs_input[ureg->nr_gs_inputs].index = index;
+      ureg->gs_input[ureg->nr_gs_inputs].semantic_name = semantic_name;
+      ureg->gs_input[ureg->nr_gs_inputs].semantic_index = semantic_index;
+      ureg->nr_gs_inputs++;
+   } else {
+      set_bad(ureg);
+   }
+
+   /* XXX: Add suport for true 2D input registers. */
+   return ureg_src_register(TGSI_FILE_INPUT, index);
+}
+
+
+struct ureg_src
+ureg_DECL_system_value(struct ureg_program *ureg,
+                       unsigned index,
+                       unsigned semantic_name,
+                       unsigned semantic_index)
+{
+   if (ureg->nr_system_values < UREG_MAX_SYSTEM_VALUE) {
+      ureg->system_value[ureg->nr_system_values].index = index;
+      ureg->system_value[ureg->nr_system_values].semantic_name = semantic_name;
+      ureg->system_value[ureg->nr_system_values].semantic_index = semantic_index;
+      ureg->nr_system_values++;
+   } else {
+      set_bad(ureg);
+   }
+
+   return ureg_src_register(TGSI_FILE_SYSTEM_VALUE, index);
+}
+
+
 struct ureg_dst 
 ureg_DECL_output( struct ureg_program *ureg,
                   unsigned name,
@@ -308,62 +398,92 @@ out:
 /* Returns a new constant register.  Keep track of which have been
  * referred to so that we can emit decls later.
  *
+ * Constant operands declared with this function must be addressed
+ * with a two-dimensional index.
+ *
  * There is nothing in this code to bind this constant to any tracked
  * value or manage any constant_buffer contents -- that's the
  * resposibility of the calling code.
  */
-struct ureg_src ureg_DECL_constant(struct ureg_program *ureg, 
-                                   unsigned index )
+void
+ureg_DECL_constant2D(struct ureg_program *ureg,
+                     unsigned first,
+                     unsigned last,
+                     unsigned index2D)
+{
+   struct const_decl *decl = &ureg->const_decls2D[index2D];
+
+   assert(index2D < PIPE_MAX_CONSTANT_BUFFERS);
+
+   if (decl->nr_constant_ranges < UREG_MAX_CONSTANT_RANGE) {
+      uint i = decl->nr_constant_ranges++;
+
+      decl->constant_range[i].first = first;
+      decl->constant_range[i].last = last;
+   }
+}
+
+
+/* A one-dimensional, depricated version of ureg_DECL_constant2D().
+ *
+ * Constant operands declared with this function must be addressed
+ * with a one-dimensional index.
+ */
+struct ureg_src
+ureg_DECL_constant(struct ureg_program *ureg,
+                   unsigned index)
 {
+   struct const_decl *decl = &ureg->const_decls;
    unsigned minconst = index, maxconst = index;
    unsigned i;
 
    /* Inside existing range?
     */
-   for (i = 0; i < ureg->nr_constant_ranges; i++) {
-      if (ureg->constant_range[i].first <= index &&
-          ureg->constant_range[i].last >= index)
+   for (i = 0; i < decl->nr_constant_ranges; i++) {
+      if (decl->constant_range[i].first <= index &&
+          decl->constant_range[i].last >= index) {
          goto out;
+      }
    }
 
    /* Extend existing range?
     */
-   for (i = 0; i < ureg->nr_constant_ranges; i++) {
-      if (ureg->constant_range[i].last == index - 1) {
-         ureg->constant_range[i].last = index;
+   for (i = 0; i < decl->nr_constant_ranges; i++) {
+      if (decl->constant_range[i].last == index - 1) {
+         decl->constant_range[i].last = index;
          goto out;
       }
 
-      if (ureg->constant_range[i].first == index + 1) {
-         ureg->constant_range[i].first = index;
+      if (decl->constant_range[i].first == index + 1) {
+         decl->constant_range[i].first = index;
          goto out;
       }
 
-      minconst = MIN2(minconst, ureg->constant_range[i].first);
-      maxconst = MAX2(maxconst, ureg->constant_range[i].last);
+      minconst = MIN2(minconst, decl->constant_range[i].first);
+      maxconst = MAX2(maxconst, decl->constant_range[i].last);
    }
 
    /* Create new range?
     */
-   if (ureg->nr_constant_ranges < UREG_MAX_CONSTANT_RANGE) {
-      i = ureg->nr_constant_ranges++;
-      ureg->constant_range[i].first = index;
-      ureg->constant_range[i].last = index;
+   if (decl->nr_constant_ranges < UREG_MAX_CONSTANT_RANGE) {
+      i = decl->nr_constant_ranges++;
+      decl->constant_range[i].first = index;
+      decl->constant_range[i].last = index;
       goto out;
    }
 
    /* Collapse all ranges down to one:
     */
    i = 0;
-   ureg->constant_range[0].first = minconst;
-   ureg->constant_range[0].last = maxconst;
-   ureg->nr_constant_ranges = 1;
+   decl->constant_range[0].first = minconst;
+   decl->constant_range[0].last = maxconst;
+   decl->nr_constant_ranges = 1;
 
 out:
-   assert(i < ureg->nr_constant_ranges);
-   assert(ureg->constant_range[i].first <= index);
-   assert(ureg->constant_range[i].last >= index);
-   return ureg_src_register( TGSI_FILE_CONSTANT, index );
+   assert(i < decl->nr_constant_ranges);
+   assert(decl->constant_range[i].first <= index);
+   assert(decl->constant_range[i].last >= index);
+   return ureg_src_register(TGSI_FILE_CONSTANT, index);
 }
 
 
@@ -465,22 +585,22 @@ struct ureg_src ureg_DECL_sampler( struct ureg_program *ureg,
 }
 
 
-
-
-static int match_or_expand_immediate( const float *v,
-                                      unsigned nr,
-                                      float *v2,
-                                      unsigned *nr2,
-                                      unsigned *swizzle )
+static int
+match_or_expand_immediate( const unsigned *v,
+                           unsigned nr,
+                           unsigned *v2,
+                           unsigned *pnr2,
+                           unsigned *swizzle )
 {
+   unsigned nr2 = *pnr2;
    unsigned i, j;
-   
+
    *swizzle = 0;
 
    for (i = 0; i < nr; i++) {
       boolean found = FALSE;
 
-      for (j = 0; j < *nr2 && !found; j++) {
+      for (j = 0; j < nr2 && !found; j++) {
          if (v[i] == v2[j]) {
             *swizzle |= j << (i * 2);
             found = TRUE;
@@ -488,65 +608,142 @@ static int match_or_expand_immediate( const float *v,
       }
 
       if (!found) {
-         if (*nr2 >= 4) 
+         if (nr2 >= 4) {
             return FALSE;
+         }
 
-         v2[*nr2] = v[i];
-         *swizzle |= *nr2 << (i * 2);
-         (*nr2)++;
+         v2[nr2] = v[i];
+         *swizzle |= nr2 << (i * 2);
+         nr2++;
       }
    }
 
+   /* Actually expand immediate only when fully succeeded.
+    */
+   *pnr2 = nr2;
    return TRUE;
 }
 
 
-
-
-struct ureg_src ureg_DECL_immediate( struct ureg_program *ureg, 
-                                     const float *v,
-                                     unsigned nr )
+static struct ureg_src
+decl_immediate( struct ureg_program *ureg,
+                const unsigned *v,
+                unsigned nr,
+                unsigned type )
 {
    unsigned i, j;
-   unsigned swizzle;
+   unsigned swizzle = 0;
 
    /* Could do a first pass where we examine all existing immediates
     * without expanding.
     */
 
    for (i = 0; i < ureg->nr_immediates; i++) {
-      if (match_or_expand_immediate( v, 
-                                     nr,
-                                     ureg->immediate[i].v,
-                                     &ureg->immediate[i].nr, 
-                                     &swizzle ))
+      if (ureg->immediate[i].type != type) {
+         continue;
+      }
+      if (match_or_expand_immediate(v,
+                                    nr,
+                                    ureg->immediate[i].value.u,
+                                    &ureg->immediate[i].nr,
+                                    &swizzle)) {
          goto out;
+      }
    }
 
    if (ureg->nr_immediates < UREG_MAX_IMMEDIATE) {
       i = ureg->nr_immediates++;
-      if (match_or_expand_immediate( v,
-                                     nr,
-                                     ureg->immediate[i].v,
-                                     &ureg->immediate[i].nr, 
-                                     &swizzle ))
+      ureg->immediate[i].type = type;
+      if (match_or_expand_immediate(v,
+                                    nr,
+                                    ureg->immediate[i].value.u,
+                                    &ureg->immediate[i].nr,
+                                    &swizzle)) {
          goto out;
+      }
    }
 
-   set_bad( ureg );
+   set_bad(ureg);
 
 out:
    /* Make sure that all referenced elements are from this immediate.
     * Has the effect of making size-one immediates into scalars.
     */
-   for (j = nr; j < 4; j++)
+   for (j = nr; j < 4; j++) {
       swizzle |= (swizzle & 0x3) << (j * 2);
+   }
+
+   return ureg_swizzle(ureg_src_register(TGSI_FILE_IMMEDIATE, i),
+                       (swizzle >> 0) & 0x3,
+                       (swizzle >> 2) & 0x3,
+                       (swizzle >> 4) & 0x3,
+                       (swizzle >> 6) & 0x3);
+}
+
+
+struct ureg_src
+ureg_DECL_immediate( struct ureg_program *ureg,
+                     const float *v,
+                     unsigned nr )
+{
+   union {
+      float f[4];
+      unsigned u[4];
+   } fu;
+   unsigned int i;
+
+   for (i = 0; i < nr; i++) {
+      fu.f[i] = v[i];
+   }
 
-   return ureg_swizzle( ureg_src_register( TGSI_FILE_IMMEDIATE, i ),
-                        (swizzle >> 0) & 0x3,
-                        (swizzle >> 2) & 0x3,
-                        (swizzle >> 4) & 0x3,
-                        (swizzle >> 6) & 0x3);
+   return decl_immediate(ureg, fu.u, nr, TGSI_IMM_FLOAT32);
+}
+
+
+struct ureg_src
+ureg_DECL_immediate_uint( struct ureg_program *ureg,
+                          const unsigned *v,
+                          unsigned nr )
+{
+   return decl_immediate(ureg, v, nr, TGSI_IMM_UINT32);
+}
+
+
+struct ureg_src
+ureg_DECL_immediate_block_uint( struct ureg_program *ureg,
+                                const unsigned *v,
+                                unsigned nr )
+{
+   uint index;
+   uint i;
+
+   if (ureg->nr_immediates + (nr + 3) / 4 > UREG_MAX_IMMEDIATE) {
+      set_bad(ureg);
+      return ureg_src_register(TGSI_FILE_IMMEDIATE, 0);
+   }
+
+   index = ureg->nr_immediates;
+   ureg->nr_immediates += (nr + 3) / 4;
+
+   for (i = index; i < ureg->nr_immediates; i++) {
+      ureg->immediate[i].type = TGSI_IMM_UINT32;
+      ureg->immediate[i].nr = nr > 4 ? 4 : nr;
+      memcpy(ureg->immediate[i].value.u,
+             &v[(i - index) * 4],
+             ureg->immediate[i].nr * sizeof(uint));
+      nr -= 4;
+   }
+
+   return ureg_src_register(TGSI_FILE_IMMEDIATE, index);
+}
+
+
+struct ureg_src
+ureg_DECL_immediate_int( struct ureg_program *ureg,
+                         const int *v,
+                         unsigned nr )
+{
+   return decl_immediate(ureg, (const unsigned *)v, nr, TGSI_IMM_INT32);
 }
 
 
@@ -554,7 +751,7 @@ void
 ureg_emit_src( struct ureg_program *ureg,
                struct ureg_src src )
 {
-   unsigned size = 1 + (src.Indirect ? 1 : 0);
+   unsigned size = 1 + (src.Indirect ? 1 : 0) + (src.Dimension ? 1 : 0);
 
    union tgsi_any_token *out = get_tokens( ureg, DOMAIN_INSN, size );
    unsigned n = 0;
@@ -577,7 +774,7 @@ ureg_emit_src( struct ureg_program *ureg,
    if (src.Indirect) {
       out[0].src.Indirect = 1;
       out[n].value = 0;
-      out[n].src.File = TGSI_FILE_ADDRESS;
+      out[n].src.File = src.IndirectFile;
       out[n].src.SwizzleX = src.IndirectSwizzle;
       out[n].src.SwizzleY = src.IndirectSwizzle;
       out[n].src.SwizzleZ = src.IndirectSwizzle;
@@ -586,6 +783,15 @@ ureg_emit_src( struct ureg_program *ureg,
       n++;
    }
 
+   if (src.Dimension) {
+      out[0].src.Dimension = 1;
+      out[n].dim.Indirect = 0;
+      out[n].dim.Dimension = 0;
+      out[n].dim.Padding = 0;
+      out[n].dim.Index = src.DimensionIndex;
+      n++;
+   }
+
    assert(n == size);
 }
 
@@ -770,8 +976,8 @@ ureg_insn(struct ureg_program *ureg,
    unsigned i;
    boolean saturate;
    boolean predicate;
-   boolean negate;
-   unsigned swizzle[4];
+   boolean negate = FALSE;
+   unsigned swizzle[4] = { 0 };
 
    saturate = nr_dst ? dst[0].Saturate : FALSE;
    predicate = nr_dst ? dst[0].Predicate : FALSE;
@@ -817,8 +1023,8 @@ ureg_tex_insn(struct ureg_program *ureg,
    unsigned i;
    boolean saturate;
    boolean predicate;
-   boolean negate;
-   unsigned swizzle[4];
+   boolean negate = FALSE;
+   unsigned swizzle[4] = { 0 };
 
    saturate = nr_dst ? dst[0].Saturate : FALSE;
    predicate = nr_dst ? dst[0].Predicate : FALSE;
@@ -885,32 +1091,59 @@ ureg_label_insn(struct ureg_program *ureg,
 }
 
 
-
-static void emit_decl( struct ureg_program *ureg,
-                       unsigned file,
-                       unsigned index,
-                       unsigned semantic_name,
-                       unsigned semantic_index,
-                       unsigned interp )
+static void
+emit_decl_semantic(struct ureg_program *ureg,
+                   unsigned file,
+                   unsigned index,
+                   unsigned semantic_name,
+                   unsigned semantic_index)
 {
-   union tgsi_any_token *out = get_tokens( ureg, DOMAIN_DECL, 3 );
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
 
    out[0].value = 0;
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
    out[0].decl.NrTokens = 3;
    out[0].decl.File = file;
    out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW; /* FIXME! */
-   out[0].decl.Interpolate = interp;
    out[0].decl.Semantic = 1;
 
    out[1].value = 0;
-   out[1].decl_range.First = 
-      out[1].decl_range.Last = index;
+   out[1].decl_range.First = index;
+   out[1].decl_range.Last = index;
 
    out[2].value = 0;
    out[2].decl_semantic.Name = semantic_name;
    out[2].decl_semantic.Index = semantic_index;
+}
+
+
+static void
+emit_decl_fs(struct ureg_program *ureg,
+             unsigned file,
+             unsigned index,
+             unsigned semantic_name,
+             unsigned semantic_index,
+             unsigned interpolate,
+             unsigned cylindrical_wrap)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
 
+   out[0].value = 0;
+   out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   out[0].decl.NrTokens = 3;
+   out[0].decl.File = file;
+   out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW; /* FIXME! */
+   out[0].decl.Interpolate = interpolate;
+   out[0].decl.Semantic = 1;
+   out[0].decl.CylindricalWrap = cylindrical_wrap;
+
+   out[1].value = 0;
+   out[1].decl_range.First = index;
+   out[1].decl_range.Last = index;
+
+   out[2].value = 0;
+   out[2].decl_semantic.Name = semantic_name;
+   out[2].decl_semantic.Index = semantic_index;
 }
 
 
@@ -934,55 +1167,150 @@ static void emit_decl_range( struct ureg_program *ureg,
    out[1].decl_range.Last = first + count - 1;
 }
 
-static void emit_immediate( struct ureg_program *ureg,
-                            const float *v )
+static void
+emit_decl_range2D(struct ureg_program *ureg,
+                  unsigned file,
+                  unsigned first,
+                  unsigned last,
+                  unsigned index2D)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
+
+   out[0].value = 0;
+   out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   out[0].decl.NrTokens = 3;
+   out[0].decl.File = file;
+   out[0].decl.UsageMask = 0xf;
+   out[0].decl.Interpolate = TGSI_INTERPOLATE_CONSTANT;
+   out[0].decl.Dimension = 1;
+
+   out[1].value = 0;
+   out[1].decl_range.First = first;
+   out[1].decl_range.Last = last;
+
+   out[2].value = 0;
+   out[2].decl_dim.Index2D = index2D;
+}
+
+static void
+emit_immediate( struct ureg_program *ureg,
+                const unsigned *v,
+                unsigned type )
 {
    union tgsi_any_token *out = get_tokens( ureg, DOMAIN_DECL, 5 );
 
    out[0].value = 0;
    out[0].imm.Type = TGSI_TOKEN_TYPE_IMMEDIATE;
    out[0].imm.NrTokens = 5;
-   out[0].imm.DataType = TGSI_IMM_FLOAT32;
+   out[0].imm.DataType = type;
    out[0].imm.Padding = 0;
 
-   out[1].imm_data.Float = v[0];
-   out[2].imm_data.Float = v[1];
-   out[3].imm_data.Float = v[2];
-   out[4].imm_data.Float = v[3];
+   out[1].imm_data.Uint = v[0];
+   out[2].imm_data.Uint = v[1];
+   out[3].imm_data.Uint = v[2];
+   out[4].imm_data.Uint = v[3];
 }
 
+static void
+emit_property(struct ureg_program *ureg,
+              unsigned name,
+              unsigned data)
+{
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2);
 
+   out[0].value = 0;
+   out[0].prop.Type = TGSI_TOKEN_TYPE_PROPERTY;
+   out[0].prop.NrTokens = 2;
+   out[0].prop.PropertyName = name;
+
+   out[1].prop_data.Data = data;
+}
 
 
 static void emit_decls( struct ureg_program *ureg )
 {
    unsigned i;
 
+   if (ureg->property_gs_input_prim != ~0) {
+      assert(ureg->processor == TGSI_PROCESSOR_GEOMETRY);
+
+      emit_property(ureg,
+                    TGSI_PROPERTY_GS_INPUT_PRIM,
+                    ureg->property_gs_input_prim);
+   }
+
+   if (ureg->property_gs_output_prim != ~0) {
+      assert(ureg->processor == TGSI_PROCESSOR_GEOMETRY);
+
+      emit_property(ureg,
+                    TGSI_PROPERTY_GS_OUTPUT_PRIM,
+                    ureg->property_gs_output_prim);
+   }
+
+   if (ureg->property_gs_max_vertices != ~0) {
+      assert(ureg->processor == TGSI_PROCESSOR_GEOMETRY);
+
+      emit_property(ureg,
+                    TGSI_PROPERTY_GS_MAX_VERTICES,
+                    ureg->property_gs_max_vertices);
+   }
+
+   if (ureg->property_fs_coord_origin) {
+      assert(ureg->processor == TGSI_PROCESSOR_FRAGMENT);
+
+      emit_property(ureg,
+                    TGSI_PROPERTY_FS_COORD_ORIGIN,
+                    ureg->property_fs_coord_origin);
+   }
+
+   if (ureg->property_fs_coord_pixel_center) {
+      assert(ureg->processor == TGSI_PROCESSOR_FRAGMENT);
+
+      emit_property(ureg,
+                    TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
+                    ureg->property_fs_coord_pixel_center);
+   }
+
    if (ureg->processor == TGSI_PROCESSOR_VERTEX) {
       for (i = 0; i < UREG_MAX_INPUT; i++) {
          if (ureg->vs_inputs[i/32] & (1 << (i%32))) {
             emit_decl_range( ureg, TGSI_FILE_INPUT, i, 1 );
          }
       }
-   }
-   else {
+   } else if (ureg->processor == TGSI_PROCESSOR_FRAGMENT) {
       for (i = 0; i < ureg->nr_fs_inputs; i++) {
-         emit_decl( ureg, 
-                    TGSI_FILE_INPUT, 
-                    i,
-                    ureg->fs_input[i].semantic_name,
-                    ureg->fs_input[i].semantic_index,
-                    ureg->fs_input[i].interp );
+         emit_decl_fs(ureg,
+                      TGSI_FILE_INPUT,
+                      i,
+                      ureg->fs_input[i].semantic_name,
+                      ureg->fs_input[i].semantic_index,
+                      ureg->fs_input[i].interp,
+                      ureg->fs_input[i].cylindrical_wrap);
+      }
+   } else {
+      for (i = 0; i < ureg->nr_gs_inputs; i++) {
+         emit_decl_semantic(ureg,
+                            TGSI_FILE_INPUT,
+                            ureg->gs_input[i].index,
+                            ureg->gs_input[i].semantic_name,
+                            ureg->gs_input[i].semantic_index);
       }
    }
 
+   for (i = 0; i < ureg->nr_system_values; i++) {
+      emit_decl_semantic(ureg,
+                         TGSI_FILE_SYSTEM_VALUE,
+                         ureg->system_value[i].index,
+                         ureg->system_value[i].semantic_name,
+                         ureg->system_value[i].semantic_index);
+   }
+
    for (i = 0; i < ureg->nr_outputs; i++) {
-      emit_decl( ureg, 
-                 TGSI_FILE_OUTPUT, 
-                 i,
-                 ureg->output[i].semantic_name,
-                 ureg->output[i].semantic_index,
-                 TGSI_INTERPOLATE_CONSTANT );
+      emit_decl_semantic(ureg,
+                         TGSI_FILE_OUTPUT,
+                         i,
+                         ureg->output[i].semantic_name,
+                         ureg->output[i].semantic_index);
    }
 
    for (i = 0; i < ureg->nr_samplers; i++) {
@@ -991,13 +1319,29 @@ static void emit_decls( struct ureg_program *ureg )
                        ureg->sampler[i].Index, 1 );
    }
 
-   if (ureg->nr_constant_ranges) {
-      for (i = 0; i < ureg->nr_constant_ranges; i++)
-         emit_decl_range( ureg,
-                          TGSI_FILE_CONSTANT,
-                          ureg->constant_range[i].first, 
-                          (ureg->constant_range[i].last + 1 -
-                           ureg->constant_range[i].first) );
+   if (ureg->const_decls.nr_constant_ranges) {
+      for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) {
+         emit_decl_range(ureg,
+                         TGSI_FILE_CONSTANT,
+                         ureg->const_decls.constant_range[i].first,
+                         ureg->const_decls.constant_range[i].last - ureg->const_decls.constant_range[i].first + 1);
+      }
+   }
+
+   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      struct const_decl *decl = &ureg->const_decls2D[i];
+
+      if (decl->nr_constant_ranges) {
+         uint j;
+
+         for (j = 0; j < decl->nr_constant_ranges; j++) {
+            emit_decl_range2D(ureg,
+                              TGSI_FILE_CONSTANT,
+                              decl->constant_range[j].first,
+                              decl->constant_range[j].last,
+                              i);
+         }
+      }
    }
 
    if (ureg->nr_temps) {
@@ -1028,7 +1372,8 @@ static void emit_decls( struct ureg_program *ureg )
 
    for (i = 0; i < ureg->nr_immediates; i++) {
       emit_immediate( ureg,
-                      ureg->immediate[i].v );
+                      ureg->immediate[i].value.u,
+                      ureg->immediate[i].type );
    }
 }
 
@@ -1053,7 +1398,7 @@ fixup_header_size(struct ureg_program *ureg)
 {
    union tgsi_any_token *out = retrieve_token( ureg, DOMAIN_DECL, 0 );
 
-   out->header.BodySize = ureg->domain[DOMAIN_DECL].count - 3;
+   out->header.BodySize = ureg->domain[DOMAIN_DECL].count - 2;
 }
 
 
@@ -1151,6 +1496,9 @@ struct ureg_program *ureg_create( unsigned processor )
       return NULL;
 
    ureg->processor = processor;
+   ureg->property_gs_input_prim = ~0;
+   ureg->property_gs_output_prim = ~0;
+   ureg->property_gs_max_vertices = ~0;
    return ureg;
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 94cc70a2082..0130a77aadb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -30,6 +30,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_shader_tokens.h"
+#include "util/u_debug.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -47,13 +48,15 @@ struct ureg_src
    unsigned SwizzleY    : 2;  /* TGSI_SWIZZLE_ */
    unsigned SwizzleZ    : 2;  /* TGSI_SWIZZLE_ */
    unsigned SwizzleW    : 2;  /* TGSI_SWIZZLE_ */
-   unsigned Pad         : 1;  /* BOOL */
    unsigned Indirect    : 1;  /* BOOL */
+   unsigned Dimension   : 1;  /* BOOL */
    unsigned Absolute    : 1;  /* BOOL */
-   int      Index       : 16; /* SINT */
    unsigned Negate      : 1;  /* BOOL */
+   int      Index       : 16; /* SINT */
+   unsigned IndirectFile    : 4;  /* TGSI_FILE_ */
    int      IndirectIndex   : 16; /* SINT */
-   int      IndirectSwizzle : 2;  /* TGSI_SWIZZLE_ */
+   unsigned IndirectSwizzle : 2;  /* TGSI_SWIZZLE_ */
+   int      DimensionIndex  : 16; /* SINT */
 };
 
 /* Very similar to a tgsi_dst_register, removing unsupported fields
@@ -118,21 +121,70 @@ ureg_create_shader_and_destroy( struct ureg_program *p,
 }
 
 
+/***********************************************************************
+ * Build shader properties:
+ */
+
+void
+ureg_property_gs_input_prim(struct ureg_program *ureg,
+                            unsigned input_prim);
+
+void
+ureg_property_gs_output_prim(struct ureg_program *ureg,
+                             unsigned output_prim);
+
+void
+ureg_property_gs_max_vertices(struct ureg_program *ureg,
+                              unsigned max_vertices);
+
+void
+ureg_property_fs_coord_origin(struct ureg_program *ureg,
+                            unsigned fs_coord_origin);
+
+void
+ureg_property_fs_coord_pixel_center(struct ureg_program *ureg,
+                            unsigned fs_coord_pixel_center);
 
 /***********************************************************************
  * Build shader declarations:
  */
 
 struct ureg_src
-ureg_DECL_fs_input( struct ureg_program *,
-                    unsigned semantic_name,
-                    unsigned semantic_index,
-                    unsigned interp_mode );
+ureg_DECL_fs_input_cyl(struct ureg_program *,
+                       unsigned semantic_name,
+                       unsigned semantic_index,
+                       unsigned interp_mode,
+                       unsigned cylindrical_wrap);
+
+static INLINE struct ureg_src
+ureg_DECL_fs_input(struct ureg_program *ureg,
+                   unsigned semantic_name,
+                   unsigned semantic_index,
+                   unsigned interp_mode)
+{
+   return ureg_DECL_fs_input_cyl(ureg,
+                                 semantic_name,
+                                 semantic_index,
+                                 interp_mode,
+                                 0);
+}
 
 struct ureg_src
 ureg_DECL_vs_input( struct ureg_program *,
                     unsigned index );
 
+struct ureg_src
+ureg_DECL_gs_input(struct ureg_program *,
+                   unsigned index,
+                   unsigned semantic_name,
+                   unsigned semantic_index);
+
+struct ureg_src
+ureg_DECL_system_value(struct ureg_program *,
+                       unsigned index,
+                       unsigned semantic_name,
+                       unsigned semantic_index);
+
 struct ureg_dst
 ureg_DECL_output( struct ureg_program *,
                   unsigned semantic_name,
@@ -144,6 +196,27 @@ ureg_DECL_immediate( struct ureg_program *,
                      unsigned nr );
 
 struct ureg_src
+ureg_DECL_immediate_uint( struct ureg_program *,
+                          const unsigned *v,
+                          unsigned nr );
+
+struct ureg_src
+ureg_DECL_immediate_block_uint( struct ureg_program *,
+                                const unsigned *v,
+                                unsigned nr );
+
+struct ureg_src
+ureg_DECL_immediate_int( struct ureg_program *,
+                         const int *v,
+                         unsigned nr );
+
+void
+ureg_DECL_constant2D(struct ureg_program *ureg,
+                     unsigned first,
+                     unsigned last,
+                     unsigned index2D);
+
+struct ureg_src
 ureg_DECL_constant( struct ureg_program *,
                     unsigned index );
 
@@ -217,6 +290,90 @@ ureg_imm1f( struct ureg_program *ureg,
    return ureg_DECL_immediate( ureg, v, 1 );
 }
 
+static INLINE struct ureg_src
+ureg_imm4u( struct ureg_program *ureg,
+            unsigned a, unsigned b,
+            unsigned c, unsigned d)
+{
+   unsigned v[4];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   v[3] = d;
+   return ureg_DECL_immediate_uint( ureg, v, 4 );
+}
+
+static INLINE struct ureg_src
+ureg_imm3u( struct ureg_program *ureg,
+            unsigned a, unsigned b,
+            unsigned c)
+{
+   unsigned v[3];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   return ureg_DECL_immediate_uint( ureg, v, 3 );
+}
+
+static INLINE struct ureg_src
+ureg_imm2u( struct ureg_program *ureg,
+            unsigned a, unsigned b)
+{
+   unsigned v[2];
+   v[0] = a;
+   v[1] = b;
+   return ureg_DECL_immediate_uint( ureg, v, 2 );
+}
+
+static INLINE struct ureg_src
+ureg_imm1u( struct ureg_program *ureg,
+            unsigned a)
+{
+   return ureg_DECL_immediate_uint( ureg, &a, 1 );
+}
+
+static INLINE struct ureg_src
+ureg_imm4i( struct ureg_program *ureg,
+            int a, int b,
+            int c, int d)
+{
+   int v[4];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   v[3] = d;
+   return ureg_DECL_immediate_int( ureg, v, 4 );
+}
+
+static INLINE struct ureg_src
+ureg_imm3i( struct ureg_program *ureg,
+            int a, int b,
+            int c)
+{
+   int v[3];
+   v[0] = a;
+   v[1] = b;
+   v[2] = c;
+   return ureg_DECL_immediate_int( ureg, v, 3 );
+}
+
+static INLINE struct ureg_src
+ureg_imm2i( struct ureg_program *ureg,
+            int a, int b)
+{
+   int v[2];
+   v[0] = a;
+   v[1] = b;
+   return ureg_DECL_immediate_int( ureg, v, 2 );
+}
+
+static INLINE struct ureg_src
+ureg_imm1i( struct ureg_program *ureg,
+            int a)
+{
+   return ureg_DECL_immediate_int( ureg, &a, 1 );
+}
+
 /***********************************************************************
  * Functions for patching up labels
  */
@@ -655,18 +812,30 @@ static INLINE struct ureg_src
 ureg_src_indirect( struct ureg_src reg, struct ureg_src addr )
 {
    assert(reg.File != TGSI_FILE_NULL);
-   assert(addr.File == TGSI_FILE_ADDRESS);
+   assert(addr.File == TGSI_FILE_ADDRESS || addr.File == TGSI_FILE_TEMPORARY);
    reg.Indirect = 1;
+   reg.IndirectFile = addr.File;
    reg.IndirectIndex = addr.Index;
    reg.IndirectSwizzle = addr.SwizzleX;
    return reg;
 }
 
+static INLINE struct ureg_src 
+ureg_src_dimension( struct ureg_src reg, int index )
+{
+   assert(reg.File != TGSI_FILE_NULL);
+   reg.Dimension = 1;
+   reg.DimensionIndex = index;
+   return reg;
+}
+
 static INLINE struct ureg_dst
 ureg_dst( struct ureg_src src )
 {
    struct ureg_dst dst;
 
+   assert(!src.Indirect || src.IndirectFile == TGSI_FILE_ADDRESS);
+
    dst.File      = src.File;
    dst.WriteMask = TGSI_WRITEMASK_XYZW;
    dst.Indirect  = src.Indirect;
@@ -685,6 +854,30 @@ ureg_dst( struct ureg_src src )
 }
 
 static INLINE struct ureg_src
+ureg_src_register(unsigned file,
+                  unsigned index)
+{
+   struct ureg_src src;
+
+   src.File = file;
+   src.SwizzleX = TGSI_SWIZZLE_X;
+   src.SwizzleY = TGSI_SWIZZLE_Y;
+   src.SwizzleZ = TGSI_SWIZZLE_Z;
+   src.SwizzleW = TGSI_SWIZZLE_W;
+   src.Indirect = 0;
+   src.IndirectFile = TGSI_FILE_NULL;
+   src.IndirectIndex = 0;
+   src.IndirectSwizzle = 0;
+   src.Absolute = 0;
+   src.Index = index;
+   src.Negate = 0;
+   src.Dimension = 0;
+   src.DimensionIndex = 0;
+
+   return src;
+}
+
+static INLINE struct ureg_src
 ureg_src( struct ureg_dst dst )
 {
    struct ureg_src src;
@@ -694,13 +887,15 @@ ureg_src( struct ureg_dst dst )
    src.SwizzleY  = TGSI_SWIZZLE_Y;
    src.SwizzleZ  = TGSI_SWIZZLE_Z;
    src.SwizzleW  = TGSI_SWIZZLE_W;
-   src.Pad       = 0;
    src.Indirect  = dst.Indirect;
+   src.IndirectFile = TGSI_FILE_ADDRESS;
    src.IndirectIndex = dst.IndirectIndex;
    src.IndirectSwizzle = dst.IndirectSwizzle;
    src.Absolute  = 0;
    src.Index     = dst.Index;
    src.Negate    = 0;
+   src.Dimension = 0;
+   src.DimensionIndex = 0;
 
    return src;
 }
@@ -739,13 +934,15 @@ ureg_src_undef( void )
    src.SwizzleY  = 0;
    src.SwizzleZ  = 0;
    src.SwizzleW  = 0;
-   src.Pad       = 0;
    src.Indirect  = 0;
+   src.IndirectFile = TGSI_FILE_NULL;
    src.IndirectIndex = 0;
    src.IndirectSwizzle = 0;
    src.Absolute  = 0;
    src.Index     = 0;
    src.Negate    = 0;
+   src.Dimension = 0;
+   src.DimensionIndex = 0;
    
    return src;
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index f4ca9e21ed9..0a7e4105a80 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -28,7 +28,6 @@
 #include "util/u_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
-#include "tgsi_build.h"
 #include "tgsi_util.h"
 
 union pointer_hack
diff --git a/src/gallium/auxiliary/translate/Makefile b/src/gallium/auxiliary/translate/Makefile
deleted file mode 100644
index 3c82f8ae037..00000000000
--- a/src/gallium/auxiliary/translate/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = translate
-
-C_SOURCES = \
-	translate_generic.c \
-	translate_sse.c \
-	translate.c \
-        translate_cache.c
-
-include ../../Makefile.template
diff --git a/src/gallium/auxiliary/translate/SConscript b/src/gallium/auxiliary/translate/SConscript
deleted file mode 100644
index 9553a675372..00000000000
--- a/src/gallium/auxiliary/translate/SConscript
+++ /dev/null
@@ -1,12 +0,0 @@
-Import('*')
-
-translate = env.ConvenienceLibrary(
-	target = 'translate',
-	source = [
-		'translate_generic.c',
-		'translate_sse.c',
-		'translate.c',
-		'translate_cache.c',
-	])
-
-auxiliaries.insert(0, translate)
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index 34526eb0617..54ed2c1a4be 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -44,12 +44,19 @@
 #include "pipe/p_format.h"
 #include "pipe/p_state.h"
 
+enum translate_element_type {
+   TRANSLATE_ELEMENT_NORMAL,
+   TRANSLATE_ELEMENT_INSTANCE_ID
+};
+
 struct translate_element 
 {
+   enum translate_element_type type;
    enum pipe_format input_format;
    enum pipe_format output_format;
    unsigned input_buffer:8;
    unsigned input_offset:24;
+   unsigned instance_divisor;
    unsigned output_offset;
 };
 
@@ -74,11 +81,13 @@ struct translate {
    void (PIPE_CDECL *run_elts)( struct translate *,
                                 const unsigned *elts,
                                 unsigned count,
+                                unsigned instance_id,
                                 void *output_buffer);
 
    void (PIPE_CDECL *run)( struct translate *,
                            unsigned start,
                            unsigned count,
+                           unsigned instance_id,
                            void *output_buffer);
 };
 
@@ -103,8 +112,13 @@ static INLINE int translate_keysize( const struct translate_key *key )
 static INLINE int translate_key_compare( const struct translate_key *a,
                                          const struct translate_key *b )
 {
-   int keysize = translate_keysize(a);
-   return memcmp(a, b, keysize);
+   int keysize_a = translate_keysize(a);
+   int keysize_b = translate_keysize(b);
+
+   if (keysize_a != keysize_b) {
+      return keysize_a - keysize_b;
+   }
+   return memcmp(a, b, keysize_a);
 }
 
 
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 266e7ee81e6..c9ec2b32bfe 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -46,9 +46,12 @@ struct translate_generic {
    struct translate translate;
 
    struct {
+      enum translate_element_type type;
+
       fetch_func fetch;
       unsigned buffer;
       unsigned input_offset;
+      unsigned instance_divisor;
 
       emit_func emit;
       unsigned output_offset;
@@ -389,10 +392,10 @@ static fetch_func get_fetch_func( enum pipe_format format )
    case PIPE_FORMAT_R8G8B8A8_SSCALED:
       return &fetch_R8G8B8A8_SSCALED;
 
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       return &fetch_A8R8G8B8_UNORM;
 
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       return &fetch_B8G8R8A8_UNORM;
 
    case PIPE_FORMAT_R32_FIXED:
@@ -548,10 +551,10 @@ static emit_func get_emit_func( enum pipe_format format )
    case PIPE_FORMAT_R8G8B8A8_SSCALED:
       return &emit_R8G8B8A8_SSCALED;
 
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       return &emit_A8R8G8B8_UNORM;
 
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       return &emit_B8G8R8A8_UNORM;
 
    default:
@@ -568,6 +571,7 @@ static emit_func get_emit_func( enum pipe_format format )
 static void PIPE_CDECL generic_run_elts( struct translate *translate,
                                          const unsigned *elts,
                                          unsigned count,
+                                         unsigned instance_id,
                                          void *output_buffer )
 {
    struct translate_generic *tg = translate_generic(translate);
@@ -583,13 +587,20 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
 
       for (attr = 0; attr < nr_attrs; attr++) {
 	 float data[4];
-
-	 const char *src = (tg->attrib[attr].input_ptr + 
-			    tg->attrib[attr].input_stride * elt);
+         const char *src;
 
 	 char *dst = (vert + 
 		      tg->attrib[attr].output_offset);
 
+         if (tg->attrib[attr].instance_divisor) {
+            src = tg->attrib[attr].input_ptr +
+                  tg->attrib[attr].input_stride *
+                  (instance_id / tg->attrib[attr].instance_divisor);
+         } else {
+            src = tg->attrib[attr].input_ptr +
+                  tg->attrib[attr].input_stride * elt;
+         }
+
 	 tg->attrib[attr].fetch( src, data );
 
          if (0) debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
@@ -607,6 +618,7 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
 static void PIPE_CDECL generic_run( struct translate *translate,
                                     unsigned start,
                                     unsigned count,
+                                    unsigned instance_id,
                                     void *output_buffer )
 {
    struct translate_generic *tg = translate_generic(translate);
@@ -623,13 +635,25 @@ static void PIPE_CDECL generic_run( struct translate *translate,
       for (attr = 0; attr < nr_attrs; attr++) {
 	 float data[4];
 
-	 const char *src = (tg->attrib[attr].input_ptr + 
-			    tg->attrib[attr].input_stride * elt);
-
 	 char *dst = (vert + 
 		      tg->attrib[attr].output_offset);
 
-	 tg->attrib[attr].fetch( src, data );
+         if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
+            const char *src;
+
+            if (tg->attrib[attr].instance_divisor) {
+               src = tg->attrib[attr].input_ptr +
+                     tg->attrib[attr].input_stride *
+                     (instance_id / tg->attrib[attr].instance_divisor);
+            } else {
+               src = tg->attrib[attr].input_ptr +
+                     tg->attrib[attr].input_stride * elt;
+            }
+
+            tg->attrib[attr].fetch( src, data );
+         } else {
+            data[0] = (float)instance_id;
+         }
 
          if (0) debug_printf("vert %d attr %d: %f %f %f %f\n",
                              i, attr, data[0], data[1], data[2], data[3]);
@@ -683,10 +707,12 @@ struct translate *translate_generic_create( const struct translate_key *key )
    tg->translate.run = generic_run;
 
    for (i = 0; i < key->nr_elements; i++) {
+      tg->attrib[i].type = key->element[i].type;
 
       tg->attrib[i].fetch = get_fetch_func(key->element[i].input_format);
       tg->attrib[i].buffer = key->element[i].input_buffer;
       tg->attrib[i].input_offset = key->element[i].input_offset;
+      tg->attrib[i].instance_divisor = key->element[i].instance_divisor;
 
       tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
       tg->attrib[i].output_offset = key->element[i].output_offset;
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index b62db8d8f33..03e093c11ea 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -49,19 +49,29 @@
 typedef void (PIPE_CDECL *run_func)( struct translate *translate,
                                      unsigned start,
                                      unsigned count,
-                                     void *output_buffer );
+                                     unsigned instance_id,
+                                     void *output_buffer);
 
 typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
                                           const unsigned *elts,
                                           unsigned count,
-                                          void *output_buffer );
+                                          unsigned instance_id,
+                                          void *output_buffer);
 
 struct translate_buffer {
    const void *base_ptr;
    unsigned stride;
-   void *ptr;                   /* updated per vertex */
 };
 
+struct translate_buffer_varient {
+   unsigned buffer_index;
+   unsigned instance_divisor;
+   void *ptr;                    /* updated either per vertex or per instance */
+};
+
+
+#define ELEMENT_BUFFER_INSTANCE_ID  1001
+
 
 struct translate_sse {
    struct translate translate;
@@ -81,6 +91,16 @@ struct translate_sse {
    struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
    unsigned nr_buffers;
 
+   /* Multiple buffer varients can map to a single buffer. */
+   struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
+   unsigned nr_buffer_varients;
+
+   /* Multiple elements can map to a single buffer varient. */
+   unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
+
+   boolean use_instancing;
+   unsigned instance_id;
+
    run_func      gen_run;
    run_elts_func gen_run_elts;
 
@@ -316,7 +336,7 @@ static boolean translate_attr( struct translate_sse *p,
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
       emit_load_R32G32B32A32(p, dataXMM, srcECX);
       break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
       emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
       break;
@@ -340,7 +360,7 @@ static boolean translate_attr( struct translate_sse *p,
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
       emit_store_R32G32B32A32(p, dstEAX, dataXMM);
       break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
       emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
       break;
@@ -359,32 +379,61 @@ static boolean init_inputs( struct translate_sse *p,
                             boolean linear )
 {
    unsigned i;
-   if (linear) {
-      for (i = 0; i < p->nr_buffers; i++) {
+   struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
+                                              get_offset(p, &p->instance_id));
+
+   for (i = 0; i < p->nr_buffer_varients; i++) {
+      struct translate_buffer_varient *varient = &p->buffer_varient[i];
+      struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
+
+      if (linear || varient->instance_divisor) {
          struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
-                                                     get_offset(p, &p->buffer[i].stride));
+                                                     get_offset(p, &buffer->stride));
          struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
-                                                     get_offset(p, &p->buffer[i].ptr));
+                                                     get_offset(p, &varient->ptr));
          struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
-                                                     get_offset(p, &p->buffer[i].base_ptr));
+                                                     get_offset(p, &buffer->base_ptr));
          struct x86_reg elt = p->idx_EBX;
-         struct x86_reg tmp = p->tmp_EAX;
-
+         struct x86_reg tmp_EAX = p->tmp_EAX;
 
          /* Calculate pointer to first attrib:
+          *   base_ptr + stride * index, where index depends on instance divisor
           */
-         x86_mov(p->func, tmp, buf_stride);
-         x86_imul(p->func, tmp, elt);
-         x86_add(p->func, tmp, buf_base_ptr);
+         if (varient->instance_divisor) {
+            /* Our index is instance ID divided by instance divisor.
+             */
+            x86_mov(p->func, tmp_EAX, instance_id);
+
+            if (varient->instance_divisor != 1) {
+               struct x86_reg tmp_EDX = p->machine_EDX;
+               struct x86_reg tmp_ECX = p->outbuf_ECX;
+
+               /* TODO: Add x86_shr() to rtasm and use it whenever
+                *       instance divisor is power of two.
+                */
+
+               x86_push(p->func, tmp_EDX);
+               x86_push(p->func, tmp_ECX);
+               x86_xor(p->func, tmp_EDX, tmp_EDX);
+               x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
+               x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
+               x86_pop(p->func, tmp_ECX);
+               x86_pop(p->func, tmp_EDX);
+            }
+         } else {
+            x86_mov(p->func, tmp_EAX, elt);
+         }
+         x86_imul(p->func, tmp_EAX, buf_stride);
+         x86_add(p->func, tmp_EAX, buf_base_ptr);
 
 
          /* In the linear case, keep the buffer pointer instead of the
           * index number.
           */
-         if (p->nr_buffers == 1) 
-            x86_mov( p->func, elt, tmp );
+         if (linear && p->nr_buffer_varients == 1)
+            x86_mov(p->func, elt, tmp_EAX);
          else
-            x86_mov( p->func, buf_ptr, tmp );
+            x86_mov(p->func, buf_ptr, tmp_EAX);
       }
    }
 
@@ -394,31 +443,36 @@ static boolean init_inputs( struct translate_sse *p,
 
 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
                                       boolean linear,
-                                      unsigned buf_idx,
+                                      unsigned var_idx,
                                       struct x86_reg elt )
 {
-   if (linear && p->nr_buffers == 1) {
+   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
+      return x86_make_disp(p->machine_EDX,
+                           get_offset(p, &p->instance_id));
+   }
+   if (linear && p->nr_buffer_varients == 1) {
       return p->idx_EBX;
    }
-   else if (linear) {
+   else if (linear || p->buffer_varient[var_idx].instance_divisor) {
       struct x86_reg ptr = p->tmp_EAX;
       struct x86_reg buf_ptr = 
          x86_make_disp(p->machine_EDX, 
-                       get_offset(p, &p->buffer[buf_idx].ptr));
+                       get_offset(p, &p->buffer_varient[var_idx].ptr));
       
       x86_mov(p->func, ptr, buf_ptr);
       return ptr;
    }
    else {
       struct x86_reg ptr = p->tmp_EAX;
+      const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
 
       struct x86_reg buf_stride = 
          x86_make_disp(p->machine_EDX, 
-                       get_offset(p, &p->buffer[buf_idx].stride));
+                       get_offset(p, &p->buffer[varient->buffer_index].stride));
 
       struct x86_reg buf_base_ptr = 
          x86_make_disp(p->machine_EDX, 
-                       get_offset(p, &p->buffer[buf_idx].base_ptr));
+                       get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
 
 
 
@@ -436,28 +490,33 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p,
 static boolean incr_inputs( struct translate_sse *p, 
                             boolean linear )
 {
-   if (linear && p->nr_buffers == 1) {
+   if (linear && p->nr_buffer_varients == 1) {
       struct x86_reg stride = x86_make_disp(p->machine_EDX,
                                             get_offset(p, &p->buffer[0].stride));
 
-      x86_add(p->func, p->idx_EBX, stride);
-      sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+      if (p->buffer_varient[0].instance_divisor == 0) {
+         x86_add(p->func, p->idx_EBX, stride);
+         sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+      }
    }
    else if (linear) {
       unsigned i;
 
       /* Is this worthwhile??
        */
-      for (i = 0; i < p->nr_buffers; i++) {
+      for (i = 0; i < p->nr_buffer_varients; i++) {
+         struct translate_buffer_varient *varient = &p->buffer_varient[i];
          struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
-                                                get_offset(p, &p->buffer[i].ptr));
+                                                get_offset(p, &varient->ptr));
          struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
-                                                   get_offset(p, &p->buffer[i].stride));
+                                                   get_offset(p, &p->buffer[varient->buffer_index].stride));
 
-         x86_mov(p->func, p->tmp_EAX, buf_ptr);
-         x86_add(p->func, p->tmp_EAX, buf_stride);
-         if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
-         x86_mov(p->func, buf_ptr, p->tmp_EAX);
+         if (varient->instance_divisor == 0) {
+            x86_mov(p->func, p->tmp_EAX, buf_ptr);
+            x86_add(p->func, p->tmp_EAX, buf_stride);
+            if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+            x86_mov(p->func, buf_ptr, p->tmp_EAX);
+         }
       }
    } 
    else {
@@ -514,7 +573,18 @@ static boolean build_vertex_emit( struct translate_sse *p,
    x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
    x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
    x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
-   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4));
+   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
+
+   /* Load instance ID.
+    */
+   if (p->use_instancing) {
+      x86_mov(p->func,
+              p->tmp_EAX,
+              x86_fn_arg(p->func, 4));
+      x86_mov(p->func,
+              x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
+              p->tmp_EAX);
+   }
 
    /* Get vertex count, compare to zero
     */
@@ -531,17 +601,18 @@ static boolean build_vertex_emit( struct translate_sse *p,
    label = x86_get_label(p->func);
    {
       struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
-      int last_vb = -1;
+      int last_varient = -1;
       struct x86_reg vb;
 
       for (j = 0; j < p->translate.key.nr_elements; j++) {
          const struct translate_element *a = &p->translate.key.element[j];
+         unsigned varient = p->element_to_buffer_varient[j];
 
          /* Figure out source pointer address:
           */
-         if (a->input_buffer != last_vb) {
-            last_vb = a->input_buffer;
-            vb = get_buffer_ptr(p, linear, a->input_buffer, elt);
+         if (varient != last_varient) {
+            last_varient = varient;
+            vb = get_buffer_ptr(p, linear, varient, elt);
          }
          
          if (!translate_attr( p, a, 
@@ -624,6 +695,7 @@ static void translate_sse_release( struct translate *translate )
 static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
 			      const unsigned *elts,
 			      unsigned count,
+                              unsigned instance_id,
 			      void *output_buffer )
 {
    struct translate_sse *p = (struct translate_sse *)translate;
@@ -631,12 +703,14 @@ static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
    p->gen_run_elts( translate,
 		    elts,
 		    count,
-		    output_buffer );
+                    instance_id,
+                    output_buffer);
 }
 
 static void PIPE_CDECL translate_sse_run( struct translate *translate,
 			 unsigned start,
 			 unsigned count,
+                         unsigned instance_id,
 			 void *output_buffer )
 {
    struct translate_sse *p = (struct translate_sse *)translate;
@@ -644,7 +718,8 @@ static void PIPE_CDECL translate_sse_run( struct translate *translate,
    p->gen_run( translate,
 	       start,
 	       count,
-	       output_buffer );
+               instance_id,
+               output_buffer);
 }
 
 
@@ -666,8 +741,37 @@ struct translate *translate_sse2_create( const struct translate_key *key )
    p->translate.run_elts = translate_sse_run_elts;
    p->translate.run = translate_sse_run;
 
-   for (i = 0; i < key->nr_elements; i++) 
-      p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
+   for (i = 0; i < key->nr_elements; i++) {
+      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
+         unsigned j;
+
+         p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
+
+         if (key->element[i].instance_divisor) {
+            p->use_instancing = TRUE;
+         }
+
+         /*
+          * Map vertex element to vertex buffer varient.
+          */
+         for (j = 0; j < p->nr_buffer_varients; j++) {
+            if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
+                p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
+               break;
+            }
+         }
+         if (j == p->nr_buffer_varients) {
+            p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
+            p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
+            p->nr_buffer_varients++;
+         }
+         p->element_to_buffer_varient[i] = j;
+      } else {
+         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
+
+         p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
+      }
+   }
 
    if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
 
diff --git a/src/gallium/auxiliary/util/.gitignore b/src/gallium/auxiliary/util/.gitignore
index 29c586c9b51..448d2f304fb 100644
--- a/src/gallium/auxiliary/util/.gitignore
+++ b/src/gallium/auxiliary/util/.gitignore
@@ -1,2 +1,3 @@
 u_format_access.c
 u_format_table.c
+u_format_pack.h
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
deleted file mode 100644
index 1d8bb55bbd6..00000000000
--- a/src/gallium/auxiliary/util/Makefile
+++ /dev/null
@@ -1,45 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = util
-
-C_SOURCES = \
-	u_debug.c \
-	u_debug_dump.c \
-	u_debug_symbol.c \
-	u_debug_stack.c \
-	u_blit.c \
-	u_cache.c \
-	u_cpu_detect.c \
-	u_draw_quad.c \
-	u_format.c \
-	u_format_access.c \
-	u_format_table.c \
-	u_gen_mipmap.c \
-	u_handle_table.c \
-	u_hash_table.c \
-	u_hash.c \
-	u_keymap.c \
-	u_linear.c \
-	u_network.c \
-	u_math.c \
-	u_mm.c \
-	u_rect.c \
-	u_simple_shaders.c \
-	u_snprintf.c \
-	u_stream_stdc.c \
-	u_stream_wd.c \
-	u_surface.c \
-	u_tile.c \
-	u_time.c \
-	u_timed_winsys.c \
-	u_upload_mgr.c \
-	u_simple_screen.c
-
-include ../../Makefile.template
-
-u_format_table.c: u_format_table.py u_format_parse.py u_format.csv
-	python u_format_table.py u_format.csv > $@
-
-u_format_access.c: u_format_access.py u_format_parse.py u_format.csv
-	python u_format_access.py u_format.csv > $@
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
deleted file mode 100644
index 8d99106d0b8..00000000000
--- a/src/gallium/auxiliary/util/SConscript
+++ /dev/null
@@ -1,58 +0,0 @@
-Import('*')
-
-env.Clone()
-
-env.Append(CPPPATH = ['.'])
-
-env.CodeGenerate(
-	target = 'u_format_table.c',
-	script = 'u_format_table.py',
-	source = ['u_format.csv'],
-	command = 'python $SCRIPT $SOURCE > $TARGET'
-)
-
-env.CodeGenerate(
-	target = 'u_format_access.c',
-	script = 'u_format_access.py',
-	source = ['u_format.csv'],
-	command = 'python $SCRIPT $SOURCE > $TARGET'
-)
-
-util = env.ConvenienceLibrary(
-	target = 'util',
-	source = [
-		'u_bitmask.c',
-		'u_blit.c',
-		'u_cache.c',
-		'u_cpu_detect.c',
-		'u_debug.c',
-		'u_debug_dump.c',
-		'u_debug_memory.c',
-		'u_debug_stack.c',
-		'u_debug_symbol.c',
-		'u_draw_quad.c',
-		'u_format.c',
-		'u_format_access.c',
-		'u_format_table.c',
-		'u_gen_mipmap.c',
-		'u_handle_table.c',
-		'u_hash.c',
-		'u_hash_table.c',
-		'u_keymap.c',
-		'u_network.c',
-		'u_math.c',
-		'u_mm.c',
-		'u_rect.c',
-		'u_simple_shaders.c',
-		'u_snprintf.c',
-		'u_stream_stdc.c',
-		'u_stream_wd.c',
-		'u_surface.c',
-		'u_tile.c',
-		'u_time.c',
-		'u_timed_winsys.c',
-		'u_upload_mgr.c',
-		'u_simple_screen.c',
-	])
-
-auxiliaries.insert(0, util)
diff --git a/src/gallium/auxiliary/util/u_atomic.h b/src/gallium/auxiliary/util/u_atomic.h
new file mode 100644
index 00000000000..3c42477ad4f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_atomic.h
@@ -0,0 +1,305 @@
+/**
+ * Many similar implementations exist. See for example libwsbm
+ * or the linux kernel include/atomic.h
+ *
+ * No copyright claimed on this file.
+ *
+ */
+
+#ifndef U_ATOMIC_H
+#define U_ATOMIC_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+
+/* Favor OS-provided implementations.
+ *
+ * Where no OS-provided implementation is available, fall back to
+ * locally coded assembly, compiler intrinsic or ultimately a
+ * mutex-based implementation.
+ */
+#if (defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || \
+     defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT))
+#define PIPE_ATOMIC_OS_UNLOCKED
+#elif defined(PIPE_OS_SOLARIS)
+#define PIPE_ATOMIC_OS_SOLARIS
+#elif defined(PIPE_CC_MSVC)
+#define PIPE_ATOMIC_MSVC_INTRINSIC
+#elif (defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86))
+#define PIPE_ATOMIC_ASM_MSVC_X86                
+#elif (defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86))
+#define PIPE_ATOMIC_ASM_GCC_X86
+#elif defined(PIPE_CC_GCC)
+#define PIPE_ATOMIC_GCC_INTRINSIC
+#else
+#error "Unsupported platform"
+#endif
+
+
+
+#if defined(PIPE_ATOMIC_ASM_GCC_X86)
+
+#define PIPE_ATOMIC "GCC x86 assembly"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define p_atomic_set(_v, _i) (*(_v) = (_i))
+#define p_atomic_read(_v) (*(_v))
+
+static INLINE boolean
+p_atomic_dec_zero(int32_t *v)
+{
+   unsigned char c;
+
+   __asm__ __volatile__("lock; decl %0; sete %1":"+m"(*v), "=qm"(c)
+			::"memory");
+
+   return c != 0;
+}
+
+static INLINE void
+p_atomic_inc(int32_t *v)
+{
+   __asm__ __volatile__("lock; incl %0":"+m"(*v));
+}
+
+static INLINE void
+p_atomic_dec(int32_t *v)
+{
+   __asm__ __volatile__("lock; decl %0":"+m"(*v));
+}
+
+static INLINE int32_t
+p_atomic_cmpxchg(int32_t *v, int32_t old, int32_t _new)
+{
+   return __sync_val_compare_and_swap(v, old, _new);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+
+
+/* Implementation using GCC-provided synchronization intrinsics
+ */
+#if defined(PIPE_ATOMIC_GCC_INTRINSIC)
+
+#define PIPE_ATOMIC "GCC Sync Intrinsics"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define p_atomic_set(_v, _i) (*(_v) = (_i))
+#define p_atomic_read(_v) (*(_v))
+
+static INLINE boolean
+p_atomic_dec_zero(int32_t *v)
+{
+   return (__sync_sub_and_fetch(v, 1) == 0);
+}
+
+static INLINE void
+p_atomic_inc(int32_t *v)
+{
+   (void) __sync_add_and_fetch(v, 1);
+}
+
+static INLINE void
+p_atomic_dec(int32_t *v)
+{
+   (void) __sync_sub_and_fetch(v, 1);
+}
+
+static INLINE int32_t
+p_atomic_cmpxchg(int32_t *v, int32_t old, int32_t _new)
+{
+   return __sync_val_compare_and_swap(v, old, _new);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+
+
+/* Unlocked version for single threaded environments, such as some
+ * windows kernel modules.
+ */
+#if defined(PIPE_ATOMIC_OS_UNLOCKED) 
+
+#define PIPE_ATOMIC "Unlocked"
+
+#define p_atomic_set(_v, _i) (*(_v) = (_i))
+#define p_atomic_read(_v) (*(_v))
+#define p_atomic_dec_zero(_v) ((boolean) --(*(_v)))
+#define p_atomic_inc(_v) ((void) (*(_v))++)
+#define p_atomic_dec(_v) ((void) (*(_v))--)
+#define p_atomic_cmpxchg(_v, old, _new) (*(_v) == old ? *(_v) = (_new) : *(_v))
+
+#endif
+
+
+/* Locally coded assembly for MSVC on x86:
+ */
+#if defined(PIPE_ATOMIC_ASM_MSVC_X86)
+
+#define PIPE_ATOMIC "MSVC x86 assembly"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define p_atomic_set(_v, _i) (*(_v) = (_i))
+#define p_atomic_read(_v) (*(_v))
+
+static INLINE boolean
+p_atomic_dec_zero(int32_t *v)
+{
+   unsigned char c;
+
+   __asm {
+      mov       eax, [v]
+      lock dec  dword ptr [eax]
+      sete      byte ptr [c]
+   }
+
+   return c != 0;
+}
+
+static INLINE void
+p_atomic_inc(int32_t *v)
+{
+   __asm {
+      mov       eax, [v]
+      lock inc  dword ptr [eax]
+   }
+}
+
+static INLINE void
+p_atomic_dec(int32_t *v)
+{
+   __asm {
+      mov       eax, [v]
+      lock dec  dword ptr [eax]
+   }
+}
+
+static INLINE int32_t
+p_atomic_cmpxchg(int32_t *v, int32_t old, int32_t _new)
+{
+   int32_t orig;
+
+   __asm {
+      mov ecx, [v]
+      mov eax, [old]
+      mov edx, [_new]
+      lock cmpxchg [ecx], edx
+      mov [orig], eax
+   }
+
+   return orig;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+
+#if defined(PIPE_ATOMIC_MSVC_INTRINSIC)
+
+#define PIPE_ATOMIC "MSVC Intrinsics"
+
+#include <intrin.h>
+
+#pragma intrinsic(_InterlockedIncrement)
+#pragma intrinsic(_InterlockedDecrement)
+#pragma intrinsic(_InterlockedCompareExchange)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define p_atomic_set(_v, _i) (*(_v) = (_i))
+#define p_atomic_read(_v) (*(_v))
+
+static INLINE boolean
+p_atomic_dec_zero(int32_t *v)
+{
+   return _InterlockedDecrement((long *)v) == 0;
+}
+
+static INLINE void
+p_atomic_inc(int32_t *v)
+{
+   _InterlockedIncrement((long *)v);
+}
+
+static INLINE void
+p_atomic_dec(int32_t *v)
+{
+   _InterlockedDecrement((long *)v);
+}
+
+static INLINE int32_t
+p_atomic_cmpxchg(int32_t *v, int32_t old, int32_t _new)
+{
+   return _InterlockedCompareExchange((long *)v, _new, old);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#if defined(PIPE_ATOMIC_OS_SOLARIS)
+
+#define PIPE_ATOMIC "Solaris OS atomic functions"
+
+#include <atomic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define p_atomic_set(_v, _i) (*(_v) = (_i))
+#define p_atomic_read(_v) (*(_v))
+
+static INLINE boolean
+p_atomic_dec_zero(int32_t *v)
+{
+   uint32_t n = atomic_dec_32_nv((uint32_t *) v);
+
+   return n != 0;
+}
+
+#define p_atomic_inc(_v) atomic_inc_32((uint32_t *) _v)
+#define p_atomic_dec(_v) atomic_dec_32((uint32_t *) _v)
+
+#define p_atomic_cmpxchg(_v, _old, _new) \
+	atomic_cas_32( (uint32_t *) _v, (uint32_t) _old, (uint32_t) _new)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+
+#ifndef PIPE_ATOMIC
+#error "No pipe_atomic implementation selected"
+#endif
+
+
+
+#endif /* U_ATOMIC_H */
diff --git a/src/gallium/auxiliary/util/u_bitmask.c b/src/gallium/auxiliary/util/u_bitmask.c
index 77587c07ec0..23c93a3ebcb 100644
--- a/src/gallium/auxiliary/util/u_bitmask.c
+++ b/src/gallium/auxiliary/util/u_bitmask.c
@@ -97,12 +97,12 @@ util_bitmask_resize(struct util_bitmask *bm,
    if(!minimum_size)
       return FALSE;
       
-   if(bm->size > minimum_size)
+   if(bm->size >= minimum_size)
       return TRUE;
 
    assert(bm->size % UTIL_BITMASK_BITS_PER_WORD == 0);
    new_size = bm->size;
-   while(!(new_size > minimum_size)) {
+   while(new_size < minimum_size) {
       new_size *= 2;
       /* Check integer overflow */
       if(new_size < bm->size)
@@ -136,7 +136,7 @@ util_bitmask_filled_set(struct util_bitmask *bm,
                         unsigned index)
 {
    assert(bm->filled <= bm->size);
-   assert(index <= bm->size);
+   assert(index < bm->size);
    
    if(index == bm->filled) {
       ++bm->filled;
@@ -149,7 +149,7 @@ util_bitmask_filled_unset(struct util_bitmask *bm,
                           unsigned index)
 {
    assert(bm->filled <= bm->size);
-   assert(index <= bm->size);
+   assert(index < bm->size);
    
    if(index < bm->filled)
       bm->filled = index;
@@ -182,7 +182,7 @@ util_bitmask_add(struct util_bitmask *bm)
       mask = 1;
    }
 found:
-   
+
    /* grow the bitmask if necessary */
    if(!util_bitmask_resize(bm, bm->filled))
       return UTIL_BITMASK_INVALID_INDEX;
@@ -198,9 +198,9 @@ unsigned
 util_bitmask_set(struct util_bitmask *bm, 
                  unsigned index)
 {
-   unsigned word = index / UTIL_BITMASK_BITS_PER_WORD;
-   unsigned bit  = index % UTIL_BITMASK_BITS_PER_WORD;
-   util_bitmask_word mask = 1 << bit;
+   unsigned word;
+   unsigned bit;
+   util_bitmask_word mask;
    
    assert(bm);
    
@@ -208,6 +208,10 @@ util_bitmask_set(struct util_bitmask *bm,
    if(!util_bitmask_resize(bm, index))
       return UTIL_BITMASK_INVALID_INDEX;
 
+   word = index / UTIL_BITMASK_BITS_PER_WORD;
+   bit  = index % UTIL_BITMASK_BITS_PER_WORD;
+   mask = 1 << bit;
+
    bm->words[word] |= mask;
 
    util_bitmask_filled_set(bm, index);
@@ -220,15 +224,19 @@ void
 util_bitmask_clear(struct util_bitmask *bm, 
                    unsigned index)
 {
-   unsigned word = index / UTIL_BITMASK_BITS_PER_WORD;
-   unsigned bit  = index % UTIL_BITMASK_BITS_PER_WORD;
-   util_bitmask_word mask = 1 << bit;
+   unsigned word;
+   unsigned bit;
+   util_bitmask_word mask;
    
    assert(bm);
    
    if(index >= bm->size)
       return;
 
+   word = index / UTIL_BITMASK_BITS_PER_WORD;
+   bit  = index % UTIL_BITMASK_BITS_PER_WORD;
+   mask = 1 << bit;
+
    bm->words[word] &= ~mask;
    
    util_bitmask_filled_unset(bm, index);
@@ -250,7 +258,7 @@ util_bitmask_get(struct util_bitmask *bm,
       return TRUE;
    }
 
-   if(index > bm->size)
+   if(index >= bm->size)
       return FALSE;
 
    if(bm->words[word] & mask) {
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index abe1de3302b..0b263a9db5c 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -36,12 +36,13 @@
 #include "pipe/p_context.h"
 #include "util/u_debug.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
+#include "util/u_inlines.h"
 #include "pipe/p_shader_tokens.h"
 #include "pipe/p_state.h"
 
 #include "util/u_blit.h"
 #include "util/u_draw_quad.h"
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_simple_shaders.h"
@@ -61,6 +62,7 @@ struct blit_state
    struct pipe_rasterizer_state rasterizer;
    struct pipe_sampler_state sampler;
    struct pipe_viewport_state viewport;
+   struct pipe_clip_state clip;
 
    void *vs;
    void *fs[TGSI_WRITEMASK_XYZW + 1];
@@ -91,7 +93,7 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
 
    /* disabled blending/masking */
    memset(&ctx->blend, 0, sizeof(ctx->blend));
-   ctx->blend.colormask = PIPE_MASK_RGBA;
+   ctx->blend.rt[0].colormask = PIPE_MASK_RGBA;
 
    /* no-op depth/stencil/alpha */
    memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil));
@@ -100,7 +102,6 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
    ctx->rasterizer.front_winding = PIPE_WINDING_CW;
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
-   ctx->rasterizer.bypass_vs_clip_and_viewport = 1;
    ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* samplers */
@@ -113,7 +114,6 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->sampler.mag_img_filter = 0; /* set later */
    ctx->sampler.normalized_coords = 1;
 
-
    /* vertex shader - still required to provide the linkage between
     * fragment shader input semantics and vertex_element/buffers.
     */
@@ -126,7 +126,8 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    }
 
    /* fragment shader */
-   ctx->fs[TGSI_WRITEMASK_XYZW] = util_make_fragment_tex_shader(pipe);
+   ctx->fs[TGSI_WRITEMASK_XYZW] =
+      util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D);
    ctx->vbuf = NULL;
 
    /* init vertex data that doesn't change */
@@ -224,8 +225,8 @@ setup_vertex_data_tex(struct blit_state *ctx,
 
    offset = get_next_slot( ctx );
 
-   pipe_buffer_write(ctx->pipe->screen, ctx->vbuf,
-                     offset, sizeof(ctx->vertices), ctx->vertices);
+   pipe_buffer_write_nooverlap(ctx->pipe->screen, ctx->vbuf,
+                               offset, sizeof(ctx->vertices), ctx->vertices);
 
    return offset;
 }
@@ -260,7 +261,10 @@ regions_overlap(int srcX0, int srcY0,
  * Copy pixel block from src surface to dst surface.
  * Overlapping regions are acceptable.
  * Flipping and stretching are supported.
- * XXX what about clipping???
+ * \param filter  one of PIPE_TEX_MIPFILTER_NEAREST/LINEAR
+ * \param writemask  controls which channels in the dest surface are sourced
+ *                   from the src surface.  Disabled channels are sourced
+ *                   from (0,0,0,1).
  * XXX need some control over blitting Z and/or stencil.
  */
 void
@@ -401,14 +405,17 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    cso_save_rasterizer(ctx->cso);
    cso_save_samplers(ctx->cso);
    cso_save_sampler_textures(ctx->cso);
+   cso_save_viewport(ctx->cso);
    cso_save_framebuffer(ctx->cso);
    cso_save_fragment_shader(ctx->cso);
    cso_save_vertex_shader(ctx->cso);
+   cso_save_clip(ctx->cso);
 
    /* set misc state we care about */
    cso_set_blend(ctx->cso, &ctx->blend);
    cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_clip(ctx->cso, &ctx->clip);
 
    /* sampler */
    ctx->sampler.min_img_filter = filter;
@@ -416,11 +423,24 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    cso_single_sampler(ctx->cso, 0, &ctx->sampler);
    cso_single_sampler_done(ctx->cso);
 
+   /* viewport */
+   ctx->viewport.scale[0] = 0.5f * dst->width;
+   ctx->viewport.scale[1] = 0.5f * dst->height;
+   ctx->viewport.scale[2] = 0.5f;
+   ctx->viewport.scale[3] = 1.0f;
+   ctx->viewport.translate[0] = 0.5f * dst->width;
+   ctx->viewport.translate[1] = 0.5f * dst->height;
+   ctx->viewport.translate[2] = 0.5f;
+   ctx->viewport.translate[3] = 0.0f;
+   cso_set_viewport(ctx->cso, &ctx->viewport);
+
    /* texture */
    cso_set_sampler_textures(ctx->cso, 1, &tex);
 
    if (ctx->fs[writemask] == NULL)
-      ctx->fs[writemask] = util_make_fragment_tex_shader_writemask(pipe, writemask);
+      ctx->fs[writemask] =
+         util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D,
+                                                 writemask);
 
    /* shaders */
    cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]);
@@ -436,8 +456,10 @@ util_blit_pixels_writemask(struct blit_state *ctx,
 
    /* draw quad */
    offset = setup_vertex_data_tex(ctx,
-                                  (float) dstX0, (float) dstY0, 
-                                  (float) dstX1, (float) dstY1,
+                                  (float) dstX0 / dst->width * 2.0f - 1.0f,
+                                  (float) dstY0 / dst->height * 2.0f - 1.0f,
+                                  (float) dstX1 / dst->width * 2.0f - 1.0f,
+                                  (float) dstY1 / dst->height * 2.0f - 1.0f,
                                   s0, t0,
                                   s1, t1,
                                   z);
@@ -453,9 +475,11 @@ util_blit_pixels_writemask(struct blit_state *ctx,
    cso_restore_rasterizer(ctx->cso);
    cso_restore_samplers(ctx->cso);
    cso_restore_sampler_textures(ctx->cso);
+   cso_restore_viewport(ctx->cso);
    cso_restore_framebuffer(ctx->cso);
    cso_restore_fragment_shader(ctx->cso);
    cso_restore_vertex_shader(ctx->cso);
+   cso_restore_clip(ctx->cso);
 
    pipe_texture_reference(&tex, NULL);
 }
@@ -539,11 +563,13 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_save_framebuffer(ctx->cso);
    cso_save_fragment_shader(ctx->cso);
    cso_save_vertex_shader(ctx->cso);
+   cso_save_clip(ctx->cso);
 
    /* set misc state we care about */
    cso_set_blend(ctx->cso, &ctx->blend);
    cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_clip(ctx->cso, &ctx->clip);
 
    /* sampler */
    ctx->sampler.min_img_filter = filter;
@@ -551,6 +577,17 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_single_sampler(ctx->cso, 0, &ctx->sampler);
    cso_single_sampler_done(ctx->cso);
 
+   /* viewport */
+   ctx->viewport.scale[0] = 0.5f * dst->width;
+   ctx->viewport.scale[1] = 0.5f * dst->height;
+   ctx->viewport.scale[2] = 0.5f;
+   ctx->viewport.scale[3] = 1.0f;
+   ctx->viewport.translate[0] = 0.5f * dst->width;
+   ctx->viewport.translate[1] = 0.5f * dst->height;
+   ctx->viewport.translate[2] = 0.5f;
+   ctx->viewport.translate[3] = 0.0f;
+   cso_set_viewport(ctx->cso, &ctx->viewport);
+
    /* texture */
    cso_set_sampler_textures(ctx->cso, 1, &tex);
 
@@ -568,8 +605,10 @@ util_blit_pixels_tex(struct blit_state *ctx,
 
    /* draw quad */
    offset = setup_vertex_data_tex(ctx,
-                                  (float) dstX0, (float) dstY0,
-                                  (float) dstX1, (float) dstY1,
+                                  (float) dstX0 / dst->width * 2.0f - 1.0f,
+                                  (float) dstY0 / dst->height * 2.0f - 1.0f,
+                                  (float) dstX1 / dst->width * 2.0f - 1.0f,
+                                  (float) dstY1 / dst->height * 2.0f - 1.0f,
                                   s0, t0, s1, t1,
                                   z);
 
@@ -588,4 +627,5 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_restore_framebuffer(ctx->cso);
    cso_restore_fragment_shader(ctx->cso);
    cso_restore_vertex_shader(ctx->cso);
+   cso_restore_clip(ctx->cso);
 }
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
new file mode 100644
index 00000000000..0ba09d33bfc
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -0,0 +1,823 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Marek Olšák <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Blitter utility to facilitate acceleration of the clear, surface_copy,
+ * and surface_fill functions.
+ *
+ * @author Marek Olšák
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_state.h"
+
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_blitter.h"
+#include "util/u_draw_quad.h"
+#include "util/u_pack_color.h"
+#include "util/u_rect.h"
+#include "util/u_simple_shaders.h"
+#include "util/u_texture.h"
+
+#define INVALID_PTR ((void*)~0)
+
+struct blitter_context_priv
+{
+   struct blitter_context blitter;
+
+   struct pipe_context *pipe; /**< pipe context */
+   struct pipe_buffer *vbuf;  /**< quad */
+
+   float vertices[4][2][4];   /**< {pos, color} or {pos, texcoord} */
+
+   /* Templates for various state objects. */
+   struct pipe_sampler_state template_sampler_state;
+
+   /* Constant state objects. */
+   /* Vertex shaders. */
+   void *vs_col; /**< Vertex shader which passes {pos, color} to the output */
+   void *vs_tex; /**< Vertex shader which passes {pos, texcoord} to the output.*/
+
+   /* Fragment shaders. */
+   /* FS which outputs a color to multiple color buffers. */
+   void *fs_col[PIPE_MAX_COLOR_BUFS];
+
+   /* FS which outputs a color from a texture,
+      where the index is PIPE_TEXTURE_* to be sampled. */
+   void *fs_texfetch_col[PIPE_MAX_TEXTURE_TYPES];
+
+   /* FS which outputs a depth from a texture,
+      where the index is PIPE_TEXTURE_* to be sampled. */
+   void *fs_texfetch_depth[PIPE_MAX_TEXTURE_TYPES];
+
+   /* Blend state. */
+   void *blend_write_color;   /**< blend state with writemask of RGBA */
+   void *blend_keep_color;    /**< blend state with writemask of 0 */
+
+   /* Depth stencil alpha state. */
+   void *dsa_write_depth_stencil;
+   void *dsa_write_depth_keep_stencil;
+   void *dsa_keep_depth_stencil;
+
+   /* Sampler state for clamping to a miplevel. */
+   void *sampler_state[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* Rasterizer state. */
+   void *rs_state;
+
+   /* Viewport state. */
+   struct pipe_viewport_state viewport;
+
+   /* Clip state. */
+   struct pipe_clip_state clip;
+};
+
+struct blitter_context *util_blitter_create(struct pipe_context *pipe)
+{
+   struct blitter_context_priv *ctx;
+   struct pipe_blend_state blend = { 0 };
+   struct pipe_depth_stencil_alpha_state dsa = { { 0 } };
+   struct pipe_rasterizer_state rs_state = { 0 };
+   struct pipe_sampler_state *sampler_state;
+   unsigned i;
+
+   ctx = CALLOC_STRUCT(blitter_context_priv);
+   if (!ctx)
+      return NULL;
+
+   ctx->pipe = pipe;
+
+   /* init state objects for them to be considered invalid */
+   ctx->blitter.saved_blend_state = INVALID_PTR;
+   ctx->blitter.saved_dsa_state = INVALID_PTR;
+   ctx->blitter.saved_rs_state = INVALID_PTR;
+   ctx->blitter.saved_fs = INVALID_PTR;
+   ctx->blitter.saved_vs = INVALID_PTR;
+   ctx->blitter.saved_fb_state.nr_cbufs = ~0;
+   ctx->blitter.saved_num_textures = ~0;
+   ctx->blitter.saved_num_sampler_states = ~0;
+
+   /* blend state objects */
+   ctx->blend_keep_color = pipe->create_blend_state(pipe, &blend);
+
+   blend.rt[0].colormask = PIPE_MASK_RGBA;
+   ctx->blend_write_color = pipe->create_blend_state(pipe, &blend);
+
+   /* depth stencil alpha state objects */
+   ctx->dsa_keep_depth_stencil =
+      pipe->create_depth_stencil_alpha_state(pipe, &dsa);
+
+   dsa.depth.enabled = 1;
+   dsa.depth.writemask = 1;
+   dsa.depth.func = PIPE_FUNC_ALWAYS;
+   ctx->dsa_write_depth_keep_stencil =
+      pipe->create_depth_stencil_alpha_state(pipe, &dsa);
+
+   dsa.stencil[0].enabled = 1;
+   dsa.stencil[0].func = PIPE_FUNC_ALWAYS;
+   dsa.stencil[0].fail_op = PIPE_STENCIL_OP_REPLACE;
+   dsa.stencil[0].zpass_op = PIPE_STENCIL_OP_REPLACE;
+   dsa.stencil[0].zfail_op = PIPE_STENCIL_OP_REPLACE;
+   dsa.stencil[0].valuemask = 0xff;
+   dsa.stencil[0].writemask = 0xff;
+   ctx->dsa_write_depth_stencil =
+      pipe->create_depth_stencil_alpha_state(pipe, &dsa);
+   /* The DSA state objects which write depth and stencil are created
+    * on-demand. */
+
+   /* sampler state */
+   sampler_state = &ctx->template_sampler_state;
+   sampler_state->wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler_state->wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler_state->wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   /* The sampler state objects which sample from a specified mipmap level
+    * are created on-demand. */
+
+   /* rasterizer state */
+   memset(&rs_state, 0, sizeof(rs_state));
+   rs_state.front_winding = PIPE_WINDING_CW;
+   rs_state.cull_mode = PIPE_WINDING_NONE;
+   rs_state.gl_rasterization_rules = 1;
+   rs_state.flatshade = 1;
+   ctx->rs_state = pipe->create_rasterizer_state(pipe, &rs_state);
+
+   /* fragment shaders are created on-demand */
+
+   /* vertex shaders */
+   {
+      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
+                                      TGSI_SEMANTIC_COLOR };
+      const uint semantic_indices[] = { 0, 0 };
+      ctx->vs_col =
+         util_make_vertex_passthrough_shader(pipe, 2, semantic_names,
+                                             semantic_indices);
+   }
+   {
+      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
+                                      TGSI_SEMANTIC_GENERIC };
+      const uint semantic_indices[] = { 0, 0 };
+      ctx->vs_tex =
+         util_make_vertex_passthrough_shader(pipe, 2, semantic_names,
+                                             semantic_indices);
+   }
+
+   /* set invariant vertex coordinates */
+   for (i = 0; i < 4; i++)
+      ctx->vertices[i][0][3] = 1; /*v.w*/
+
+   /* create the vertex buffer */
+   ctx->vbuf = pipe_buffer_create(ctx->pipe->screen,
+                                  32,
+                                  PIPE_BUFFER_USAGE_VERTEX,
+                                  sizeof(ctx->vertices));
+
+   return &ctx->blitter;
+}
+
+void util_blitter_destroy(struct blitter_context *blitter)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   int i;
+
+   pipe->delete_blend_state(pipe, ctx->blend_write_color);
+   pipe->delete_blend_state(pipe, ctx->blend_keep_color);
+   pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
+   pipe->delete_depth_stencil_alpha_state(pipe,
+                                          ctx->dsa_write_depth_keep_stencil);
+   pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil);
+
+   pipe->delete_rasterizer_state(pipe, ctx->rs_state);
+   pipe->delete_vs_state(pipe, ctx->vs_col);
+   pipe->delete_vs_state(pipe, ctx->vs_tex);
+
+   for (i = 0; i < PIPE_MAX_TEXTURE_TYPES; i++) {
+      if (ctx->fs_texfetch_col[i])
+         pipe->delete_fs_state(pipe, ctx->fs_texfetch_col[i]);
+      if (ctx->fs_texfetch_depth[i])
+         pipe->delete_fs_state(pipe, ctx->fs_texfetch_depth[i]);
+   }
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS && ctx->fs_col[i]; i++)
+      if (ctx->fs_col[i])
+         pipe->delete_fs_state(pipe, ctx->fs_col[i]);
+
+   for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
+      if (ctx->sampler_state[i])
+         pipe->delete_sampler_state(pipe, ctx->sampler_state[i]);
+
+   pipe_buffer_reference(&ctx->vbuf, NULL);
+   FREE(ctx);
+}
+
+static void blitter_check_saved_CSOs(struct blitter_context_priv *ctx)
+{
+   /* make sure these CSOs have been saved */
+   assert(ctx->blitter.saved_blend_state != INVALID_PTR &&
+          ctx->blitter.saved_dsa_state != INVALID_PTR &&
+          ctx->blitter.saved_rs_state != INVALID_PTR &&
+          ctx->blitter.saved_fs != INVALID_PTR &&
+          ctx->blitter.saved_vs != INVALID_PTR);
+}
+
+static void blitter_restore_CSOs(struct blitter_context_priv *ctx)
+{
+   struct pipe_context *pipe = ctx->pipe;
+
+   /* restore the state objects which are always required to be saved */
+   pipe->bind_blend_state(pipe, ctx->blitter.saved_blend_state);
+   pipe->bind_depth_stencil_alpha_state(pipe, ctx->blitter.saved_dsa_state);
+   pipe->bind_rasterizer_state(pipe, ctx->blitter.saved_rs_state);
+   pipe->bind_fs_state(pipe, ctx->blitter.saved_fs);
+   pipe->bind_vs_state(pipe, ctx->blitter.saved_vs);
+
+   ctx->blitter.saved_blend_state = INVALID_PTR;
+   ctx->blitter.saved_dsa_state = INVALID_PTR;
+   ctx->blitter.saved_rs_state = INVALID_PTR;
+   ctx->blitter.saved_fs = INVALID_PTR;
+   ctx->blitter.saved_vs = INVALID_PTR;
+
+   pipe->set_stencil_ref(pipe, &ctx->blitter.saved_stencil_ref);
+
+   pipe->set_viewport_state(pipe, &ctx->blitter.saved_viewport);
+   pipe->set_clip_state(pipe, &ctx->blitter.saved_clip);
+
+   /* restore the state objects which are required to be saved before copy/fill
+    */
+   if (ctx->blitter.saved_fb_state.nr_cbufs != ~0) {
+      pipe->set_framebuffer_state(pipe, &ctx->blitter.saved_fb_state);
+      ctx->blitter.saved_fb_state.nr_cbufs = ~0;
+   }
+
+   if (ctx->blitter.saved_num_sampler_states != ~0) {
+      pipe->bind_fragment_sampler_states(pipe,
+                                         ctx->blitter.saved_num_sampler_states,
+                                         ctx->blitter.saved_sampler_states);
+      ctx->blitter.saved_num_sampler_states = ~0;
+   }
+
+   if (ctx->blitter.saved_num_textures != ~0) {
+      pipe->set_fragment_sampler_textures(pipe,
+                                          ctx->blitter.saved_num_textures,
+                                          ctx->blitter.saved_textures);
+      ctx->blitter.saved_num_textures = ~0;
+   }
+}
+
+static void blitter_set_rectangle(struct blitter_context_priv *ctx,
+                                  unsigned x1, unsigned y1,
+                                  unsigned x2, unsigned y2,
+                                  unsigned width, unsigned height,
+                                  float depth)
+{
+   int i;
+
+   /* set vertex positions */
+   ctx->vertices[0][0][0] = (float)x1 / width * 2.0f - 1.0f; /*v0.x*/
+   ctx->vertices[0][0][1] = (float)y1 / height * 2.0f - 1.0f; /*v0.y*/
+
+   ctx->vertices[1][0][0] = (float)x2 / width * 2.0f - 1.0f; /*v1.x*/
+   ctx->vertices[1][0][1] = (float)y1 / height * 2.0f - 1.0f; /*v1.y*/
+
+   ctx->vertices[2][0][0] = (float)x2 / width * 2.0f - 1.0f; /*v2.x*/
+   ctx->vertices[2][0][1] = (float)y2 / height * 2.0f - 1.0f; /*v2.y*/
+
+   ctx->vertices[3][0][0] = (float)x1 / width * 2.0f - 1.0f; /*v3.x*/
+   ctx->vertices[3][0][1] = (float)y2 / height * 2.0f - 1.0f; /*v3.y*/
+
+   for (i = 0; i < 4; i++)
+      ctx->vertices[i][0][2] = depth; /*z*/
+
+   /* viewport */
+   ctx->viewport.scale[0] = 0.5f * width;
+   ctx->viewport.scale[1] = 0.5f * height;
+   ctx->viewport.scale[2] = 1.0f;
+   ctx->viewport.scale[3] = 1.0f;
+   ctx->viewport.translate[0] = 0.5f * width;
+   ctx->viewport.translate[1] = 0.5f * height;
+   ctx->viewport.translate[2] = 0.0f;
+   ctx->viewport.translate[3] = 0.0f;
+   ctx->pipe->set_viewport_state(ctx->pipe, &ctx->viewport);
+
+   /* clip */
+   ctx->pipe->set_clip_state(ctx->pipe, &ctx->clip);
+}
+
+static void blitter_set_clear_color(struct blitter_context_priv *ctx,
+                                    const float *rgba)
+{
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      ctx->vertices[i][1][0] = rgba[0];
+      ctx->vertices[i][1][1] = rgba[1];
+      ctx->vertices[i][1][2] = rgba[2];
+      ctx->vertices[i][1][3] = rgba[3];
+   }
+}
+
+static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx,
+                                     struct pipe_surface *surf,
+                                     unsigned x1, unsigned y1,
+                                     unsigned x2, unsigned y2)
+{
+   int i;
+   float s1 = x1 / (float)surf->width;
+   float t1 = y1 / (float)surf->height;
+   float s2 = x2 / (float)surf->width;
+   float t2 = y2 / (float)surf->height;
+
+   ctx->vertices[0][1][0] = s1; /*t0.s*/
+   ctx->vertices[0][1][1] = t1; /*t0.t*/
+
+   ctx->vertices[1][1][0] = s2; /*t1.s*/
+   ctx->vertices[1][1][1] = t1; /*t1.t*/
+
+   ctx->vertices[2][1][0] = s2; /*t2.s*/
+   ctx->vertices[2][1][1] = t2; /*t2.t*/
+
+   ctx->vertices[3][1][0] = s1; /*t3.s*/
+   ctx->vertices[3][1][1] = t2; /*t3.t*/
+
+   for (i = 0; i < 4; i++) {
+      ctx->vertices[i][1][2] = 0; /*r*/
+      ctx->vertices[i][1][3] = 1; /*q*/
+   }
+}
+
+static void blitter_set_texcoords_3d(struct blitter_context_priv *ctx,
+                                     struct pipe_surface *surf,
+                                     unsigned x1, unsigned y1,
+                                     unsigned x2, unsigned y2)
+{
+   int i;
+   float depth = u_minify(surf->texture->depth0, surf->level);
+   float r = surf->zslice / depth;
+
+   blitter_set_texcoords_2d(ctx, surf, x1, y1, x2, y2);
+
+   for (i = 0; i < 4; i++)
+      ctx->vertices[i][1][2] = r; /*r*/
+}
+
+static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx,
+                                       struct pipe_surface *surf,
+                                       unsigned x1, unsigned y1,
+                                       unsigned x2, unsigned y2)
+{
+   int i;
+   float s1 = x1 / (float)surf->width;
+   float t1 = y1 / (float)surf->height;
+   float s2 = x2 / (float)surf->width;
+   float t2 = y2 / (float)surf->height;
+   float st[4][2];
+
+   st[0][0] = s1;
+   st[0][1] = t1;
+   st[1][0] = s2;
+   st[1][1] = t1;
+   st[2][0] = s2;
+   st[2][1] = t2;
+   st[3][0] = s1;
+   st[3][1] = t2;
+
+   util_map_texcoords2d_onto_cubemap(surf->face,
+                                     /* pointer, stride in floats */
+                                     &st[0][0], 2,
+                                     &ctx->vertices[0][1][0], 8);
+
+   for (i = 0; i < 4; i++)
+      ctx->vertices[i][1][3] = 1; /*q*/
+}
+
+static void blitter_draw_quad(struct blitter_context_priv *ctx)
+{
+   struct pipe_context *pipe = ctx->pipe;
+
+   /* write vertices and draw them */
+   pipe_buffer_write(pipe->screen, ctx->vbuf,
+                     0, sizeof(ctx->vertices), ctx->vertices);
+
+   util_draw_vertex_buffer(pipe, ctx->vbuf, 0, PIPE_PRIM_TRIANGLE_FAN,
+                           4,  /* verts */
+                           2); /* attribs/vert */
+}
+
+static INLINE
+void **blitter_get_sampler_state(struct blitter_context_priv *ctx,
+                                 int miplevel)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_sampler_state *sampler_state = &ctx->template_sampler_state;
+
+   assert(miplevel < PIPE_MAX_TEXTURE_LEVELS);
+
+   /* Create the sampler state on-demand. */
+   if (!ctx->sampler_state[miplevel]) {
+      sampler_state->lod_bias = miplevel;
+      sampler_state->min_lod = miplevel;
+      sampler_state->max_lod = miplevel;
+
+      ctx->sampler_state[miplevel] = pipe->create_sampler_state(pipe,
+                                                                sampler_state);
+   }
+
+   /* Return void** so that it can be passed to bind_fragment_sampler_states
+    * directly. */
+   return &ctx->sampler_state[miplevel];
+}
+
+static INLINE
+void *blitter_get_fs_col(struct blitter_context_priv *ctx, unsigned num_cbufs)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   unsigned index = num_cbufs ? num_cbufs - 1 : 0;
+
+   assert(num_cbufs <= PIPE_MAX_COLOR_BUFS);
+
+   if (!ctx->fs_col[index])
+      ctx->fs_col[index] =
+         util_make_fragment_clonecolor_shader(pipe, num_cbufs);
+
+   return ctx->fs_col[index];
+}
+
+static INLINE
+void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
+                                  unsigned tex_target)
+{
+   struct pipe_context *pipe = ctx->pipe;
+
+   assert(tex_target < PIPE_MAX_TEXTURE_TYPES);
+
+   /* Create the fragment shader on-demand. */
+   if (!ctx->fs_texfetch_col[tex_target]) {
+      switch (tex_target) {
+         case PIPE_TEXTURE_1D:
+            ctx->fs_texfetch_col[PIPE_TEXTURE_1D] =
+               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_1D);
+            break;
+         case PIPE_TEXTURE_2D:
+            ctx->fs_texfetch_col[PIPE_TEXTURE_2D] =
+               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D);
+            break;
+         case PIPE_TEXTURE_3D:
+            ctx->fs_texfetch_col[PIPE_TEXTURE_3D] =
+               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_3D);
+            break;
+         case PIPE_TEXTURE_CUBE:
+            ctx->fs_texfetch_col[PIPE_TEXTURE_CUBE] =
+               util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE);
+            break;
+         default:;
+      }
+   }
+
+   return ctx->fs_texfetch_col[tex_target];
+}
+
+static INLINE
+void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx,
+                                    unsigned tex_target)
+{
+   struct pipe_context *pipe = ctx->pipe;
+
+   assert(tex_target < PIPE_MAX_TEXTURE_TYPES);
+
+   /* Create the fragment shader on-demand. */
+   if (!ctx->fs_texfetch_depth[tex_target]) {
+      switch (tex_target) {
+         case PIPE_TEXTURE_1D:
+            ctx->fs_texfetch_depth[PIPE_TEXTURE_1D] =
+               util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_1D);
+            break;
+         case PIPE_TEXTURE_2D:
+            ctx->fs_texfetch_depth[PIPE_TEXTURE_2D] =
+               util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_2D);
+            break;
+         case PIPE_TEXTURE_3D:
+            ctx->fs_texfetch_depth[PIPE_TEXTURE_3D] =
+               util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_3D);
+            break;
+         case PIPE_TEXTURE_CUBE:
+            ctx->fs_texfetch_depth[PIPE_TEXTURE_CUBE] =
+               util_make_fragment_tex_shader_writedepth(pipe,TGSI_TEXTURE_CUBE);
+            break;
+         default:;
+      }
+   }
+
+   return ctx->fs_texfetch_depth[tex_target];
+}
+
+void util_blitter_clear(struct blitter_context *blitter,
+                        unsigned width, unsigned height,
+                        unsigned num_cbufs,
+                        unsigned clear_buffers,
+                        const float *rgba,
+                        double depth, unsigned stencil)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_stencil_ref sr = { { 0 } };
+
+   assert(num_cbufs <= PIPE_MAX_COLOR_BUFS);
+
+   blitter_check_saved_CSOs(ctx);
+
+   /* bind CSOs */
+   if (clear_buffers & PIPE_CLEAR_COLOR)
+      pipe->bind_blend_state(pipe, ctx->blend_write_color);
+   else
+      pipe->bind_blend_state(pipe, ctx->blend_keep_color);
+
+   if (clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) {
+      sr.ref_value[0] = stencil & 0xff;
+      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil);
+      pipe->set_stencil_ref(pipe, &sr);
+   }
+   else
+      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
+
+   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
+   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, num_cbufs));
+   pipe->bind_vs_state(pipe, ctx->vs_col);
+
+   blitter_set_clear_color(ctx, rgba);
+   blitter_set_rectangle(ctx, 0, 0, width, height, width, height, depth);
+   blitter_draw_quad(ctx);
+   blitter_restore_CSOs(ctx);
+}
+
+static boolean
+is_overlap(unsigned sx1, unsigned sx2, unsigned sy1, unsigned sy2,
+           unsigned dx1, unsigned dx2, unsigned dy1, unsigned dy2)
+{
+    if (sx1 >= dx2 || sx2 <= dx1 || sy1 >= dy2 || sy2 <= dy1) {
+        return FALSE;
+    } else {
+        return TRUE;
+    }
+}
+
+static void util_blitter_do_copy(struct blitter_context *blitter,
+				 struct pipe_surface *dst,
+				 unsigned dstx, unsigned dsty,
+				 struct pipe_surface *src,
+				 unsigned srcx, unsigned srcy,
+				 unsigned width, unsigned height,
+				 boolean is_depth)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_framebuffer_state fb_state;
+
+   assert(blitter->saved_fb_state.nr_cbufs != ~0);
+   assert(blitter->saved_num_textures != ~0);
+   assert(blitter->saved_num_sampler_states != ~0);
+   assert(src->texture->target < PIPE_MAX_TEXTURE_TYPES);
+
+   /* bind CSOs */
+   fb_state.width = dst->width;
+   fb_state.height = dst->height;
+
+   if (is_depth) {
+      pipe->bind_blend_state(pipe, ctx->blend_keep_color);
+      pipe->bind_depth_stencil_alpha_state(pipe,
+                                           ctx->dsa_write_depth_keep_stencil);
+      pipe->bind_fs_state(pipe,
+         blitter_get_fs_texfetch_depth(ctx, src->texture->target));
+
+      fb_state.nr_cbufs = 0;
+      fb_state.zsbuf = dst;
+   } else {
+      pipe->bind_blend_state(pipe, ctx->blend_write_color);
+      pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
+      pipe->bind_fs_state(pipe,
+         blitter_get_fs_texfetch_col(ctx, src->texture->target));
+
+      fb_state.nr_cbufs = 1;
+      fb_state.cbufs[0] = dst;
+      fb_state.zsbuf = 0;
+   }
+
+   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
+   pipe->bind_vs_state(pipe, ctx->vs_tex);
+   pipe->bind_fragment_sampler_states(pipe, 1,
+      blitter_get_sampler_state(ctx, src->level));
+   pipe->set_fragment_sampler_textures(pipe, 1, &src->texture);
+   pipe->set_framebuffer_state(pipe, &fb_state);
+
+   /* set texture coordinates */
+   switch (src->texture->target) {
+      case PIPE_TEXTURE_1D:
+      case PIPE_TEXTURE_2D:
+         blitter_set_texcoords_2d(ctx, src, srcx, srcy,
+                                  srcx+width, srcy+height);
+         break;
+      case PIPE_TEXTURE_3D:
+         blitter_set_texcoords_3d(ctx, src, srcx, srcy,
+                                  srcx+width, srcy+height);
+         break;
+      case PIPE_TEXTURE_CUBE:
+         blitter_set_texcoords_cube(ctx, src, srcx, srcy,
+                                    srcx+width, srcy+height);
+         break;
+      default:
+         assert(0);
+   }
+
+   blitter_set_rectangle(ctx, dstx, dsty, dstx+width, dsty+height, dst->width, dst->height, 0);
+   blitter_draw_quad(ctx);
+
+}
+
+static void util_blitter_overlap_copy(struct blitter_context *blitter,
+				      struct pipe_surface *dst,
+				      unsigned dstx, unsigned dsty,
+				      struct pipe_surface *src,
+				      unsigned srcx, unsigned srcy,
+				      unsigned width, unsigned height)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+
+   struct pipe_texture texTemp;
+   struct pipe_texture *texture;
+   struct pipe_surface *tex_surf;
+
+   /* check whether the states are properly saved */
+   blitter_check_saved_CSOs(ctx);
+
+   memset(&texTemp, 0, sizeof(texTemp));
+   texTemp.target = PIPE_TEXTURE_2D;
+   texTemp.format = dst->texture->format; /* XXX verify supported by driver! */
+   texTemp.last_level = 0;
+   texTemp.width0 = width;
+   texTemp.height0 = height;
+   texTemp.depth0 = 1;
+
+   texture = screen->texture_create(screen, &texTemp);
+   if (!texture)
+      return;
+
+   tex_surf = screen->get_tex_surface(screen, texture, 0, 0, 0,
+				      PIPE_BUFFER_USAGE_GPU_READ | 
+				      PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   /* blit from the src to the temp */
+   util_blitter_do_copy(blitter, tex_surf, 0, 0,
+			src, srcx, srcy,
+			width, height,
+			FALSE);
+   util_blitter_do_copy(blitter, dst, dstx, dsty,
+			tex_surf, 0, 0,
+			width, height,
+			FALSE);
+   pipe_surface_reference(&tex_surf, NULL);
+   pipe_texture_reference(&texture, NULL);
+   blitter_restore_CSOs(ctx);
+}
+
+void util_blitter_copy(struct blitter_context *blitter,
+                       struct pipe_surface *dst,
+                       unsigned dstx, unsigned dsty,
+                       struct pipe_surface *src,
+                       unsigned srcx, unsigned srcy,
+                       unsigned width, unsigned height,
+                       boolean ignore_stencil)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   boolean is_stencil, is_depth;
+   unsigned dst_tex_usage;
+
+   /* give up if textures are not set */
+   assert(dst->texture && src->texture);
+   if (!dst->texture || !src->texture)
+      return;
+
+   if (dst->texture == src->texture) {
+      if (is_overlap(srcx, srcx + width, srcy, srcy + height,
+		             dstx, dstx + width, dsty, dsty + height)) {
+         util_blitter_overlap_copy(blitter, dst, dstx, dsty, src, srcx, srcy,
+                                   width, height);
+         return;
+      }
+   }
+		   
+   is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
+   is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0;
+   dst_tex_usage = is_depth || is_stencil ? PIPE_TEXTURE_USAGE_DEPTH_STENCIL :
+                                            PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+   /* check if we can sample from and render to the surfaces */
+   /* (assuming copying a stencil buffer is not possible) */
+   if ((!ignore_stencil && is_stencil) ||
+       !screen->is_format_supported(screen, dst->format, dst->texture->target,
+                                    dst_tex_usage, 0) ||
+       !screen->is_format_supported(screen, src->format, src->texture->target,
+                                    PIPE_TEXTURE_USAGE_SAMPLER, 0)) {
+      util_surface_copy(pipe, FALSE, dst, dstx, dsty, src, srcx, srcy,
+                        width, height);
+      return;
+   }
+
+   /* check whether the states are properly saved */
+   blitter_check_saved_CSOs(ctx);
+   util_blitter_do_copy(blitter,
+			dst, dstx, dsty,
+			src, srcx, srcy,
+			width, height, is_depth);
+   blitter_restore_CSOs(ctx);
+}
+
+void util_blitter_fill(struct blitter_context *blitter,
+                       struct pipe_surface *dst,
+                       unsigned dstx, unsigned dsty,
+                       unsigned width, unsigned height,
+                       unsigned value)
+{
+   struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_framebuffer_state fb_state;
+   float rgba[4];
+   ubyte ub_rgba[4] = {0};
+   union util_color color;
+   int i;
+
+   assert(dst->texture);
+   if (!dst->texture)
+      return;
+
+   /* check if we can render to the surface */
+   if (util_format_is_depth_or_stencil(dst->format) || /* unlikely, but you never know */
+       !screen->is_format_supported(screen, dst->format, dst->texture->target,
+                                    PIPE_TEXTURE_USAGE_RENDER_TARGET, 0)) {
+      util_surface_fill(pipe, dst, dstx, dsty, width, height, value);
+      return;
+   }
+
+   /* unpack the color */
+   color.ui = value;
+   util_unpack_color_ub(dst->format, &color,
+                        ub_rgba, ub_rgba+1, ub_rgba+2, ub_rgba+3);
+   for (i = 0; i < 4; i++)
+      rgba[i] = ubyte_to_float(ub_rgba[i]);
+
+   /* check the saved state */
+   blitter_check_saved_CSOs(ctx);
+   assert(blitter->saved_fb_state.nr_cbufs != ~0);
+
+   /* bind CSOs */
+   pipe->bind_blend_state(pipe, ctx->blend_write_color);
+   pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
+   pipe->bind_rasterizer_state(pipe, ctx->rs_state);
+   pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 1));
+   pipe->bind_vs_state(pipe, ctx->vs_col);
+
+   /* set a framebuffer state */
+   fb_state.width = dst->width;
+   fb_state.height = dst->height;
+   fb_state.nr_cbufs = 1;
+   fb_state.cbufs[0] = dst;
+   fb_state.zsbuf = 0;
+   pipe->set_framebuffer_state(pipe, &fb_state);
+
+   blitter_set_clear_color(ctx, rgba);
+   blitter_set_rectangle(ctx, 0, 0, width, height, dst->width, dst->height, 0);
+   blitter_draw_quad(ctx);
+   blitter_restore_CSOs(ctx);
+}
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
new file mode 100644
index 00000000000..92008fce992
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -0,0 +1,254 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Marek Olšák <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_BLITTER_H
+#define U_BLITTER_H
+
+#include "util/u_memory.h"
+
+#include "pipe/p_state.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_context;
+
+struct blitter_context
+{
+   /* Private members, really. */
+   void *saved_blend_state;   /**< blend state */
+   void *saved_dsa_state;     /**< depth stencil alpha state */
+   void *saved_rs_state;      /**< rasterizer state */
+   void *saved_fs, *saved_vs; /**< fragment shader, vertex shader */
+
+   struct pipe_framebuffer_state saved_fb_state;  /**< framebuffer state */
+   struct pipe_stencil_ref saved_stencil_ref;     /**< stencil ref */
+   struct pipe_viewport_state saved_viewport;
+   struct pipe_clip_state saved_clip;
+
+   int saved_num_sampler_states;
+   void *saved_sampler_states[32];
+
+   int saved_num_textures;
+   struct pipe_texture *saved_textures[32]; /* is 32 enough? */
+};
+
+/**
+ * Create a blitter context.
+ */
+struct blitter_context *util_blitter_create(struct pipe_context *pipe);
+
+/**
+ * Destroy a blitter context.
+ */
+void util_blitter_destroy(struct blitter_context *blitter);
+
+/*
+ * These CSOs must be saved before any of the following functions is called:
+ * - blend state
+ * - depth stencil alpha state
+ * - rasterizer state
+ * - vertex shader
+ * - fragment shader
+ */
+
+/**
+ * Clear a specified set of currently bound buffers to specified values.
+ */
+void util_blitter_clear(struct blitter_context *blitter,
+                        unsigned width, unsigned height,
+                        unsigned num_cbufs,
+                        unsigned clear_buffers,
+                        const float *rgba,
+                        double depth, unsigned stencil);
+
+/**
+ * Copy a block of pixels from one surface to another.
+ *
+ * You can copy from any color format to any other color format provided
+ * the former can be sampled and the latter can be rendered to. Otherwise,
+ * a software fallback path is taken and both surfaces must be of the same
+ * format.
+ *
+ * The same holds for depth-stencil formats with the exception that stencil
+ * cannot be copied unless you set ignore_stencil to FALSE. In that case,
+ * a software fallback path is taken and both surfaces must be of the same
+ * format.
+ *
+ * Use pipe_screen->is_format_supported to know your options.
+ *
+ * These states must be saved in the blitter in addition to the state objects
+ * already required to be saved:
+ * - framebuffer state
+ * - fragment sampler states
+ * - fragment sampler textures
+ */
+void util_blitter_copy(struct blitter_context *blitter,
+                       struct pipe_surface *dst,
+                       unsigned dstx, unsigned dsty,
+                       struct pipe_surface *src,
+                       unsigned srcx, unsigned srcy,
+                       unsigned width, unsigned height,
+                       boolean ignore_stencil);
+
+/**
+ * Fill a region of a surface with a constant value.
+ *
+ * If the surface cannot be rendered to or it's a depth-stencil format,
+ * a software fallback path is taken.
+ *
+ * These states must be saved in the blitter in addition to the state objects
+ * already required to be saved:
+ * - framebuffer state
+ */
+void util_blitter_fill(struct blitter_context *blitter,
+                       struct pipe_surface *dst,
+                       unsigned dstx, unsigned dsty,
+                       unsigned width, unsigned height,
+                       unsigned value);
+
+/**
+ * Copy all pixels from one surface to another.
+ *
+ * The rules are the same as in util_blitter_copy with the addition that
+ * surfaces must have the same size.
+ */
+static INLINE
+void util_blitter_copy_surface(struct blitter_context *blitter,
+                               struct pipe_surface *dst,
+                               struct pipe_surface *src,
+                               boolean ignore_stencil)
+{
+   assert(dst->width == src->width && dst->height == src->height);
+
+   util_blitter_copy(blitter, dst, 0, 0, src, 0, 0, src->width, src->height,
+                     ignore_stencil);
+}
+
+
+/* The functions below should be used to save currently bound constant state
+ * objects inside a driver. The objects are automatically restored at the end
+ * of the util_blitter_{clear, fill, copy, copy_surface} functions and then
+ * forgotten.
+ *
+ * CSOs not listed here are not affected by util_blitter. */
+
+static INLINE
+void util_blitter_save_blend(struct blitter_context *blitter,
+                             void *state)
+{
+   blitter->saved_blend_state = state;
+}
+
+static INLINE
+void util_blitter_save_depth_stencil_alpha(struct blitter_context *blitter,
+                                           void *state)
+{
+   blitter->saved_dsa_state = state;
+}
+
+static INLINE
+void util_blitter_save_stencil_ref(struct blitter_context *blitter,
+                                   const struct pipe_stencil_ref *state)
+{
+   blitter->saved_stencil_ref = *state;
+}
+
+static INLINE
+void util_blitter_save_rasterizer(struct blitter_context *blitter,
+                                  void *state)
+{
+   blitter->saved_rs_state = state;
+}
+
+static INLINE
+void util_blitter_save_fragment_shader(struct blitter_context *blitter,
+                                       void *fs)
+{
+   blitter->saved_fs = fs;
+}
+
+static INLINE
+void util_blitter_save_vertex_shader(struct blitter_context *blitter,
+                                     void *vs)
+{
+   blitter->saved_vs = vs;
+}
+
+static INLINE
+void util_blitter_save_framebuffer(struct blitter_context *blitter,
+                                   struct pipe_framebuffer_state *state)
+{
+   blitter->saved_fb_state = *state;
+}
+
+static INLINE
+void util_blitter_save_viewport(struct blitter_context *blitter,
+                                struct pipe_viewport_state *state)
+{
+   blitter->saved_viewport = *state;
+}
+
+static INLINE
+void util_blitter_save_clip(struct blitter_context *blitter,
+                            struct pipe_clip_state *state)
+{
+   blitter->saved_clip = *state;
+}
+
+static INLINE
+void util_blitter_save_fragment_sampler_states(
+                  struct blitter_context *blitter,
+                  int num_sampler_states,
+                  void **sampler_states)
+{
+   assert(num_sampler_states <= Elements(blitter->saved_sampler_states));
+
+   blitter->saved_num_sampler_states = num_sampler_states;
+   memcpy(blitter->saved_sampler_states, sampler_states,
+          num_sampler_states * sizeof(void *));
+}
+
+static INLINE
+void util_blitter_save_fragment_sampler_textures(
+                  struct blitter_context *blitter,
+                  int num_textures,
+                  struct pipe_texture **textures)
+{
+   assert(num_textures <= Elements(blitter->saved_textures));
+
+   blitter->saved_num_textures = num_textures;
+   memcpy(blitter->saved_textures, textures,
+          num_textures * sizeof(struct pipe_texture *));
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_clear.h b/src/gallium/auxiliary/util/u_clear.h
index 1e65a035aed..2c32db61756 100644
--- a/src/gallium/auxiliary/util/u_clear.h
+++ b/src/gallium/auxiliary/util/u_clear.h
@@ -46,13 +46,13 @@ util_clear(struct pipe_context *pipe,
 {
    if (buffers & PIPE_CLEAR_COLOR) {
       struct pipe_surface *ps = framebuffer->cbufs[0];
-      unsigned color;
+      union util_color uc;
 
-      util_pack_color(rgba, ps->format, &color);
+      util_pack_color(rgba, ps->format, &uc);
       if (pipe->surface_fill) {
-         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+         pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, uc.ui);
       } else {
-         util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height, color);
+         util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height, uc.ui);
       }
    }
 
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 40633574b08..94be682c4b1 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -29,123 +29,30 @@
 
 #include "pipe/p_config.h" 
 
-#include <stdarg.h>
-
-
-#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
-
-#include <windows.h>
-#include <winddi.h>
-
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
-
-#include <stdio.h> 
-#include <stdlib.h> 
-#include <windows.h> 
-#include <types.h> 
-
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
-
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN      // Exclude rarely-used stuff from Windows headers
-#endif
-#include <windows.h>
-#include <stdio.h>
-
-#else
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#endif
-
-#include "pipe/p_compiler.h" 
+#include "pipe/p_compiler.h"
+#include "os/os_stream.h"
 #include "util/u_debug.h" 
 #include "pipe/p_format.h" 
 #include "pipe/p_state.h" 
-#include "pipe/p_inlines.h" 
+#include "util/u_inlines.h" 
+#include "util/u_format.h"
 #include "util/u_memory.h" 
 #include "util/u_string.h" 
-#include "util/u_stream.h" 
 #include "util/u_math.h" 
 #include "util/u_tile.h" 
-
-
-#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
-static INLINE void 
-_EngDebugPrint(const char *format, ...)
-{
-   va_list ap;
-   va_start(ap, format);
-   EngDebugPrint("", (PCHAR)format, ap);
-   va_end(ap);
-}
-#endif
+#include "util/u_prim.h" 
 
 
 void _debug_vprintf(const char *format, va_list ap)
 {
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-   /* EngDebugPrint does not handle float point arguments, so we need to use
-    * our own vsnprintf implementation. It is also very slow, so buffer until
-    * we find a newline. */
-   static char buf[512] = {'\0'};
-   size_t len = strlen(buf);
-   int ret = util_vsnprintf(buf + len, sizeof(buf) - len, format, ap);
-   if(ret > (int)(sizeof(buf) - len - 1) || util_strchr(buf + len, '\n')) {
-      _EngDebugPrint("%s", buf);
-      buf[0] = '\0';
-   }
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
-   /* OutputDebugStringA can be very slow, so buffer until we find a newline. */
+   /* We buffer until we find a newline. */
    static char buf[4096] = {'\0'};
    size_t len = strlen(buf);
    int ret = util_vsnprintf(buf + len, sizeof(buf) - len, format, ap);
    if(ret > (int)(sizeof(buf) - len - 1) || util_strchr(buf + len, '\n')) {
-      OutputDebugStringA(buf);
+      os_log_message(buf);
       buf[0] = '\0';
    }
-   
-   if(GetConsoleWindow() && !IsDebuggerPresent()) {
-      fflush(stdout);
-      vfprintf(stderr, format, ap);
-      fflush(stderr);
-   }
-   
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
-   wchar_t *wide_format;
-   long wide_str_len;   
-   char buf[512];   
-   int ret;   
-#if (_WIN32_WCE < 600)
-   ret = vsprintf(buf, format, ap);   
-   if(ret < 0){   
-       sprintf(buf, "Cant handle debug print!");   
-       ret = 25;
-   }
-#else
-   ret = vsprintf_s(buf, 512, format, ap);   
-   if(ret < 0){   
-       sprintf_s(buf, 512, "Cant handle debug print!");   
-       ret = 25;
-   }
-#endif
-   buf[ret] = '\0';   
-   /* Format is ascii - needs to be converted to wchar_t for printing */   
-   wide_str_len = MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, NULL, 0);   
-   wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t));   
-   if (wide_format) {   
-      MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1,   
-            wide_format, wide_str_len);   
-      NKDbgPrintfW(wide_format, wide_format);   
-      free(wide_format);   
-   } 
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-   /* TODO */
-#else /* !PIPE_SUBSYSTEM_WINDOWS */
-   fflush(stdout);
-   vfprintf(stderr, format, ap);
-#endif
 }
 
 
@@ -167,108 +74,12 @@ void debug_print_blob( const char *name,
 #endif
 
 
-#ifndef debug_break
-void debug_break(void) 
-{
-#if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
-   DebugBreak();
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-   EngDebugBreak();
-#else
-   abort();
-#endif
-}
-#endif
-
-
-#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
-static const char *
-find(const char *start, const char *end, char c) 
-{
-   const char *p;
-   for(p = start; !end || p != end; ++p) {
-      if(*p == c)
-	 return p;
-      if(*p < 32)
-	 break;
-   }
-   return NULL;
-}
-
-static int 
-compare(const char *start, const char *end, const char *s)
-{
-   const char *p, *q;
-   for(p = start, q = s; p != end && *q != '\0'; ++p, ++q) {
-      if(*p != *q)
-	 return 0;
-   }
-   return p == end && *q == '\0';
-}
-
-static void 
-copy(char *dst, const char *start, const char *end, size_t n) 
-{
-   const char *p;
-   char *q;
-   for(p = start, q = dst, n = n - 1; p != end && n; ++p, ++q, --n)
-      *q = *p;
-   *q = '\0';
-}
-#endif
-
-
-static INLINE const char *
-_debug_get_option(const char *name)
-{
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-   /* EngMapFile creates the file if it does not exists, so it must either be
-    * disabled on release versions (or put in a less conspicuous place). */
-#ifdef DEBUG
-   const char *result = NULL;
-   ULONG_PTR iFile = 0;
-   const void *pMap = NULL;
-   const char *sol, *eol, *sep;
-   static char output[1024];
-   
-   pMap = EngMapFile(L"\\??\\c:\\gallium.cfg", 0, &iFile);
-   if(pMap) {
-      sol = (const char *)pMap;
-      while(1) {
-	 /* TODO: handle LF line endings */
-	 eol = find(sol, NULL, '\r');
-	 if(!eol || eol == sol)
-	    break;
-	 sep = find(sol, eol, '=');
-	 if(!sep)
-	    break;
-	 if(compare(sol, sep, name)) {
-	    copy(output, sep + 1, eol, sizeof(output));
-	    result = output;
-	    break;
-	 }
-	 sol = eol + 2;
-      }
-      EngUnmapFile(iFile);
-   }
-   return result;
-#else
-   return NULL;
-#endif
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) 
-   /* TODO: implement */
-   return NULL;
-#else
-   return getenv(name);
-#endif
-}
-
 const char *
 debug_get_option(const char *name, const char *dfault)
 {
    const char *result;
 
-   result = _debug_get_option(name);
+   result = os_get_option(name);
    if(!result)
       result = dfault;
       
@@ -280,7 +91,7 @@ debug_get_option(const char *name, const char *dfault)
 boolean
 debug_get_bool_option(const char *name, boolean dfault)
 {
-   const char *str = _debug_get_option(name);
+   const char *str = os_get_option(name);
    boolean result;
    
    if(str == NULL)
@@ -310,7 +121,7 @@ debug_get_num_option(const char *name, long dfault)
    long result;
    const char *str;
    
-   str = _debug_get_option(name);
+   str = os_get_option(name);
    if(!str)
       result = dfault;
    else {
@@ -346,7 +157,7 @@ debug_get_flags_option(const char *name,
    unsigned long result;
    const char *str;
    
-   str = _debug_get_option(name);
+   str = os_get_option(name);
    if(!str)
       result = dfault;
    else if (!util_strcmp(str, "help")) {
@@ -387,7 +198,7 @@ void _debug_assert_fail(const char *expr,
 #else
    if (debug_get_bool_option("GALLIUM_ABORT_ON_ASSERT", TRUE))
 #endif
-      debug_break();
+      os_abort();
    else
       _debug_printf("continuing...\n");
 }
@@ -452,7 +263,8 @@ debug_dump_flags(const struct debug_named_value *names,
 	    util_strncat(output, "|", sizeof(output));
 	 else
 	    first = 0;
-	 util_strncat(output, names->name, sizeof(output));
+	 util_strncat(output, names->name, sizeof(output) - 1);
+	 output[sizeof(output) - 1] = '\0';
 	 value &= ~names->value;
       }
       ++names;
@@ -465,7 +277,8 @@ debug_dump_flags(const struct debug_named_value *names,
 	 first = 0;
       
       util_snprintf(rest, sizeof(rest), "0x%08lx", value);
-      util_strncat(output, rest, sizeof(output));
+      util_strncat(output, rest, sizeof(output) - 1);
+      output[sizeof(output) - 1] = '\0';
    }
    
    if(first)
@@ -475,132 +288,49 @@ debug_dump_flags(const struct debug_named_value *names,
 }
 
 
-static const struct debug_named_value pipe_format_names[] = {
-#ifdef DEBUG
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_NONE),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8R8G8B8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8R8G8B8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_B8G8R8A8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_B8G8R8X8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_A1R5G5B5_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_A4R4G4B4_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R5G6B5_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_A2B10G10R10_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_L8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_I8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8L8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_L16_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_YCBCR),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_YCBCR_REV),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z16_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z32_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z32_FLOAT),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_S8Z24_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z24S8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8Z24_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z24X8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_S8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R64_FLOAT),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R64G64_FLOAT),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R64G64B64_FLOAT),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R64G64B64A64_FLOAT),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_FLOAT),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_FLOAT),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_FLOAT),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_FLOAT),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16A16_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16A16_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16A16_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16A16_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_UNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_USCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_B6G5R5_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8B8G8R8_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8B8G8R8_SNORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_SSCALED),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_L8_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8L8_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8R8G8B8_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8R8G8B8_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_B8G8R8A8_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_B8G8R8X8_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8UB8UG8SR8S_NORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_B6UG5SR5S_NORM),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT1_RGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT1_RGBA),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT3_RGBA),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT5_RGBA),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT1_SRGB),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT1_SRGBA),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT3_SRGBA),
-   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT5_SRGBA),
-#endif
-   DEBUG_NAMED_VALUE_END
-};
-
 #ifdef DEBUG
 void debug_print_format(const char *msg, unsigned fmt )
 {
-   debug_printf("%s: %s\n", msg, debug_dump_enum(pipe_format_names, fmt)); 
+   debug_printf("%s: %s\n", msg, util_format_name(fmt));
 }
 #endif
 
-const char *pf_name( enum pipe_format format )
+
+
+static const struct debug_named_value pipe_prim_names[] = {
+#ifdef DEBUG
+   DEBUG_NAMED_VALUE(PIPE_PRIM_POINTS),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_LINES),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_LINE_LOOP),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_LINE_STRIP),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_TRIANGLES),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_TRIANGLE_STRIP),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_TRIANGLE_FAN),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_QUADS),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_QUAD_STRIP),
+   DEBUG_NAMED_VALUE(PIPE_PRIM_POLYGON),
+#endif
+   DEBUG_NAMED_VALUE_END
+};
+
+
+const char *u_prim_name( unsigned prim )
 {
-   return debug_dump_enum(pipe_format_names, format);
+   return debug_dump_enum(pipe_prim_names, prim);
 }
 
 
+
+
 #ifdef DEBUG
+/**
+ * Dump an image to a .raw or .ppm file (depends on OS).
+ * \param format  PIPE_FORMAT_x
+ * \param cpp  bytes per pixel
+ * \param width  width in pixels
+ * \param height height in pixels
+ * \param stride  row stride in bytes
+ */
 void debug_dump_image(const char *prefix,
                       unsigned format, unsigned cpp,
                       unsigned width, unsigned height,
@@ -642,6 +372,52 @@ void debug_dump_image(const char *prefix,
    }
       
    EngUnmapFile(iFile);
+#elif defined(PIPE_OS_UNIX)
+   /* write a ppm file */
+   char filename[256];
+   FILE *f;
+
+   util_snprintf(filename, sizeof(filename), "%s.ppm", prefix);
+
+   f = fopen(filename, "w");
+   if (f) {
+      int i, x, y;
+      int r, g, b;
+      const uint8_t *ptr = (uint8_t *) data;
+
+      /* XXX this is a hack */
+      switch (format) {
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         r = 2;
+         g = 1;
+         b = 0;
+         break;
+      default:
+         r = 0;
+         g = 1;
+         b = 1;
+      }
+
+      fprintf(f, "P6\n");
+      fprintf(f, "# ppm-file created by osdemo.c\n");
+      fprintf(f, "%i %i\n", width, height);
+      fprintf(f, "255\n");
+      fclose(f);
+
+      f = fopen(filename, "ab");  /* reopen in binary append mode */
+      for (y = 0; y < height; y++) {
+         for (x = 0; x < width; x++) {
+            i = y * stride + x * cpp;
+            fputc(ptr[i + r], f); /* write red */
+            fputc(ptr[i + g], f); /* write green */
+            fputc(ptr[i + b], f); /* write blue */
+         }
+      }
+      fclose(f);
+   }
+   else {
+      fprintf(stderr, "Can't open %s for writing\n", filename);
+   }
 #endif
 }
 
@@ -670,9 +446,9 @@ void debug_dump_surface(const char *prefix,
    
    debug_dump_image(prefix, 
                     texture->format,
-                    pf_get_blocksize(texture->format), 
-                    pf_get_nblocksx(texture->format, transfer->width),
-                    pf_get_nblocksy(texture->format, transfer->height),
+                    util_format_get_blocksize(texture->format), 
+                    util_format_get_nblocksx(texture->format, transfer->width),
+                    util_format_get_nblocksy(texture->format, transfer->height),
                     transfer->stride,
                     data);
    
@@ -682,6 +458,27 @@ error:
 }
 
 
+void debug_dump_texture(const char *prefix,
+                        struct pipe_texture *texture)
+{
+   struct pipe_surface *surface;
+   struct pipe_screen *screen;
+
+   if (!texture)
+      return;
+
+   screen = texture->screen;
+
+   /* XXX for now, just dump image for face=0, level=0 */
+   surface = screen->get_tex_surface(screen, texture, 0, 0, 0,
+                                     PIPE_TEXTURE_USAGE_SAMPLER);
+   if (surface) {
+      debug_dump_surface(prefix, surface);
+      screen->tex_surface_destroy(surface);
+   }
+}
+
+
 #pragma pack(push,2)
 struct bmp_file_header {
    uint16_t bfType;
@@ -767,7 +564,7 @@ debug_dump_float_rgba_bmp(const char *filename,
                           float *rgba, unsigned stride)
 {
 #ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
-   struct util_stream *stream;
+   struct os_stream *stream;
    struct bmp_file_header bmfh;
    struct bmp_info_header bmih;
    unsigned x, y;
@@ -793,12 +590,12 @@ debug_dump_float_rgba_bmp(const char *filename,
    bmih.biClrUsed = 0;
    bmih.biClrImportant = 0;
 
-   stream = util_stream_create(filename, bmfh.bfSize);
+   stream = os_file_stream_create(filename);
    if(!stream)
       goto error1;
 
-   util_stream_write(stream, &bmfh, 14);
-   util_stream_write(stream, &bmih, 40);
+   os_stream_write(stream, &bmfh, 14);
+   os_stream_write(stream, &bmih, 40);
 
    y = height;
    while(y--) {
@@ -810,11 +607,11 @@ debug_dump_float_rgba_bmp(const char *filename,
          pixel.rgbGreen = float_to_ubyte(ptr[x*4 + 1]);
          pixel.rgbBlue  = float_to_ubyte(ptr[x*4 + 2]);
          pixel.rgbAlpha = 255;
-         util_stream_write(stream, &pixel, 4);
+         os_stream_write(stream, &pixel, 4);
       }
    }
 
-   util_stream_close(stream);
+   os_stream_close(stream);
 error1:
    ;
 #endif
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index abd834c741a..0f4768f3444 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -39,9 +39,7 @@
 #define U_DEBUG_H_
 
 
-#include <stdarg.h>
-
-#include "pipe/p_compiler.h"
+#include "os/os_misc.h"
 
 
 #ifdef	__cplusplus
@@ -49,22 +47,6 @@ extern "C" {
 #endif
 
 
-#if defined(DBG) || defined(DEBUG)
-#ifndef DEBUG
-#define DEBUG 1
-#endif
-#else
-#ifndef NDEBUG
-#define NDEBUG 1
-#endif
-#endif
-
-   
-/* MSVC bebore VC7 does not have the __FUNCTION__ macro */
-#if defined(_MSC_VER) && _MSC_VER < 1300
-#define __FUNCTION__ "???"
-#endif
-
 #if defined(__GNUC__)
 #define _util_printf_format(fmt, list) __attribute__ ((format (printf, fmt, list)))
 #else
@@ -155,13 +137,7 @@ void debug_print_format(const char *msg, unsigned fmt );
  * Hard-coded breakpoint.
  */
 #ifdef DEBUG
-#if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && defined(PIPE_CC_GCC)
-#define debug_break() __asm("int3")
-#elif defined(PIPE_CC_MSVC)
-#define debug_break()  __debugbreak()
-#else
-void debug_break(void);
-#endif
+#define debug_break() os_break()
 #else /* !DEBUG */
 #define debug_break() ((void)0)
 #endif /* !DEBUG */
@@ -188,7 +164,7 @@ void _debug_assert_fail(const char *expr,
 #ifdef DEBUG
 #define debug_assert(expr) ((expr) ? (void)0 : _debug_assert_fail(#expr, __FILE__, __LINE__, __FUNCTION__))
 #else
-#define debug_assert(expr) ((void)0)
+#define debug_assert(expr) do { } while (0 && (expr))
 #endif
 
 
@@ -328,22 +304,6 @@ debug_get_flags_option(const char *name,
                        unsigned long dfault);
 
 
-void *
-debug_malloc(const char *file, unsigned line, const char *function,
-             size_t size);
-
-void
-debug_free(const char *file, unsigned line, const char *function,
-           void *ptr);
-
-void *
-debug_calloc(const char *file, unsigned line, const char *function,
-             size_t count, size_t size );
-
-void *
-debug_realloc(const char *file, unsigned line, const char *function,
-              void *old_ptr, size_t old_size, size_t new_size );
-
 unsigned long
 debug_memory_begin(void);
 
@@ -354,6 +314,8 @@ debug_memory_end(unsigned long beginning);
 #ifdef DEBUG
 struct pipe_surface;
 struct pipe_transfer;
+struct pipe_texture;
+
 void debug_dump_image(const char *prefix,
                       unsigned format, unsigned cpp,
                       unsigned width, unsigned height,
@@ -361,6 +323,8 @@ void debug_dump_image(const char *prefix,
                       const void *data);
 void debug_dump_surface(const char *prefix,
                         struct pipe_surface *surface);   
+void debug_dump_texture(const char *prefix,
+                        struct pipe_texture *texture);
 void debug_dump_surface_bmp(const char *filename,
                             struct pipe_surface *surface);
 void debug_dump_transfer_bmp(const char *filename,
@@ -373,7 +337,7 @@ void debug_dump_float_rgba_bmp(const char *filename,
 #define debug_dump_surface(prefix, surface) ((void)0)
 #define debug_dump_surface_bmp(filename, surface) ((void)0)
 #define debug_dump_transfer_bmp(filename, transfer) ((void)0)
-#define debug_dump_rgba_float_bmp(filename, width, height, rgba, stride) ((void)0)
+#define debug_dump_float_rgba_bmp(filename, width, height, rgba, stride) ((void)0)
 #endif
 
 
diff --git a/src/gallium/auxiliary/util/u_debug_memory.c b/src/gallium/auxiliary/util/u_debug_memory.c
index 7623cb93981..f1baa62f894 100644
--- a/src/gallium/auxiliary/util/u_debug_memory.c
+++ b/src/gallium/auxiliary/util/u_debug_memory.c
@@ -34,15 +34,10 @@
 
 #include "pipe/p_config.h" 
 
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-#include <windows.h>
-#include <winddi.h>
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-#include <wdm.h>
-#else
-#include <stdio.h>
-#include <stdlib.h>
-#endif
+#define DEBUG_MEMORY_IMPLEMENTATION
+
+#include "os/os_memory.h"
+#include "os/os_memory_debug.h"
 
 #include "util/u_debug.h" 
 #include "util/u_debug_stack.h" 
@@ -53,18 +48,6 @@
 #define DEBUG_MEMORY_STACK 0 /* XXX: disabled until we have symbol lookup */
 
 
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && !defined(WINCE)
-#define real_malloc(_size) EngAllocMem(0, _size, 'D3AG')
-#define real_free(_ptr) EngFreeMem(_ptr)
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-#define real_malloc(_size) ExAllocatePool(0, _size)
-#define real_free(_ptr) ExFreePool(_ptr)
-#else
-#define real_malloc(_size) malloc(_size)
-#define real_free(_ptr) free(_ptr)
-#endif
-
-
 struct debug_memory_header 
 {
    struct list_head head;
@@ -127,7 +110,7 @@ debug_malloc(const char *file, unsigned line, const char *function,
    struct debug_memory_header *hdr;
    struct debug_memory_footer *ftr;
    
-   hdr = real_malloc(sizeof(*hdr) + size + sizeof(*ftr));
+   hdr = os_malloc(sizeof(*hdr) + size + sizeof(*ftr));
    if(!hdr) {
       debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n",
                    file, line, function,
@@ -185,7 +168,7 @@ debug_free(const char *file, unsigned line, const char *function,
    hdr->magic = 0;
    ftr->magic = 0;
    
-   real_free(hdr);
+   os_free(hdr);
 }
 
 void *
@@ -232,7 +215,7 @@ debug_realloc(const char *file, unsigned line, const char *function,
    }
 
    /* alloc new */
-   new_hdr = real_malloc(sizeof(*new_hdr) + new_size + sizeof(*new_ftr));
+   new_hdr = os_malloc(sizeof(*new_hdr) + new_size + sizeof(*new_ftr));
    if(!new_hdr) {
       debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n",
                    file, line, function,
@@ -258,7 +241,7 @@ debug_realloc(const char *file, unsigned line, const char *function,
    /* free old */
    old_hdr->magic = 0;
    old_ftr->magic = 0;
-   real_free(old_hdr);
+   os_free(old_hdr);
 
    return new_ptr;
 }
@@ -297,9 +280,9 @@ debug_memory_end(unsigned long start_no)
 
       if((start_no <= hdr->no && hdr->no < last_no) ||
 	 (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) {
-	 debug_printf("%s:%u:%s: %u bytes at %p not freed\n",
+	 debug_printf("%s:%u:%s: %lu bytes at %p not freed\n",
 		      hdr->file, hdr->line, hdr->function,
-		      hdr->size, ptr);
+		      (unsigned long) hdr->size, ptr);
 #if DEBUG_MEMORY_STACK
 	 debug_backtrace_dump(hdr->backtrace, DEBUG_MEMORY_STACK);
 #endif
@@ -315,8 +298,8 @@ debug_memory_end(unsigned long start_no)
    }
 
    if(total_size) {
-      debug_printf("Total of %u KB of system memory apparently leaked\n",
-		   (total_size + 1023)/1024);
+      debug_printf("Total of %lu KB of system memory apparently leaked\n",
+		   (unsigned long) (total_size + 1023)/1024);
    }
    else {
       debug_printf("No memory leaks detected.\n");
diff --git a/src/gallium/auxiliary/util/u_dl.c b/src/gallium/auxiliary/util/u_dl.c
new file mode 100644
index 00000000000..37ed789f955
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_dl.c
@@ -0,0 +1,80 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * Copyright 1999-2008  Brian Paul
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_config.h"
+#include "pipe/p_compiler.h"
+
+#if defined(PIPE_OS_UNIX)
+#include <dlfcn.h>
+#endif
+#if defined(PIPE_OS_WINDOWS)
+#include <windows.h>
+#endif
+
+#include "u_dl.h"
+
+
+struct util_dl_library *
+util_dl_open(const char *filename)
+{
+#if defined(PIPE_OS_UNIX)
+   return (struct util_dl_library *)dlopen(filename, RTLD_LAZY | RTLD_GLOBAL);
+#elif defined(PIPE_OS_WINDOWS)
+   return (struct util_dl_library *)LoadLibraryA(filename);
+#else
+   return NULL;
+#endif
+}
+
+
+util_dl_proc
+util_dl_get_proc_address(struct util_dl_library *library,
+                         const char *procname)
+{
+#if defined(PIPE_OS_UNIX)
+   return (util_dl_proc)dlsym((void *)library, procname);
+#elif defined(PIPE_OS_WINDOWS)
+   return (util_dl_proc)GetProcAddress((HMODULE)library, procname);
+#else
+   return (util_dl_proc)NULL;
+#endif
+}
+
+
+void
+util_dl_close(struct util_dl_library *library)
+{
+#if defined(PIPE_OS_UNIX)
+   dlclose((void *)library);
+#elif defined(PIPE_OS_WINDOWS)
+   FreeLibrary((HMODULE)library);
+#else
+   (void)library;
+#endif
+}
diff --git a/src/gallium/auxiliary/util/u_dl.h b/src/gallium/auxiliary/util/u_dl.h
new file mode 100644
index 00000000000..85296c58af6
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_dl.h
@@ -0,0 +1,73 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef U_DL_H_
+#define U_DL_H_
+
+
+#include "pipe/p_config.h"
+
+
+#if defined(PIPE_OS_WINDOWS)
+#  define UTIL_DL_EXT ".dll"
+#elif defined(PIPE_OS_APPLE)
+#  define UTIL_DL_EXT ".dylib"
+#else
+#  define UTIL_DL_EXT ".so"
+#endif
+
+
+struct util_dl_library;
+
+
+typedef void (*util_dl_proc)(void);
+
+
+/**
+ * Open a library dynamically.
+ */
+struct util_dl_library *
+util_dl_open(const char *filename);
+
+
+/**
+ * Lookup a function in a library.
+ */
+util_dl_proc
+util_dl_get_proc_address(struct util_dl_library *library,
+                         const char *procname);
+
+
+/**
+ * Close a library.
+ */
+void
+util_dl_close(struct util_dl_library *library);
+
+
+#endif /* U_DL_H_ */
diff --git a/src/gallium/auxiliary/util/u_draw_quad.c b/src/gallium/auxiliary/util/u_draw_quad.c
index 4110485fb19..14506e84519 100644
--- a/src/gallium/auxiliary/util/u_draw_quad.c
+++ b/src/gallium/auxiliary/util/u_draw_quad.c
@@ -28,7 +28,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
+#include "util/u_inlines.h"
 #include "util/u_draw_quad.h"
 
 
@@ -61,6 +61,7 @@ util_draw_vertex_buffer(struct pipe_context *pipe,
    /* tell pipe about the vertex attributes */
    for (i = 0; i < num_attribs; i++) {
       velements[i].src_offset = i * 4 * sizeof(float);
+      velements[i].instance_divisor = 0;
       velements[i].vertex_buffer_index = 0;
       velements[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       velements[i].nr_components = 4;
diff --git a/src/gallium/auxiliary/util/u_dump.h b/src/gallium/auxiliary/util/u_dump.h
new file mode 100644
index 00000000000..379f18ef38b
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_dump.h
@@ -0,0 +1,173 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Dump data in human/machine readable format.
+ * 
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#ifndef U_DEBUG_DUMP_H_
+#define U_DEBUG_DUMP_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+#define UTIL_DUMP_INVALID_NAME "<invalid>"
+
+
+struct os_stream;
+
+
+/* Duplicated here for convenience */
+extern struct os_stream *
+os_log_stream;
+
+
+/*
+ * p_defines.h
+ *
+ * XXX: These functions don't really dump anything -- just translate into
+ * strings so a verb better than "dump" should be used instead, in order to
+ * free up the namespace to the true dumper functions.
+ */
+
+const char *
+util_dump_blend_factor(unsigned value, boolean shortened);
+
+const char *
+util_dump_blend_func(unsigned value, boolean shortened);
+
+const char *
+util_dump_func(unsigned value, boolean shortened);
+
+const char *
+util_dump_tex_target(unsigned value, boolean shortened);
+
+const char *
+util_dump_tex_wrap(unsigned value, boolean shortened);
+
+const char *
+util_dump_tex_mipfilter(unsigned value, boolean shortened);
+
+const char *
+util_dump_tex_filter(unsigned value, boolean shortened);
+
+
+/*
+ * p_state.h, through an os_stream
+ */
+
+void
+util_dump_template(struct os_stream *stream,
+                   const struct pipe_texture *templat);
+
+void
+util_dump_rasterizer_state(struct os_stream *stream,
+                           const struct pipe_rasterizer_state *state);
+
+void
+util_dump_poly_stipple(struct os_stream *stream,
+                       const struct pipe_poly_stipple *state);
+
+void
+util_dump_viewport_state(struct os_stream *stream,
+                         const struct pipe_viewport_state *state);
+
+void
+util_dump_scissor_state(struct os_stream *stream,
+                        const struct pipe_scissor_state *state);
+
+void
+util_dump_clip_state(struct os_stream *stream,
+                     const struct pipe_clip_state *state);
+
+void
+util_dump_shader_state(struct os_stream *stream,
+                       const struct pipe_shader_state *state);
+
+void
+util_dump_depth_stencil_alpha_state(struct os_stream *stream,
+                                    const struct pipe_depth_stencil_alpha_state *state);
+
+void
+util_dump_rt_blend_state(struct os_stream *stream,
+                         const struct pipe_rt_blend_state *state);
+
+void
+util_dump_blend_state(struct os_stream *stream,
+                      const struct pipe_blend_state *state);
+
+void
+util_dump_blend_color(struct os_stream *stream,
+                      const struct pipe_blend_color *state);
+
+void
+util_dump_stencil_ref(struct os_stream *stream,
+                      const struct pipe_stencil_ref *state);
+
+void
+util_dump_framebuffer_state(struct os_stream *stream,
+                            const struct pipe_framebuffer_state *state);
+
+void
+util_dump_sampler_state(struct os_stream *stream,
+                        const struct pipe_sampler_state *state);
+
+void
+util_dump_surface(struct os_stream *stream,
+                  const struct pipe_surface *state);
+
+void
+util_dump_transfer(struct os_stream *stream,
+                   const struct pipe_transfer *state);
+
+void
+util_dump_vertex_buffer(struct os_stream *stream,
+                        const struct pipe_vertex_buffer *state);
+
+void
+util_dump_vertex_element(struct os_stream *stream,
+                         const struct pipe_vertex_element *state);
+
+
+/* FIXME: Move the other debug_dump_xxx functions out of u_debug.h into here. */
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* U_DEBUG_H_ */
diff --git a/src/gallium/auxiliary/util/u_debug_dump.c b/src/gallium/auxiliary/util/u_dump_defines.c
index 09866880aea..96a22563473 100644
--- a/src/gallium/auxiliary/util/u_debug_dump.c
+++ b/src/gallium/auxiliary/util/u_dump_defines.c
@@ -28,15 +28,12 @@
 
 #include "util/u_memory.h"
 #include "util/u_debug.h" 
-#include "util/u_debug_dump.h" 
-
-
-#define DEBUG_DUMP_INVALID_NAME "<invalid>"
+#include "util/u_dump.h"
 
 
 #if 0
 static const char *
-debug_dump_strip_prefix(const char *name,
+util_dump_strip_prefix(const char *name,
                         const char *prefix) 
 {
    const char *stripped;
@@ -55,30 +52,30 @@ debug_dump_strip_prefix(const char *name,
 #endif
 
 static const char *
-debug_dump_enum_continuous(unsigned value, 
+util_dump_enum_continuous(unsigned value,
                            unsigned num_names,
                            const char **names)
 {
    if (value >= num_names)
-      return DEBUG_DUMP_INVALID_NAME;
+      return UTIL_DUMP_INVALID_NAME;
    return names[value];
 }
 
 
-#define DEFINE_DEBUG_DUMP_CONTINUOUS(_name) \
+#define DEFINE_UTIL_DUMP_CONTINUOUS(_name) \
    const char * \
-   debug_dump_##_name(unsigned value, boolean shortened) \
+   util_dump_##_name(unsigned value, boolean shortened) \
    { \
       if(shortened) \
-         return debug_dump_enum_continuous(value, Elements(debug_dump_##_name##_short_names), debug_dump_##_name##_short_names); \
+         return util_dump_enum_continuous(value, Elements(util_dump_##_name##_short_names), util_dump_##_name##_short_names); \
       else \
-         return debug_dump_enum_continuous(value, Elements(debug_dump_##_name##_names), debug_dump_##_name##_names); \
+         return util_dump_enum_continuous(value, Elements(util_dump_##_name##_names), util_dump_##_name##_names); \
    }
 
 
 static const char *
-debug_dump_blend_factor_names[] = {
-   DEBUG_DUMP_INVALID_NAME, /* 0x0 */
+util_dump_blend_factor_names[] = {
+   UTIL_DUMP_INVALID_NAME, /* 0x0 */
    "PIPE_BLENDFACTOR_ONE",
    "PIPE_BLENDFACTOR_SRC_COLOR",
    "PIPE_BLENDFACTOR_SRC_ALPHA",
@@ -89,18 +86,18 @@ debug_dump_blend_factor_names[] = {
    "PIPE_BLENDFACTOR_CONST_ALPHA",
    "PIPE_BLENDFACTOR_SRC1_COLOR",
    "PIPE_BLENDFACTOR_SRC1_ALPHA",
-   DEBUG_DUMP_INVALID_NAME, /* 0x0b */
-   DEBUG_DUMP_INVALID_NAME, /* 0x0c */
-   DEBUG_DUMP_INVALID_NAME, /* 0x0d */
-   DEBUG_DUMP_INVALID_NAME, /* 0x0e */
-   DEBUG_DUMP_INVALID_NAME, /* 0x0f */
-   DEBUG_DUMP_INVALID_NAME, /* 0x10 */
+   UTIL_DUMP_INVALID_NAME, /* 0x0b */
+   UTIL_DUMP_INVALID_NAME, /* 0x0c */
+   UTIL_DUMP_INVALID_NAME, /* 0x0d */
+   UTIL_DUMP_INVALID_NAME, /* 0x0e */
+   UTIL_DUMP_INVALID_NAME, /* 0x0f */
+   UTIL_DUMP_INVALID_NAME, /* 0x10 */
    "PIPE_BLENDFACTOR_ZERO",
    "PIPE_BLENDFACTOR_INV_SRC_COLOR",
    "PIPE_BLENDFACTOR_INV_SRC_ALPHA",
    "PIPE_BLENDFACTOR_INV_DST_ALPHA",
    "PIPE_BLENDFACTOR_INV_DST_COLOR",
-   DEBUG_DUMP_INVALID_NAME, /* 0x16 */
+   UTIL_DUMP_INVALID_NAME, /* 0x16 */
    "PIPE_BLENDFACTOR_INV_CONST_COLOR",
    "PIPE_BLENDFACTOR_INV_CONST_ALPHA",
    "PIPE_BLENDFACTOR_INV_SRC1_COLOR",
@@ -108,8 +105,8 @@ debug_dump_blend_factor_names[] = {
 };
 
 static const char *
-debug_dump_blend_factor_short_names[] = {
-   DEBUG_DUMP_INVALID_NAME, /* 0x0 */
+util_dump_blend_factor_short_names[] = {
+   UTIL_DUMP_INVALID_NAME, /* 0x0 */
    "one",
    "src_color",
    "src_alpha",
@@ -120,29 +117,29 @@ debug_dump_blend_factor_short_names[] = {
    "const_alpha",
    "src1_color",
    "src1_alpha",
-   DEBUG_DUMP_INVALID_NAME, /* 0x0b */
-   DEBUG_DUMP_INVALID_NAME, /* 0x0c */
-   DEBUG_DUMP_INVALID_NAME, /* 0x0d */
-   DEBUG_DUMP_INVALID_NAME, /* 0x0e */
-   DEBUG_DUMP_INVALID_NAME, /* 0x0f */
-   DEBUG_DUMP_INVALID_NAME, /* 0x10 */
+   UTIL_DUMP_INVALID_NAME, /* 0x0b */
+   UTIL_DUMP_INVALID_NAME, /* 0x0c */
+   UTIL_DUMP_INVALID_NAME, /* 0x0d */
+   UTIL_DUMP_INVALID_NAME, /* 0x0e */
+   UTIL_DUMP_INVALID_NAME, /* 0x0f */
+   UTIL_DUMP_INVALID_NAME, /* 0x10 */
    "zero",
    "inv_src_color",
    "inv_src_alpha",
    "inv_dst_alpha",
    "inv_dst_color",
-   DEBUG_DUMP_INVALID_NAME, /* 0x16 */
+   UTIL_DUMP_INVALID_NAME, /* 0x16 */
    "inv_const_color",
    "inv_const_alpha",
    "inv_src1_color",
    "inv_src1_alpha"
 };
 
-DEFINE_DEBUG_DUMP_CONTINUOUS(blend_factor)
+DEFINE_UTIL_DUMP_CONTINUOUS(blend_factor)
 
 
 static const char *
-debug_dump_blend_func_names[] = {
+util_dump_blend_func_names[] = {
    "PIPE_BLEND_ADD",
    "PIPE_BLEND_SUBTRACT",
    "PIPE_BLEND_REVERSE_SUBTRACT",
@@ -151,7 +148,7 @@ debug_dump_blend_func_names[] = {
 };
 
 static const char *
-debug_dump_blend_func_short_names[] = {
+util_dump_blend_func_short_names[] = {
    "add",
    "sub",
    "rev_sub",
@@ -159,11 +156,11 @@ debug_dump_blend_func_short_names[] = {
    "max"
 };
 
-DEFINE_DEBUG_DUMP_CONTINUOUS(blend_func)
+DEFINE_UTIL_DUMP_CONTINUOUS(blend_func)
 
 
 static const char *
-debug_dump_func_names[] = {
+util_dump_func_names[] = {
    "PIPE_FUNC_NEVER",
    "PIPE_FUNC_LESS",
    "PIPE_FUNC_EQUAL",
@@ -175,7 +172,7 @@ debug_dump_func_names[] = {
 };
 
 static const char *
-debug_dump_func_short_names[] = {
+util_dump_func_short_names[] = {
    "never",
    "less",
    "equal",
@@ -186,11 +183,11 @@ debug_dump_func_short_names[] = {
    "always"
 };
 
-DEFINE_DEBUG_DUMP_CONTINUOUS(func)
+DEFINE_UTIL_DUMP_CONTINUOUS(func)
 
 
 static const char *
-debug_dump_tex_target_names[] = {
+util_dump_tex_target_names[] = {
    "PIPE_TEXTURE_1D",
    "PIPE_TEXTURE_2D",
    "PIPE_TEXTURE_3D",
@@ -198,18 +195,18 @@ debug_dump_tex_target_names[] = {
 };
 
 static const char *
-debug_dump_tex_target_short_names[] = {
+util_dump_tex_target_short_names[] = {
    "1d",
    "2d",
    "3d",
    "cube"
 };
 
-DEFINE_DEBUG_DUMP_CONTINUOUS(tex_target)
+DEFINE_UTIL_DUMP_CONTINUOUS(tex_target)
 
 
 static const char *
-debug_dump_tex_wrap_names[] = {
+util_dump_tex_wrap_names[] = {
    "PIPE_TEX_WRAP_REPEAT",
    "PIPE_TEX_WRAP_CLAMP",
    "PIPE_TEX_WRAP_CLAMP_TO_EDGE",
@@ -221,7 +218,7 @@ debug_dump_tex_wrap_names[] = {
 };
 
 static const char *
-debug_dump_tex_wrap_short_names[] = {
+util_dump_tex_wrap_short_names[] = {
    "repeat",
    "clamp",
    "clamp_to_edge",
@@ -232,38 +229,36 @@ debug_dump_tex_wrap_short_names[] = {
    "mirror_clamp_to_border"
 };
 
-DEFINE_DEBUG_DUMP_CONTINUOUS(tex_wrap)
+DEFINE_UTIL_DUMP_CONTINUOUS(tex_wrap)
 
 
 static const char *
-debug_dump_tex_mipfilter_names[] = {
+util_dump_tex_mipfilter_names[] = {
    "PIPE_TEX_MIPFILTER_NEAREST",
    "PIPE_TEX_MIPFILTER_LINEAR",
    "PIPE_TEX_MIPFILTER_NONE"
 };
 
 static const char *
-debug_dump_tex_mipfilter_short_names[] = {
+util_dump_tex_mipfilter_short_names[] = {
    "nearest",
    "linear",
    "none"
 };
 
-DEFINE_DEBUG_DUMP_CONTINUOUS(tex_mipfilter)
+DEFINE_UTIL_DUMP_CONTINUOUS(tex_mipfilter)
 
 
 static const char *
-debug_dump_tex_filter_names[] = {
+util_dump_tex_filter_names[] = {
    "PIPE_TEX_FILTER_NEAREST",
-   "PIPE_TEX_FILTER_LINEAR",
-   "PIPE_TEX_FILTER_ANISO"
+   "PIPE_TEX_FILTER_LINEAR"
 };
 
 static const char *
-debug_dump_tex_filter_short_names[] = {
+util_dump_tex_filter_short_names[] = {
    "nearest",
-   "linear",
-   "aniso"
+   "linear"
 };
 
-DEFINE_DEBUG_DUMP_CONTINUOUS(tex_filter)
+DEFINE_UTIL_DUMP_CONTINUOUS(tex_filter)
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
new file mode 100644
index 00000000000..ae7afd7311e
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -0,0 +1,708 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "os/os_stream.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "util/u_format.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "u_dump.h"
+
+
+/*
+ * Dump primitives
+ */
+
+static INLINE void
+util_stream_writef(struct os_stream *stream, const char *format, ...)
+{
+   static char buf[1024];
+   unsigned len;
+   va_list ap;
+   va_start(ap, format);
+   len = util_vsnprintf(buf, sizeof(buf), format, ap);
+   va_end(ap);
+   os_stream_write(stream, buf, len);
+}
+
+static void
+util_dump_bool(struct os_stream *stream, int value)
+{
+   util_stream_writef(stream, "%c", value ? '1' : '0');
+}
+
+static void
+util_dump_int(struct os_stream *stream, long long int value)
+{
+   util_stream_writef(stream, "%lli", value);
+}
+
+static void
+util_dump_uint(struct os_stream *stream, long long unsigned value)
+{
+   util_stream_writef(stream, "%llu", value);
+}
+
+static void
+util_dump_float(struct os_stream *stream, double value)
+{
+   util_stream_writef(stream, "%g", value);
+}
+
+static void
+util_dump_string(struct os_stream *stream, const char *str)
+{
+   os_stream_write_str(stream, "\"");
+   os_stream_write_str(stream, str);
+   os_stream_write_str(stream, "\"");
+}
+
+static void
+util_dump_enum(struct os_stream *stream, const char *value)
+{
+   os_stream_write_str(stream, value);
+}
+
+static void
+util_dump_array_begin(struct os_stream *stream)
+{
+   os_stream_write_str(stream, "{");
+}
+
+static void
+util_dump_array_end(struct os_stream *stream)
+{
+   os_stream_write_str(stream, "}");
+}
+
+static void
+util_dump_elem_begin(struct os_stream *stream)
+{
+}
+
+static void
+util_dump_elem_end(struct os_stream *stream)
+{
+   os_stream_write_str(stream, ", ");
+}
+
+static void
+util_dump_struct_begin(struct os_stream *stream, const char *name)
+{
+   os_stream_write_str(stream, "{");
+}
+
+static void
+util_dump_struct_end(struct os_stream *stream)
+{
+   os_stream_write_str(stream, "}");
+}
+
+static void
+util_dump_member_begin(struct os_stream *stream, const char *name)
+{
+   util_stream_writef(stream, "%s = ", name);
+}
+
+static void
+util_dump_member_end(struct os_stream *stream)
+{
+   os_stream_write_str(stream, ", ");
+}
+
+static void
+util_dump_null(struct os_stream *stream)
+{
+   os_stream_write_str(stream, "NULL");
+}
+
+static void
+util_dump_ptr(struct os_stream *stream, const void *value)
+{
+   if(value)
+      util_stream_writef(stream, "0x%08lx", (unsigned long)(uintptr_t)value);
+   else
+      util_dump_null(stream);
+}
+
+
+/*
+ * Code saving macros.
+ */
+
+#define util_dump_arg(_stream, _type, _arg) \
+   do { \
+      util_dump_arg_begin(_stream, #_arg); \
+      util_dump_##_type(_stream, _arg); \
+      util_dump_arg_end(_stream); \
+   } while(0)
+
+#define util_dump_ret(_stream, _type, _arg) \
+   do { \
+      util_dump_ret_begin(_stream); \
+      util_dump_##_type(_stream, _arg); \
+      util_dump_ret_end(_stream); \
+   } while(0)
+
+#define util_dump_array(_stream, _type, _obj, _size) \
+   do { \
+      size_t idx; \
+      util_dump_array_begin(_stream); \
+      for(idx = 0; idx < (_size); ++idx) { \
+         util_dump_elem_begin(_stream); \
+         util_dump_##_type(_stream, (_obj)[idx]); \
+         util_dump_elem_end(_stream); \
+      } \
+      util_dump_array_end(_stream); \
+   } while(0)
+
+#define util_dump_struct_array(_stream, _type, _obj, _size) \
+   do { \
+      size_t idx; \
+      util_dump_array_begin(_stream); \
+      for(idx = 0; idx < (_size); ++idx) { \
+         util_dump_elem_begin(_stream); \
+         util_dump_##_type(_stream, &(_obj)[idx]); \
+         util_dump_elem_end(_stream); \
+      } \
+      util_dump_array_end(_stream); \
+   } while(0)
+
+#define util_dump_member(_stream, _type, _obj, _member) \
+   do { \
+      util_dump_member_begin(_stream, #_member); \
+      util_dump_##_type(_stream, (_obj)->_member); \
+      util_dump_member_end(_stream); \
+   } while(0)
+
+#define util_dump_arg_array(_stream, _type, _arg, _size) \
+   do { \
+      util_dump_arg_begin(_stream, #_arg); \
+      util_dump_array(_stream, _type, _arg, _size); \
+      util_dump_arg_end(_stream); \
+   } while(0)
+
+#define util_dump_member_array(_stream, _type, _obj, _member) \
+   do { \
+      util_dump_member_begin(_stream, #_member); \
+      util_dump_array(_stream, _type, (_obj)->_member, sizeof((_obj)->_member)/sizeof((_obj)->_member[0])); \
+      util_dump_member_end(_stream); \
+   } while(0)
+
+
+
+/*
+ * Wrappers for enum -> string dumpers.
+ */
+
+
+static void
+util_dump_format(struct os_stream *stream, enum pipe_format format)
+{
+   util_dump_enum(stream, util_format_name(format));
+}
+
+
+static void
+util_dump_enum_blend_factor(struct os_stream *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_blend_factor(value, TRUE));
+}
+
+static void
+util_dump_enum_blend_func(struct os_stream *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_blend_func(value, TRUE));
+}
+
+static void
+util_dump_enum_func(struct os_stream *stream, unsigned value)
+{
+   util_dump_enum(stream, util_dump_func(value, TRUE));
+}
+
+
+/*
+ * Public functions
+ */
+
+
+void
+util_dump_template(struct os_stream *stream, const struct pipe_texture *templat)
+{
+   if(!templat) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_texture");
+
+   util_dump_member(stream, int, templat, target);
+   util_dump_member(stream, format, templat, format);
+
+   util_dump_member_begin(stream, "width");
+   util_dump_uint(stream, templat->width0);
+   util_dump_member_end(stream);
+
+   util_dump_member_begin(stream, "height");
+   util_dump_uint(stream, templat->height0);
+   util_dump_member_end(stream);
+
+   util_dump_member_begin(stream, "depth");
+   util_dump_uint(stream, templat->depth0);
+   util_dump_member_end(stream);
+
+   util_dump_member(stream, uint, templat, last_level);
+   util_dump_member(stream, uint, templat, tex_usage);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_rasterizer_state(struct os_stream *stream, const struct pipe_rasterizer_state *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_rasterizer_state");
+
+   util_dump_member(stream, bool, state, flatshade);
+   util_dump_member(stream, bool, state, light_twoside);
+   util_dump_member(stream, uint, state, front_winding);
+   util_dump_member(stream, uint, state, cull_mode);
+   util_dump_member(stream, uint, state, fill_cw);
+   util_dump_member(stream, uint, state, fill_ccw);
+   util_dump_member(stream, bool, state, offset_cw);
+   util_dump_member(stream, bool, state, offset_ccw);
+   util_dump_member(stream, bool, state, scissor);
+   util_dump_member(stream, bool, state, poly_smooth);
+   util_dump_member(stream, bool, state, poly_stipple_enable);
+   util_dump_member(stream, bool, state, point_smooth);
+   util_dump_member(stream, uint, state, sprite_coord_enable);
+   util_dump_member(stream, bool, state, sprite_coord_mode);
+   util_dump_member(stream, bool, state, point_quad_rasterization);
+   util_dump_member(stream, bool, state, point_size_per_vertex);
+   util_dump_member(stream, bool, state, multisample);
+   util_dump_member(stream, bool, state, line_smooth);
+   util_dump_member(stream, bool, state, line_stipple_enable);
+   util_dump_member(stream, uint, state, line_stipple_factor);
+   util_dump_member(stream, uint, state, line_stipple_pattern);
+   util_dump_member(stream, bool, state, line_last_pixel);
+   util_dump_member(stream, bool, state, flatshade_first);
+   util_dump_member(stream, bool, state, gl_rasterization_rules);
+
+   util_dump_member(stream, float, state, line_width);
+   util_dump_member(stream, float, state, point_size);
+   util_dump_member(stream, float, state, offset_units);
+   util_dump_member(stream, float, state, offset_scale);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_poly_stipple(struct os_stream *stream, const struct pipe_poly_stipple *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_poly_stipple");
+
+   util_dump_member_begin(stream, "stipple");
+   util_dump_member_array(stream, uint, state, stipple);
+   util_dump_member_end(stream);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_viewport_state(struct os_stream *stream, const struct pipe_viewport_state *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_viewport_state");
+
+   util_dump_member_array(stream, float, state, scale);
+   util_dump_member_array(stream, float, state, translate);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_scissor_state(struct os_stream *stream, const struct pipe_scissor_state *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_scissor_state");
+
+   util_dump_member(stream, uint, state, minx);
+   util_dump_member(stream, uint, state, miny);
+   util_dump_member(stream, uint, state, maxx);
+   util_dump_member(stream, uint, state, maxy);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_clip_state(struct os_stream *stream, const struct pipe_clip_state *state)
+{
+   unsigned i;
+
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_clip_state");
+
+   util_dump_member_begin(stream, "ucp");
+   util_dump_array_begin(stream);
+   for(i = 0; i < PIPE_MAX_CLIP_PLANES; ++i) {
+      util_dump_elem_begin(stream);
+      util_dump_array(stream, float, state->ucp[i], 4);
+      util_dump_elem_end(stream);
+   }
+   util_dump_array_end(stream);
+   util_dump_member_end(stream);
+
+   util_dump_member(stream, uint, state, nr);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_shader_state(struct os_stream *stream, const struct pipe_shader_state *state)
+{
+   char str[8192];
+
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   tgsi_dump_str(state->tokens, 0, str, sizeof(str));
+
+   util_dump_struct_begin(stream, "pipe_shader_state");
+
+   util_dump_member_begin(stream, "tokens");
+   util_dump_string(stream, str);
+   util_dump_member_end(stream);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_depth_stencil_alpha_state(struct os_stream *stream, const struct pipe_depth_stencil_alpha_state *state)
+{
+   unsigned i;
+
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_depth_stencil_alpha_state");
+
+   util_dump_member_begin(stream, "depth");
+   util_dump_struct_begin(stream, "pipe_depth_state");
+   util_dump_member(stream, bool, &state->depth, enabled);
+   if (state->depth.enabled) {
+      util_dump_member(stream, bool, &state->depth, writemask);
+      util_dump_member(stream, enum_func, &state->depth, func);
+   }
+   util_dump_struct_end(stream);
+   util_dump_member_end(stream);
+
+   util_dump_member_begin(stream, "stencil");
+   util_dump_array_begin(stream);
+   for(i = 0; i < Elements(state->stencil); ++i) {
+      util_dump_elem_begin(stream);
+      util_dump_struct_begin(stream, "pipe_stencil_state");
+      util_dump_member(stream, bool, &state->stencil[i], enabled);
+      if (state->stencil[i].enabled) {
+         util_dump_member(stream, enum_func, &state->stencil[i], func);
+         util_dump_member(stream, uint, &state->stencil[i], fail_op);
+         util_dump_member(stream, uint, &state->stencil[i], zpass_op);
+         util_dump_member(stream, uint, &state->stencil[i], zfail_op);
+         util_dump_member(stream, uint, &state->stencil[i], valuemask);
+         util_dump_member(stream, uint, &state->stencil[i], writemask);
+      }
+      util_dump_struct_end(stream);
+      util_dump_elem_end(stream);
+   }
+   util_dump_array_end(stream);
+   util_dump_member_end(stream);
+
+   util_dump_member_begin(stream, "alpha");
+   util_dump_struct_begin(stream, "pipe_alpha_state");
+   util_dump_member(stream, bool, &state->alpha, enabled);
+   if (state->alpha.enabled) {
+      util_dump_member(stream, enum_func, &state->alpha, func);
+      util_dump_member(stream, float, &state->alpha, ref_value);
+   }
+   util_dump_struct_end(stream);
+   util_dump_member_end(stream);
+
+   util_dump_struct_end(stream);
+}
+
+void
+util_dump_rt_blend_state(struct os_stream *stream, const struct pipe_rt_blend_state *state)
+{
+   util_dump_struct_begin(stream, "pipe_rt_blend_state");
+
+   util_dump_member(stream, uint, state, blend_enable);
+   if (state->blend_enable) {
+      util_dump_member(stream, enum_blend_func, state, rgb_func);
+      util_dump_member(stream, enum_blend_factor, state, rgb_src_factor);
+      util_dump_member(stream, enum_blend_factor, state, rgb_dst_factor);
+
+      util_dump_member(stream, enum_blend_func, state, alpha_func);
+      util_dump_member(stream, enum_blend_factor, state, alpha_src_factor);
+      util_dump_member(stream, enum_blend_factor, state, alpha_dst_factor);
+   }
+
+   util_dump_member(stream, uint, state, colormask);
+
+   util_dump_struct_end(stream);
+}
+
+void
+util_dump_blend_state(struct os_stream *stream, const struct pipe_blend_state *state)
+{
+   unsigned valid_entries = 1;
+
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_blend_state");
+
+   util_dump_member(stream, bool, state, dither);
+
+   util_dump_member(stream, bool, state, logicop_enable);
+   if (state->logicop_enable) {
+      util_dump_member(stream, enum_func, state, logicop_func);
+   }
+   else {
+      util_dump_member(stream, bool, state, independent_blend_enable);
+
+      util_dump_member_begin(stream, "rt");
+      if (state->independent_blend_enable)
+         valid_entries = PIPE_MAX_COLOR_BUFS;
+      util_dump_struct_array(stream, rt_blend_state, state->rt, valid_entries);
+      util_dump_member_end(stream);
+   }
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_blend_color(struct os_stream *stream, const struct pipe_blend_color *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_blend_color");
+
+   util_dump_member_array(stream, float, state, color);
+
+   util_dump_struct_end(stream);
+}
+
+void
+util_dump_stencil_ref(struct os_stream *stream, const struct pipe_stencil_ref *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_stencil_ref");
+
+   util_dump_member_array(stream, uint, state, ref_value);
+
+   util_dump_struct_end(stream);
+}
+
+void
+util_dump_framebuffer_state(struct os_stream *stream, const struct pipe_framebuffer_state *state)
+{
+   util_dump_struct_begin(stream, "pipe_framebuffer_state");
+
+   util_dump_member(stream, uint, state, width);
+   util_dump_member(stream, uint, state, height);
+   util_dump_member(stream, uint, state, nr_cbufs);
+   util_dump_member_array(stream, ptr, state, cbufs);
+   util_dump_member(stream, ptr, state, zsbuf);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_sampler_state(struct os_stream *stream, const struct pipe_sampler_state *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_sampler_state");
+
+   util_dump_member(stream, uint, state, wrap_s);
+   util_dump_member(stream, uint, state, wrap_t);
+   util_dump_member(stream, uint, state, wrap_r);
+   util_dump_member(stream, uint, state, min_img_filter);
+   util_dump_member(stream, uint, state, min_mip_filter);
+   util_dump_member(stream, uint, state, mag_img_filter);
+   util_dump_member(stream, uint, state, compare_mode);
+   util_dump_member(stream, enum_func, state, compare_func);
+   util_dump_member(stream, bool, state, normalized_coords);
+   util_dump_member(stream, uint, state, max_anisotropy);
+   util_dump_member(stream, float, state, lod_bias);
+   util_dump_member(stream, float, state, min_lod);
+   util_dump_member(stream, float, state, max_lod);
+   util_dump_member_array(stream, float, state, border_color);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_surface(struct os_stream *stream, const struct pipe_surface *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_surface");
+
+   util_dump_member(stream, format, state, format);
+   util_dump_member(stream, uint, state, width);
+   util_dump_member(stream, uint, state, height);
+
+   util_dump_member(stream, uint, state, layout);
+   util_dump_member(stream, uint, state, offset);
+   util_dump_member(stream, uint, state, usage);
+
+   util_dump_member(stream, ptr, state, texture);
+   util_dump_member(stream, uint, state, face);
+   util_dump_member(stream, uint, state, level);
+   util_dump_member(stream, uint, state, zslice);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_transfer(struct os_stream *stream, const struct pipe_transfer *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_transfer");
+
+   util_dump_member(stream, uint, state, width);
+   util_dump_member(stream, uint, state, height);
+
+   util_dump_member(stream, uint, state, stride);
+   util_dump_member(stream, uint, state, usage);
+
+   util_dump_member(stream, ptr, state, texture);
+   util_dump_member(stream, uint, state, face);
+   util_dump_member(stream, uint, state, level);
+   util_dump_member(stream, uint, state, zslice);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_vertex_buffer(struct os_stream *stream, const struct pipe_vertex_buffer *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_vertex_buffer");
+
+   util_dump_member(stream, uint, state, stride);
+   util_dump_member(stream, uint, state, max_index);
+   util_dump_member(stream, uint, state, buffer_offset);
+   util_dump_member(stream, ptr, state, buffer);
+
+   util_dump_struct_end(stream);
+}
+
+
+void
+util_dump_vertex_element(struct os_stream *stream, const struct pipe_vertex_element *state)
+{
+   if(!state) {
+      util_dump_null(stream);
+      return;
+   }
+
+   util_dump_struct_begin(stream, "pipe_vertex_element");
+
+   util_dump_member(stream, uint, state, src_offset);
+
+   util_dump_member(stream, uint, state, vertex_buffer_index);
+   util_dump_member(stream, uint, state, nr_components);
+
+   util_dump_member(stream, format, state, src_format);
+
+   util_dump_struct_end(stream);
+}
diff --git a/src/gallium/auxiliary/util/u_format.csv b/src/gallium/auxiliary/util/u_format.csv
index b9cc2aa716e..96a0fa65507 100644
--- a/src/gallium/auxiliary/util/u_format.csv
+++ b/src/gallium/auxiliary/util/u_format.csv
@@ -1,109 +1,186 @@
-PIPE_FORMAT_A8R8G8B8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , zyxw, rgb
-PIPE_FORMAT_X8R8G8B8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , zyx1, rgb
-PIPE_FORMAT_B8G8R8A8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , yzwx, rgb
-PIPE_FORMAT_B8G8R8X8_UNORM        , arith , 1, 1, un8 , un8 , un8 , un8 , yzw1, rgb
-PIPE_FORMAT_A1R5G5B5_UNORM        , arith , 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb
-PIPE_FORMAT_A4R4G4B4_UNORM        , arith , 1, 1, un4 , un4 , un4 , un4 , zyxw, rgb
-PIPE_FORMAT_R5G6B5_UNORM          , arith , 1, 1, un5 , un6 , un5 ,     , zyx1, rgb
-PIPE_FORMAT_A2B10G10R10_UNORM     , arith , 1, 1, un10, un10, un10, un2 , xyzw, rgb
-PIPE_FORMAT_L8_UNORM              , arith , 1, 1, un8 ,     ,     ,     , xxx1, rgb
-PIPE_FORMAT_A8_UNORM              , arith , 1, 1, un8 ,     ,     ,     , 000x, rgb
-PIPE_FORMAT_I8_UNORM              , arith , 1, 1, un8 ,     ,     ,     , xxxx, rgb
-PIPE_FORMAT_A8L8_UNORM            , arith , 1, 1, un8 , un8 ,     ,     , xxxy, rgb
-PIPE_FORMAT_L16_UNORM             , arith , 1, 1, un16,     ,     ,     , xxx1, rgb
-PIPE_FORMAT_Z16_UNORM             , array , 1, 1, un16,     ,     ,     , x___, zs 
-PIPE_FORMAT_Z32_UNORM             , array , 1, 1, un32,     ,     ,     , x___, zs 
-PIPE_FORMAT_Z32_FLOAT             , array , 1, 1, f32 ,     ,     ,     , x___, zs 
-PIPE_FORMAT_S8Z24_UNORM           , arith , 1, 1, un24, un8 ,     ,     , xy__, zs 
-PIPE_FORMAT_Z24S8_UNORM           , arith , 1, 1, un8 , un24,     ,     , yx__, zs 
-PIPE_FORMAT_X8Z24_UNORM           , arith , 1, 1, un24, un8 ,     ,     , x___, zs 
-PIPE_FORMAT_Z24X8_UNORM           , arith , 1, 1, un8 , un24,     ,     , y___, zs 
-PIPE_FORMAT_S8_UNORM              , array , 1, 1, un8 ,     ,     ,     , _x__, zs 
-PIPE_FORMAT_R64_FLOAT             , array , 1, 1, f64 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R64G64_FLOAT          , array , 1, 1, f64 , f64 ,     ,     , xy01, rgb
-PIPE_FORMAT_R64G64B64_FLOAT       , array , 1, 1, f64 , f64 , f64 ,     , xyz1, rgb
-PIPE_FORMAT_R64G64B64A64_FLOAT    , array , 1, 1, f64 , f64 , f64 , f64 , xyzw, rgb
-PIPE_FORMAT_R32_FLOAT             , array , 1, 1, f32 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R32G32_FLOAT          , array , 1, 1, f32 , f32 ,     ,     , xy01, rgb
-PIPE_FORMAT_R32G32B32_FLOAT       , array , 1, 1, f32 , f32 , f32 ,     , xyz1, rgb
-PIPE_FORMAT_R32G32B32A32_FLOAT    , array , 1, 1, f32 , f32 , f32 , f32 , xyzw, rgb
-PIPE_FORMAT_R32_UNORM             , array , 1, 1, un32,     ,     ,     , x001, rgb
-PIPE_FORMAT_R32G32_UNORM          , array , 1, 1, un32, un32,     ,     , xy01, rgb
-PIPE_FORMAT_R32G32B32_UNORM       , array , 1, 1, un32, un32, un32,     , xyz1, rgb
-PIPE_FORMAT_R32G32B32A32_UNORM    , array , 1, 1, un32, un32, un32, un32, xyzw, rgb
-PIPE_FORMAT_R32_USCALED           , array , 1, 1, u32 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R32G32_USCALED        , array , 1, 1, u32 , u32 ,     ,     , xy01, rgb
-PIPE_FORMAT_R32G32B32_USCALED     , array , 1, 1, u32 , u32 , u32 ,     , xyz1, rgb
-PIPE_FORMAT_R32G32B32A32_USCALED  , array , 1, 1, u32 , u32 , u32 , u32 , xyzw, rgb
-PIPE_FORMAT_R32_SNORM             , array , 1, 1, sn32,     ,     ,     , x001, rgb
-PIPE_FORMAT_R32G32_SNORM          , array , 1, 1, sn32, sn32,     ,     , xy01, rgb
-PIPE_FORMAT_R32G32B32_SNORM       , array , 1, 1, sn32, sn32, sn32,     , xyz1, rgb
-PIPE_FORMAT_R32G32B32A32_SNORM    , array , 1, 1, sn32, sn32, sn32, sn32, xyzw, rgb
-PIPE_FORMAT_R32_SSCALED           , array , 1, 1, s32 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R32G32_SSCALED        , array , 1, 1, s32 , s32 ,     ,     , xy01, rgb
-PIPE_FORMAT_R32G32B32_SSCALED     , array , 1, 1, s32 , s32 , s32 ,     , xyz1, rgb
-PIPE_FORMAT_R32G32B32A32_SSCALED  , array , 1, 1, s32 , s32 , s32 , s32 , xyzw, rgb
-PIPE_FORMAT_R16_UNORM             , array , 1, 1, un16,     ,     ,     , x001, rgb
-PIPE_FORMAT_R16G16_UNORM          , array , 1, 1, un16, un16,     ,     , xy01, rgb
-PIPE_FORMAT_R16G16B16_UNORM       , array , 1, 1, un16, un16, un16,     , xyz1, rgb
-PIPE_FORMAT_R16G16B16A16_UNORM    , array , 1, 1, un16, un16, un16, un16, xyzw, rgb
-PIPE_FORMAT_R16_USCALED           , array , 1, 1, u16 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R16G16_USCALED        , array , 1, 1, u16 , u16 ,     ,     , xy01, rgb
-PIPE_FORMAT_R16G16B16_USCALED     , array , 1, 1, u16 , u16 , u16 ,     , xyz1, rgb
-PIPE_FORMAT_R16G16B16A16_USCALED  , array , 1, 1, u16 , u16 , u16 , u16 , xyzw, rgb
-PIPE_FORMAT_R16_SNORM             , array , 1, 1, sn16,     ,     ,     , x001, rgb
-PIPE_FORMAT_R16G16_SNORM          , array , 1, 1, sn16, sn16,     ,     , xy01, rgb
-PIPE_FORMAT_R16G16B16_SNORM       , array , 1, 1, sn16, sn16, sn16,     , xyz1, rgb
-PIPE_FORMAT_R16G16B16A16_SNORM    , array , 1, 1, sn16, sn16, sn16, sn16, xyzw, rgb
-PIPE_FORMAT_R16_SSCALED           , array , 1, 1, s16 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R16G16_SSCALED        , array , 1, 1, s16 , s16 ,     ,     , xy01, rgb
-PIPE_FORMAT_R16G16B16_SSCALED     , array , 1, 1, s16 , s16 , s16 ,     , xyz1, rgb
-PIPE_FORMAT_R16G16B16A16_SSCALED  , array , 1, 1, s16 , s16 , s16 , s16 , xyzw, rgb
-PIPE_FORMAT_R8_UNORM              , array , 1, 1, un8 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R8G8_UNORM            , array , 1, 1, un8 , un8 ,     ,     , xy01, rgb
-PIPE_FORMAT_R8G8B8_UNORM          , array , 1, 1, un8 , un8 , un8 ,     , xyz1, rgb
-PIPE_FORMAT_R8G8B8A8_UNORM        , array , 1, 1, un8 , un8 , un8 , un8 , xyzw, rgb
-PIPE_FORMAT_R8G8B8X8_UNORM        , array , 1, 1, un8 , un8 , un8 , un8 , xyz1, rgb
-PIPE_FORMAT_R8_USCALED            , array , 1, 1, u8  ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R8G8_USCALED          , array , 1, 1, u8  , u8  ,     ,     , xy01, rgb
-PIPE_FORMAT_R8G8B8_USCALED        , array , 1, 1, u8  , u8  , u8  ,     , xyz1, rgb
-PIPE_FORMAT_R8G8B8A8_USCALED      , array , 1, 1, u8  , u8  , u8  , u8  , xyzw, rgb
-PIPE_FORMAT_R8G8B8X8_USCALED      , array , 1, 1, u8  , u8  , u8  , u8  , xyz1, rgb
-PIPE_FORMAT_R8_SNORM              , array , 1, 1, sn8 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R8G8_SNORM            , array , 1, 1, sn8 , sn8 ,     ,     , xy01, rgb
-PIPE_FORMAT_R8G8B8_SNORM          , array , 1, 1, sn8 , sn8 , sn8 ,     , xyz1, rgb
-PIPE_FORMAT_R8G8B8A8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb
-PIPE_FORMAT_R8G8B8X8_SNORM        , array , 1, 1, sn8 , sn8 , sn8 , sn8 , xyz1, rgb
-PIPE_FORMAT_B6G5R5_SNORM          , arith , 1, 1, sn5 , sn5 , sn6 ,     , zyx1, rgb
-PIPE_FORMAT_A8B8G8R8_SNORM        , arith , 1, 1, sn8 , sn8 , sn8 , sn8 , zyxw, rgb
-PIPE_FORMAT_X8B8G8R8_SNORM        , arith , 1, 1, sn8 , sn8 , sn8 , sn8 , zyx1, rgb
-PIPE_FORMAT_R8_SSCALED            , array , 1, 1, s8  ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R8G8_SSCALED          , array , 1, 1, s8  , s8  ,     ,     , xy01, rgb
-PIPE_FORMAT_R8G8B8_SSCALED        , array , 1, 1, s8  , s8  , s8  ,     , xyz1, rgb
-PIPE_FORMAT_R8G8B8A8_SSCALED      , array , 1, 1, s8  , s8  , s8  , s8  , xyzw, rgb
-PIPE_FORMAT_R8G8B8X8_SSCALED      , array , 1, 1, s8  , s8  , s8  , s8  , xyz1, rgb
-PIPE_FORMAT_R32_FIXED             , array , 1, 1, h32 ,     ,     ,     , x001, rgb
-PIPE_FORMAT_R32G32_FIXED          , array , 1, 1, h32 , h32 ,     ,     , xy01, rgb
-PIPE_FORMAT_R32G32B32_FIXED       , array , 1, 1, h32 , h32 , h32 ,     , xyz1, rgb
-PIPE_FORMAT_R32G32B32A32_FIXED    , array , 1, 1, h32 , h32 , h32 , h32 , xyzw, rgb
-PIPE_FORMAT_L8_SRGB               , arith , 1, 1, u8  ,     ,     ,     , xxx1, srgb 
-PIPE_FORMAT_A8L8_SRGB             , arith , 1, 1, u8  , u8  ,     ,     , xxxy, srgb 
-PIPE_FORMAT_R8G8B8_SRGB           , arith , 1, 1, u8  , u8  , u8  ,     , xyz1, srgb 
-PIPE_FORMAT_R8G8B8A8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , xyzw, srgb 
-PIPE_FORMAT_R8G8B8X8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , xyz1, srgb 
-PIPE_FORMAT_A8R8G8B8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , wxyz, srgb 
-PIPE_FORMAT_X8R8G8B8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , 1xyz, srgb 
-PIPE_FORMAT_B8G8R8A8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , zyxw, srgb 
-PIPE_FORMAT_B8G8R8X8_SRGB         , arith , 1, 1, u8  , u8  , u8  , u8  , zyx1, srgb 
-PIPE_FORMAT_X8UB8UG8SR8S_NORM     , arith , 1, 1, sn8 , sn8 , un8 , x8  , 1zyx, rgb
-PIPE_FORMAT_B6UG5SR5S_NORM        , arith , 1, 1, sn5 , sn5 , un6 ,     , xyz1, rgb
-PIPE_FORMAT_YCBCR                 , yuv   , 2, 1, x32 ,     ,     ,     , xyz1, yuv
-PIPE_FORMAT_YCBCR_REV             , yuv   , 2, 1, x32 ,     ,     ,     , xyz1, yuv
-PIPE_FORMAT_DXT1_RGBA             , dxt   , 4, 4, x64 ,     ,     ,     , xyzw, rgb
-PIPE_FORMAT_DXT1_RGB              , dxt   , 4, 4, x64 ,     ,     ,     , xyz1, rgb
-PIPE_FORMAT_DXT3_RGBA             , dxt   , 4, 4, x128,     ,     ,     , xyzw, rgb
-PIPE_FORMAT_DXT5_RGBA             , dxt   , 4, 4, x128,     ,     ,     , xyzw, rgb
-PIPE_FORMAT_DXT1_SRGBA            , dxt   , 4, 4, x64 ,     ,     ,     , xyzw, srgb
-PIPE_FORMAT_DXT1_SRGB             , dxt   , 4, 4, x64 ,     ,     ,     , xyz1, srgb
-PIPE_FORMAT_DXT3_SRGBA            , dxt   , 4, 4, x128,     ,     ,     , xyzw, srgb
-PIPE_FORMAT_DXT5_SRGBA            , dxt   , 4, 4, x128,     ,     ,     , xyzw, srgb
+###########################################################################
+# 
+# Copyright 2009-2010 VMware, Inc.
+# All Rights Reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sub license, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+# 
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial portions
+# of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# 
+###########################################################################
+
+# This CSV file has the input data for u_format.h's struct
+# util_format_description.
+#
+# Each format entry contains:
+# - name, per enum pipe_format
+# - layout, per enum util_format_layout, in shortened lower caps
+# - pixel block's width
+# - pixel block's height
+# - channel encoding (only meaningful for plain layout), containing for each
+#   channel the following information:
+#   - type, one of
+#     - 'x': void
+#     - 'u': unsigned
+#     - 's': signed
+#     - 'h': fixed
+#     - 'f': FLOAT
+#   - optionally followed by 'n' if it is normalized
+#   - number of bits
+# - channel swizzle 
+# - color space: rgb, yub, sz
+#
+# See also:
+# - http://msdn.microsoft.com/en-us/library/ee416489.aspx (D3D9)
+# - http://msdn.microsoft.com/en-us/library/ee415668.aspx (D3D9 -> D3D10)
+# - http://msdn.microsoft.com/en-us/library/ee418116.aspx (D3D10)
+#
+# Note that GL doesn't really specify the layout of internal formats. See
+# OpenGL 2.1 specification, Table 3.16, on the "Correspondence of sized
+# internal formats to base in- ternal formats, and desired component
+# resolutions for each sized internal format."
+
+# Typical rendertarget formats
+PIPE_FORMAT_B8G8R8A8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, rgb
+PIPE_FORMAT_B8G8R8X8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , zyx1, rgb
+PIPE_FORMAT_A8R8G8B8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , yzwx, rgb
+PIPE_FORMAT_X8R8G8B8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , yzw1, rgb
+PIPE_FORMAT_A8B8G8R8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , wzyx, rgb
+PIPE_FORMAT_X8B8G8R8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , wzy1, rgb
+PIPE_FORMAT_B5G5R5A1_UNORM        , plain, 1, 1, un5 , un5 , un5 , un1 , zyxw, rgb
+PIPE_FORMAT_B4G4R4A4_UNORM        , plain, 1, 1, un4 , un4 , un4 , un4 , zyxw, rgb
+PIPE_FORMAT_B5G6R5_UNORM          , plain, 1, 1, un5 , un6 , un5 ,     , zyx1, rgb
+PIPE_FORMAT_R10G10B10A2_UNORM     , plain, 1, 1, un10, un10, un10, un2 , xyzw, rgb
+
+# Luminance/Intensity/Alpha formats
+PIPE_FORMAT_L8_UNORM              , plain, 1, 1, un8 ,     ,     ,     , xxx1, rgb
+PIPE_FORMAT_A8_UNORM              , plain, 1, 1, un8 ,     ,     ,     , 000x, rgb
+PIPE_FORMAT_I8_UNORM              , plain, 1, 1, un8 ,     ,     ,     , xxxx, rgb
+PIPE_FORMAT_L8A8_UNORM            , plain, 1, 1, un8 , un8 ,     ,     , xxxy, rgb
+PIPE_FORMAT_L16_UNORM             , plain, 1, 1, un16,     ,     ,     , xxx1, rgb
+
+# SRGB formats
+PIPE_FORMAT_L8_SRGB               , plain, 1, 1, un8 ,     ,     ,     , xxx1, srgb 
+PIPE_FORMAT_L8A8_SRGB             , plain, 1, 1, un8 , un8 ,     ,     , xxxy, srgb 
+PIPE_FORMAT_R8G8B8_SRGB           , plain, 1, 1, un8 , un8 , un8 ,     , xyz1, srgb 
+PIPE_FORMAT_A8B8G8R8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , wzyx, srgb
+PIPE_FORMAT_X8B8G8R8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , wzy1, srgb
+PIPE_FORMAT_B8G8R8A8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , zyxw, srgb
+PIPE_FORMAT_B8G8R8X8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , zyx1, srgb
+PIPE_FORMAT_A8R8G8B8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , yzwx, srgb
+PIPE_FORMAT_X8R8G8B8_SRGB         , plain, 1, 1, un8 , un8 , un8 , un8 , yzw1, srgb
+
+# Mixed-sign formats (typically used for bump map textures)
+PIPE_FORMAT_R8SG8SB8UX8U_NORM     , plain, 1, 1, sn8 , sn8 , un8 , x8  , xyz1, rgb
+PIPE_FORMAT_R5SG5SB6U_NORM        , plain, 1, 1, sn5 , sn5 , un6 ,     , xyz1, rgb
+
+# Depth-stencil formats
+PIPE_FORMAT_S8_UNORM              , plain, 1, 1, un8 ,     ,     ,     , _x__, zs 
+PIPE_FORMAT_Z16_UNORM             , plain, 1, 1, un16,     ,     ,     , x___, zs 
+PIPE_FORMAT_Z32_UNORM             , plain, 1, 1, un32,     ,     ,     , x___, zs 
+PIPE_FORMAT_Z32_FLOAT             , plain, 1, 1, f32 ,     ,     ,     , x___, zs 
+PIPE_FORMAT_Z24S8_UNORM           , plain, 1, 1, un24, un8 ,     ,     , xy__, zs 
+PIPE_FORMAT_S8Z24_UNORM           , plain, 1, 1, un8 , un24,     ,     , yx__, zs 
+PIPE_FORMAT_Z24X8_UNORM           , plain, 1, 1, un24, un8 ,     ,     , x___, zs 
+PIPE_FORMAT_X8Z24_UNORM           , plain, 1, 1, un8 , un24,     ,     , y___, zs 
+
+# YUV formats
+# http://www.fourcc.org/yuv.php#UYVY
+PIPE_FORMAT_UYVY                 , subsampled, 2, 1, x32 ,     ,     ,     , xyz1, yuv
+# http://www.fourcc.org/yuv.php#YUYV (a.k.a http://www.fourcc.org/yuv.php#YUY2)
+# XXX: u_tile.c's ycbcr_get_tile_rgba actually interprets it as VYUY but the 
+# intent should be to match D3DFMT_YUY2
+PIPE_FORMAT_YUYV                 , subsampled, 2, 1, x32 ,     ,     ,     , xyz1, yuv
+
+# Compressed formats
+PIPE_FORMAT_DXT1_RGB              , compressed, 4, 4, x64 ,     ,     ,     , xyz1, rgb
+PIPE_FORMAT_DXT1_RGBA             , compressed, 4, 4, x64 ,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_DXT3_RGBA             , compressed, 4, 4, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_DXT5_RGBA             , compressed, 4, 4, x128,     ,     ,     , xyzw, rgb
+PIPE_FORMAT_DXT1_SRGB             , compressed, 4, 4, x64 ,     ,     ,     , xyz1, srgb
+PIPE_FORMAT_DXT1_SRGBA            , compressed, 4, 4, x64 ,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_DXT3_SRGBA            , compressed, 4, 4, x128,     ,     ,     , xyzw, srgb
+PIPE_FORMAT_DXT5_SRGBA            , compressed, 4, 4, x128,     ,     ,     , xyzw, srgb
+
+# Straightforward D3D10-like formats (also used for 
+# vertex buffer element description)
+# 
+# See also:
+# - src/gallium/auxiliary/translate/translate_generic.c
+# - src/mesa/state_tracker/st_draw.c
+PIPE_FORMAT_R64_FLOAT             , plain, 1, 1, f64 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R64G64_FLOAT          , plain, 1, 1, f64 , f64 ,     ,     , xy01, rgb
+PIPE_FORMAT_R64G64B64_FLOAT       , plain, 1, 1, f64 , f64 , f64 ,     , xyz1, rgb
+PIPE_FORMAT_R64G64B64A64_FLOAT    , plain, 1, 1, f64 , f64 , f64 , f64 , xyzw, rgb
+PIPE_FORMAT_R32_FLOAT             , plain, 1, 1, f32 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R32G32_FLOAT          , plain, 1, 1, f32 , f32 ,     ,     , xy01, rgb
+PIPE_FORMAT_R32G32B32_FLOAT       , plain, 1, 1, f32 , f32 , f32 ,     , xyz1, rgb
+PIPE_FORMAT_R32G32B32A32_FLOAT    , plain, 1, 1, f32 , f32 , f32 , f32 , xyzw, rgb
+PIPE_FORMAT_R32_UNORM             , plain, 1, 1, un32,     ,     ,     , x001, rgb
+PIPE_FORMAT_R32G32_UNORM          , plain, 1, 1, un32, un32,     ,     , xy01, rgb
+PIPE_FORMAT_R32G32B32_UNORM       , plain, 1, 1, un32, un32, un32,     , xyz1, rgb
+PIPE_FORMAT_R32G32B32A32_UNORM    , plain, 1, 1, un32, un32, un32, un32, xyzw, rgb
+PIPE_FORMAT_R32_USCALED           , plain, 1, 1, u32 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R32G32_USCALED        , plain, 1, 1, u32 , u32 ,     ,     , xy01, rgb
+PIPE_FORMAT_R32G32B32_USCALED     , plain, 1, 1, u32 , u32 , u32 ,     , xyz1, rgb
+PIPE_FORMAT_R32G32B32A32_USCALED  , plain, 1, 1, u32 , u32 , u32 , u32 , xyzw, rgb
+PIPE_FORMAT_R32_SNORM             , plain, 1, 1, sn32,     ,     ,     , x001, rgb
+PIPE_FORMAT_R32G32_SNORM          , plain, 1, 1, sn32, sn32,     ,     , xy01, rgb
+PIPE_FORMAT_R32G32B32_SNORM       , plain, 1, 1, sn32, sn32, sn32,     , xyz1, rgb
+PIPE_FORMAT_R32G32B32A32_SNORM    , plain, 1, 1, sn32, sn32, sn32, sn32, xyzw, rgb
+PIPE_FORMAT_R32_SSCALED           , plain, 1, 1, s32 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R32G32_SSCALED        , plain, 1, 1, s32 , s32 ,     ,     , xy01, rgb
+PIPE_FORMAT_R32G32B32_SSCALED     , plain, 1, 1, s32 , s32 , s32 ,     , xyz1, rgb
+PIPE_FORMAT_R32G32B32A32_SSCALED  , plain, 1, 1, s32 , s32 , s32 , s32 , xyzw, rgb
+PIPE_FORMAT_R32_FIXED             , plain, 1, 1, h32 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R32G32_FIXED          , plain, 1, 1, h32 , h32 ,     ,     , xy01, rgb
+PIPE_FORMAT_R32G32B32_FIXED       , plain, 1, 1, h32 , h32 , h32 ,     , xyz1, rgb
+PIPE_FORMAT_R32G32B32A32_FIXED    , plain, 1, 1, h32 , h32 , h32 , h32 , xyzw, rgb
+PIPE_FORMAT_R16_UNORM             , plain, 1, 1, un16,     ,     ,     , x001, rgb
+PIPE_FORMAT_R16G16_UNORM          , plain, 1, 1, un16, un16,     ,     , xy01, rgb
+PIPE_FORMAT_R16G16B16_UNORM       , plain, 1, 1, un16, un16, un16,     , xyz1, rgb
+PIPE_FORMAT_R16G16B16A16_UNORM    , plain, 1, 1, un16, un16, un16, un16, xyzw, rgb
+PIPE_FORMAT_R16_USCALED           , plain, 1, 1, u16 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R16G16_USCALED        , plain, 1, 1, u16 , u16 ,     ,     , xy01, rgb
+PIPE_FORMAT_R16G16B16_USCALED     , plain, 1, 1, u16 , u16 , u16 ,     , xyz1, rgb
+PIPE_FORMAT_R16G16B16A16_USCALED  , plain, 1, 1, u16 , u16 , u16 , u16 , xyzw, rgb
+PIPE_FORMAT_R16_SNORM             , plain, 1, 1, sn16,     ,     ,     , x001, rgb
+PIPE_FORMAT_R16G16_SNORM          , plain, 1, 1, sn16, sn16,     ,     , xy01, rgb
+PIPE_FORMAT_R16G16B16_SNORM       , plain, 1, 1, sn16, sn16, sn16,     , xyz1, rgb
+PIPE_FORMAT_R16G16B16A16_SNORM    , plain, 1, 1, sn16, sn16, sn16, sn16, xyzw, rgb
+PIPE_FORMAT_R16_SSCALED           , plain, 1, 1, s16 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R16G16_SSCALED        , plain, 1, 1, s16 , s16 ,     ,     , xy01, rgb
+PIPE_FORMAT_R16G16B16_SSCALED     , plain, 1, 1, s16 , s16 , s16 ,     , xyz1, rgb
+PIPE_FORMAT_R16G16B16A16_SSCALED  , plain, 1, 1, s16 , s16 , s16 , s16 , xyzw, rgb
+PIPE_FORMAT_R8_UNORM              , plain, 1, 1, un8 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R8G8_UNORM            , plain, 1, 1, un8 , un8 ,     ,     , xy01, rgb
+PIPE_FORMAT_R8G8B8_UNORM          , plain, 1, 1, un8 , un8 , un8 ,     , xyz1, rgb
+PIPE_FORMAT_R8G8B8A8_UNORM        , plain, 1, 1, un8 , un8 , un8 , un8 , xyzw, rgb
+PIPE_FORMAT_R8_USCALED            , plain, 1, 1, u8  ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R8G8_USCALED          , plain, 1, 1, u8  , u8  ,     ,     , xy01, rgb
+PIPE_FORMAT_R8G8B8_USCALED        , plain, 1, 1, u8  , u8  , u8  ,     , xyz1, rgb
+PIPE_FORMAT_R8G8B8A8_USCALED      , plain, 1, 1, u8  , u8  , u8  , u8  , xyzw, rgb
+PIPE_FORMAT_R8_SNORM              , plain, 1, 1, sn8 ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R8G8_SNORM            , plain, 1, 1, sn8 , sn8 ,     ,     , xy01, rgb
+PIPE_FORMAT_R8G8B8_SNORM          , plain, 1, 1, sn8 , sn8 , sn8 ,     , xyz1, rgb
+PIPE_FORMAT_R8G8B8A8_SNORM        , plain, 1, 1, sn8 , sn8 , sn8 , sn8 , xyzw, rgb
+PIPE_FORMAT_R8_SSCALED            , plain, 1, 1, s8  ,     ,     ,     , x001, rgb
+PIPE_FORMAT_R8G8_SSCALED          , plain, 1, 1, s8  , s8  ,     ,     , xy01, rgb
+PIPE_FORMAT_R8G8B8_SSCALED        , plain, 1, 1, s8  , s8  , s8  ,     , xyz1, rgb
+PIPE_FORMAT_R8G8B8A8_SSCALED      , plain, 1, 1, s8  , s8  , s8  , s8  , xyzw, rgb
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index 6740683a618..e8fa0022b5b 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 Vmware, Inc.
+ * Copyright 2009-2010 Vmware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -31,14 +31,40 @@
 
 
 #include "pipe/p_format.h"
+#include "util/u_debug.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
+
+/**
+ * Describe how to pack/unpack pixels into/from the prescribed format.
+ *
+ * XXX: This could be renamed to something like util_format_pack, or broke down
+ * in flags inside util_format_block that said exactly what we want.
+ */
 enum util_format_layout {
-   UTIL_FORMAT_LAYOUT_SCALAR = 0,
-   UTIL_FORMAT_LAYOUT_ARITH = 1,
-   UTIL_FORMAT_LAYOUT_ARRAY = 2,
-   UTIL_FORMAT_LAYOUT_YUV = 3,
-   UTIL_FORMAT_LAYOUT_DXT = 4
+   /**
+    * Formats with util_format_block::width == util_format_block::height == 1
+    * that can be described as an ordinary data structure.
+    */
+   UTIL_FORMAT_LAYOUT_PLAIN = 0,
+
+   /**
+    * Formats with sub-sampled channels.
+    *
+    * This is for formats like YV12 where there is less than one sample per
+    * pixel.
+    *
+    * XXX: This could actually b
+    */
+   UTIL_FORMAT_LAYOUT_SUBSAMPLED = 3,
+
+   /**
+    * An unspecified compression algorithm.
+    */
+   UTIL_FORMAT_LAYOUT_COMPRESSED = 4
 };
 
 
@@ -79,7 +105,7 @@ enum util_format_colorspace {
    UTIL_FORMAT_COLORSPACE_RGB = 0,
    UTIL_FORMAT_COLORSPACE_SRGB = 1,
    UTIL_FORMAT_COLORSPACE_YUV = 2,
-   UTIL_FORMAT_COLORSPACE_ZS = 3,
+   UTIL_FORMAT_COLORSPACE_ZS = 3
 };
 
 
@@ -95,10 +121,50 @@ struct util_format_description
 {
    enum pipe_format format;
    const char *name;
+
+   /**
+    * Pixel block dimensions.
+    */
    struct util_format_block block;
+
    enum util_format_layout layout;
+
+   /**
+    * The number of channels.
+    */
+   unsigned nr_channels:3;
+
+   /**
+    * Whether all channels have the same number of (whole) bytes.
+    */
+   unsigned is_array:1;
+
+   /**
+    * Whether channels have mixed types (ignoring UTIL_FORMAT_TYPE_VOID).
+    */
+   unsigned is_mixed:1;
+
+   /**
+    * Input channel description.
+    *
+    * Only valid for UTIL_FORMAT_LAYOUT_PLAIN formats.
+    */
    struct util_format_channel_description channel[4];
+
+   /**
+    * Output channel swizzle.
+    *
+    * The order is either:
+    * - RGBA
+    * - YUV(A)
+    * - ZS
+    * depending on the colorspace.
+    */
    unsigned char swizzle[4];
+
+   /**
+    * Colorspace transformation.
+    */
    enum util_format_colorspace colorspace;
 };
 
@@ -111,6 +177,234 @@ const struct util_format_description *
 util_format_description(enum pipe_format format);
 
 
+/*
+ * Format query functions.
+ */
+
+static INLINE const char *
+util_format_name(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(format);
+   if (!format) {
+      return "???";
+   }
+
+   return desc->name;
+}
+
+static INLINE boolean 
+util_format_is_compressed(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(format);
+   if (!format) {
+      return FALSE;
+   }
+
+   return desc->layout == UTIL_FORMAT_LAYOUT_COMPRESSED ? TRUE : FALSE;
+}
+
+static INLINE boolean 
+util_format_is_depth_or_stencil(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(format);
+   if (!format) {
+      return FALSE;
+   }
+
+   return desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? TRUE : FALSE;
+}
+
+static INLINE boolean 
+util_format_is_depth_and_stencil(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(format);
+   if (!format) {
+      return FALSE;
+   }
+
+   if (desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS) {
+      return FALSE;
+   }
+
+   return (desc->swizzle[0] != UTIL_FORMAT_SWIZZLE_NONE &&
+           desc->swizzle[1] != UTIL_FORMAT_SWIZZLE_NONE) ? TRUE : FALSE;
+}
+
+
+/**
+ * Return total bits needed for the pixel format per block.
+ */
+static INLINE uint
+util_format_get_blocksizebits(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(format);
+   if (!format) {
+      return 0;
+   }
+
+   return desc->block.bits;
+}
+
+/**
+ * Return bytes per block (not pixel) for the given format.
+ */
+static INLINE uint
+util_format_get_blocksize(enum pipe_format format)
+{
+   uint bits = util_format_get_blocksizebits(format);
+
+   assert(bits % 8 == 0);
+
+   return bits / 8;
+}
+
+static INLINE uint
+util_format_get_blockwidth(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(format);
+   if (!format) {
+      return 1;
+   }
+
+   return desc->block.width;
+}
+
+static INLINE uint
+util_format_get_blockheight(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(format);
+   if (!format) {
+      return 1;
+   }
+
+   return desc->block.height;
+}
+
+static INLINE unsigned
+util_format_get_nblocksx(enum pipe_format format,
+                         unsigned x)
+{
+   unsigned blockwidth = util_format_get_blockwidth(format);
+   return (x + blockwidth - 1) / blockwidth;
+}
+
+static INLINE unsigned
+util_format_get_nblocksy(enum pipe_format format,
+                         unsigned y)
+{
+   unsigned blockheight = util_format_get_blockheight(format);
+   return (y + blockheight - 1) / blockheight;
+}
+
+static INLINE unsigned
+util_format_get_nblocks(enum pipe_format format,
+                        unsigned width,
+                        unsigned height)
+{
+   return util_format_get_nblocksx(format, width) * util_format_get_nblocksy(format, height);
+}
+
+static INLINE size_t
+util_format_get_stride(enum pipe_format format,
+                       unsigned width)
+{
+   return util_format_get_nblocksx(format, width) * util_format_get_blocksize(format);
+}
+
+static INLINE size_t
+util_format_get_2d_size(enum pipe_format format,
+                        size_t stride,
+                        unsigned height)
+{
+   return util_format_get_nblocksy(format, height) * stride;
+}
+
+static INLINE uint
+util_format_get_component_bits(enum pipe_format format,
+                               enum util_format_colorspace colorspace,
+                               uint component)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   enum util_format_colorspace desc_colorspace;
+
+   assert(format);
+   if (!format) {
+      return 0;
+   }
+
+   assert(component < 4);
+
+   /* Treat RGB and SRGB as equivalent. */
+   if (colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+      colorspace = UTIL_FORMAT_COLORSPACE_RGB;
+   }
+   if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+      desc_colorspace = UTIL_FORMAT_COLORSPACE_RGB;
+   } else {
+      desc_colorspace = desc->colorspace;
+   }
+
+   if (desc_colorspace != colorspace) {
+      return 0;
+   }
+
+   switch (desc->swizzle[component]) {
+   case UTIL_FORMAT_SWIZZLE_X:
+      return desc->channel[0].size;
+   case UTIL_FORMAT_SWIZZLE_Y:
+      return desc->channel[1].size;
+   case UTIL_FORMAT_SWIZZLE_Z:
+      return desc->channel[2].size;
+   case UTIL_FORMAT_SWIZZLE_W:
+      return desc->channel[3].size;
+   default:
+      return 0;
+   }
+}
+
+static INLINE boolean
+util_format_has_alpha(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+
+   assert(format);
+   if (!format) {
+      return FALSE;
+   }
+
+   switch (desc->colorspace) {
+   case UTIL_FORMAT_COLORSPACE_RGB:
+   case UTIL_FORMAT_COLORSPACE_SRGB:
+      return desc->swizzle[3] != UTIL_FORMAT_SWIZZLE_1;
+   case UTIL_FORMAT_COLORSPACE_YUV:
+      return FALSE;
+   case UTIL_FORMAT_COLORSPACE_ZS:
+      return FALSE;
+   default:
+      assert(0);
+      return FALSE;
+   }
+}
+
+
+/*
+ * Format access functions.
+ */
+
 void
 util_format_read_4f(enum pipe_format format,
                     float *dst, unsigned dst_stride, 
@@ -135,4 +429,8 @@ util_format_write_4ub(enum pipe_format format,
                       void *dst, unsigned dst_stride, 
                       unsigned x, unsigned y, unsigned w, unsigned h);
 
+#ifdef __cplusplus
+} // extern "C" {
+#endif
+
 #endif /* ! U_FORMAT_H */
diff --git a/src/gallium/auxiliary/util/u_format_access.py b/src/gallium/auxiliary/util/u_format_access.py
index eeb1a9657fd..00424779d28 100644
--- a/src/gallium/auxiliary/util/u_format_access.py
+++ b/src/gallium/auxiliary/util/u_format_access.py
@@ -37,20 +37,10 @@
 '''
 
 
+import math
 import sys
 
-from u_format_parse import *
-
-
-def short_name(format):
-    '''Make up a short norm for a format, suitable to be used as suffix in
-    function names.'''
-
-    name = format.name
-    if name.startswith('PIPE_FORMAT_'):
-        name = name[len('PIPE_FORMAT_'):]
-    name = name.lower()
-    return name
+from u_format_pack import *
 
 
 def is_format_supported(format):
@@ -63,16 +53,16 @@ def is_format_supported(format):
     if format.colorspace not in ('rgb', 'zs'):
         return False
 
-    if format.layout not in (ARITH, ARRAY):
+    if format.layout != PLAIN:
         return False
 
     for i in range(4):
-        type = format.in_types[i]
-        if type.kind not in (VOID, UNSIGNED, FLOAT):
+        channel = format.channels[i]
+        if channel.type not in (VOID, UNSIGNED, FLOAT):
             return False
 
     # We can only read a color from a depth/stencil format if the depth channel is present
-    if format.colorspace == 'zs' and format.out_swizzle[0] == SWIZZLE_NONE:
+    if format.colorspace == 'zs' and format.swizzles[0] == SWIZZLE_NONE:
         return False
 
     return True
@@ -81,187 +71,48 @@ def is_format_supported(format):
 def native_type(format):
     '''Get the native appropriate for a format.'''
 
-    if format.layout == ARITH:
-        # For arithmetic pixel formats return the integer type that matches the whole pixel
-        return 'uint%u_t' % format.block_size()
-    elif format.layout == ARRAY:
-        # For array pixel formats return the integer type that matches the color channel
-        type = format.in_types[0]
-        if type.kind == UNSIGNED:
-            return 'uint%u_t' % type.size
-        elif type.kind == SIGNED:
-            return 'int%u_t' % type.size
-        elif type.kind == FLOAT:
-            if type.size == 32:
-                return 'float'
-            elif type.size == 64:
-                return 'double'
+    if format.layout == PLAIN:
+        if not format.is_array():
+            # For arithmetic pixel formats return the integer type that matches the whole pixel
+            return 'uint%u_t' % format.block_size()
+        else:
+            # For array pixel formats return the integer type that matches the color channel
+            channel = format.channels[0]
+            if channel.type == UNSIGNED:
+                return 'uint%u_t' % channel.size
+            elif channel.type == SIGNED:
+                return 'int%u_t' % channel.size
+            elif channel.type == FLOAT:
+                if channel.size == 32:
+                    return 'float'
+                elif channel.size == 64:
+                    return 'double'
+                else:
+                    assert False
             else:
                 assert False
-        else:
-            assert False
-    else:
-        assert False
-
-
-def intermediate_native_type(bits, sign):
-    '''Find a native type adequate to hold intermediate results of the request bit size.'''
-
-    bytes = 4 # don't use anything smaller than 32bits
-    while bytes * 8 < bits:
-        bytes *= 2
-    bits = bytes*8
-
-    if sign:
-        return 'int%u_t' % bits
-    else:
-        return 'uint%u_t' % bits
-
-
-def get_one_shift(type):
-    '''Get the number of the bit that matches unity for this type.'''
-    if type.kind == 'FLOAT':
-        assert False
-    if not type.norm:
-        return 0
-    if type.kind == UNSIGNED:
-        return type.size
-    if type.kind == SIGNED:
-        return type.size - 1
-    if type.kind == FIXED:
-        return type.size / 2
-    assert False
-
-
-def get_one(type):
-    '''Get the value of unity for this type.'''
-    if type.kind == 'FLOAT' or not type.norm:
-        return 1
-    else:
-        return (1 << get_one_shift(type)) - 1
-
-
-def generate_clamp():
-    '''Code generate the clamping functions for each type.
-
-    We don't use a macro so that arguments with side effects, 
-    like *src_pixel++ are correctly handled.
-    '''
-
-    for suffix, native_type in [
-        ('', 'double'),
-        ('f', 'float'),
-        ('ui', 'unsigned int'),
-        ('si', 'int'),
-    ]:
-        print 'static INLINE %s' % native_type
-        print 'clamp%s(%s value, %s lbound, %s ubound)' % (suffix, native_type, native_type, native_type)
-        print '{'
-        print '   if(value < lbound)'
-        print '      return lbound;'
-        print '   if(value > ubound)'
-        print '      return ubound;'
-        print '   return value;'
-        print '}'
-        print
-
-
-def clamp_expr(src_type, dst_type, dst_native_type, value):
-    '''Generate the expression to clamp the value in the source type to the
-    destination type range.'''
-
-    if src_type == dst_type:
-        return value
-
-    # Pick the approriate clamp function
-    if src_type.kind == FLOAT:
-        if src_type.size == 32:
-            func = 'clampf'
-        elif src_type.size == 64:
-            func = 'clamp'
-        else:
-            assert False
-    elif src_type.kind == UNSIGNED:
-        func = 'clampui'
-    elif src_type.kind == SIGNED:
-        func = 'clampsi'
     else:
         assert False
 
-    # Clamp floats to [-1, 1] or [0, 1] range
-    if src_type.kind == FLOAT and dst_type.norm:
-        max = 1
-        if src_type.sign and dst_type.sign:
-            min = -1
-        else:
-            min = 0
-        return '%s(%s, %s, %s)' % (func, value, min, max)
-                
-    # FIXME: Also clamp scaled values
-
-    return value
-
-
-def conversion_expr(src_type, dst_type, dst_native_type, value):
-    '''Generate the expression to convert a value between two types.'''
-
-    if src_type == dst_type:
-        return value
 
-    if src_type.kind == FLOAT and dst_type.kind == FLOAT:
-        return '(%s)%s' % (dst_native_type, value)
-    
-    if not src_type.norm and not dst_type.norm:
-        return '(%s)%s' % (dst_native_type, value)
-
-    value = clamp_expr(src_type, dst_type, dst_native_type, value)
-
-    if dst_type.kind == FLOAT:
-        if src_type.norm:
-            one = get_one(src_type)
-            if src_type.size <= 23:
-                scale = '(1.0f/0x%x)' % one
-            else:
-                # bigger than single precision mantissa, use double
-                scale = '(1.0/0x%x)' % one
-            value = '(%s * %s)' % (value, scale)
-        return '(%s)%s' % (dst_native_type, value)
-
-    if src_type.kind == FLOAT:
-        if dst_type.norm:
-            dst_one = get_one(dst_type)
-            if dst_type.size <= 23:
-                scale = '0x%x' % dst_one
-            else:
-                # bigger than single precision mantissa, use double
-                scale = '(double)0x%x' % dst_one
-            value = '(%s * %s)' % (value, scale)
-        return '(%s)%s' % (dst_native_type, value)
-
-    if src_type.kind == dst_type.kind:
-        src_one = get_one(src_type)
-        dst_one = get_one(dst_type)
-
-        if src_one > dst_one and src_type.norm and dst_type.norm:
-            # We can just bitshift
-            src_shift = get_one_shift(src_type)
-            dst_shift = get_one_shift(dst_type)
-            value = '(%s >> %s)' % (value, src_shift - dst_shift)
-        else:
-            # We need to rescale using an intermediate type big enough to hold the multiplication of both
-            tmp_native_type = intermediate_native_type(src_type.size + dst_type.size, src_type.sign and dst_type.sign)
-            value = '(%s)%s' % (tmp_native_type, value)
-            value = '%s * 0x%x / 0x%x' % (value, dst_one, src_one)
-        value = '(%s)%s' % (dst_native_type, value)
-        return value
-
-    assert False
+def generate_srgb_tables():
+    print 'static ubyte srgb_to_linear[256] = {'
+    for i in range(256):
+        print '   %s,' % (int(math.pow((i / 255.0 + 0.055) / 1.055, 2.4) * 255))
+    print '};'
+    print
+    print 'static ubyte linear_to_srgb[256] = {'
+    print '   0,'
+    for i in range(1, 256):
+        print '   %s,' % (int((1.055 * math.pow(i / 255.0, 0.41666) - 0.055) * 255))
+    print '};'
+    print
 
 
-def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
+def generate_format_read(format, dst_channel, dst_native_type, dst_suffix):
     '''Generate the function to read pixels from a particular format'''
 
-    name = short_name(format)
+    name = format.short_name()
 
     src_native_type = native_type(format)
 
@@ -279,11 +130,11 @@ def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
     names = ['']*4
     if format.colorspace == 'rgb':
         for i in range(4):
-            swizzle = format.out_swizzle[i]
+            swizzle = format.swizzles[i]
             if swizzle < 4:
                 names[swizzle] += 'rgba'[i]
     elif format.colorspace == 'zs':
-        swizzle = format.out_swizzle[0]
+        swizzle = format.swizzles[0]
         if swizzle < 4:
             names[swizzle] = 'z'
         else:
@@ -291,64 +142,66 @@ def generate_format_read(format, dst_type, dst_native_type, dst_suffix):
     else:
         assert False
 
-    if format.layout == ARITH:
-        print '         %s pixel = *src_pixel++;' % src_native_type
-        shift = 0;
-        for i in range(4):
-            src_type = format.in_types[i]
-            width = src_type.size
-            if names[i]:
-                value = 'pixel'
-                mask = (1 << width) - 1
-                if shift:
-                    value = '(%s >> %u)' % (value, shift)
-                if shift + width < format.block_size():
-                    value = '(%s & 0x%x)' % (value, mask)
-                value = conversion_expr(src_type, dst_type, dst_native_type, value)
-                print '         %s %s = %s;' % (dst_native_type, names[i], value)
-            shift += width
-    elif format.layout == ARRAY:
-        for i in range(4):
-            src_type = format.in_types[i]
-            if names[i]:
-                value = '(*src_pixel++)'
-                value = conversion_expr(src_type, dst_type, dst_native_type, value)
-                print '         %s %s = %s;' % (dst_native_type, names[i], value)
+    if format.layout == PLAIN:
+        if not format.is_array():
+            print '         %s pixel = *src_pixel++;' % src_native_type
+            shift = 0;
+            for i in range(4):
+                src_channel = format.channels[i]
+                width = src_channel.size
+                if names[i]:
+                    value = 'pixel'
+                    mask = (1 << width) - 1
+                    if shift:
+                        value = '(%s >> %u)' % (value, shift)
+                    if shift + width < format.block_size():
+                        value = '(%s & 0x%x)' % (value, mask)
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
+                    print '         %s %s = %s;' % (dst_native_type, names[i], value)
+                shift += width
+        else:
+            for i in range(4):
+                src_channel = format.channels[i]
+                if names[i]:
+                    value = 'src_pixel[%u]' % i
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
+                    print '         %s %s = %s;' % (dst_native_type, names[i], value)
+            print '         src_pixel += %u;' % (format.nr_channels())
     else:
         assert False
 
     for i in range(4):
         if format.colorspace == 'rgb':
-            swizzle = format.out_swizzle[i]
+            swizzle = format.swizzles[i]
             if swizzle < 4:
                 value = names[swizzle]
             elif swizzle == SWIZZLE_0:
                 value = '0'
             elif swizzle == SWIZZLE_1:
-                value = '1'
+                value = get_one(dst_channel)
             else:
                 assert False
         elif format.colorspace == 'zs':
             if i < 3:
                 value = 'z'
             else:
-                value = '1'
+                value = get_one(dst_channel)
         else:
             assert False
         print '         *dst_pixel++ = %s; /* %s */' % (value, 'rgba'[i])
 
     print '      }'
     print '      src_row += src_stride;'
-    print '      dst_row += dst_stride/sizeof(%s);' % dst_native_type
+    print '      dst_row += dst_stride/sizeof(*dst_row);'
     print '   }'
     print '}'
     print
     
 
-def generate_format_write(format, src_type, src_native_type, src_suffix):
+def generate_format_write(format, src_channel, src_native_type, src_suffix):
     '''Generate the function to write pixels to a particular format'''
 
-    name = short_name(format)
+    name = format.short_name()
 
     dst_native_type = native_type(format)
 
@@ -363,58 +216,48 @@ def generate_format_write(format, src_type, src_native_type, src_suffix):
     print '      const %s *src_pixel = src_row;' %src_native_type
     print '      for (x = 0; x < w; ++x) {'
 
-    inv_swizzle = [None]*4
-    if format.colorspace == 'rgb':
-        for i in range(4):
-            swizzle = format.out_swizzle[i]
-            if swizzle < 4:
-                inv_swizzle[swizzle] = i
-    elif format.colorspace == 'zs':
-        swizzle = format.out_swizzle[0]
-        if swizzle < 4:
-            inv_swizzle[swizzle] = 0
-    else:
-        assert False
-
-    if format.layout == ARITH:
-        print '         %s pixel = 0;' % dst_native_type
-        shift = 0;
-        for i in range(4):
-            dst_type = format.in_types[i]
-            width = dst_type.size
-            if inv_swizzle[i] is not None:
-                value = 'src_pixel[%u]' % inv_swizzle[i]
-                value = conversion_expr(src_type, dst_type, dst_native_type, value)
-                if shift:
-                    value = '(%s << %u)' % (value, shift)
-                print '         pixel |= %s;' % value
-            shift += width
-        print '         *dst_pixel++ = pixel;'
-    elif format.layout == ARRAY:
-        for i in range(4):
-            dst_type = format.in_types[i]
-            if inv_swizzle[i] is not None:
-                value = 'src_pixel[%u]' % inv_swizzle[i]
-                value = conversion_expr(src_type, dst_type, dst_native_type, value)
-                print '         *dst_pixel++ = %s;' % value
+    inv_swizzle = format.inv_swizzles()
+
+    if format.layout == PLAIN:
+        if not format.is_array():
+            print '         %s pixel = 0;' % dst_native_type
+            shift = 0;
+            for i in range(4):
+                dst_channel = format.channels[i]
+                width = dst_channel.size
+                if inv_swizzle[i] is not None:
+                    value = 'src_pixel[%u]' % inv_swizzle[i]
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
+                    if shift:
+                        value = '(%s << %u)' % (value, shift)
+                    print '         pixel |= %s;' % value
+                shift += width
+            print '         *dst_pixel++ = pixel;'
+        else:
+            for i in range(4):
+                dst_channel = format.channels[i]
+                if inv_swizzle[i] is not None:
+                    value = 'src_pixel[%u]' % inv_swizzle[i]
+                    value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
+                    print '         *dst_pixel++ = %s;' % value
     else:
         assert False
     print '         src_pixel += 4;'
 
     print '      }'
     print '      dst_row += dst_stride;'
-    print '      src_row += src_stride/sizeof(%s);' % src_native_type
+    print '      src_row += src_stride/sizeof(*src_row);'
     print '   }'
     print '}'
     print
     
 
-def generate_read(formats, dst_type, dst_native_type, dst_suffix):
+def generate_read(formats, dst_channel, dst_native_type, dst_suffix):
     '''Generate the dispatch function to read pixels from any format'''
 
     for format in formats:
         if is_format_supported(format):
-            generate_format_read(format, dst_type, dst_native_type, dst_suffix)
+            generate_format_read(format, dst_channel, dst_native_type, dst_suffix)
 
     print 'void'
     print 'util_format_read_%s(enum pipe_format format, %s *dst, unsigned dst_stride, const void *src, unsigned src_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (dst_suffix, dst_native_type)
@@ -424,7 +267,7 @@ def generate_read(formats, dst_type, dst_native_type, dst_suffix):
     for format in formats:
         if is_format_supported(format):
             print '   case %s:' % format.name
-            print '      func = &util_format_%s_read_%s;' % (short_name(format), dst_suffix)
+            print '      func = &util_format_%s_read_%s;' % (format.short_name(), dst_suffix)
             print '      break;'
     print '   default:'
     print '      debug_printf("unsupported format\\n");'
@@ -435,12 +278,12 @@ def generate_read(formats, dst_type, dst_native_type, dst_suffix):
     print
 
 
-def generate_write(formats, src_type, src_native_type, src_suffix):
+def generate_write(formats, src_channel, src_native_type, src_suffix):
     '''Generate the dispatch function to write pixels to any format'''
 
     for format in formats:
         if is_format_supported(format):
-            generate_format_write(format, src_type, src_native_type, src_suffix)
+            generate_format_write(format, src_channel, src_native_type, src_suffix)
 
     print 'void'
     print 'util_format_write_%s(enum pipe_format format, const %s *src, unsigned src_stride, void *dst, unsigned dst_stride, unsigned x, unsigned y, unsigned w, unsigned h)' % (src_suffix, src_native_type)
@@ -451,7 +294,7 @@ def generate_write(formats, src_type, src_native_type, src_suffix):
     for format in formats:
         if is_format_supported(format):
             print '   case %s:' % format.name
-            print '      func = &util_format_%s_write_%s;' % (short_name(format), src_suffix)
+            print '      func = &util_format_%s_write_%s;' % (format.short_name(), src_suffix)
             print '      break;'
     print '   default:'
     print '      debug_printf("unsupported format\\n");'
@@ -473,20 +316,20 @@ def main():
     print __doc__.strip()
     print
     print '#include "pipe/p_compiler.h"'
-    print '#include "u_format.h"'
     print '#include "u_math.h"'
+    print '#include "u_format_pack.h"'
     print
 
-    generate_clamp()
+    generate_srgb_tables()
 
-    type = Type(FLOAT, False, 32)
+    type = Channel(FLOAT, False, 32)
     native_type = 'float'
     suffix = '4f'
 
     generate_read(formats, type, native_type, suffix)
     generate_write(formats, type, native_type, suffix)
 
-    type = Type(UNSIGNED, True, 8)
+    type = Channel(UNSIGNED, True, 8)
     native_type = 'uint8_t'
     suffix = '4ub'
 
diff --git a/src/gallium/auxiliary/util/u_format_pack.py b/src/gallium/auxiliary/util/u_format_pack.py
new file mode 100644
index 00000000000..3f33f7cc021
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_pack.py
@@ -0,0 +1,484 @@
+#!/usr/bin/env python
+
+'''
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Pixel format packing and unpacking functions.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+'''
+
+
+import sys
+
+from u_format_parse import *
+
+
+def generate_format_type(format):
+    '''Generate a structure that describes the format.'''
+
+    print 'union util_format_%s {' % format.short_name()
+    if format.is_bitmask():
+        print '   uint%u_t value;' % (format.block_size(),)
+    print '   struct {'
+    for channel in format.channels:
+        if format.is_bitmask() and not format.is_array():
+            if channel.type == VOID:
+                if channel.size:
+                    print '      unsigned %s:%u;' % (channel.name, channel.size)
+            elif channel.type == UNSIGNED:
+                print '      unsigned %s:%u;' % (channel.name, channel.size)
+            elif channel.type == SIGNED:
+                print '      int %s:%u;' % (channel.name, channel.size)
+            else:
+                assert 0
+        else:
+            assert channel.size % 8 == 0 and is_pot(channel.size)
+            if channel.type == VOID:
+                if channel.size:
+                    print '      uint%u_t %s;' % (channel.size, channel.name)
+            elif channel.type == UNSIGNED:
+                print '      uint%u_t %s;' % (channel.size, channel.name)
+            elif channel.type in (SIGNED, FIXED):
+                print '      int%u_t %s;' % (channel.size, channel.name)
+            elif channel.type == FLOAT:
+                if channel.size == 64:
+                    print '      double %s;' % (channel.name)
+                elif channel.size == 32:
+                    print '      float %s;' % (channel.name)
+                elif channel.size == 16:
+                    print '      uint16_t %s;' % (channel.name)
+                else:
+                    assert 0
+            else:
+                assert 0
+    print '   } chan;'
+    print '};'
+    print
+
+
+def bswap_format(format):
+    '''Generate a structure that describes the format.'''
+
+    if format.is_bitmask() and not format.is_array():
+        print '#ifdef PIPE_ARCH_BIG_ENDIAN'
+        print '   pixel.value = util_bswap%u(pixel.value);' % format.block_size()
+        print '#endif'
+
+
+def is_format_supported(format):
+    '''Determines whether we actually have the plumbing necessary to generate the 
+    to read/write to/from this format.'''
+
+    # FIXME: Ideally we would support any format combination here.
+
+    if format.layout != PLAIN:
+        return False
+
+    for i in range(4):
+        channel = format.channels[i]
+        if channel.type not in (VOID, UNSIGNED, SIGNED, FLOAT):
+            return False
+
+    # We can only read a color from a depth/stencil format if the depth channel is present
+    if format.colorspace == 'zs' and format.swizzles[0] == SWIZZLE_NONE:
+        return False
+
+    return True
+
+
+def native_type(format):
+    '''Get the native appropriate for a format.'''
+
+    if format.layout == PLAIN:
+        if not format.is_array():
+            # For arithmetic pixel formats return the integer type that matches the whole pixel
+            return 'uint%u_t' % format.block_size()
+        else:
+            # For array pixel formats return the integer type that matches the color channel
+            type = format.channels[0]
+            if type.type == UNSIGNED:
+                return 'uint%u_t' % type.size
+            elif type.type == SIGNED:
+                return 'int%u_t' % type.size
+            elif type.type == FLOAT:
+                if type.size == 32:
+                    return 'float'
+                elif type.size == 64:
+                    return 'double'
+                else:
+                    assert False
+            else:
+                assert False
+    else:
+        assert False
+
+
+def intermediate_native_type(bits, sign):
+    '''Find a native type adequate to hold intermediate results of the request bit size.'''
+
+    bytes = 4 # don't use anything smaller than 32bits
+    while bytes * 8 < bits:
+        bytes *= 2
+    bits = bytes*8
+
+    if sign:
+        return 'int%u_t' % bits
+    else:
+        return 'uint%u_t' % bits
+
+
+def get_one_shift(type):
+    '''Get the number of the bit that matches unity for this type.'''
+    if type.type == 'FLOAT':
+        assert False
+    if not type.norm:
+        return 0
+    if type.type == UNSIGNED:
+        return type.size
+    if type.type == SIGNED:
+        return type.size - 1
+    if type.type == FIXED:
+        return type.size / 2
+    assert False
+
+
+def get_one(type):
+    '''Get the value of unity for this type.'''
+    if type.type == 'FLOAT' or not type.norm:
+        return 1
+    else:
+        return (1 << get_one_shift(type)) - 1
+
+
+def generate_clamp():
+    '''Code generate the clamping functions for each type.
+
+    We don't use a macro so that arguments with side effects, 
+    like *src_pixel++ are correctly handled.
+    '''
+
+    for suffix, native_type in [
+        ('', 'double'),
+        ('f', 'float'),
+        ('ui', 'unsigned int'),
+        ('si', 'int'),
+    ]:
+        print 'static INLINE %s' % native_type
+        print 'clamp%s(%s value, %s lbound, %s ubound)' % (suffix, native_type, native_type, native_type)
+        print '{'
+        print '   if(value < lbound)'
+        print '      return lbound;'
+        print '   if(value > ubound)'
+        print '      return ubound;'
+        print '   return value;'
+        print '}'
+        print
+
+
+def clamp_expr(src_channel, dst_channel, dst_native_type, value):
+    '''Generate the expression to clamp the value in the source type to the
+    destination type range.'''
+
+    if src_channel == dst_channel:
+        return value
+
+    # Pick the approriate clamp function
+    if src_channel.type == FLOAT:
+        if src_channel.size == 32:
+            func = 'clampf'
+        elif src_channel.size == 64:
+            func = 'clamp'
+        else:
+            assert False
+    elif src_channel.type == UNSIGNED:
+        func = 'clampui'
+    elif src_channel.type == SIGNED:
+        func = 'clampsi'
+    else:
+        assert False
+
+    src_min = src_channel.min()
+    src_max = src_channel.max()
+    dst_min = dst_channel.min()
+    dst_max = dst_channel.max()
+
+    if src_min < dst_min and src_max > dst_max:
+        return 'CLAMP(%s, %s, %s)' % (value, dst_min, dst_max)
+
+    if src_max > dst_max:
+        return 'MIN2(%s, %s)' % (value, dst_max)
+        
+    if src_min < dst_min:
+        return 'MAX2(%s, %s)' % (value, dst_min)
+
+    return value
+
+
+def conversion_expr(src_channel, dst_channel, dst_native_type, value, clamp=True):
+    '''Generate the expression to convert a value between two types.'''
+
+    if src_channel == dst_channel:
+        return value
+
+    if src_channel.type == FLOAT and dst_channel.type == FLOAT:
+        return '(%s)%s' % (dst_native_type, value)
+    
+    if not src_channel.norm and not dst_channel.norm:
+        return '(%s)%s' % (dst_native_type, value)
+
+    if clamp:
+        value = clamp_expr(src_channel, dst_channel, dst_native_type, value)
+
+    if dst_channel.type == FLOAT:
+        if src_channel.norm:
+            one = get_one(src_channel)
+            if src_channel.size <= 23:
+                scale = '(1.0f/0x%x)' % one
+            else:
+                # bigger than single precision mantissa, use double
+                scale = '(1.0/0x%x)' % one
+            value = '(%s * %s)' % (value, scale)
+        return '(%s)%s' % (dst_native_type, value)
+
+    if src_channel.type == FLOAT:
+        if dst_channel.norm:
+            dst_one = get_one(dst_channel)
+            if dst_channel.size <= 23:
+                scale = '0x%x' % dst_one
+            else:
+                # bigger than single precision mantissa, use double
+                scale = '(double)0x%x' % dst_one
+            value = '(%s * %s)' % (value, scale)
+        return '(%s)%s' % (dst_native_type, value)
+
+    if not src_channel.norm and not dst_channel.norm:
+        # neither is normalized -- just cast
+        return '(%s)%s' % (dst_native_type, value)
+
+    if src_channel.type in (SIGNED, UNSIGNED) and dst_channel.type in (SIGNED, UNSIGNED):
+        src_one = get_one(src_channel)
+        dst_one = get_one(dst_channel)
+
+        if src_one > dst_one and src_channel.norm:
+            # We can just bitshift
+            src_shift = get_one_shift(src_channel)
+            dst_shift = get_one_shift(dst_channel)
+            value = '(%s >> %s)' % (value, src_shift - dst_shift)
+        else:
+            # We need to rescale using an intermediate type big enough to hold the multiplication of both
+            tmp_native_type = intermediate_native_type(src_channel.size + dst_channel.size, src_channel.sign and dst_channel.sign)
+            value = '(%s)%s' % (tmp_native_type, value)
+            value = '(%s * 0x%x / 0x%x)' % (value, dst_one, src_one)
+        value = '(%s)%s' % (dst_native_type, value)
+        return value
+
+    assert False
+
+
+def generate_format_unpack(format, dst_channel, dst_native_type, dst_suffix):
+    '''Generate the function to unpack pixels from a particular format'''
+
+    name = format.short_name()
+
+    src_native_type = native_type(format)
+
+    print 'static INLINE void'
+    print 'util_format_%s_unpack_%s(%s *dst, const void *src)' % (name, dst_suffix, dst_native_type)
+    print '{'
+    print '   union util_format_%s pixel;' % format.short_name()
+    print '   memcpy(&pixel, src, sizeof pixel);'
+    bswap_format(format)
+
+    assert format.layout == PLAIN
+
+    for i in range(4):
+        swizzle = format.swizzles[i]
+        if swizzle < 4:
+            src_channel = format.channels[swizzle]
+            value = 'pixel.chan.%s' % src_channel.name 
+            value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
+        elif swizzle == SWIZZLE_0:
+            value = '0'
+        elif swizzle == SWIZZLE_1:
+            value = get_one(dst_channel)
+        elif swizzle == SWIZZLE_NONE:
+            value = '0'
+        else:
+            assert False
+        if format.colorspace == ZS:
+            if i == 3:
+                value = get_one(dst_channel)
+            elif i >= 1:
+                value = 'dst[0]'
+        print '   dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i])
+
+    print '}'
+    print
+    
+
+def generate_format_pack(format, src_channel, src_native_type, src_suffix):
+    '''Generate the function to pack pixels to a particular format'''
+
+    name = format.short_name()
+
+    dst_native_type = native_type(format)
+
+    print 'static INLINE void'
+    print 'util_format_%s_pack_%s(void *dst, %s r, %s g, %s b, %s a)' % (name, src_suffix, src_native_type, src_native_type, src_native_type, src_native_type)
+    print '{'
+    print '   union util_format_%s pixel;' % format.short_name()
+
+    assert format.layout == PLAIN
+
+    inv_swizzle = format.inv_swizzles()
+
+    for i in range(4):
+        dst_channel = format.channels[i]
+        width = dst_channel.size
+        if inv_swizzle[i] is None:
+            continue
+        value = 'rgba'[inv_swizzle[i]]
+        value = conversion_expr(src_channel, dst_channel, dst_native_type, value)
+        if format.colorspace == ZS:
+            if i == 3:
+                value = get_one(dst_channel)
+            elif i >= 1:
+                value = '0'
+        print '   pixel.chan.%s = %s;' % (dst_channel.name, value)
+
+    bswap_format(format)
+    print '   memcpy(dst, &pixel, sizeof pixel);'
+    print '}'
+    print
+    
+
+def generate_unpack(formats, dst_channel, dst_native_type, dst_suffix):
+    '''Generate the dispatch function to unpack pixels from any format'''
+
+    for format in formats:
+        if is_format_supported(format):
+            generate_format_unpack(format, dst_channel, dst_native_type, dst_suffix)
+
+    print 'static INLINE void'
+    print 'util_format_unpack_%s(enum pipe_format format, %s *dst, const void *src)' % (dst_suffix, dst_native_type)
+    print '{'
+    print '   void (*func)(%s *dst, const void *src);' % dst_native_type
+    print '   switch(format) {'
+    for format in formats:
+        if is_format_supported(format):
+            print '   case %s:' % format.name
+            print '      func = &util_format_%s_unpack_%s;' % (format.short_name(), dst_suffix)
+            print '      break;'
+    print '   default:'
+    print '      debug_printf("unsupported format\\n");'
+    print '      return;'
+    print '   }'
+    print '   func(dst, src);'
+    print '}'
+    print
+
+
+def generate_pack(formats, src_channel, src_native_type, src_suffix):
+    '''Generate the dispatch function to pack pixels to any format'''
+
+    for format in formats:
+        if is_format_supported(format):
+            generate_format_pack(format, src_channel, src_native_type, src_suffix)
+
+    print 'static INLINE void'
+    print 'util_format_pack_%s(enum pipe_format format, void *dst, %s r, %s g, %s b, %s a)' % (src_suffix, src_native_type, src_native_type, src_native_type, src_native_type)
+    print '{'
+    print '   void (*func)(void *dst, %s r, %s g, %s b, %s a);' % (src_native_type, src_native_type, src_native_type, src_native_type)
+    print '   switch(format) {'
+    for format in formats:
+        if is_format_supported(format):
+            print '   case %s:' % format.name
+            print '      func = &util_format_%s_pack_%s;' % (format.short_name(), src_suffix)
+            print '      break;'
+    print '   default:'
+    print '      debug_printf("%s: unsupported format\\n", __FUNCTION__);'
+    print '      return;'
+    print '   }'
+    print '   func(dst, r, g, b, a);'
+    print '}'
+    print
+
+
+def main():
+    formats = []
+    for arg in sys.argv[1:]:
+        formats.extend(parse(arg))
+
+    print '/* This file is autogenerated by u_format_pack.py from u_format.csv. Do not edit directly. */'
+    print
+    # This will print the copyright message on the top of this file
+    print __doc__.strip()
+
+    print
+    print '#ifndef U_FORMAT_PACK_H'
+    print '#define U_FORMAT_PACK_H'
+    print
+    print '#include "pipe/p_compiler.h"'
+    print '#include "u_math.h"'
+    print '#include "u_format.h"'
+    print
+
+    generate_clamp()
+
+    for format in formats:
+        if format.layout == PLAIN:
+            generate_format_type(format)
+
+    channel = Channel(FLOAT, False, 32)
+    native_type = 'float'
+    suffix = '4f'
+
+    generate_unpack(formats, channel, native_type, suffix)
+    generate_pack(formats, channel, native_type, suffix)
+
+    channel = Channel(UNSIGNED, True, 8)
+    native_type = 'uint8_t'
+    suffix = '4ub'
+
+    generate_unpack(formats, channel, native_type, suffix)
+    generate_pack(formats, channel, native_type, suffix)
+
+    print
+    print '#ifdef __cplusplus'
+    print '}'
+    print '#endif'
+    print
+    print '#endif /* ! U_FORMAT_PACK_H */'
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/auxiliary/util/u_format_parse.py b/src/gallium/auxiliary/util/u_format_parse.py
index 493aff71127..250926418ec 100755
--- a/src/gallium/auxiliary/util/u_format_parse.py
+++ b/src/gallium/auxiliary/util/u_format_parse.py
@@ -30,64 +30,169 @@
 '''
 
 
-import sys
-
-
 VOID, UNSIGNED, SIGNED, FIXED, FLOAT = range(5)
 
 SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_0, SWIZZLE_1, SWIZZLE_NONE, = range(7)
 
-ARITH = 'arith'
-ARRAY = 'array'
+PLAIN = 'plain'
+
+RGB = 'rgb'
+SRGB = 'srgb'
+YUV = 'yuv'
+ZS = 'zs'
+
 
+def is_pot(x):
+   return (x & (x - 1)) == 0;
 
-class Type:
-    '''Describe the type of a color channel.'''
+
+VERY_LARGE = 99999999999999999999999
+
+
+class Channel:
+    '''Describe the channel of a color channel.'''
     
-    def __init__(self, kind, norm, size):
-        self.kind = kind
+    def __init__(self, type, norm, size, name = ''):
+        self.type = type
         self.norm = norm
         self.size = size
-        self.sign = kind in (SIGNED, FIXED, FLOAT)
+        self.sign = type in (SIGNED, FIXED, FLOAT)
+        self.name = name
 
     def __str__(self):
-        s = str(self.kind)
+        s = str(self.type)
         if self.norm:
             s += 'n'
         s += str(self.size)
         return s
 
     def __eq__(self, other):
-        return self.kind == other.kind and self.norm == other.norm and self.size == other.size
+        return self.type == other.type and self.norm == other.norm and self.size == other.size
+
+    def max(self):
+        '''Maximum representable number.'''
+        if self.type == FLOAT:
+            return VERY_LARGE
+        if self.norm:
+            return 1
+        if self.type == UNSIGNED:
+            return (1 << self.size) - 1
+        if self.type == SIGNED:
+            return self.size - 1
+        assert False
+    
+    def min(self):
+        '''Minimum representable number.'''
+        if self.type == FLOAT:
+            return -VERY_LARGE
+        if self.type == UNSIGNED:
+            return 0
+        if self.norm:
+            return -1
+        if self.type == SIGNED:
+            return -(1 << (self.size - 1))
+        assert False
 
 
 class Format:
     '''Describe a pixel format.'''
 
-    def __init__(self, name, layout, block_width, block_height, in_types, out_swizzle, colorspace):
+    def __init__(self, name, layout, block_width, block_height, channels, swizzles, colorspace):
         self.name = name
         self.layout = layout
         self.block_width = block_width
         self.block_height = block_height
-        self.in_types = in_types
-        self.out_swizzle = out_swizzle
+        self.channels = channels
+        self.swizzles = swizzles
         self.name = name
         self.colorspace = colorspace
 
     def __str__(self):
         return self.name
 
+    def short_name(self):
+        '''Make up a short norm for a format, suitable to be used as suffix in
+        function names.'''
+
+        name = self.name
+        if name.startswith('PIPE_FORMAT_'):
+            name = name[len('PIPE_FORMAT_'):]
+        name = name.lower()
+        return name
+
     def block_size(self):
         size = 0
-        for type in self.in_types:
-            size += type.size
+        for channel in self.channels:
+            size += channel.size
         return size
 
+    def nr_channels(self):
+        nr_channels = 0
+        for channel in self.channels:
+            if channel.size:
+                nr_channels += 1
+        return nr_channels
+
+    def is_array(self):
+        ref_channel = self.channels[0]
+        for channel in self.channels[1:]:
+            if channel.size and (channel.size != ref_channel.size or channel.size % 8):
+                return False
+        return True
+
+    def is_mixed(self):
+        ref_channel = self.channels[0]
+        for channel in self.channels[1:]:
+            if channel.type != VOID:
+                if channel.type != ref_channel.type:
+                    return True
+                if channel.norm != ref_channel.norm:
+                    return True
+        return False
+
+    def is_pot(self):
+        return is_pot(self.block_size())
+
+    def is_int(self):
+        for channel in self.channels:
+            if channel.type not in (VOID, UNSIGNED, SIGNED):
+                return False
+        return True
+
+    def is_float(self):
+        for channel in self.channels:
+            if channel.type not in (VOID, FLOAT):
+                return False
+        return True
+
+    def is_bitmask(self):
+        if self.block_size() > 32:
+            return False
+        if not self.is_pot():
+            return False
+        for channel in self.channels:
+            if not is_pot(channel.size):
+                return True
+            if channel.type not in (VOID, UNSIGNED, SIGNED):
+                return False
+            if channel.size >= 32:
+                return False
+        return True
+
+    def inv_swizzles(self):
+        '''Return an array[4] of inverse swizzle terms'''
+        inv_swizzle = [None]*4
+        for i in range(4):
+            swizzle = self.swizzles[i]
+            if swizzle < 4:
+                inv_swizzle[swizzle] = i
+        return inv_swizzle
+
     def stride(self):
         return self.block_size()/8
 
 
-_kind_parse_map = {
+_type_parse_map = {
     '':  VOID,
     'x': VOID,
     'u': UNSIGNED,
@@ -108,20 +213,55 @@ _swizzle_parse_map = {
 
 def parse(filename):
     '''Parse the format descrition in CSV format in terms of the 
-    Type and Format classes above.'''
+    Channel and Format classes above.'''
 
     stream = open(filename)
     formats = []
     for line in stream:
-        line = line.rstrip()
+        try:
+            comment = line.index('#')
+        except ValueError:
+            pass
+        else:
+            line = line[:comment]
+        line = line.strip()
+        if not line:
+            continue
+
         fields = [field.strip() for field in line.split(',')]
+        
         name = fields[0]
         layout = fields[1]
         block_width, block_height = map(int, fields[2:4])
-        in_types = []
-        for field in fields[4:8]:
+
+        swizzles = [_swizzle_parse_map[swizzle] for swizzle in fields[8]]
+        colorspace = fields[9]
+        
+        if layout == PLAIN:
+            names = ['']*4
+            if colorspace in (RGB, SRGB):
+                for i in range(4):
+                    swizzle = swizzles[i]
+                    if swizzle < 4:
+                        names[swizzle] += 'rgba'[i]
+            elif colorspace == ZS:
+                for i in range(4):
+                    swizzle = swizzles[i]
+                    if swizzle < 4:
+                        names[swizzle] += 'zs'[i]
+            else:
+                assert False
+            for i in range(4):
+                if names[i] == '':
+                    names[i] = 'x'
+        else:
+            names = ['x', 'y', 'z', 'w']
+
+        channels = []
+        for i in range(0, 4):
+            field = fields[4 + i]
             if field:
-                kind = _kind_parse_map[field[0]]
+                type = _type_parse_map[field[0]]
                 if field[1] == 'n':
                     norm = True
                     size = int(field[2:])
@@ -129,13 +269,13 @@ def parse(filename):
                     norm = False
                     size = int(field[1:])
             else:
-                kind = VOID
+                type = VOID
                 norm = False
                 size = 0
-            in_type = Type(kind, norm, size)
-            in_types.append(in_type)
-        out_swizzle = [_swizzle_parse_map[swizzle] for swizzle in fields[8]]
-        colorspace = fields[9]
-        formats.append(Format(name, layout, block_width, block_height, in_types, out_swizzle, colorspace))
+            channel = Channel(type, norm, size, names[i])
+            channels.append(channel)
+
+        format = Format(name, layout, block_width, block_height, channels, swizzles, colorspace)
+        formats.append(format)
     return formats
 
diff --git a/src/gallium/auxiliary/util/u_format_table.py b/src/gallium/auxiliary/util/u_format_table.py
index 2cd0f956786..4e29d15f3bb 100755
--- a/src/gallium/auxiliary/util/u_format_table.py
+++ b/src/gallium/auxiliary/util/u_format_table.py
@@ -51,7 +51,7 @@ colorspace_channels_map = {
 }
 
 
-kind_map = {
+type_map = {
     VOID:     "UTIL_FORMAT_TYPE_VOID",
     UNSIGNED: "UTIL_FORMAT_TYPE_UNSIGNED",
     SIGNED:   "UTIL_FORMAT_TYPE_SIGNED",
@@ -87,26 +87,44 @@ def write_format_table(formats):
     print '#include "u_format.h"'
     print
     print 'const struct util_format_description'
-    print 'util_format_description_table[] = '
-    print "{"
+    print 'util_format_none_description = {'
+    print "   PIPE_FORMAT_NONE,"
+    print "   \"PIPE_FORMAT_NONE\","
+    print "   {0, 0, 0},"
+    print "   0,"
+    print "   0,"
+    print "   0,"
+    print "   0,"
+    print "   {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},"
+    print "   {0, 0, 0, 0},"
+    print "   0"
+    print "};"
+    print
     for format in formats:
+        print 'const struct util_format_description'
+        print 'util_format_%s_description = {' % (format.short_name(),)
+        print "   %s," % (format.name,)
+        print "   \"%s\"," % (format.name,)
+        print "   {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size())
+        print "   %s," % (layout_map(format.layout),)
+        print "   %u,\t/* nr_channels */" % (format.nr_channels(),)
+        print "   %s,\t/* is_array */" % (bool_map(format.is_array()),)
+        print "   %s,\t/* is_mixed */" % (bool_map(format.is_mixed()),)
         print "   {"
-        print "      %s," % (format.name,)
-        print "      \"%s\"," % (format.name,)
-        print "      {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size())
-        print "      %s," % (layout_map(format.layout),)
-        print "      {"
         for i in range(4):
-            type = format.in_types[i]
+            channel = format.channels[i]
             if i < 3:
                 sep = ","
             else:
                 sep = ""
-            print "         {%s, %s, %u}%s\t/* %s */" % (kind_map[type.kind], bool_map(type.norm), type.size, sep, "xyzw"[i])
-        print "      },"
-        print "      {"
+            if channel.size:
+                print "      {%s, %s, %u}%s\t/* %s = %s */" % (type_map[channel.type], bool_map(channel.norm), channel.size, sep, "xyzw"[i], channel.name)
+            else:
+                print "      {0, 0, 0}%s" % (sep,)
+        print "   },"
+        print "   {"
         for i in range(4):
-            swizzle = format.out_swizzle[i]
+            swizzle = format.swizzles[i]
             if i < 3:
                 sep = ","
             else:
@@ -115,20 +133,30 @@ def write_format_table(formats):
                 comment = colorspace_channels_map[format.colorspace][i]
             except (KeyError, IndexError):
                 comment = 'ignored'
-            print "         %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment)
-        print "      },"
-        print "      %s," % (colorspace_map(format.colorspace),)
+            print "      %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment)
         print "   },"
-    print "   {"
-    print "      PIPE_FORMAT_NONE,"
-    print "      \"PIPE_FORMAT_NONE\","
-    print "      {0, 0, 0},"
-    print "      0,"
-    print "      {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}},"
-    print "      {0, 0, 0, 0},"
-    print "      0"
-    print "   },"
-    print "};"
+        print "   %s," % (colorspace_map(format.colorspace),)
+        print "};"
+        print
+    print "const struct util_format_description *"
+    print "util_format_description(enum pipe_format format)"
+    print "{"
+    print "   if (format >= PIPE_FORMAT_COUNT) {"
+    print "      return NULL;"
+    print "   }"
+    print
+    print "   switch (format) {"
+    print "   case PIPE_FORMAT_NONE:"
+    print "      return &util_format_none_description;"
+    for format in formats:
+        print "   case %s:" % format.name
+        print "      return &util_format_%s_description;" % (format.short_name(),)
+    print "   default:"
+    print "      assert(0);"
+    print "      return NULL;"
+    print "   }"
+    print "}"
+    print
 
 
 def main():
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index 83263d9fe64..fc027e48e4e 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -37,15 +37,17 @@
 #include "pipe/p_context.h"
 #include "util/u_debug.h"
 #include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
+#include "util/u_inlines.h"
 #include "pipe/p_shader_tokens.h"
 #include "pipe/p_state.h"
 
+#include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_draw_quad.h"
 #include "util/u_gen_mipmap.h"
 #include "util/u_simple_shaders.h"
 #include "util/u_math.h"
+#include "util/u_texture.h"
 
 #include "cso_cache/cso_context.h"
 
@@ -59,9 +61,10 @@ struct gen_mipmap_state
    struct pipe_depth_stencil_alpha_state depthstencil;
    struct pipe_rasterizer_state rasterizer;
    struct pipe_sampler_state sampler;
+   struct pipe_clip_state clip;
 
    void *vs;
-   void *fs;
+   void *fs2d, *fsCube;
 
    struct pipe_buffer *vbuf;  /**< quad vertices */
    unsigned vbuf_slot;
@@ -920,29 +923,29 @@ format_to_type_comps(enum pipe_format pformat,
 {
    /* XXX I think this could be implemented in terms of the pf_*() functions */
    switch (pformat) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
    case PIPE_FORMAT_B8G8R8A8_UNORM:
    case PIPE_FORMAT_B8G8R8X8_UNORM:
-   case PIPE_FORMAT_R8G8B8A8_SRGB:
-   case PIPE_FORMAT_R8G8B8X8_SRGB:
-   case PIPE_FORMAT_A8R8G8B8_SRGB:
-   case PIPE_FORMAT_X8R8G8B8_SRGB:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_A8B8G8R8_SRGB:
+   case PIPE_FORMAT_X8B8G8R8_SRGB:
    case PIPE_FORMAT_B8G8R8A8_SRGB:
    case PIPE_FORMAT_B8G8R8X8_SRGB:
+   case PIPE_FORMAT_A8R8G8B8_SRGB:
+   case PIPE_FORMAT_X8R8G8B8_SRGB:
    case PIPE_FORMAT_R8G8B8_SRGB:
       *datatype = DTYPE_UBYTE;
       *comps = 4;
       return;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
       *datatype = DTYPE_USHORT_1_5_5_5_REV;
       *comps = 4;
       return;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
       *datatype = DTYPE_USHORT_4_4_4_4;
       *comps = 4;
       return;
-   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_B5G6R5_UNORM:
       *datatype = DTYPE_USHORT_5_6_5;
       *comps = 3;
       return;
@@ -953,8 +956,8 @@ format_to_type_comps(enum pipe_format pformat,
       *datatype = DTYPE_UBYTE;
       *comps = 1;
       return;
-   case PIPE_FORMAT_A8L8_UNORM:
-   case PIPE_FORMAT_A8L8_SRGB:
+   case PIPE_FORMAT_L8A8_UNORM:
+   case PIPE_FORMAT_L8A8_SRGB:
       *datatype = DTYPE_UBYTE;
       *comps = 2;
       return;
@@ -996,7 +999,7 @@ reduce_2d(enum pipe_format pformat,
 {
    enum dtype datatype;
    uint comps;
-   const int bpt = pf_get_blocksize(pformat);
+   const int bpt = util_format_get_blocksize(pformat);
    const ubyte *srcA, *srcB;
    ubyte *dst;
    int row;
@@ -1035,7 +1038,7 @@ reduce_3d(enum pipe_format pformat,
           int dstWidth, int dstHeight, int dstDepth,
           int dstRowStride, ubyte *dstPtr)
 {
-   const int bpt = pf_get_blocksize(pformat);
+   const int bpt = util_format_get_blocksize(pformat);
    const int border = 0;
    int img, row;
    int bytesPerSrcImage, bytesPerDstImage;
@@ -1067,7 +1070,7 @@ reduce_3d(enum pipe_format pformat,
     */
 
    /*
-   _mesa_printf("mip3d %d x %d x %d  ->  %d x %d x %d\n",
+   printf("mip3d %d x %d x %d  ->  %d x %d x %d\n",
           srcWidth, srcHeight, srcDepth, dstWidth, dstHeight, dstDepth);
    */
 
@@ -1159,8 +1162,8 @@ make_2d_mipmap(struct gen_mipmap_state *ctx,
    const uint zslice = 0;
    uint dstLevel;
    
-   assert(pf_get_blockwidth(pt->format) == 1);
-   assert(pf_get_blockheight(pt->format) == 1);
+   assert(util_format_get_blockwidth(pt->format) == 1);
+   assert(util_format_get_blockheight(pt->format) == 1);
 
    for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
       const uint srcLevel = dstLevel - 1;
@@ -1204,8 +1207,8 @@ make_3d_mipmap(struct gen_mipmap_state *ctx,
    struct pipe_screen *screen = pipe->screen;
    uint dstLevel, zslice = 0;
 
-   assert(pf_get_blockwidth(pt->format) == 1);
-   assert(pf_get_blockheight(pt->format) == 1);
+   assert(util_format_get_blockwidth(pt->format) == 1);
+   assert(util_format_get_blockheight(pt->format) == 1);
 
    for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
       const uint srcLevel = dstLevel - 1;
@@ -1285,7 +1288,7 @@ util_create_gen_mipmap(struct pipe_context *pipe,
 
    /* disabled blending/masking */
    memset(&ctx->blend, 0, sizeof(ctx->blend));
-   ctx->blend.colormask = PIPE_MASK_RGBA;
+   ctx->blend.rt[0].colormask = PIPE_MASK_RGBA;
 
    /* no-op depth/stencil/alpha */
    memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil));
@@ -1294,7 +1297,6 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
    ctx->rasterizer.front_winding = PIPE_WINDING_CW;
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
-   ctx->rasterizer.bypass_vs_clip_and_viewport = 1;
    ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* sampler state */
@@ -1317,7 +1319,8 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    }
 
    /* fragment shader */
-   ctx->fs = util_make_fragment_tex_shader(pipe);
+   ctx->fs2d = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D);
+   ctx->fsCube = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE);
 
    /* vertex data that doesn't change */
    for (i = 0; i < 4; i++) {
@@ -1358,84 +1361,34 @@ get_next_slot(struct gen_mipmap_state *ctx)
 static unsigned
 set_vertex_data(struct gen_mipmap_state *ctx,
                 enum pipe_texture_target tex_target,
-                uint face, float width, float height)
+                uint face)
 {
    unsigned offset;
 
    /* vert[0].position */
-   ctx->vertices[0][0][0] = 0.0f; /*x*/
-   ctx->vertices[0][0][1] = 0.0f; /*y*/
+   ctx->vertices[0][0][0] = -1.0f; /*x*/
+   ctx->vertices[0][0][1] = -1.0f; /*y*/
 
    /* vert[1].position */
-   ctx->vertices[1][0][0] = width;
-   ctx->vertices[1][0][1] = 0.0f;
+   ctx->vertices[1][0][0] = 1.0f;
+   ctx->vertices[1][0][1] = -1.0f;
 
    /* vert[2].position */
-   ctx->vertices[2][0][0] = width;
-   ctx->vertices[2][0][1] = height;
+   ctx->vertices[2][0][0] = 1.0f;
+   ctx->vertices[2][0][1] = 1.0f;
 
    /* vert[3].position */
-   ctx->vertices[3][0][0] = 0.0f;
-   ctx->vertices[3][0][1] = height;
+   ctx->vertices[3][0][0] = -1.0f;
+   ctx->vertices[3][0][1] = 1.0f;
 
    /* Setup vertex texcoords.  This is a little tricky for cube maps. */
    if (tex_target == PIPE_TEXTURE_CUBE) {
       static const float st[4][2] = {
          {0.0f, 0.0f}, {1.0f, 0.0f}, {1.0f, 1.0f}, {0.0f, 1.0f}
       };
-      float rx, ry, rz;
-      uint i;
-
-      /* loop over quad verts */
-      for (i = 0; i < 4; i++) {
-         /* Compute sc = +/-scale and tc = +/-scale.
-          * Not +/-1 to avoid cube face selection ambiguity near the edges,
-          * though that can still sometimes happen with this scale factor...
-          */
-         const float scale = 0.9999f;
-         const float sc = (2.0f * st[i][0] - 1.0f) * scale;
-         const float tc = (2.0f * st[i][1] - 1.0f) * scale;
-
-         switch (face) {
-         case PIPE_TEX_FACE_POS_X:
-            rx = 1.0f;
-            ry = -tc;
-            rz = -sc;
-            break;
-         case PIPE_TEX_FACE_NEG_X:
-            rx = -1.0f;
-            ry = -tc;
-            rz = sc;
-            break;
-         case PIPE_TEX_FACE_POS_Y:
-            rx = sc;
-            ry = 1.0f;
-            rz = tc;
-            break;
-         case PIPE_TEX_FACE_NEG_Y:
-            rx = sc;
-            ry = -1.0f;
-            rz = -tc;
-            break;
-         case PIPE_TEX_FACE_POS_Z:
-            rx = sc;
-            ry = -tc;
-            rz = 1.0f;
-            break;
-         case PIPE_TEX_FACE_NEG_Z:
-            rx = -sc;
-            ry = -tc;
-            rz = -1.0f;
-            break;
-         default:
-            rx = ry = rz = 0.0f;
-            assert(0);
-         }
 
-         ctx->vertices[i][1][0] = rx; /*s*/
-         ctx->vertices[i][1][1] = ry; /*t*/
-         ctx->vertices[i][1][2] = rz; /*r*/
-      }
+      util_map_texcoords2d_onto_cubemap(face, &st[0][0], 2,
+                                        &ctx->vertices[0][1][0], 8);
    }
    else {
       /* 1D/2D */
@@ -1458,8 +1411,8 @@ set_vertex_data(struct gen_mipmap_state *ctx,
 
    offset = get_next_slot( ctx );
 
-   pipe_buffer_write(ctx->pipe->screen, ctx->vbuf,
-                     offset, sizeof(ctx->vertices), ctx->vertices);
+   pipe_buffer_write_nooverlap(ctx->pipe->screen, ctx->vbuf,
+                               offset, sizeof(ctx->vertices), ctx->vertices);
 
    return offset;
 }
@@ -1475,7 +1428,8 @@ util_destroy_gen_mipmap(struct gen_mipmap_state *ctx)
    struct pipe_context *pipe = ctx->pipe;
 
    pipe->delete_vs_state(pipe, ctx->vs);
-   pipe->delete_fs_state(pipe, ctx->fs);
+   pipe->delete_fs_state(pipe, ctx->fs2d);
+   pipe->delete_fs_state(pipe, ctx->fsCube);
 
    pipe_buffer_reference(&ctx->vbuf, NULL);
 
@@ -1513,6 +1467,7 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_screen *screen = pipe->screen;
    struct pipe_framebuffer_state fb;
+   void *fs = (pt->target == PIPE_TEXTURE_CUBE) ? ctx->fsCube : ctx->fs2d;
    uint dstLevel;
    uint zslice = 0;
    uint offset;
@@ -1544,13 +1499,16 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
    cso_save_framebuffer(ctx->cso);
    cso_save_fragment_shader(ctx->cso);
    cso_save_vertex_shader(ctx->cso);
+   cso_save_viewport(ctx->cso);
+   cso_save_clip(ctx->cso);
 
    /* bind our state */
    cso_set_blend(ctx->cso, &ctx->blend);
    cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_clip(ctx->cso, &ctx->clip);
 
-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs);
+   cso_set_fragment_shader_handle(ctx->cso, fs);
    cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
 
    /* init framebuffer state */
@@ -1567,6 +1525,7 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
     */
    for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
       const uint srcLevel = dstLevel - 1;
+      struct pipe_viewport_state vp;
 
       struct pipe_surface *surf = 
          screen->get_tex_surface(screen, pt, face, dstLevel, zslice,
@@ -1580,6 +1539,17 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
       fb.height = u_minify(pt->height0, dstLevel);
       cso_set_framebuffer(ctx->cso, &fb);
 
+      /* viewport */
+      vp.scale[0] = 0.5f * fb.width;
+      vp.scale[1] = 0.5f * fb.height;
+      vp.scale[2] = 1.0f;
+      vp.scale[3] = 1.0f;
+      vp.translate[0] = 0.5f * fb.width;
+      vp.translate[1] = 0.5f * fb.height;
+      vp.translate[2] = 0.0f;
+      vp.translate[3] = 0.0f;
+      cso_set_viewport(ctx->cso, &vp);
+
       /*
        * Setup sampler state
        * Note: we should only have to set the min/max LOD clamps to ensure
@@ -1594,12 +1564,10 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
 
       cso_set_sampler_textures(ctx->cso, 1, &pt);
 
-      /* quad coords in window coords (bypassing vs, clip and viewport) */
+      /* quad coords in clip coords */
       offset = set_vertex_data(ctx,
                                pt->target,
-                               face,
-                               (float) u_minify(pt->width0, dstLevel),
-                               (float) u_minify(pt->height0, dstLevel));
+                               face);
 
       util_draw_vertex_buffer(ctx->pipe, 
                               ctx->vbuf,
@@ -1623,4 +1591,6 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
    cso_restore_framebuffer(ctx->cso);
    cso_restore_fragment_shader(ctx->cso);
    cso_restore_vertex_shader(ctx->cso);
+   cso_restore_viewport(ctx->cso);
+   cso_restore_clip(ctx->cso);
 }
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
new file mode 100644
index 00000000000..0cb3432c6e4
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -0,0 +1,307 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef U_INLINES_H
+#define U_INLINES_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_screen.h"
+#include "util/u_debug.h"
+#include "util/u_atomic.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * Reference counting helper functions.
+ */
+
+
+static INLINE void
+pipe_reference_init(struct pipe_reference *reference, unsigned count)
+{
+   p_atomic_set(&reference->count, count);
+}
+
+static INLINE boolean
+pipe_is_referenced(struct pipe_reference *reference)
+{
+   return p_atomic_read(&reference->count) != 0;
+}
+
+/**
+ * Update reference counting.
+ * The old thing pointed to, if any, will be unreferenced.
+ * Both 'ptr' and 'reference' may be NULL.
+ * \return TRUE if the object's refcount hits zero and should be destroyed.
+ */
+static INLINE boolean
+pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
+{
+   boolean destroy = FALSE;
+
+   if(ptr != reference) {
+      /* bump the reference.count first */
+      if (reference) {
+         assert(pipe_is_referenced(reference));
+         p_atomic_inc(&reference->count);
+      }
+
+      if (ptr) {
+         assert(pipe_is_referenced(ptr));
+         if (p_atomic_dec_zero(&ptr->count)) {
+            destroy = TRUE;
+         }
+      }
+   }
+
+   return destroy;
+}
+
+static INLINE void
+pipe_buffer_reference(struct pipe_buffer **ptr, struct pipe_buffer *buf)
+{
+   struct pipe_buffer *old_buf;
+
+   assert(ptr);
+   old_buf = *ptr;
+
+   if (pipe_reference(&(*ptr)->reference, &buf->reference))
+      old_buf->screen->buffer_destroy(old_buf);
+   *ptr = buf;
+}
+
+static INLINE void
+pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
+{
+   struct pipe_surface *old_surf = *ptr;
+
+   if (pipe_reference(&(*ptr)->reference, &surf->reference))
+      old_surf->texture->screen->tex_surface_destroy(old_surf);
+   *ptr = surf;
+}
+
+static INLINE void
+pipe_texture_reference(struct pipe_texture **ptr, struct pipe_texture *tex)
+{
+   struct pipe_texture *old_tex = *ptr;
+
+   if (pipe_reference(&(*ptr)->reference, &tex->reference))
+      old_tex->screen->texture_destroy(old_tex);
+   *ptr = tex;
+}
+
+
+/*
+ * Convenience wrappers for screen buffer functions.
+ */
+
+static INLINE struct pipe_buffer *
+pipe_buffer_create( struct pipe_screen *screen,
+                    unsigned alignment, unsigned usage, unsigned size )
+{
+   return screen->buffer_create(screen, alignment, usage, size);
+}
+
+static INLINE struct pipe_buffer *
+pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size )
+{
+   return screen->user_buffer_create(screen, ptr, size);
+}
+
+static INLINE void *
+pipe_buffer_map(struct pipe_screen *screen,
+                struct pipe_buffer *buf,
+                unsigned usage)
+{
+   if(screen->buffer_map_range) {
+      unsigned offset = 0;
+      unsigned length = buf->size;
+      return screen->buffer_map_range(screen, buf, offset, length, usage);
+   }
+   else
+      return screen->buffer_map(screen, buf, usage);
+}
+
+static INLINE void
+pipe_buffer_unmap(struct pipe_screen *screen,
+                  struct pipe_buffer *buf)
+{
+   screen->buffer_unmap(screen, buf);
+}
+
+static INLINE void *
+pipe_buffer_map_range(struct pipe_screen *screen,
+                struct pipe_buffer *buf,
+                unsigned offset,
+                unsigned length,
+                unsigned usage)
+{
+   assert(offset < buf->size);
+   assert(offset + length <= buf->size);
+   assert(length);
+   if(screen->buffer_map_range)
+      return screen->buffer_map_range(screen, buf, offset, length, usage);
+   else
+      return screen->buffer_map(screen, buf, usage);
+}
+
+static INLINE void
+pipe_buffer_flush_mapped_range(struct pipe_screen *screen,
+                               struct pipe_buffer *buf,
+                               unsigned offset,
+                               unsigned length)
+{
+   assert(offset < buf->size);
+   assert(offset + length <= buf->size);
+   assert(length);
+   if(screen->buffer_flush_mapped_range)
+      screen->buffer_flush_mapped_range(screen, buf, offset, length);
+}
+
+static INLINE void
+pipe_buffer_write(struct pipe_screen *screen,
+                  struct pipe_buffer *buf,
+                  unsigned offset, unsigned size,
+                  const void *data)
+{
+   void *map;
+   
+   assert(offset < buf->size);
+   assert(offset + size <= buf->size);
+   assert(size);
+
+   map = pipe_buffer_map_range(screen, buf, offset, size, 
+                               PIPE_BUFFER_USAGE_CPU_WRITE | 
+                               PIPE_BUFFER_USAGE_FLUSH_EXPLICIT |
+                               PIPE_BUFFER_USAGE_DISCARD);
+   assert(map);
+   if(map) {
+      memcpy((uint8_t *)map + offset, data, size);
+      pipe_buffer_flush_mapped_range(screen, buf, offset, size);
+      pipe_buffer_unmap(screen, buf);
+   }
+}
+
+/**
+ * Special case for writing non-overlapping ranges.
+ *
+ * We can avoid GPU/CPU synchronization when writing range that has never
+ * been written before.
+ */
+static INLINE void
+pipe_buffer_write_nooverlap(struct pipe_screen *screen,
+                            struct pipe_buffer *buf,
+                            unsigned offset, unsigned size,
+                            const void *data)
+{
+   void *map;
+
+   assert(offset < buf->size);
+   assert(offset + size <= buf->size);
+   assert(size);
+
+   map = pipe_buffer_map_range(screen, buf, offset, size,
+                               PIPE_BUFFER_USAGE_CPU_WRITE |
+                               PIPE_BUFFER_USAGE_FLUSH_EXPLICIT |
+                               PIPE_BUFFER_USAGE_DISCARD |
+                               PIPE_BUFFER_USAGE_UNSYNCHRONIZED);
+   assert(map);
+   if(map) {
+      memcpy((uint8_t *)map + offset, data, size);
+      pipe_buffer_flush_mapped_range(screen, buf, offset, size);
+      pipe_buffer_unmap(screen, buf);
+   }
+}
+
+static INLINE void
+pipe_buffer_read(struct pipe_screen *screen,
+                 struct pipe_buffer *buf,
+                 unsigned offset, unsigned size,
+                 void *data)
+{
+   void *map;
+   
+   assert(offset < buf->size);
+   assert(offset + size <= buf->size);
+   assert(size);
+
+   map = pipe_buffer_map_range(screen, buf, offset, size, PIPE_BUFFER_USAGE_CPU_READ);
+   assert(map);
+   if(map) {
+      memcpy(data, (const uint8_t *)map + offset, size);
+      pipe_buffer_unmap(screen, buf);
+   }
+}
+
+static INLINE void *
+pipe_transfer_map( struct pipe_transfer *transf )
+{
+   struct pipe_screen *screen = transf->texture->screen;
+   return screen->transfer_map(screen, transf);
+}
+
+static INLINE void
+pipe_transfer_unmap( struct pipe_transfer *transf )
+{
+   struct pipe_screen *screen = transf->texture->screen;
+   screen->transfer_unmap(screen, transf);
+}
+
+static INLINE void
+pipe_transfer_destroy( struct pipe_transfer *transf )
+{
+   struct pipe_screen *screen = transf->texture->screen;
+   screen->tex_transfer_destroy(transf);
+}
+
+static INLINE unsigned
+pipe_transfer_buffer_flags( struct pipe_transfer *transf )
+{
+   switch (transf->usage & PIPE_TRANSFER_READ_WRITE) {
+   case PIPE_TRANSFER_READ_WRITE:
+      return PIPE_BUFFER_USAGE_CPU_READ | PIPE_BUFFER_USAGE_CPU_WRITE;
+   case PIPE_TRANSFER_READ:
+      return PIPE_BUFFER_USAGE_CPU_READ;
+   case PIPE_TRANSFER_WRITE:
+      return PIPE_BUFFER_USAGE_CPU_WRITE;
+   default:
+      debug_assert(0);
+      return 0;
+   }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_INLINES_H */
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
index c4b9eb3d9b7..e161ccd88eb 100644
--- a/src/gallium/auxiliary/util/u_keymap.c
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -36,7 +36,6 @@
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
-#include "pipe/p_defines.h"
 
 #include "cso_cache/cso_hash.h"
 
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index b76592d1ec6..d1ec13def30 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -532,6 +532,17 @@ util_bswap32(uint32_t n)
 
 
 /**
+ * Reverse byte order of a 16 bit word.
+ */
+static INLINE uint16_t
+util_bswap16(uint16_t n)
+{
+   return (n >> 8) |
+          (n << 8);
+}
+
+
+/**
  * Clamp X to [MIN, MAX].
  * This is a macro to allow float, int, uint, etc. types.
  */
@@ -583,6 +594,18 @@ do {                                     \
 #endif
 
 
+static INLINE uint32_t util_unsigned_fixed(float value, unsigned frac_bits)
+{
+   return value < 0 ? 0 : (uint32_t)(value * (1<<frac_bits));
+}
+
+static INLINE int32_t util_signed_fixed(float value, unsigned frac_bits)
+{
+   return (int32_t)(value * (1<<frac_bits));
+}
+
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index c3f8c918338..a2fc5973565 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-/**
+/*
  * Memory functions
  */
 
@@ -37,6 +37,7 @@
 
 #include "util/u_pointer.h"
 #include "util/u_debug.h"
+#include "os/os_memory.h"
 
 
 #ifdef __cplusplus
@@ -44,114 +45,13 @@ extern "C" {
 #endif
 
 
-/* Define ENOMEM for WINCE */ 
-#if (_WIN32_WCE < 600)
-#ifndef ENOMEM
-#define ENOMEM 12
-#endif
-#endif
-
-
-#if defined(PIPE_OS_WINDOWS) && defined(DEBUG) 
-
-/* memory debugging */
-
-#include "util/u_debug.h"
-
-#define MALLOC( _size ) \
-   debug_malloc( __FILE__, __LINE__, __FUNCTION__, _size )
-#define CALLOC( _count, _size ) \
-   debug_calloc(__FILE__, __LINE__, __FUNCTION__, _count, _size )
-#define FREE( _ptr ) \
-   debug_free( __FILE__, __LINE__, __FUNCTION__,  _ptr )
-#define REALLOC( _ptr, _old_size, _size ) \
-   debug_realloc( __FILE__, __LINE__, __FUNCTION__,  _ptr, _old_size, _size )
-
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-
-void * __stdcall
-EngAllocMem(
-    unsigned long Flags,
-    unsigned long MemSize,
-    unsigned long Tag );
-
-void __stdcall
-EngFreeMem(
-    void *Mem );
-
-#define MALLOC( _size ) EngAllocMem( 0, _size, 'D3AG' )
-#define _FREE( _ptr ) EngFreeMem( _ptr )
-
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-
-void *
-ExAllocatePool(
-    unsigned long PoolType, 
-    size_t NumberOfBytes);
-
-void 
-ExFreePool(void *P);
-
-#define MALLOC(_size) ExAllocatePool(0, _size)
-#define _FREE(_ptr) ExFreePool(_ptr)
-
-#else
-
-#define MALLOC( SIZE )  malloc( SIZE )
-#define CALLOC( COUNT, SIZE )   calloc( COUNT, SIZE )
-#define FREE( PTR )  free( PTR )
-
-static INLINE void *
-_REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
-{
-   (void) old_size;
-   return realloc(old_ptr, new_size);
-}
-#define REALLOC( a, b, c ) _REALLOC( a, b, c )
-#endif
-
-
-#ifndef CALLOC
-static INLINE void *
-CALLOC( unsigned count, unsigned size )
-{
-   void *ptr = MALLOC( count * size );
-   if( ptr ) {
-      memset( ptr, 0, count * size );
-   }
-   return ptr;
-}
-#endif /* !CALLOC */
+#define MALLOC(_size)  os_malloc(_size)
 
-#ifndef FREE
-static INLINE void
-FREE( void *ptr )
-{
-   if( ptr ) {
-      _FREE( ptr );
-   }
-}
-#endif /* !FREE */
+#define CALLOC(_count, _size) os_calloc(_count, _size)
 
-#ifndef REALLOC
-static INLINE void *
-REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
-{
-   void *new_ptr = NULL;
-
-   if (new_size != 0) {
-      unsigned copy_size = old_size < new_size ? old_size : new_size;
-      new_ptr = MALLOC( new_size );
-      if (new_ptr && old_ptr && copy_size) {
-         memcpy( new_ptr, old_ptr, copy_size );
-      }
-   }
-
-   FREE( old_ptr );
-   return new_ptr;
-}
-#endif /* !REALLOC */
+#define FREE(_ptr ) os_free(_ptr)
 
+#define REALLOC(_ptr, _old_size, _size) os_realloc(_ptr, _old_size, _size)
 
 #define MALLOC_STRUCT(T)   (struct T *) MALLOC(sizeof(struct T))
 
@@ -160,50 +60,8 @@ REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
 #define CALLOC_VARIANT_LENGTH_STRUCT(T,more_size)   ((struct T *) CALLOC(1, sizeof(struct T) + more_size))
 
 
-/**
- * Return memory on given byte alignment
- */
-static INLINE void *
-align_malloc(size_t bytes, uint alignment)
-{
-#if defined(HAVE_POSIX_MEMALIGN)
-   void *mem;
-   alignment = (alignment + (uint)sizeof(void*) - 1) & ~((uint)sizeof(void*) - 1);
-   if(posix_memalign(& mem, alignment, bytes) != 0)
-      return NULL;
-   return mem;
-#else
-   char *ptr, *buf;
-
-   assert( alignment > 0 );
-
-   ptr = (char *) MALLOC(bytes + alignment + sizeof(void *));
-   if (!ptr)
-      return NULL;
-
-   buf = (char *) align_pointer( ptr + sizeof(void *), alignment );
-   *(char **)(buf - sizeof(void *)) = ptr;
-
-   return buf;
-#endif /* defined(HAVE_POSIX_MEMALIGN) */
-}
-
-/**
- * Free memory returned by align_malloc().
- */
-static INLINE void
-align_free(void *ptr)
-{
-#if defined(HAVE_POSIX_MEMALIGN)
-   FREE(ptr);
-#else
-   if (ptr) {
-      void **cubbyHole = (void **) ((char *) ptr - sizeof(void *));
-      void *realAddr = *cubbyHole;
-      FREE(realAddr);
-   }
-#endif /* defined(HAVE_POSIX_MEMALIGN) */
-}
+#define align_malloc(_size, _alignment) os_malloc_aligned(_size, _alignment)
+#define align_free(_ptr) os_free_aligned(_ptr)
 
 
 /**
diff --git a/src/gallium/auxiliary/util/u_network.c b/src/gallium/auxiliary/util/u_network.c
index 6269c72e121..87ee0e47685 100644
--- a/src/gallium/auxiliary/util/u_network.c
+++ b/src/gallium/auxiliary/util/u_network.c
@@ -6,7 +6,7 @@
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  include <winsock2.h>
 #  include <windows.h>
-#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE)
 #  include <sys/socket.h>
 #  include <netinet/in.h>
 #  include <unistd.h>
@@ -54,7 +54,7 @@ u_socket_close(int s)
    if (s < 0)
       return;
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE)
    shutdown(s, SHUT_RDWR);
    close(s);
 #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
@@ -117,7 +117,7 @@ u_socket_connect(const char *hostname, uint16_t port)
    if (!host)
       return -1;
 
-   memcpy((char *)&sa.sin_addr,host->h_addr,host->h_length);
+   memcpy((char *)&sa.sin_addr,host->h_addr_list[0],host->h_length);
    sa.sin_family= host->h_addrtype;
    sa.sin_port = htons(port);
 
@@ -169,7 +169,7 @@ u_socket_listen_on_port(uint16_t portnum)
 void
 u_socket_block(int s, boolean block)
 {
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE)
    int old = fcntl(s, F_GETFL, 0);
    if (old == -1)
       return;
diff --git a/src/gallium/auxiliary/util/u_network.h b/src/gallium/auxiliary/util/u_network.h
index 0aa898b9676..187dcab86e7 100644
--- a/src/gallium/auxiliary/util/u_network.h
+++ b/src/gallium/auxiliary/util/u_network.h
@@ -6,7 +6,7 @@
 
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  define PIPE_HAVE_SOCKETS
-#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_BSD)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE)
 #  define PIPE_HAVE_SOCKETS
 #endif
 
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index 9dacc6d83db..50f1b1670b6 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -37,106 +37,110 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
+#include "util/u_format.h"
 #include "util/u_math.h"
 
 
+
+union util_color {
+   ubyte ub;
+   ushort us;
+   uint ui;
+   float f[4];
+};
+
 /**
  * Pack ubyte R,G,B,A into dest pixel.
  */
 static INLINE void
 util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
-                   enum pipe_format format, void *dest)
+                   enum pipe_format format, union util_color *uc)
 {
    switch (format) {
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (r << 24) | (g << 16) | (b << 8) | a;
+         uc->ui = (r << 24) | (g << 16) | (b << 8) | a;
       }
       return;
-   case PIPE_FORMAT_R8G8B8X8_UNORM:
+   case PIPE_FORMAT_X8B8G8R8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (r << 24) | (g << 16) | (b << 8) | 0xff;
+         uc->ui = (r << 24) | (g << 16) | (b << 8) | 0xff;
       }
       return;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (a << 24) | (r << 16) | (g << 8) | b;
+         uc->ui = (a << 24) | (r << 16) | (g << 8) | b;
       }
       return;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (0xff << 24) | (r << 16) | (g << 8) | b;
+         uc->ui = (0xff << 24) | (r << 16) | (g << 8) | b;
       }
       return;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (b << 24) | (g << 16) | (r << 8) | a;
+         uc->ui = (b << 24) | (g << 16) | (r << 8) | a;
       }
       return;
-   case PIPE_FORMAT_B8G8R8X8_UNORM:
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (b << 24) | (g << 16) | (r << 8) | 0xff;
+         uc->ui = (b << 24) | (g << 16) | (r << 8) | 0xff;
       }
       return;
-   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_B5G6R5_UNORM:
       {
-         ushort *d = (ushort *) dest;
-         *d = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
+         uc->us = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
       }
       return;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
       {
-         ushort *d = (ushort *) dest;
-         *d = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
+         uc->us = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
       }
       return;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
       {
-         ushort *d = (ushort *) dest;
-         *d = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
+         uc->us = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
       }
       return;
    case PIPE_FORMAT_A8_UNORM:
       {
-         ubyte *d = (ubyte *) dest;
-         *d = a;
+         uc->ub = a;
       }
       return;
    case PIPE_FORMAT_L8_UNORM:
    case PIPE_FORMAT_I8_UNORM:
       {
-         ubyte *d = (ubyte *) dest;
-         *d = r;
+         uc->ub = a;
       }
       return;
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
       {
-         float *d = (float *) dest;
-         d[0] = (float)r / 255.0f;
-         d[1] = (float)g / 255.0f;
-         d[2] = (float)b / 255.0f;
-         d[3] = (float)a / 255.0f;
+         uc->f[0] = (float)r / 255.0f;
+         uc->f[1] = (float)g / 255.0f;
+         uc->f[2] = (float)b / 255.0f;
+         uc->f[3] = (float)a / 255.0f;
       }
       return;
    case PIPE_FORMAT_R32G32B32_FLOAT:
       {
-         float *d = (float *) dest;
-         d[0] = (float)r / 255.0f;
-         d[1] = (float)g / 255.0f;
-         d[2] = (float)b / 255.0f;
+         uc->f[0] = (float)r / 255.0f;
+         uc->f[1] = (float)g / 255.0f;
+         uc->f[2] = (float)b / 255.0f;
       }
       return;
 
-   /* XXX lots more cases to add */
+   /* Handle other cases with a generic function.
+    */
    default:
-      debug_print_format("gallium: unhandled format in util_pack_color_ub()", format);
-      assert(0);
+      {
+         ubyte src[4];
+
+         src[0] = r;
+         src[1] = g;
+         src[2] = b;
+         src[3] = a;
+         util_format_write_4ub(format, src, 0, uc, 0, 0, 0, 1, 1);
+      }
    }
 }
  
@@ -145,85 +149,85 @@ util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
  * Unpack RGBA from a packed pixel, returning values as ubytes in [0,255].
  */
 static INLINE void
-util_unpack_color_ub(enum pipe_format format, const void *src,
+util_unpack_color_ub(enum pipe_format format, union util_color *uc,
                      ubyte *r, ubyte *g, ubyte *b, ubyte *a)
 {
    switch (format) {
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
       {
-         uint p = ((const uint *) src)[0];
+         uint p = uc->ui;
          *r = (ubyte) ((p >> 24) & 0xff);
          *g = (ubyte) ((p >> 16) & 0xff);
          *b = (ubyte) ((p >>  8) & 0xff);
          *a = (ubyte) ((p >>  0) & 0xff);
       }
       return;
-   case PIPE_FORMAT_R8G8B8X8_UNORM:
+   case PIPE_FORMAT_X8B8G8R8_UNORM:
       {
-         uint p = ((const uint *) src)[0];
+         uint p = uc->ui;
          *r = (ubyte) ((p >> 24) & 0xff);
          *g = (ubyte) ((p >> 16) & 0xff);
          *b = (ubyte) ((p >>  8) & 0xff);
          *a = (ubyte) 0xff;
       }
       return;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       {
-         uint p = ((const uint *) src)[0];
+         uint p = uc->ui;
          *r = (ubyte) ((p >> 16) & 0xff);
          *g = (ubyte) ((p >>  8) & 0xff);
          *b = (ubyte) ((p >>  0) & 0xff);
          *a = (ubyte) ((p >> 24) & 0xff);
       }
       return;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
       {
-         uint p = ((const uint *) src)[0];
+         uint p = uc->ui;
          *r = (ubyte) ((p >> 16) & 0xff);
          *g = (ubyte) ((p >>  8) & 0xff);
          *b = (ubyte) ((p >>  0) & 0xff);
          *a = (ubyte) 0xff;
       }
       return;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       {
-         uint p = ((const uint *) src)[0];
+         uint p = uc->ui;
          *r = (ubyte) ((p >>  8) & 0xff);
          *g = (ubyte) ((p >> 16) & 0xff);
          *b = (ubyte) ((p >> 24) & 0xff);
          *a = (ubyte) ((p >>  0) & 0xff);
       }
       return;
-   case PIPE_FORMAT_B8G8R8X8_UNORM:
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
       {
-         uint p = ((const uint *) src)[0];
+         uint p = uc->ui;
          *r = (ubyte) ((p >>  8) & 0xff);
          *g = (ubyte) ((p >> 16) & 0xff);
          *b = (ubyte) ((p >> 24) & 0xff);
          *a = (ubyte) 0xff;
       }
       return;
-   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_B5G6R5_UNORM:
       {
-         ushort p = ((const ushort *) src)[0];
+         ushort p = uc->us;
          *r = (ubyte) (((p >> 8) & 0xf8) | ((p >> 13) & 0x7));
          *g = (ubyte) (((p >> 3) & 0xfc) | ((p >>  9) & 0x3));
          *b = (ubyte) (((p << 3) & 0xf8) | ((p >>  2) & 0x7));
          *a = (ubyte) 0xff;
       }
       return;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
       {
-         ushort p = ((const ushort *) src)[0];
+         ushort p = uc->us;
          *r = (ubyte) (((p >>  7) & 0xf8) | ((p >> 12) & 0x7));
          *g = (ubyte) (((p >>  2) & 0xf8) | ((p >>  7) & 0x7));
          *b = (ubyte) (((p <<  3) & 0xf8) | ((p >>  2) & 0x7));
          *a = (ubyte) (0xff * (p >> 15));
       }
       return;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
       {
-         ushort p = ((const ushort *) src)[0];
+         ushort p = uc->us;
          *r = (ubyte) (((p >> 4) & 0xf0) | ((p >>  8) & 0xf));
          *g = (ubyte) (((p >> 0) & 0xf0) | ((p >>  4) & 0xf));
          *b = (ubyte) (((p << 4) & 0xf0) | ((p >>  0) & 0xf));
@@ -232,27 +236,27 @@ util_unpack_color_ub(enum pipe_format format, const void *src,
       return;
    case PIPE_FORMAT_A8_UNORM:
       {
-         ubyte p = ((const ubyte *) src)[0];
+         ubyte p = uc->ub;
          *r = *g = *b = (ubyte) 0xff;
          *a = p;
       }
       return;
    case PIPE_FORMAT_L8_UNORM:
       {
-         ubyte p = ((const ubyte *) src)[0];
+         ubyte p = uc->ub;
          *r = *g = *b = p;
          *a = (ubyte) 0xff;
       }
       return;
    case PIPE_FORMAT_I8_UNORM:
       {
-         ubyte p = ((const ubyte *) src)[0];
+         ubyte p = uc->ub;
          *r = *g = *b = *a = p;
       }
       return;
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
       {
-         const float *p = (const float *) src;
+         const float *p = &uc->f[0];
          *r = float_to_ubyte(p[0]);
          *g = float_to_ubyte(p[1]);
          *b = float_to_ubyte(p[2]);
@@ -261,7 +265,7 @@ util_unpack_color_ub(enum pipe_format format, const void *src,
       return;
    case PIPE_FORMAT_R32G32B32_FLOAT:
       {
-         const float *p = (const float *) src;
+         const float *p = &uc->f[0];
          *r = float_to_ubyte(p[0]);
          *g = float_to_ubyte(p[1]);
          *b = float_to_ubyte(p[2]);
@@ -271,7 +275,7 @@ util_unpack_color_ub(enum pipe_format format, const void *src,
 
    case PIPE_FORMAT_R32G32_FLOAT:
       {
-         const float *p = (const float *) src;
+         const float *p = &uc->f[0];
          *r = float_to_ubyte(p[0]);
          *g = float_to_ubyte(p[1]);
          *b = *a = (ubyte) 0xff;
@@ -280,34 +284,40 @@ util_unpack_color_ub(enum pipe_format format, const void *src,
 
    case PIPE_FORMAT_R32_FLOAT:
       {
-         const float *p = (const float *) src;
+         const float *p = &uc->f[0];
          *r = float_to_ubyte(p[0]);
          *g = *b = *a = (ubyte) 0xff;
       }
       return;
 
-   /* XXX lots more cases to add */
+   /* Handle other cases with a generic function.
+    */
    default:
-      debug_print_format("gallium: unhandled format in util_unpack_color_ub()",
-                         format);
-      assert(0);
+      {
+         ubyte dst[4];
+
+         util_format_read_4ub(format, dst, 0, uc, 0, 0, 0, 1, 1);
+         *r = dst[0];
+         *g = dst[1];
+         *b = dst[2];
+         *a = dst[3];
+      }
    }
 }
- 
 
 
 /**
  * Note rgba outside [0,1] will be clamped for int pixel formats.
  */
 static INLINE void
-util_pack_color(const float rgba[4], enum pipe_format format, void *dest)
+util_pack_color(const float rgba[4], enum pipe_format format, union util_color *uc)
 {
    ubyte r = 0;
    ubyte g = 0;
    ubyte b = 0;
    ubyte a = 0;
 
-   if (pf_size_x(format) <= 8) {
+   if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) <= 8) {
       /* format uses 8-bit components or less */
       r = float_to_ubyte(rgba[0]);
       g = float_to_ubyte(rgba[1]);
@@ -316,94 +326,82 @@ util_pack_color(const float rgba[4], enum pipe_format format, void *dest)
    }
 
    switch (format) {
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (r << 24) | (g << 16) | (b << 8) | a;
+         uc->ui = (r << 24) | (g << 16) | (b << 8) | a;
       }
       return;
-   case PIPE_FORMAT_R8G8B8X8_UNORM:
+   case PIPE_FORMAT_X8B8G8R8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (r << 24) | (g << 16) | (b << 8) | 0xff;
+         uc->ui = (r << 24) | (g << 16) | (b << 8) | 0xff;
       }
       return;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (a << 24) | (r << 16) | (g << 8) | b;
+         uc->ui = (a << 24) | (r << 16) | (g << 8) | b;
       }
       return;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (0xff << 24) | (r << 16) | (g << 8) | b;
+         uc->ui = (0xff << 24) | (r << 16) | (g << 8) | b;
       }
       return;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (b << 24) | (g << 16) | (r << 8) | a;
+         uc->ui = (b << 24) | (g << 16) | (r << 8) | a;
       }
       return;
-   case PIPE_FORMAT_B8G8R8X8_UNORM:
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
       {
-         uint *d = (uint *) dest;
-         *d = (b << 24) | (g << 16) | (r << 8) | 0xff;
+         uc->ui = (b << 24) | (g << 16) | (r << 8) | 0xff;
       }
       return;
-   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_B5G6R5_UNORM:
       {
-         ushort *d = (ushort *) dest;
-         *d = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
+         uc->us = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
       }
       return;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
       {
-         ushort *d = (ushort *) dest;
-         *d = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
+         uc->us = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
       }
       return;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
       {
-         ushort *d = (ushort *) dest;
-         *d = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
+         uc->ub = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
       }
       return;
    case PIPE_FORMAT_A8_UNORM:
       {
-         ubyte *d = (ubyte *) dest;
-         *d = a;
+         uc->ub = a;
       }
       return;
    case PIPE_FORMAT_L8_UNORM:
    case PIPE_FORMAT_I8_UNORM:
       {
-         ubyte *d = (ubyte *) dest;
-         *d = r;
+         uc->ub = r;
       }
       return;
    case PIPE_FORMAT_R32G32B32A32_FLOAT:
       {
-         float *d = (float *) dest;
-         d[0] = rgba[0];
-         d[1] = rgba[1];
-         d[2] = rgba[2];
-         d[3] = rgba[3];
+         uc->f[0] = rgba[0];
+         uc->f[1] = rgba[1];
+         uc->f[2] = rgba[2];
+         uc->f[3] = rgba[3];
       }
       return;
    case PIPE_FORMAT_R32G32B32_FLOAT:
       {
-         float *d = (float *) dest;
-         d[0] = rgba[0];
-         d[1] = rgba[1];
-         d[2] = rgba[2];
+         uc->f[0] = rgba[0];
+         uc->f[1] = rgba[1];
+         uc->f[2] = rgba[2];
       }
       return;
-   /* XXX lots more cases to add */
+
+   /* Handle other cases with a generic function.
+    */
    default:
-      debug_print_format("gallium: unhandled format in util_pack_color()", format);
-      assert(0);
+      util_format_write_4f(format, rgba, 0, uc, 0, 0, 0, 1, 1);
    }
 }
  
@@ -427,13 +425,15 @@ util_pack_z(enum pipe_format format, double z)
       if (z == 1.0)
          return 0xffffffff;
       return (uint) (z * 0xffffffff);
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z32_FLOAT:
+      return (uint)z;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
       if (z == 1.0)
          return 0xffffff;
       return (uint) (z * 0xffffff);
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       if (z == 1.0)
          return 0xffffff00;
       return ((uint) (z * 0xffffff)) << 8;
@@ -458,10 +458,10 @@ util_pack_z_stencil(enum pipe_format format, double z, uint s)
    unsigned packed = util_pack_z(format, z);
 
    switch (format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24S8_UNORM:
       packed |= s << 24;
       break;
-   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
       packed |= s;
       break;
    case PIPE_FORMAT_S8_UNORM:
diff --git a/src/gallium/auxiliary/util/u_prim.h b/src/gallium/auxiliary/util/u_prim.h
index a9b533eea70..64390e13851 100644
--- a/src/gallium/auxiliary/util/u_prim.h
+++ b/src/gallium/auxiliary/util/u_prim.h
@@ -30,12 +30,13 @@
 #define U_BLIT_H
 
 
+#include "pipe/p_defines.h"
+#include "util/u_debug.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "pipe/p_defines.h"
-
 static INLINE boolean u_validate_pipe_prim( unsigned pipe_prim, unsigned nr )
 {
    boolean ok = TRUE;
@@ -135,4 +136,39 @@ static INLINE unsigned u_reduced_prim( unsigned pipe_prim )
    }
 }
 
+static INLINE unsigned
+u_vertices_per_prim(int primitive)
+{
+   switch(primitive) {
+   case PIPE_PRIM_POINTS:
+      return 1;
+   case PIPE_PRIM_LINES:
+   case PIPE_PRIM_LINE_LOOP:
+   case PIPE_PRIM_LINE_STRIP:
+      return 2;
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+      return 3;
+   case PIPE_PRIM_LINES_ADJACENCY:
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      return 4;
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return 6;
+
+   /* following primitives should never be used
+    * with geometry shaders abd their size is
+    * undefined */
+   case PIPE_PRIM_POLYGON:
+   case PIPE_PRIM_QUADS:
+   case PIPE_PRIM_QUAD_STRIP:
+   default:
+      debug_printf("Unrecognized geometry shader primitive");
+      return 3;
+   }
+}
+
+const char *u_prim_name( unsigned pipe_prim );
+
 #endif
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index 72725b59d2c..8479161c744 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -34,13 +34,14 @@
 #include "pipe/p_format.h"
 #include "pipe/p_context.h"
 #include "pipe/p_screen.h"
+#include "util/u_format.h"
 #include "util/u_rect.h"
 
 
 /**
  * Copy 2D rect from one place to another.
  * Position and sizes are in pixels.
- * src_pitch may be negative to do vertical flip of pixels from source.
+ * src_stride may be negative to do vertical flip of pixels from source.
  */
 void
 util_copy_rect(ubyte * dst,
@@ -53,21 +54,17 @@ util_copy_rect(ubyte * dst,
                const ubyte * src,
                int src_stride,
                unsigned src_x, 
-               int src_y)
+               unsigned src_y)
 {
    unsigned i;
    int src_stride_pos = src_stride < 0 ? -src_stride : src_stride;
-   int blocksize = pf_get_blocksize(format);
-   int blockwidth = pf_get_blockwidth(format);
-   int blockheight = pf_get_blockheight(format);
+   int blocksize = util_format_get_blocksize(format);
+   int blockwidth = util_format_get_blockwidth(format);
+   int blockheight = util_format_get_blockheight(format);
 
    assert(blocksize > 0);
    assert(blockwidth > 0);
    assert(blockheight > 0);
-   assert(src_x >= 0);
-   assert(src_y >= 0);
-   assert(dst_x >= 0);
-   assert(dst_y >= 0);
 
    dst_x /= blockwidth;
    dst_y /= blockheight;
@@ -105,15 +102,13 @@ util_fill_rect(ubyte * dst,
 {
    unsigned i, j;
    unsigned width_size;
-   int blocksize = pf_get_blocksize(format);
-   int blockwidth = pf_get_blockwidth(format);
-   int blockheight = pf_get_blockheight(format);
+   int blocksize = util_format_get_blocksize(format);
+   int blockwidth = util_format_get_blockwidth(format);
+   int blockheight = util_format_get_blockheight(format);
 
    assert(blocksize > 0);
    assert(blockwidth > 0);
    assert(blockheight > 0);
-   assert(dst_x >= 0);
-   assert(dst_y >= 0);
 
    dst_x /= blockwidth;
    dst_y /= blockheight;
@@ -203,9 +198,9 @@ util_surface_copy(struct pipe_context *pipe,
                                         PIPE_TRANSFER_WRITE,
                                         dst_x, dst_y, w, h);
 
-   assert(pf_get_blocksize(dst_format) == pf_get_blocksize(src_format));
-   assert(pf_get_blockwidth(dst_format) == pf_get_blockwidth(src_format));
-   assert(pf_get_blockheight(dst_format) == pf_get_blockheight(src_format));
+   assert(util_format_get_blocksize(dst_format) == util_format_get_blocksize(src_format));
+   assert(util_format_get_blockwidth(dst_format) == util_format_get_blockwidth(src_format));
+   assert(util_format_get_blockheight(dst_format) == util_format_get_blockheight(src_format));
 
    src_map = pipe->screen->transfer_map(screen, src_trans);
    dst_map = pipe->screen->transfer_map(screen, dst_trans);
@@ -270,7 +265,7 @@ util_surface_fill(struct pipe_context *pipe,
    if (dst_map) {
       assert(dst_trans->stride > 0);
 
-      switch (pf_get_blocksize(dst_trans->texture->format)) {
+      switch (util_format_get_blocksize(dst_trans->texture->format)) {
       case 1:
       case 2:
       case 4:
diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h
index 5e444ffae21..b44d821904b 100644
--- a/src/gallium/auxiliary/util/u_rect.h
+++ b/src/gallium/auxiliary/util/u_rect.h
@@ -45,7 +45,7 @@ extern void
 util_copy_rect(ubyte * dst, enum pipe_format format,
                unsigned dst_stride, unsigned dst_x, unsigned dst_y,
                unsigned width, unsigned height, const ubyte * src,
-               int src_stride, unsigned src_x, int src_y);
+               int src_stride, unsigned src_x, unsigned src_y);
 
 extern void
 util_fill_rect(ubyte * dst, enum pipe_format format,
diff --git a/src/gallium/auxiliary/util/u_ringbuffer.c b/src/gallium/auxiliary/util/u_ringbuffer.c
new file mode 100644
index 00000000000..648b105b137
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_ringbuffer.c
@@ -0,0 +1,160 @@
+
+#include "os/os_thread.h"
+#include "pipe/p_defines.h"
+#include "util/u_ringbuffer.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+/* Generic ringbuffer: 
+ */
+struct util_ringbuffer 
+{
+   struct util_packet *buf;
+   unsigned mask;
+
+   /* Can this be done with atomic variables??
+    */
+   unsigned head;
+   unsigned tail;
+   pipe_condvar change;
+   pipe_mutex mutex;
+};
+
+
+struct util_ringbuffer *util_ringbuffer_create( unsigned dwords )
+{
+   struct util_ringbuffer *ring = CALLOC_STRUCT(util_ringbuffer);
+   if (ring == NULL)
+      return NULL;
+
+   assert(util_is_power_of_two(dwords));
+   
+   ring->buf = MALLOC( dwords * sizeof(unsigned) );
+   if (ring->buf == NULL)
+      goto fail;
+
+   ring->mask = dwords - 1;
+
+   pipe_condvar_init(ring->change);
+   pipe_mutex_init(ring->mutex);
+   return ring;
+
+fail:
+   FREE(ring->buf);
+   FREE(ring);
+   return NULL;
+}
+
+void util_ringbuffer_destroy( struct util_ringbuffer *ring )
+{
+   pipe_condvar_destroy(ring->change);
+   pipe_mutex_destroy(ring->mutex);
+   FREE(ring->buf);
+   FREE(ring);
+}
+
+/**
+ * Return number of free entries in the ring
+ */
+static INLINE unsigned util_ringbuffer_space( const struct util_ringbuffer *ring )
+{
+   return (ring->tail - (ring->head + 1)) & ring->mask;
+}
+
+/**
+ * Is the ring buffer empty?
+ */
+static INLINE boolean util_ringbuffer_empty( const struct util_ringbuffer *ring )
+{
+   return util_ringbuffer_space(ring) == ring->mask;
+}
+
+void util_ringbuffer_enqueue( struct util_ringbuffer *ring,
+                              const struct util_packet *packet )
+{
+   unsigned i;
+
+   /* XXX: over-reliance on mutexes, etc:
+    */
+   pipe_mutex_lock(ring->mutex);
+
+   /* make sure we don't request an impossible amount of space
+    */
+   assert(packet->dwords <= ring->mask);
+
+   /* Wait for free space:
+    */
+   while (util_ringbuffer_space(ring) < packet->dwords)
+      pipe_condvar_wait(ring->change, ring->mutex);
+
+   /* Copy data to ring:
+    */
+   for (i = 0; i < packet->dwords; i++) {
+
+      /* Copy all dwords of the packet.  Note we're abusing the
+       * typesystem a little - we're being passed a pointer to
+       * something, but probably not an array of packet structs:
+       */
+      ring->buf[ring->head] = packet[i];
+      ring->head++;
+      ring->head &= ring->mask;
+   }
+
+   /* Signal change:
+    */
+   pipe_condvar_signal(ring->change);
+   pipe_mutex_unlock(ring->mutex);
+}
+
+enum pipe_error util_ringbuffer_dequeue( struct util_ringbuffer *ring,
+                                         struct util_packet *packet,
+                                         unsigned max_dwords,
+                                         boolean wait )
+{
+   const struct util_packet *ring_packet;
+   unsigned i;
+   int ret = PIPE_OK;
+
+   /* XXX: over-reliance on mutexes, etc:
+    */
+   pipe_mutex_lock(ring->mutex);
+
+   /* Get next ring entry:
+    */
+   if (wait) {
+      while (util_ringbuffer_empty(ring))
+         pipe_condvar_wait(ring->change, ring->mutex);
+   }
+   else {
+      if (util_ringbuffer_empty(ring)) {
+         ret = PIPE_ERROR_OUT_OF_MEMORY;
+         goto out;
+      }
+   }
+
+   ring_packet = &ring->buf[ring->tail];
+
+   /* Both of these are considered bugs.  Raise an assert on debug builds.
+    */
+   if (ring_packet->dwords > ring->mask + 1 - util_ringbuffer_space(ring) ||
+       ring_packet->dwords > max_dwords) {
+      assert(0);
+      ret = PIPE_ERROR_BAD_INPUT;
+      goto out;
+   }
+
+   /* Copy data from ring:
+    */
+   for (i = 0; i < ring_packet->dwords; i++) {
+      packet[i] = ring->buf[ring->tail];
+      ring->tail++;
+      ring->tail &= ring->mask;
+   }
+
+out:
+   /* Signal change:
+    */
+   pipe_condvar_signal(ring->change);
+   pipe_mutex_unlock(ring->mutex);
+   return ret;
+}
diff --git a/src/gallium/auxiliary/util/u_ringbuffer.h b/src/gallium/auxiliary/util/u_ringbuffer.h
new file mode 100644
index 00000000000..85f0ad6c1f6
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_ringbuffer.h
@@ -0,0 +1,29 @@
+
+#ifndef UTIL_RINGBUFFER_H
+#define UTIL_RINGBUFFER_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"       /* only for pipe_error! */
+
+/* Generic header
+ */
+struct util_packet {
+   unsigned dwords:8;
+   unsigned data24:24;
+};
+
+struct util_ringbuffer;
+
+struct util_ringbuffer *util_ringbuffer_create( unsigned dwords );
+
+void util_ringbuffer_destroy( struct util_ringbuffer *ring );
+
+void util_ringbuffer_enqueue( struct util_ringbuffer *ring,
+                              const struct util_packet *packet );
+
+enum pipe_error util_ringbuffer_dequeue( struct util_ringbuffer *ring,
+                                         struct util_packet *packet,
+                                         unsigned max_dwords,
+                                         boolean wait );
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_simple_screen.c b/src/gallium/auxiliary/util/u_simple_screen.c
index 52382990155..53f3c16dbcc 100644
--- a/src/gallium/auxiliary/util/u_simple_screen.c
+++ b/src/gallium/auxiliary/util/u_simple_screen.c
@@ -29,7 +29,7 @@
 
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
-#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_simple_screen.h"
 
 
 static struct pipe_buffer *
diff --git a/src/gallium/auxiliary/util/u_simple_screen.h b/src/gallium/auxiliary/util/u_simple_screen.h
index 6612a8a7c09..bb3f5ba102f 100644
--- a/src/gallium/auxiliary/util/u_simple_screen.h
+++ b/src/gallium/auxiliary/util/u_simple_screen.h
@@ -28,8 +28,145 @@
 #ifndef U_SIMPLE_SCREEN_H
 #define U_SIMPLE_SCREEN_H
 
+#include "pipe/p_format.h"
+
 struct pipe_screen;
-struct pipe_winsys;
+struct pipe_fence_handle;
+struct pipe_surface;
+struct pipe_buffer;
+
+/**
+ * Gallium3D drivers are (meant to be!) independent of both GL and the
+ * window system.  The window system provides a buffer manager and a
+ * set of additional hooks for things like command buffer submission,
+ * etc.
+ *
+ * There clearly has to be some agreement between the window system
+ * driver and the hardware driver about the format of command buffers,
+ * etc.
+ */
+struct pipe_winsys
+{
+   void (*destroy)( struct pipe_winsys *ws );
+
+   /** Returns name of this winsys interface */
+   const char *(*get_name)( struct pipe_winsys *ws );
+
+   /**
+    * Do any special operations to ensure buffer size is correct
+    */
+   void (*update_buffer)( struct pipe_winsys *ws,
+                          void *context_private );
+   /**
+    * Do any special operations to ensure frontbuffer contents are
+    * displayed, eg copy fake frontbuffer.
+    */
+   void (*flush_frontbuffer)( struct pipe_winsys *ws,
+                              struct pipe_surface *surf,
+                              void *context_private );
+
+
+   /**
+    * Buffer management. Buffer attributes are mostly fixed over its lifetime.
+    *
+    * Remember that gallium gets to choose the interface it needs, and the
+    * window systems must then implement that interface (rather than the
+    * other way around...).
+    *
+    * usage is a bitmask of PIPE_BUFFER_USAGE_PIXEL/VERTEX/INDEX/CONSTANT. This
+    * usage argument is only an optimization hint, not a guarantee, therefore
+    * proper behavior must be observed in all circumstances.
+    *
+    * alignment indicates the client's alignment requirements, eg for
+    * SSE instructions.
+    */
+   struct pipe_buffer *(*buffer_create)( struct pipe_winsys *ws,
+                                         unsigned alignment,
+                                         unsigned usage,
+                                         unsigned size );
+
+   /**
+    * Create a buffer that wraps user-space data.
+    *
+    * Effectively this schedules a delayed call to buffer_create
+    * followed by an upload of the data at *some point in the future*,
+    * or perhaps never.  Basically the allocate/upload is delayed
+    * until the buffer is actually passed to hardware.
+    *
+    * The intention is to provide a quick way to turn regular data
+    * into a buffer, and secondly to avoid a copy operation if that
+    * data subsequently turns out to be only accessed by the CPU.
+    *
+    * Common example is OpenGL vertex buffers that are subsequently
+    * processed either by software TNL in the driver or by passing to
+    * hardware.
+    *
+    * XXX: What happens if the delayed call to buffer_create() fails?
+    *
+    * Note that ptr may be accessed at any time upto the time when the
+    * buffer is destroyed, so the data must not be freed before then.
+    */
+   struct pipe_buffer *(*user_buffer_create)(struct pipe_winsys *ws,
+                                                    void *ptr,
+                                                    unsigned bytes);
+
+   /**
+    * Allocate storage for a display target surface.
+    *
+    * Often surfaces which are meant to be blitted to the front screen (i.e.,
+    * display targets) must be allocated with special characteristics, memory
+    * pools, or obtained directly from the windowing system.
+    *
+    * This callback is invoked by the pipe_screenwhen creating a texture marked
+    * with the PIPE_TEXTURE_USAGE_DISPLAY_TARGET flag  to get the underlying
+    * buffer storage.
+    */
+   struct pipe_buffer *(*surface_buffer_create)(struct pipe_winsys *ws,
+                                                unsigned width, unsigned height,
+                                                enum pipe_format format,
+                                                unsigned usage,
+                                                unsigned tex_usage,
+                                                unsigned *stride);
+
+
+   /**
+    * Map the entire data store of a buffer object into the client's address.
+    * flags is bitmask of PIPE_BUFFER_USAGE_CPU_READ/WRITE flags.
+    */
+   void *(*buffer_map)( struct pipe_winsys *ws,
+                        struct pipe_buffer *buf,
+                        unsigned usage );
+
+   void (*buffer_unmap)( struct pipe_winsys *ws,
+                         struct pipe_buffer *buf );
+
+   void (*buffer_destroy)( struct pipe_buffer *buf );
+
+
+   /** Set ptr = fence, with reference counting */
+   void (*fence_reference)( struct pipe_winsys *ws,
+                            struct pipe_fence_handle **ptr,
+                            struct pipe_fence_handle *fence );
+
+   /**
+    * Checks whether the fence has been signalled.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_signalled)( struct pipe_winsys *ws,
+                           struct pipe_fence_handle *fence,
+                           unsigned flag );
+
+   /**
+    * Wait for the fence to finish.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_finish)( struct pipe_winsys *ws,
+                        struct pipe_fence_handle *fence,
+                        unsigned flag );
+
+};
 
 /**
  * The following function initializes a simple passthrough screen.
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 1c8b157d91f..019dda767d0 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -2,6 +2,7 @@
  *
  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009 Marek Olšák <[email protected]>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -30,25 +31,29 @@
  * Simple vertex/fragment shader generators.
  *  
  * @author Brian Paul
+           Marek Olšák
  */
 
 
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_simple_shaders.h"
+#include "util/u_debug.h"
 #include "tgsi/tgsi_ureg.h"
 
 
 
 /**
  * Make simple vertex pass-through shader.
+ * \param num_attribs  number of attributes to pass through
+ * \param semantic_names  array of semantic names for each attribute
+ * \param semantic_indexes  array of semantic indexes for each attribute
  */
 void *
 util_make_vertex_passthrough_shader(struct pipe_context *pipe,
                                     uint num_attribs,
                                     const uint *semantic_names,
                                     const uint *semantic_indexes)
-                                    
 {
    struct ureg_program *ureg;
    uint i;
@@ -76,8 +81,6 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
 }
 
 
-
-
 /**
  * Make simple fragment texture shader:
  *  IMM {0,0,0,1}                         // (if writemask != 0xf)
@@ -87,6 +90,7 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
  */
 void *
 util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
+                                        unsigned tex_target,
                                         unsigned writemask )
 {
    struct ureg_program *ureg;
@@ -116,20 +120,71 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
 
    ureg_TEX( ureg, 
              ureg_writemask(out, writemask),
-             TGSI_TEXTURE_2D, tex, sampler );
+             tex_target, tex, sampler );
    ureg_END( ureg );
 
    return ureg_create_shader_and_destroy( ureg, pipe );
 }
 
+
+/**
+ * Make a simple fragment shader that sets the output color to a color
+ * taken from a texture.
+ * \param tex_target  one of PIPE_TEXTURE_x
+ */
 void *
-util_make_fragment_tex_shader(struct pipe_context *pipe )
+util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target )
 {
    return util_make_fragment_tex_shader_writemask( pipe,
+                                                   tex_target,
                                                    TGSI_WRITEMASK_XYZW );
 }
 
 
+/**
+ * Make a simple fragment texture shader which reads an X component from
+ * a texture and writes it as depth.
+ */
+void *
+util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
+                                         unsigned tex_target)
+{
+   struct ureg_program *ureg;
+   struct ureg_src sampler;
+   struct ureg_src tex;
+   struct ureg_dst out, depth;
+   struct ureg_src imm;
+
+   ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
+   if (ureg == NULL)
+      return NULL;
+
+   sampler = ureg_DECL_sampler( ureg, 0 );
+
+   tex = ureg_DECL_fs_input( ureg,
+                             TGSI_SEMANTIC_GENERIC, 0,
+                             TGSI_INTERPOLATE_PERSPECTIVE );
+
+   out = ureg_DECL_output( ureg,
+                           TGSI_SEMANTIC_COLOR,
+                           0 );
+
+   depth = ureg_DECL_output( ureg,
+                             TGSI_SEMANTIC_POSITION,
+                             0 );
+
+   imm = ureg_imm4f( ureg, 0, 0, 0, 1 );
+
+   ureg_MOV( ureg, out, imm );
+
+   ureg_TEX( ureg,
+             ureg_writemask(depth, TGSI_WRITEMASK_Z),
+             tex_target, tex, sampler );
+   ureg_END( ureg );
+
+   return ureg_create_shader_and_destroy( ureg, pipe );
+}
+
 
 /**
  * Make simple fragment color pass-through shader.
@@ -137,9 +192,22 @@ util_make_fragment_tex_shader(struct pipe_context *pipe )
 void *
 util_make_fragment_passthrough_shader(struct pipe_context *pipe)
 {
+   return util_make_fragment_clonecolor_shader(pipe, 1);
+}
+
+
+/**
+ * Make a fragment shader that copies the input color to N output colors.
+ */
+void *
+util_make_fragment_clonecolor_shader(struct pipe_context *pipe, int num_cbufs)
+{
    struct ureg_program *ureg;
    struct ureg_src src;
-   struct ureg_dst dst;
+   struct ureg_dst dst[PIPE_MAX_COLOR_BUFS];
+   int i;
+
+   assert(num_cbufs <= PIPE_MAX_COLOR_BUFS);
 
    ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
    if (ureg == NULL)
@@ -148,12 +216,13 @@ util_make_fragment_passthrough_shader(struct pipe_context *pipe)
    src = ureg_DECL_fs_input( ureg, TGSI_SEMANTIC_COLOR, 0, 
                              TGSI_INTERPOLATE_PERSPECTIVE );
 
-   dst = ureg_DECL_output( ureg, TGSI_SEMANTIC_COLOR, 0 );
+   for (i = 0; i < num_cbufs; i++)
+      dst[i] = ureg_DECL_output( ureg, TGSI_SEMANTIC_COLOR, i );
+
+   for (i = 0; i < num_cbufs; i++)
+      ureg_MOV( ureg, dst[i], src );
 
-   ureg_MOV( ureg, dst, src );
    ureg_END( ureg );
 
    return ureg_create_shader_and_destroy( ureg, pipe );
 }
-
-
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h
index d2e80d6eb4d..6e760942e25 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.h
+++ b/src/gallium/auxiliary/util/u_simple_shaders.h
@@ -51,16 +51,25 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe,
 
 extern void *
 util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, 
-                                        unsigned writemask );
+                                        unsigned tex_target,
+                                        unsigned writemask);
 
 extern void *
-util_make_fragment_tex_shader(struct pipe_context *pipe);
+util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target);
+
+
+extern void *
+util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
+                                         unsigned tex_target);
 
 
 extern void *
 util_make_fragment_passthrough_shader(struct pipe_context *pipe);
 
 
+extern void *
+util_make_fragment_clonecolor_shader(struct pipe_context *pipe, int num_cbufs);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/util/u_stream_wd.c b/src/gallium/auxiliary/util/u_stream_wd.c
deleted file mode 100644
index 864489e7755..00000000000
--- a/src/gallium/auxiliary/util/u_stream_wd.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * Stream implementation for the Windows Display driver.
- */
-
-#include "pipe/p_config.h"
-
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-
-#include <windows.h>
-#include <winddi.h>
-
-#include "util/u_memory.h"
-#include "util/u_string.h"
-
-#include "u_stream.h"
-
-
-#define MAP_FILE_SIZE (4*1024*1024)
-
-
-struct util_stream 
-{
-   char filename[MAX_PATH + 1];
-   WCHAR wFileName[MAX_PATH + 1];
-   boolean growable;
-   size_t map_size;
-   ULONG_PTR iFile;
-   char *pMap;
-   size_t written;
-   unsigned suffix;
-};
-
-
-static INLINE boolean
-util_stream_map(struct util_stream *stream)
-{
-   ULONG BytesInUnicodeString;
-   static char filename[MAX_PATH + 1];
-   unsigned filename_len;
-
-   if(stream->growable)
-      filename_len = util_snprintf(filename,
-                                   sizeof(filename),
-                                   "%s.%04x",
-                                   stream->filename,
-                                   stream->suffix++);
-   else
-      filename_len = util_snprintf(filename,
-                                   sizeof(filename),
-                                   "%s",
-                                   stream->filename);
-
-   EngMultiByteToUnicodeN(
-         stream->wFileName,
-         sizeof(stream->wFileName),
-         &BytesInUnicodeString,
-         filename,
-         filename_len);
-   
-   stream->pMap = EngMapFile(stream->wFileName, stream->map_size, &stream->iFile);
-   if(!stream->pMap)
-      return FALSE;
-   
-   memset(stream->pMap, 0, stream->map_size);
-   stream->written = 0;
-   
-   return TRUE;
-}
-
-
-static INLINE void
-util_stream_unmap(struct util_stream *stream)
-{
-   EngUnmapFile(stream->iFile);
-   if(stream->written < stream->map_size) {
-      /* Truncate file size */
-      stream->pMap = EngMapFile(stream->wFileName, stream->written, &stream->iFile);
-      if(stream->pMap)
-         EngUnmapFile(stream->iFile);
-   }
-   
-   stream->pMap = NULL;
-}
-
-
-static INLINE void
-util_stream_full_qualified_filename(char *dst, size_t size, const char *src)
-{
-   boolean need_drive, need_root;
-   
-   if((('A' <= src[0] && src[0] <= 'Z') || ('a' <= src[0] && src[0] <= 'z')) && src[1] == ':') {
-      need_drive = FALSE;
-      need_root = src[2] == '\\' ? FALSE : TRUE;
-   }
-   else {
-      need_drive = TRUE;
-      need_root = src[0] == '\\' ? FALSE : TRUE;
-   }
-   
-   util_snprintf(dst, size, 
-                 "\\??\\%s%s%s",
-                 need_drive ? "C:" : "",
-                 need_root ? "\\" : "",
-                 src);
-}
-
-
-struct util_stream *
-util_stream_create(const char *filename, size_t max_size)
-{
-   struct util_stream *stream;
-   
-   stream = CALLOC_STRUCT(util_stream);
-   if(!stream)
-      goto error1;
-   
-   util_stream_full_qualified_filename(stream->filename,
-                                       sizeof(stream->filename),
-                                       filename);
-   
-   if(max_size) {
-      stream->growable = FALSE;
-      stream->map_size = max_size;
-   }
-   else {
-      stream->growable = TRUE;
-      stream->map_size = MAP_FILE_SIZE;
-   }
-   
-   if(!util_stream_map(stream))
-      goto error2;
-   
-   return stream;
-   
-error2:
-   FREE(stream);
-error1:
-   return NULL;
-}
-
-
-static INLINE void
-util_stream_copy(struct util_stream *stream, const char *data, size_t size)
-{
-   assert(stream->written + size <= stream->map_size);
-   memcpy(stream->pMap + stream->written, data, size);
-   stream->written += size;
-}
-
-
-boolean
-util_stream_write(struct util_stream *stream, const void *data, size_t size)
-{
-   if(!stream)
-      return FALSE;
-   
-   if(!stream->pMap)
-      return FALSE;
-   
-   while(stream->written + size > stream->map_size) {
-      size_t step = stream->map_size - stream->written;
-      util_stream_copy(stream, data, step);
-      data = (const char *)data + step;
-      size -= step;
-      
-      util_stream_unmap(stream);
-      if(!stream->growable || !util_stream_map(stream))
-         return FALSE;
-   }
-
-   util_stream_copy(stream, data, size);
-   
-   return TRUE;
-}
-
-
-void
-util_stream_flush(struct util_stream *stream) 
-{
-   (void)stream;
-}
-
-
-void
-util_stream_close(struct util_stream *stream) 
-{
-   if(!stream)
-      return;
-   
-   util_stream_unmap(stream);
-
-   FREE(stream);
-}
-
-
-#endif
diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c
index f828908f0be..33306bbc2a6 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -35,7 +35,9 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
+#include "util/u_inlines.h"
 
+#include "util/u_memory.h"
 #include "util/u_surface.h"
 
 
@@ -52,9 +54,9 @@ util_create_rgba_surface(struct pipe_screen *screen,
                          struct pipe_surface **surfaceOut)
 {
    static const enum pipe_format rgbaFormats[] = {
-      PIPE_FORMAT_A8R8G8B8_UNORM,
       PIPE_FORMAT_B8G8R8A8_UNORM,
-      PIPE_FORMAT_R8G8B8A8_UNORM,
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_A8B8G8R8_UNORM,
       PIPE_FORMAT_NONE
    };
    const uint target = PIPE_TEXTURE_2D;
@@ -110,3 +112,73 @@ util_destroy_rgba_surface(struct pipe_texture *texture,
    pipe_texture_reference(&texture, NULL);
 }
 
+
+
+/**
+ * Compare pipe_framebuffer_state objects.
+ * \return TRUE if same, FALSE if different
+ */
+boolean
+util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
+                             const struct pipe_framebuffer_state *src)
+{
+   unsigned i;
+
+   if (dst->width != src->width ||
+       dst->height != src->height)
+      return FALSE;
+
+   for (i = 0; i < Elements(src->cbufs); i++) {
+      if (dst->cbufs[i] != src->cbufs[i]) {
+         return FALSE;
+      }
+   }
+
+   if (dst->nr_cbufs != src->nr_cbufs) {
+      return FALSE;
+   }
+
+   if (dst->zsbuf != src->zsbuf) {
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * Copy framebuffer state from src to dst, updating refcounts.
+ */
+void
+util_copy_framebuffer_state(struct pipe_framebuffer_state *dst,
+                            const struct pipe_framebuffer_state *src)
+{
+   unsigned i;
+
+   dst->width = src->width;
+   dst->height = src->height;
+
+   for (i = 0; i < Elements(src->cbufs); i++) {
+      pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
+   }
+
+   dst->nr_cbufs = src->nr_cbufs;
+
+   pipe_surface_reference(&dst->zsbuf, src->zsbuf);
+}
+
+
+void
+util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb)
+{
+   unsigned i;
+
+   for (i = 0; i < fb->nr_cbufs; i++) {
+      pipe_surface_reference(&fb->cbufs[i], NULL);
+   }
+
+   pipe_surface_reference(&fb->zsbuf, NULL);
+
+   fb->width = fb->height = 0;
+   fb->nr_cbufs = 0;
+}
diff --git a/src/gallium/auxiliary/util/u_surface.h b/src/gallium/auxiliary/util/u_surface.h
index ce84ed7ad06..3c60df2c3e5 100644
--- a/src/gallium/auxiliary/util/u_surface.h
+++ b/src/gallium/auxiliary/util/u_surface.h
@@ -30,11 +30,7 @@
 
 
 #include "pipe/p_compiler.h"
-
-
-struct pipe_screen;
-struct pipe_texture;
-struct pipe_surface;
+#include "pipe/p_state.h"
 
 
 /**
@@ -66,4 +62,17 @@ util_destroy_rgba_surface(struct pipe_texture *texture,
                           struct pipe_surface *surface);
 
 
+extern boolean
+util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
+                             const struct pipe_framebuffer_state *src);
+
+extern void
+util_copy_framebuffer_state(struct pipe_framebuffer_state *dst,
+                            const struct pipe_framebuffer_state *src);
+
+
+extern void
+util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb);
+
+
 #endif /* U_SURFACE_H */
diff --git a/src/gallium/auxiliary/util/u_texture.c b/src/gallium/auxiliary/util/u_texture.c
new file mode 100644
index 00000000000..d97e57a7903
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_texture.c
@@ -0,0 +1,103 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ * Copyright 2009 Marek Olšák <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Texture mapping utility functions.
+ *
+ * @author Brian Paul
+ *         Marek Olšák
+ */
+
+#include "pipe/p_defines.h"
+
+#include "util/u_debug.h"
+#include "util/u_texture.h"
+
+void util_map_texcoords2d_onto_cubemap(unsigned face,
+                                       const float *in_st, unsigned in_stride,
+                                       float *out_str, unsigned out_stride)
+{
+   int i;
+   float rx, ry, rz;
+
+   /* loop over quad verts */
+   for (i = 0; i < 4; i++) {
+      /* Compute sc = +/-scale and tc = +/-scale.
+       * Not +/-1 to avoid cube face selection ambiguity near the edges,
+       * though that can still sometimes happen with this scale factor...
+       */
+      const float scale = 0.9999f;
+      const float sc = (2 * in_st[0] - 1) * scale;
+      const float tc = (2 * in_st[1] - 1) * scale;
+
+      switch (face) {
+         case PIPE_TEX_FACE_POS_X:
+            rx = 1;
+            ry = -tc;
+            rz = -sc;
+            break;
+         case PIPE_TEX_FACE_NEG_X:
+            rx = -1;
+            ry = -tc;
+            rz = sc;
+            break;
+         case PIPE_TEX_FACE_POS_Y:
+            rx = sc;
+            ry = 1;
+            rz = tc;
+            break;
+         case PIPE_TEX_FACE_NEG_Y:
+            rx = sc;
+            ry = -1;
+            rz = -tc;
+            break;
+         case PIPE_TEX_FACE_POS_Z:
+            rx = sc;
+            ry = -tc;
+            rz = 1;
+            break;
+         case PIPE_TEX_FACE_NEG_Z:
+            rx = -sc;
+            ry = -tc;
+            rz = -1;
+            break;
+         default:
+            rx = ry = rz = 0;
+            assert(0);
+      }
+
+      out_str[0] = rx; /*s*/
+      out_str[1] = ry; /*t*/
+      out_str[2] = rz; /*r*/
+
+      in_st += in_stride;
+      out_str += out_stride;
+   }
+}
diff --git a/src/gallium/auxiliary/util/u_texture.h b/src/gallium/auxiliary/util/u_texture.h
new file mode 100644
index 00000000000..93b2f1e4c97
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_texture.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Marek Olšák <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_TEXTURE_H
+#define U_TEXTURE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Convert 2D texture coordinates of 4 vertices into cubemap coordinates
+ * in the given face.
+ * Coordinates must be in the range [0,1].
+ *
+ * \param face          Cubemap face.
+ * \param in_st         4 pairs of 2D texture coordinates to convert.
+ * \param in_stride     Stride of in_st in floats.
+ * \param out_str       STR cubemap texture coordinates to compute.
+ * \param out_stride    Stride of out_str in floats.
+ */
+void util_map_texcoords2d_onto_cubemap(unsigned face,
+                                       const float *in_st, unsigned in_stride,
+                                       float *out_str, unsigned out_stride);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index 4f34f8a1a6b..79481b710bf 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -32,8 +32,9 @@
 
 
 #include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
+#include "util/u_inlines.h"
 
+#include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_rect.h"
@@ -52,7 +53,7 @@ pipe_get_tile_raw(struct pipe_transfer *pt,
    const void *src;
 
    if (dst_stride == 0)
-      dst_stride = pf_get_stride(pt->texture->format, w);
+      dst_stride = util_format_get_stride(pt->texture->format, w);
 
    if (pipe_clip_tile(x, y, &w, &h, pt))
       return;
@@ -81,7 +82,7 @@ pipe_put_tile_raw(struct pipe_transfer *pt,
    enum pipe_format format = pt->texture->format;
 
    if (src_stride == 0)
-      src_stride = pf_get_stride(format, w);
+      src_stride = util_format_get_stride(format, w);
 
    if (pipe_clip_tile(x, y, &w, &h, pt))
       return;
@@ -107,7 +108,7 @@ pipe_put_tile_raw(struct pipe_transfer *pt,
 
 
 
-/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
 
 static void
 a8r8g8b8_get_tile_rgba(const unsigned *src,
@@ -154,7 +155,7 @@ a8r8g8b8_put_tile_rgba(unsigned *dst,
 }
 
 
-/*** PIPE_FORMAT_X8R8G8B8_UNORM ***/
+/*** PIPE_FORMAT_B8G8R8X8_UNORM ***/
 
 static void
 x8r8g8b8_get_tile_rgba(const unsigned *src,
@@ -200,7 +201,7 @@ x8r8g8b8_put_tile_rgba(unsigned *dst,
 }
 
 
-/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
 
 static void
 b8g8r8a8_get_tile_rgba(const unsigned *src,
@@ -247,7 +248,54 @@ b8g8r8a8_put_tile_rgba(unsigned *dst,
 }
 
 
-/*** PIPE_FORMAT_A1R5G5B5_UNORM ***/
+/*** PIPE_FORMAT_A8B8G8R8_UNORM ***/
+
+static void
+r8g8b8a8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >> 24) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[3] = ubyte_to_float((pixel >>  0) & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r8g8b8a8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (r << 24) | (g << 16) | (b << 8) | a;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_B5G5R5A1_UNORM ***/
 
 static void
 a1r5g5b5_get_tile_rgba(const ushort *src,
@@ -298,7 +346,7 @@ a1r5g5b5_put_tile_rgba(ushort *dst,
 }
 
 
-/*** PIPE_FORMAT_A4R4G4B4_UNORM ***/
+/*** PIPE_FORMAT_B4G4R4A4_UNORM ***/
 
 static void
 a4r4g4b4_get_tile_rgba(const ushort *src,
@@ -342,14 +390,14 @@ a4r4g4b4_put_tile_rgba(ushort *dst,
          g >>= 4;
          b >>= 4;
          a >>= 4;
-         *dst++ = (a << 12) | (r << 16) | (g << 4) | b;
+         *dst++ = (a << 12) | (r << 8) | (g << 4) | b;
       }
       p += src_stride;
    }
 }
 
 
-/*** PIPE_FORMAT_R5G6B5_UNORM ***/
+/*** PIPE_FORMAT_B5G6R5_UNORM ***/
 
 static void
 r5g6b5_get_tile_rgba(const ushort *src,
@@ -643,7 +691,7 @@ r16g16b16a16_put_tile_rgba(short *dst,
 }
 
 
-/*** PIPE_FORMAT_R8G8B8A8_SRGB ***/
+/*** PIPE_FORMAT_A8B8G8R8_SRGB ***/
 
 /**
  * Convert an 8-bit sRGB value from non-linear space to a
@@ -736,7 +784,7 @@ a8r8g8b8_srgb_put_tile_rgba(unsigned *dst,
 }
 
 
-/*** PIPE_FORMAT_A8L8_SRGB ***/
+/*** PIPE_FORMAT_L8A8_SRGB ***/
 
 static void
 a8l8_srgb_get_tile_rgba(const ushort *src,
@@ -865,7 +913,7 @@ i8_put_tile_rgba(ubyte *dst,
 }
 
 
-/*** PIPE_FORMAT_A8L8_UNORM ***/
+/*** PIPE_FORMAT_L8A8_UNORM ***/
 
 static void
 a8l8_get_tile_rgba(const ushort *src,
@@ -939,7 +987,7 @@ z32_get_tile_rgba(const unsigned *src,
 }
 
 
-/*** PIPE_FORMAT_S8Z24_UNORM ***/
+/*** PIPE_FORMAT_Z24S8_UNORM ***/
 
 /**
  * Return Z component as four float in [0,1].  Stencil part ignored.
@@ -966,7 +1014,7 @@ s8z24_get_tile_rgba(const unsigned *src,
 }
 
 
-/*** PIPE_FORMAT_Z24S8_UNORM ***/
+/*** PIPE_FORMAT_S8Z24_UNORM ***/
 
 /**
  * Return Z component as four float in [0,1].  Stencil part ignored.
@@ -1019,7 +1067,7 @@ z32f_get_tile_rgba(const float *src,
 }
 
 
-/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
+/*** PIPE_FORMAT_UYVY / PIPE_FORMAT_YUYV ***/
 
 /**
  * Convert YCbCr (or YCrCb) to RGBA.
@@ -1107,27 +1155,6 @@ ycbcr_get_tile_rgba(const ushort *src,
 }
 
 
-static void
-fake_get_tile_rgba(const ushort *src,
-                   unsigned w, unsigned h,
-                   float *p,
-                   unsigned dst_stride)
-{
-   unsigned i, j;
-
-   for (i = 0; i < h; i++) {
-      float *pRow = p;
-      for (j = 0; j < w; j++, pRow += 4) {
-         pRow[0] =
-         pRow[1] =
-         pRow[2] =
-         pRow[3] = (i ^ j) & 1 ? 1.0f : 0.0f;
-      }
-      p += dst_stride;
-   }
-}
-
-
 void
 pipe_tile_raw_to_rgba(enum pipe_format format,
                       void *src,
@@ -1135,22 +1162,25 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
                       float *dst, unsigned dst_stride)
 {
    switch (format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       a8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
       x8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       b8g8r8a8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
+      r8g8b8a8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
       a1r5g5b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
       a4r4g4b4_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_B5G6R5_UNORM:
       r5g6b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
       break;
    case PIPE_FORMAT_R8G8B8_UNORM:
@@ -1165,7 +1195,7 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_I8_UNORM:
       i8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_A8L8_UNORM:
+   case PIPE_FORMAT_L8A8_UNORM:
       a8l8_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
       break;
    case PIPE_FORMAT_R16_SNORM:
@@ -1174,10 +1204,10 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_R16G16B16A16_SNORM:
       r16g16b16a16_get_tile_rgba((short *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_A8R8G8B8_SRGB:
+   case PIPE_FORMAT_B8G8R8A8_SRGB:
       a8r8g8b8_srgb_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_A8L8_SRGB:
+   case PIPE_FORMAT_L8A8_SRGB:
       a8l8_srgb_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
       break;
    case PIPE_FORMAT_L8_SRGB:
@@ -1189,26 +1219,28 @@ pipe_tile_raw_to_rgba(enum pipe_format format,
    case PIPE_FORMAT_Z32_UNORM:
       z32_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
-      break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_Z24X8_UNORM:
+      s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
       break;
    case PIPE_FORMAT_Z32_FLOAT:
       z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
       break;
-   case PIPE_FORMAT_YCBCR:
+   case PIPE_FORMAT_UYVY:
       ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
       break;
-   case PIPE_FORMAT_YCBCR_REV:
+   case PIPE_FORMAT_YUYV:
       ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, TRUE);
       break;
    default:
-      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(format));
-      fake_get_tile_rgba(src, w, h, dst, dst_stride);
+      util_format_read_4f(format,
+                          dst, dst_stride * sizeof(float),
+                          src, util_format_get_stride(format, w),
+                          0, 0, w, h);
    }
 }
 
@@ -1225,12 +1257,12 @@ pipe_get_tile_rgba(struct pipe_transfer *pt,
    if (pipe_clip_tile(x, y, &w, &h, pt))
       return;
 
-   packed = MALLOC(pf_get_nblocks(format, w, h) * pf_get_blocksize(format));
+   packed = MALLOC(util_format_get_nblocks(format, w, h) * util_format_get_blocksize(format));
 
    if (!packed)
       return;
 
-   if(format == PIPE_FORMAT_YCBCR || format == PIPE_FORMAT_YCBCR_REV)
+   if(format == PIPE_FORMAT_UYVY || format == PIPE_FORMAT_YUYV)
       assert((x & 1) == 0);
 
    pipe_get_tile_raw(pt, x, y, w, h, packed, 0);
@@ -1253,34 +1285,34 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
    if (pipe_clip_tile(x, y, &w, &h, pt))
       return;
 
-   packed = MALLOC(pf_get_nblocks(format, w, h) * pf_get_blocksize(format));
+   packed = MALLOC(util_format_get_nblocks(format, w, h) * util_format_get_blocksize(format));
 
    if (!packed)
       return;
 
    switch (format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
       a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
       break;
-   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
       x8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
       break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
       b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
       break;
-   case PIPE_FORMAT_A1R5G5B5_UNORM:
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
+      r8g8b8a8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
       a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
       break;
-   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_B5G6R5_UNORM:
       r5g6b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
       break;
    case PIPE_FORMAT_R8G8B8_UNORM:
       r8g8b8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
       break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      assert(0);
-      break;
-   case PIPE_FORMAT_A4R4G4B4_UNORM:
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
       a4r4g4b4_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
       break;
    case PIPE_FORMAT_L8_UNORM:
@@ -1292,7 +1324,7 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
    case PIPE_FORMAT_I8_UNORM:
       i8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
       break;
-   case PIPE_FORMAT_A8L8_UNORM:
+   case PIPE_FORMAT_L8A8_UNORM:
       a8l8_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
       break;
    case PIPE_FORMAT_R16_SNORM:
@@ -1301,10 +1333,10 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
    case PIPE_FORMAT_R16G16B16A16_SNORM:
       r16g16b16a16_put_tile_rgba((short *) packed, w, h, p, src_stride);
       break;
-   case PIPE_FORMAT_A8R8G8B8_SRGB:
+   case PIPE_FORMAT_B8G8R8A8_SRGB:
       a8r8g8b8_srgb_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
       break;
-   case PIPE_FORMAT_A8L8_SRGB:
+   case PIPE_FORMAT_L8A8_SRGB:
       a8l8_srgb_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
       break;
    case PIPE_FORMAT_L8_SRGB:
@@ -1316,16 +1348,19 @@ pipe_put_tile_rgba(struct pipe_transfer *pt,
    case PIPE_FORMAT_Z32_UNORM:
       /*z32_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
       break;
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
-      /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
-      break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_Z24X8_UNORM:
+      /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
       break;
    default:
-      debug_printf("%s: unsupported format %s\n", __FUNCTION__, pf_name(format));
+      util_format_write_4f(format,
+                           p, src_stride * sizeof(float),
+                           packed, util_format_get_stride(format, w),
+                           0, 0, w, h);
    }
 
    pipe_put_tile_raw(pt, x, y, w, h, packed, 0);
@@ -1370,8 +1405,8 @@ pipe_get_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_S8Z24_UNORM:
-   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
       {
          const uint *ptrc
             = (const uint *)(map + y * pt->stride + x*4);
@@ -1385,8 +1420,8 @@ pipe_get_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       {
          const uint *ptrc
             = (const uint *)(map + y * pt->stride + x*4);
@@ -1454,7 +1489,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24S8_UNORM:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
          assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
@@ -1468,7 +1503,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
          for (i = 0; i < h; i++) {
@@ -1481,7 +1516,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
          assert((pt->usage & PIPE_TRANSFER_READ_WRITE) == PIPE_TRANSFER_READ_WRITE);
@@ -1495,7 +1530,7 @@ pipe_put_tile_z(struct pipe_transfer *pt,
          }
       }
       break;
-   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       {
          uint *pDest = (uint *) (map + y * pt->stride + x*4);
          for (i = 0; i < h; i++) {
diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c
deleted file mode 100644
index b958a986353..00000000000
--- a/src/gallium/auxiliary/util/u_time.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/**
- * @file
- * OS independent time-manipulation functions.
- * 
- * @author Jose Fonseca <[email protected]>
- */
-
-
-#include "pipe/p_config.h"
-
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
-#include <sys/time.h>
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-#include <windows.h>
-#include <winddi.h>
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-#include <windows.h>
-extern VOID KeQuerySystemTime(PLARGE_INTEGER);
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
-#include <windows.h>
-#else
-#error Unsupported OS
-#endif
-
-#include "util/u_time.h"
-
-
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
-
-static int64_t frequency = 0;
-
-static INLINE void 
-util_time_get_frequency(void)
-{
-   if(!frequency) {
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-      LONGLONG temp;
-      EngQueryPerformanceFrequency(&temp);
-      frequency = temp;
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
-      LARGE_INTEGER temp;
-      QueryPerformanceFrequency(&temp);
-      frequency = temp.QuadPart;
-#endif
-   }
-}
-#endif
-
-
-void 
-util_time_get(struct util_time *t)
-{
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
-   gettimeofday(&t->tv, NULL);
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-   LONGLONG temp;
-   EngQueryPerformanceCounter(&temp);
-   t->counter = temp;
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-   /* Updated every 10 miliseconds, measured in units of 100 nanoseconds.
-    * http://msdn.microsoft.com/en-us/library/ms801642.aspx */
-   LARGE_INTEGER temp;
-   KeQuerySystemTime(&temp);
-   t->counter = temp.QuadPart;
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
-   LARGE_INTEGER temp;
-   QueryPerformanceCounter(&temp);
-   t->counter = temp.QuadPart;
-#endif
-}
-
-
-void 
-util_time_add(const struct util_time *t1,
-              int64_t usecs,
-              struct util_time *t2)
-{
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
-   t2->tv.tv_sec = t1->tv.tv_sec + usecs / 1000000;
-   t2->tv.tv_usec = t1->tv.tv_usec + usecs % 1000000;
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
-   util_time_get_frequency();
-   t2->counter = t1->counter + (usecs * frequency + INT64_C(999999))/INT64_C(1000000);
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-   /* 1 tick = 100 nano seconds. */
-   t2->counter = t1->counter + usecs * 10;
-#else
-   LARGE_INTEGER temp;
-   LONGLONG freq;
-   freq = temp.QuadPart;
-   t2->counter = t1->counter + (usecs * freq)/1000000L;
-#endif
-}
-
-
-int64_t
-util_time_diff(const struct util_time *t1, 
-               const struct util_time *t2)
-{
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
-   return (t2->tv.tv_usec - t1->tv.tv_usec) + 
-          (t2->tv.tv_sec - t1->tv.tv_sec)*1000000;
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
-   util_time_get_frequency();
-   return (t2->counter - t1->counter)*INT64_C(1000000)/frequency;
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-   return (t2->counter - t1->counter)/10;
-#endif
-}
-
-
-
-uint64_t
-util_time_micros( void )
-{
-   struct util_time t1;
-   
-   util_time_get(&t1);
-   
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
-   return t1.tv.tv_usec + t1.tv.tv_sec*1000000LL;
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
-   util_time_get_frequency();
-   return t1.counter*INT64_C(1000000)/frequency;
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
-   return t1.counter/10;
-#endif
-}
-
-
-
-/**
- * Compare two time values.
- * 
- * Not publicly available because it does not take in account wrap-arounds. 
- * Use util_time_timeout instead.
- */
-static INLINE int
-util_time_compare(const struct util_time *t1, 
-                  const struct util_time *t2)
-{
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
-   if (t1->tv.tv_sec < t2->tv.tv_sec)
-      return -1;
-   else if(t1->tv.tv_sec > t2->tv.tv_sec)
-      return 1;
-   else if (t1->tv.tv_usec < t2->tv.tv_usec)
-      return -1;
-   else if(t1->tv.tv_usec > t2->tv.tv_usec)
-      return 1;
-   else 
-      return 0;
-#elif defined(PIPE_OS_WINDOWS)
-   if (t1->counter < t2->counter)
-      return -1;
-   else if(t1->counter > t2->counter)
-      return 1;
-   else 
-      return 0;
-#endif
-}
-
-
-boolean 
-util_time_timeout(const struct util_time *start, 
-                  const struct util_time *end,
-                  const struct util_time *curr) 
-{
-   if(util_time_compare(start, end) <= 0)
-      return !(util_time_compare(start, curr) <= 0 && util_time_compare(curr, end) < 0);
-   else
-      return !(util_time_compare(start, curr) <= 0 || util_time_compare(curr, end) < 0);
-}
-
-
-#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
-void util_time_sleep(unsigned usecs)
-{
-   LONGLONG start, curr, end;
-   
-   EngQueryPerformanceCounter(&start);
-   
-   if(!frequency)
-      EngQueryPerformanceFrequency(&frequency);
-   
-   end = start + (usecs * frequency + 999999LL)/1000000LL;
-   
-   do {
-      EngQueryPerformanceCounter(&curr);
-   } while(start <= curr && curr < end || 
-	   end < start && (curr < end || start <= curr));
-}
-#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
-void util_time_sleep(unsigned usecs)
-{
-   Sleep((usecs + 999)/ 1000);
-}
-#endif
diff --git a/src/gallium/auxiliary/util/u_time.h b/src/gallium/auxiliary/util/u_time.h
index a6189a247bb..15899c2c884 100644
--- a/src/gallium/auxiliary/util/u_time.h
+++ b/src/gallium/auxiliary/util/u_time.h
@@ -38,15 +38,7 @@
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE)
-#include <time.h> /* timeval */
-#include <unistd.h> /* usleep */
-#endif
-
-#if defined(PIPE_OS_HAIKU)
-#include <sys/time.h> /* timeval */
-#include <unistd.h>
-#endif
+#include "os/os_time.h"
 
 #include "pipe/p_compiler.h"
 
@@ -63,43 +55,92 @@ extern "C" {
  */
 struct util_time 
 {
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
-   struct timeval tv;
-#else
    int64_t counter;
-#endif
 };
    
 
-void 
-util_time_get(struct util_time *t);
+PIPE_DEPRECATED
+static INLINE void
+util_time_get(struct util_time *t)
+{
+   t->counter = os_time_get();
+}
+
 
-void 
+/**
+ * Return t2 = t1 + usecs
+ */
+PIPE_DEPRECATED
+static INLINE void
 util_time_add(const struct util_time *t1,
               int64_t usecs,
-              struct util_time *t2);
+              struct util_time *t2)
+{
+   t2->counter = t1->counter + usecs;
+}
 
-uint64_t
-util_time_micros( void );
 
-int64_t
+/**
+ * Return difference between times, in microseconds
+ */
+PIPE_DEPRECATED
+static INLINE int64_t
 util_time_diff(const struct util_time *t1, 
-               const struct util_time *t2);
+               const struct util_time *t2)
+{
+   return t2->counter - t1->counter;
+}
+
+
+/**
+ * Compare two time values.
+ *
+ * Not publicly available because it does not take in account wrap-arounds.
+ * Use util_time_timeout instead.
+ */
+static INLINE int
+_util_time_compare(const struct util_time *t1,
+                   const struct util_time *t2)
+{
+   if (t1->counter < t2->counter)
+      return -1;
+   else if(t1->counter > t2->counter)
+      return 1;
+   else
+      return 0;
+}
+
 
 /**
  * Returns non-zero when the timeout expires.
  */
-boolean 
+PIPE_DEPRECATED
+static INLINE boolean
 util_time_timeout(const struct util_time *start, 
                   const struct util_time *end,
-                  const struct util_time *curr);
+                  const struct util_time *curr)
+{
+   return os_time_timeout(start->counter, end->counter, curr->counter);
+}
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU)
-#define util_time_sleep usleep
-#else
-void
-util_time_sleep(unsigned usecs);
-#endif
+
+/**
+ * Return current time in microseconds
+ */
+PIPE_DEPRECATED
+static INLINE int64_t
+util_time_micros(void)
+{
+   return os_time_get();
+}
+
+
+PIPE_DEPRECATED
+static INLINE void
+util_time_sleep(int64_t usecs)
+{
+   os_time_sleep(usecs);
+}
 
 
 #ifdef	__cplusplus
diff --git a/src/gallium/auxiliary/util/u_timed_winsys.c b/src/gallium/auxiliary/util/u_timed_winsys.c
index 178acdca4df..d88298bc14c 100644
--- a/src/gallium/auxiliary/util/u_timed_winsys.c
+++ b/src/gallium/auxiliary/util/u_timed_winsys.c
@@ -30,10 +30,10 @@
  */
 
 #include "pipe/p_state.h"
-#include "pipe/internal/p_winsys_screen.h"
+#include "util/u_simple_screen.h"
 #include "u_timed_winsys.h"
 #include "util/u_memory.h"
-#include "util/u_time.h"
+#include "os/os_time.h"
 
 
 struct timed_winsys {
@@ -54,12 +54,6 @@ static struct timed_winsys *timed_winsys( struct pipe_winsys *winsys )
 }
 
 
-static uint64_t time_start( void )
-{
-   return util_time_micros();
-}
-
-
 static void time_display( struct pipe_winsys *winsys )
 {
    struct timed_winsys *tws = timed_winsys(winsys);
@@ -90,7 +84,7 @@ static void time_finish( struct pipe_winsys *winsys,
                          const char *name ) 
 {
    struct timed_winsys *tws = timed_winsys(winsys);
-   uint64_t endval = util_time_micros();
+   int64_t endval = os_time_get();
    double elapsed = (endval - startval)/1000.0;
 
    if (endval - startval > 1000LL) 
@@ -120,7 +114,7 @@ timed_buffer_create(struct pipe_winsys *winsys,
                     unsigned size )
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    struct pipe_buffer *buf =
       backend->buffer_create( backend, alignment, usage, size );
@@ -139,7 +133,7 @@ timed_user_buffer_create(struct pipe_winsys *winsys,
                              unsigned bytes) 
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    struct pipe_buffer *buf = backend->user_buffer_create( backend, data, bytes );
 
@@ -155,7 +149,7 @@ timed_buffer_map(struct pipe_winsys *winsys,
                      unsigned flags)
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    void *map = backend->buffer_map( backend, buf, flags );
 
@@ -170,7 +164,7 @@ timed_buffer_unmap(struct pipe_winsys *winsys,
                        struct pipe_buffer *buf)
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    backend->buffer_unmap( backend, buf );
 
@@ -183,7 +177,7 @@ timed_buffer_destroy(struct pipe_buffer *buf)
 {
    struct pipe_winsys *winsys = buf->screen->winsys;
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    backend->buffer_destroy( buf );
 
@@ -197,7 +191,7 @@ timed_flush_frontbuffer( struct pipe_winsys *winsys,
                          void *context_private)
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    backend->flush_frontbuffer( backend, surf, context_private );
 
@@ -216,7 +210,7 @@ timed_surface_buffer_create(struct pipe_winsys *winsys,
                               unsigned *stride)
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    struct pipe_buffer *ret = backend->surface_buffer_create( backend, width, height, 
                                                              format, usage, tex_usage, stride );
@@ -231,7 +225,7 @@ static const char *
 timed_get_name( struct pipe_winsys *winsys )
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    const char *ret = backend->get_name( backend );
 
@@ -246,7 +240,7 @@ timed_fence_reference(struct pipe_winsys *winsys,
                     struct pipe_fence_handle *fence)
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    backend->fence_reference( backend, ptr, fence );
 
@@ -260,7 +254,7 @@ timed_fence_signalled( struct pipe_winsys *winsys,
                        unsigned flag )
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    int ret = backend->fence_signalled( backend, fence, flag );
 
@@ -275,7 +269,7 @@ timed_fence_finish( struct pipe_winsys *winsys,
                      unsigned flag )
 {
    struct pipe_winsys *backend = timed_winsys(winsys)->backend;
-   uint64_t start = time_start();
+   int64_t start = os_time_get();
 
    int ret = backend->fence_finish( backend, fence, flag );
 
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c
index 975ee89c455..012b2ae2336 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.c
+++ b/src/gallium/auxiliary/util/u_upload_mgr.c
@@ -30,7 +30,7 @@
  */
 
 #include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
+#include "util/u_inlines.h"
 #include "pipe/p_screen.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
@@ -85,7 +85,9 @@ my_buffer_write(struct pipe_screen *screen,
 
    map = pipe_buffer_map_range(screen, buf, offset, size, 
                                PIPE_BUFFER_USAGE_CPU_WRITE |
-                               PIPE_BUFFER_USAGE_FLUSH_EXPLICIT);
+                               PIPE_BUFFER_USAGE_FLUSH_EXPLICIT |
+                               PIPE_BUFFER_USAGE_DISCARD |
+                               PIPE_BUFFER_USAGE_UNSYNCHRONIZED);
    if (map == NULL) 
       return PIPE_ERROR_OUT_OF_MEMORY;
 
diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h
index 745b5834af6..e158bed9d04 100644
--- a/src/gallium/auxiliary/util/u_upload_mgr.h
+++ b/src/gallium/auxiliary/util/u_upload_mgr.h
@@ -32,6 +32,8 @@
 #ifndef U_UPLOAD_MGR_H
 #define U_UPLOAD_MGR_H
 
+#include "pipe/p_defines.h"
+
 struct pipe_screen;
 struct pipe_buffer;
 struct u_upload_mgr;
diff --git a/src/gallium/auxiliary/vl/Makefile b/src/gallium/auxiliary/vl/Makefile
deleted file mode 100644
index b4b6fb5bdac..00000000000
--- a/src/gallium/auxiliary/vl/Makefile
+++ /dev/null
@@ -1,12 +0,0 @@
-TOP = ../../../..
-include $(TOP)/configs/current
-
-LIBNAME = vl
-
-C_SOURCES = \
-	vl_bitstream_parser.c \
-	vl_mpeg12_mc_renderer.c \
-	vl_compositor.c \
-        vl_csc.c
-
-include ../../Makefile.template
diff --git a/src/gallium/auxiliary/vl/SConscript b/src/gallium/auxiliary/vl/SConscript
deleted file mode 100644
index 4b1ef90b9bb..00000000000
--- a/src/gallium/auxiliary/vl/SConscript
+++ /dev/null
@@ -1,12 +0,0 @@
-Import('*')
-
-vl = env.ConvenienceLibrary(
-	target = 'vl',
-	source = [
-		'vl_bitstream_parser.c',
-		'vl_mpeg12_mc_renderer.c',
-		'vl_compositor.c',
-                'vl_csc.c',
-	])
-
-auxiliaries.insert(0, vl)
diff --git a/src/gallium/auxiliary/vl/vl_compositor.c b/src/gallium/auxiliary/vl/vl_compositor.c
index e6d787b4d74..bafe232877e 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.c
+++ b/src/gallium/auxiliary/vl/vl_compositor.c
@@ -28,9 +28,9 @@
 #include "vl_compositor.h"
 #include <assert.h>
 #include <pipe/p_context.h>
-#include <pipe/p_inlines.h>
-#include <tgsi/tgsi_ureg.h>
+#include <util/u_inlines.h>
 #include <util/u_memory.h>
+#include <tgsi/tgsi_ureg.h>
 #include "vl_csc.h"
 
 struct vertex_shader_consts
@@ -154,7 +154,6 @@ init_pipe_state(struct vl_compositor *c)
    sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
    sampler.compare_func = PIPE_FUNC_ALWAYS;
    sampler.normalized_coords = 1;
-   /*sampler.prefilter = ;*/
    /*sampler.lod_bias = ;*/
    /*sampler.min_lod = ;*/
    /*sampler.max_lod = ;*/
@@ -213,10 +212,12 @@ init_buffers(struct vl_compositor *c)
    );
 
    c->vertex_elems[0].src_offset = 0;
+   c->vertex_elems[0].instance_divisor = 0;
    c->vertex_elems[0].vertex_buffer_index = 0;
    c->vertex_elems[0].nr_components = 2;
    c->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
    c->vertex_elems[1].src_offset = sizeof(struct vertex2f);
+   c->vertex_elems[1].instance_divisor = 0;
    c->vertex_elems[1].vertex_buffer_index = 0;
    c->vertex_elems[1].nr_components = 2;
    c->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
@@ -225,7 +226,7 @@ init_buffers(struct vl_compositor *c)
     * Create our fragment shader's constant buffer
     * Const buffer contains the color conversion matrix and bias vectors
     */
-   c->fs_const_buf.buffer = pipe_buffer_create
+   c->fs_const_buf = pipe_buffer_create
    (
       c->pipe->screen,
       1,
@@ -246,7 +247,7 @@ cleanup_buffers(struct vl_compositor *c)
    assert(c);
 
    pipe_buffer_reference(&c->vertex_buf.buffer, NULL);
-   pipe_buffer_reference(&c->fs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&c->fs_const_buf, NULL);
 }
 
 bool vl_compositor_init(struct vl_compositor *compositor, struct pipe_context *pipe)
@@ -511,7 +512,7 @@ void vl_compositor_render(struct vl_compositor          *compositor,
    compositor->pipe->bind_fs_state(compositor->pipe, compositor->fragment_shader);
    compositor->pipe->set_vertex_buffers(compositor->pipe, 1, &compositor->vertex_buf);
    compositor->pipe->set_vertex_elements(compositor->pipe, 2, compositor->vertex_elems);
-   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_FRAGMENT, 0, &compositor->fs_const_buf);
+   compositor->pipe->set_constant_buffer(compositor->pipe, PIPE_SHADER_FRAGMENT, 0, compositor->fs_const_buf);
 
    draw_layers(compositor, src_surface, src_area, dst_area);
 
@@ -527,11 +528,11 @@ void vl_compositor_set_csc_matrix(struct vl_compositor *compositor, const float
 
    memcpy
    (
-      pipe_buffer_map(compositor->pipe->screen, compositor->fs_const_buf.buffer,
+      pipe_buffer_map(compositor->pipe->screen, compositor->fs_const_buf,
                       PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD),
       mat,
       sizeof(struct fragment_shader_consts)
    );
 
-   pipe_buffer_unmap(compositor->pipe->screen, compositor->fs_const_buf.buffer);
+   pipe_buffer_unmap(compositor->pipe->screen, compositor->fs_const_buf);
 }
diff --git a/src/gallium/auxiliary/vl/vl_compositor.h b/src/gallium/auxiliary/vl/vl_compositor.h
index 86f8343659e..a75223c773f 100644
--- a/src/gallium/auxiliary/vl/vl_compositor.h
+++ b/src/gallium/auxiliary/vl/vl_compositor.h
@@ -50,7 +50,7 @@ struct vl_compositor
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buf;
    struct pipe_vertex_element vertex_elems[2];
-   struct pipe_constant_buffer fs_const_buf;
+   struct pipe_buffer *fs_const_buf;
 
    struct pipe_texture *bg;
    struct pipe_video_rect bg_src_rect;
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
index 4c448f44f51..eca3452a5b4 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.c
@@ -28,7 +28,8 @@
 #include "vl_mpeg12_mc_renderer.h"
 #include <assert.h>
 #include <pipe/p_context.h>
-#include <pipe/p_inlines.h>
+#include <util/u_inlines.h>
+#include <util/u_format.h>
 #include <util/u_math.h>
 #include <util/u_memory.h>
 #include <tgsi/tgsi_ureg.h>
@@ -194,11 +195,13 @@ create_frame_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
    return true;
 }
 
+#if 0
 static void
 create_field_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
 {
    assert(false);
 }
+#endif
 
 static bool
 create_frame_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
@@ -248,11 +251,13 @@ create_frame_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
    return true;
 }
 
+#if 0
 static void
 create_field_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
 {
    assert(false);
 }
+#endif
 
 static bool
 create_frame_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
@@ -295,11 +300,13 @@ create_frame_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
    return true;
 }
 
+#if 0
 static void
 create_field_bi_pred_vert_shader(struct vl_mpeg12_mc_renderer *r)
 {
    assert(false);
 }
+#endif
 
 static bool
 create_frame_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
@@ -355,11 +362,13 @@ create_frame_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
    return true;
 }
 
+#if 0
 static void
 create_field_bi_pred_frag_shader(struct vl_mpeg12_mc_renderer *r)
 {
    assert(false);
 }
+#endif
 
 static void
 xfer_buffers_map(struct vl_mpeg12_mc_renderer *r)
@@ -413,11 +422,6 @@ init_pipe_state(struct vl_mpeg12_mc_renderer *r)
    r->viewport.translate[2] = 0;
    r->viewport.translate[3] = 0;
 
-   r->scissor.maxx = r->pot_buffers ?
-      util_next_power_of_two(r->picture_width) : r->picture_width;
-   r->scissor.maxy = r->pot_buffers ?
-      util_next_power_of_two(r->picture_height) : r->picture_height;
-
    r->fb_state.width = r->pot_buffers ?
       util_next_power_of_two(r->picture_width) : r->picture_width;
    r->fb_state.height = r->pot_buffers ?
@@ -451,7 +455,6 @@ init_pipe_state(struct vl_mpeg12_mc_renderer *r)
       sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
       sampler.compare_func = PIPE_FUNC_ALWAYS;
       sampler.normalized_coords = 1;
-      /*sampler.prefilter = ; */
       /*sampler.shadow_ambient = ; */
       /*sampler.lod_bias = ; */
       sampler.min_lod = 0;
@@ -580,53 +583,61 @@ init_buffers(struct vl_mpeg12_mc_renderer *r)
 
    /* Position element */
    r->vertex_elems[0].src_offset = 0;
+   r->vertex_elems[0].instance_divisor = 0;
    r->vertex_elems[0].vertex_buffer_index = 0;
    r->vertex_elems[0].nr_components = 2;
    r->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Luma, texcoord element */
    r->vertex_elems[1].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[1].instance_divisor = 0;
    r->vertex_elems[1].vertex_buffer_index = 0;
    r->vertex_elems[1].nr_components = 2;
    r->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Chroma Cr texcoord element */
    r->vertex_elems[2].src_offset = sizeof(struct vertex2f) * 2;
+   r->vertex_elems[2].instance_divisor = 0;
    r->vertex_elems[2].vertex_buffer_index = 0;
    r->vertex_elems[2].nr_components = 2;
    r->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Chroma Cb texcoord element */
    r->vertex_elems[3].src_offset = sizeof(struct vertex2f) * 3;
+   r->vertex_elems[3].instance_divisor = 0;
    r->vertex_elems[3].vertex_buffer_index = 0;
    r->vertex_elems[3].nr_components = 2;
    r->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* First ref surface top field texcoord element */
    r->vertex_elems[4].src_offset = 0;
+   r->vertex_elems[4].instance_divisor = 0;
    r->vertex_elems[4].vertex_buffer_index = 1;
    r->vertex_elems[4].nr_components = 2;
    r->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* First ref surface bottom field texcoord element */
    r->vertex_elems[5].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[5].instance_divisor = 0;
    r->vertex_elems[5].vertex_buffer_index = 1;
    r->vertex_elems[5].nr_components = 2;
    r->vertex_elems[5].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Second ref surface top field texcoord element */
    r->vertex_elems[6].src_offset = 0;
+   r->vertex_elems[6].instance_divisor = 0;
    r->vertex_elems[6].vertex_buffer_index = 2;
    r->vertex_elems[6].nr_components = 2;
    r->vertex_elems[6].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
    /* Second ref surface bottom field texcoord element */
    r->vertex_elems[7].src_offset = sizeof(struct vertex2f);
+   r->vertex_elems[7].instance_divisor = 0;
    r->vertex_elems[7].vertex_buffer_index = 2;
    r->vertex_elems[7].nr_components = 2;
    r->vertex_elems[7].src_format = PIPE_FORMAT_R32G32_FLOAT;
 
-   r->vs_const_buf.buffer = pipe_buffer_create
+   r->vs_const_buf = pipe_buffer_create
    (
       r->pipe->screen,
       DEFAULT_BUF_ALIGNMENT,
@@ -644,7 +655,7 @@ cleanup_buffers(struct vl_mpeg12_mc_renderer *r)
 
    assert(r);
 
-   pipe_buffer_reference(&r->vs_const_buf.buffer, NULL);
+   pipe_buffer_reference(&r->vs_const_buf, NULL);
 
    for (i = 0; i < 3; ++i)
       pipe_buffer_reference(&r->vertex_bufs.all[i].buffer, NULL);
@@ -794,6 +805,9 @@ gen_macroblock_verts(struct vl_mpeg12_mc_renderer *r,
    assert(ycbcr_vb);
    assert(pos < r->macroblocks_per_batch);
 
+   mo_vec[1].x = 0;
+   mo_vec[1].y = 0;
+
    switch (mb->mb_type) {
       case PIPE_MPEG12_MACROBLOCK_TYPE_BI:
       {
@@ -991,21 +1005,20 @@ flush(struct vl_mpeg12_mc_renderer *r)
 
    r->pipe->set_framebuffer_state(r->pipe, &r->fb_state);
    r->pipe->set_viewport_state(r->pipe, &r->viewport);
-   r->pipe->set_scissor_state(r->pipe, &r->scissor);
 
    vs_consts = pipe_buffer_map
    (
-      r->pipe->screen, r->vs_const_buf.buffer,
+      r->pipe->screen, r->vs_const_buf,
       PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_DISCARD
    );
 
    vs_consts->denorm.x = r->surface->width0;
    vs_consts->denorm.y = r->surface->height0;
 
-   pipe_buffer_unmap(r->pipe->screen, r->vs_const_buf.buffer);
+   pipe_buffer_unmap(r->pipe->screen, r->vs_const_buf);
 
    r->pipe->set_constant_buffer(r->pipe, PIPE_SHADER_VERTEX, 0,
-                                &r->vs_const_buf);
+                                r->vs_const_buf);
 
    if (num_macroblocks[MACROBLOCK_TYPE_INTRA] > 0) {
       r->pipe->set_vertex_buffers(r->pipe, 1, r->vertex_bufs.all);
@@ -1164,7 +1177,7 @@ grab_blocks(struct vl_mpeg12_mc_renderer *r, unsigned mbx, unsigned mby,
    assert(r);
    assert(blocks);
 
-   tex_pitch = r->tex_transfer[0]->stride / pf_get_blocksize(r->tex_transfer[0]->texture->format);
+   tex_pitch = r->tex_transfer[0]->stride / util_format_get_blocksize(r->tex_transfer[0]->texture->format);
    texels = r->texels[0] + mbpy * tex_pitch + mbpx;
 
    for (y = 0; y < 2; ++y) {
@@ -1203,7 +1216,7 @@ grab_blocks(struct vl_mpeg12_mc_renderer *r, unsigned mbx, unsigned mby,
    mbpy /= 2;
 
    for (tb = 0; tb < 2; ++tb) {
-      tex_pitch = r->tex_transfer[tb + 1]->stride / pf_get_blocksize(r->tex_transfer[tb + 1]->texture->format);
+      tex_pitch = r->tex_transfer[tb + 1]->stride / util_format_get_blocksize(r->tex_transfer[tb + 1]->texture->format);
       texels = r->texels[tb + 1] + mbpy * tex_pitch + mbpx;
 
       if ((cbp >> (1 - tb)) & 1) {
@@ -1336,7 +1349,7 @@ vl_mpeg12_mc_renderer_render_macroblocks(struct vl_mpeg12_mc_renderer
             xfer_buffers_unmap(renderer);
             flush(renderer);
          }
-         
+
          new_surface = true;
       }
 
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
index 9602a0fad9d..f24edfcf194 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_mc_renderer.h
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2009 Younes Manton.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #ifndef vl_mpeg12_mc_renderer_h
@@ -63,20 +63,19 @@ struct vl_mpeg12_mc_renderer
    unsigned macroblocks_per_batch;
 
    struct pipe_viewport_state viewport;
-   struct pipe_scissor_state scissor;
-   struct pipe_constant_buffer vs_const_buf;
+   struct pipe_buffer *vs_const_buf;
    struct pipe_framebuffer_state fb_state;
    struct pipe_vertex_element vertex_elems[8];
-	
+
    union
    {
       void *all[5];
       struct { void *y, *cb, *cr, *ref[2]; } individual;
    } samplers;
-	
+
    void *i_vs, *p_vs[2], *b_vs[2];
    void *i_fs, *p_fs[2], *b_fs[2];
-	
+
    union
    {
       struct pipe_texture *all[5];
@@ -88,7 +87,7 @@ struct vl_mpeg12_mc_renderer
       struct pipe_vertex_buffer all[3];
       struct { struct pipe_vertex_buffer ycbcr, ref[2]; } individual;
    } vertex_bufs;
-	
+
    struct pipe_texture *surface, *past, *future;
    struct pipe_fence_handle **fence;
    unsigned num_macroblocks;
author	Younes Manton <[email protected]>	2010-03-07 12:47:45 -0500
committer	Younes Manton <[email protected]>	2010-03-12 01:37:49 -0500
commit	a8238bb08a95e7ea4430450c304a6bee210df1a6 (patch)
tree	00f4e852473dc1d6a86aece436f3e5bf89d029c7 /src/gallium/auxiliary
parent	80468464897682b8e10aeab310f20fdd7ddc6cb4 (diff)
parent	45df4bad9fc0379f05197bee10c03fd351f24094 (diff)