266 files changed, 4900 insertions, 3619 deletions
diff --git a/configure.ac b/configure.ac
index 384de4dbde6..6fe060ccb49 100644
--- a/configure.ac
+++ b/configure.ac
@@ -704,8 +704,10 @@ test "x$enable_asm" = xno && AC_MSG_RESULT([no])
 if test "x$enable_asm" = xyes -a "x$cross_compiling" = xyes; then
     case "$host_cpu" in
     i?86 | x86_64 | amd64)
-        enable_asm=no
-        AC_MSG_RESULT([no, cross compiling])
+        if test "x$host_cpu" != "x$target_cpu"; then
+            enable_asm=no
+            AC_MSG_RESULT([no, cross compiling])
+        fi
         ;;
     esac
 fi
@@ -929,12 +931,6 @@ AC_ARG_ENABLE([xlib-glx],
     [enable_xlib_glx="$enableval"],
     [enable_xlib_glx=no])
 
-AC_ARG_ENABLE([r600-llvm-compiler],
-    [AS_HELP_STRING([--enable-r600-llvm-compiler],
-        [Enable experimental LLVM backend for graphics shaders @<:@default=disabled@:>@])],
-    [enable_r600_llvm="$enableval"],
-    [enable_r600_llvm=no])
-
 AC_ARG_ENABLE([gallium-tests],
     [AS_HELP_STRING([--enable-gallium-tests],
         [Enable optional Gallium tests) @<:@default=disabled@:>@])],
@@ -2238,14 +2234,8 @@ if test -n "$with_gallium_drivers"; then
             PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
             gallium_require_drm "Gallium R600"
             gallium_require_drm_loader
-            if test "x$enable_r600_llvm" = xyes -o "x$enable_opencl" = xyes; then
-                radeon_llvm_check "r600g"
-                LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
-            fi
-            if test "x$enable_r600_llvm" = xyes; then
-                USE_R600_LLVM_COMPILER=yes;
-            fi
             if test "x$enable_opencl" = xyes; then
+                radeon_llvm_check "r600g"
                 LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
             fi
             ;;
@@ -2416,7 +2406,6 @@ AM_CONDITIONAL(NEED_RADEON_DRM_WINSYS, test "x$HAVE_GALLIUM_R300" = xyes -o \
                                             "x$HAVE_GALLIUM_RADEONSI" = xyes)
 AM_CONDITIONAL(NEED_WINSYS_XLIB, test "x$NEED_WINSYS_XLIB" = xyes)
 AM_CONDITIONAL(NEED_RADEON_LLVM, test x$NEED_RADEON_LLVM = xyes)
-AM_CONDITIONAL(USE_R600_LLVM_COMPILER, test x$USE_R600_LLVM_COMPILER = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_COMPUTE, test x$enable_opencl = xyes)
 AM_CONDITIONAL(HAVE_MESA_LLVM, test x$MESA_LLVM = x1)
 AM_CONDITIONAL(USE_VC4_SIMULATOR, test x$USE_VC4_SIMULATOR = xyes)
diff --git a/docs/GL3.txt b/docs/GL3.txt
index ee7facafc95..88c14c4c67d 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -1,13 +1,28 @@
+# Status of OpenGL extensions in Mesa
 
-Status of OpenGL 3.x features in Mesa
+Here's how to read this file:
 
+all DONE: <driver>, ...
+    All the extensions are done for the given list of drivers.
 
-Note: when an item is marked as "DONE" it means all the core Mesa
-infrastructure is complete but it may be the case that few (if any) drivers
-implement the features.
+DONE
+    The extension is done for Mesa and no implementation is necessary on the
+    driver-side.
 
+DONE ()
+    The extension is done for Mesa and all the drivers in the "all DONE" list.
 
-OpenGL Core and Compatibility context support
+DONE (<driver>, ...)
+    The extension is done for Mesa, all the drivers in the "all DONE" list, and
+    all the drivers in the brackets.
+
+in progress
+    The extension is started but not finished yet.
+
+not started
+    The extension isn't started yet.
+
+# OpenGL Core and Compatibility context support
 
 OpenGL 3.1 and later versions are only supported with the Core profile.
 There are no plans to support GL_ARB_compatibility. The last supported OpenGL
@@ -15,30 +30,30 @@ version with all deprecated features is 3.0. Some of the later GL features
 are exposed in the 3.0 context as extensions.
 
 
-Feature                                               Status
------------------------------------------------------ ------------------------
+Feature                                                 Status
+------------------------------------------------------- ------------------------
 
 GL 3.0, GLSL 1.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
 
   glBindFragDataLocation, glGetFragDataLocation         DONE
-  Conditional rendering (GL_NV_conditional_render)      DONE ()
-  Map buffer subranges (GL_ARB_map_buffer_range)        DONE ()
-  Clamping controls (GL_ARB_color_buffer_float)         DONE ()
-  Float textures, renderbuffers (GL_ARB_texture_float)  DONE ()
+  GL_NV_conditional_render (Conditional rendering)      DONE ()
+  GL_ARB_map_buffer_range (Map buffer subranges)        DONE ()
+  GL_ARB_color_buffer_float (Clamping controls)         DONE ()
+  GL_ARB_texture_float (Float textures, renderbuffers)  DONE ()
   GL_EXT_packed_float                                   DONE ()
   GL_EXT_texture_shared_exponent                        DONE ()
-  Float depth buffers (GL_ARB_depth_buffer_float)       DONE ()
-  Framebuffer objects (GL_ARB_framebuffer_object)       DONE ()
+  GL_ARB_depth_buffer_float (Float depth buffers)       DONE ()
+  GL_ARB_framebuffer_object (Framebuffer objects)       DONE ()
   GL_ARB_half_float_pixel                               DONE (all drivers)
   GL_ARB_half_float_vertex                              DONE ()
   GL_EXT_texture_integer                                DONE ()
   GL_EXT_texture_array                                  DONE ()
-  Per-buffer blend and masks (GL_EXT_draw_buffers2)     DONE ()
+  GL_EXT_draw_buffers2 (Per-buffer blend and masks)     DONE ()
   GL_EXT_texture_compression_rgtc                       DONE ()
   GL_ARB_texture_rg                                     DONE ()
-  Transform feedback (GL_EXT_transform_feedback)        DONE ()
-  Vertex array objects (GL_ARB_vertex_array_object)     DONE ()
-  sRGB framebuffer format (GL_EXT_framebuffer_sRGB)     DONE ()
+  GL_EXT_transform_feedback (Transform feedback)        DONE ()
+  GL_ARB_vertex_array_object (Vertex array objects)     DONE ()
+  GL_EXT_framebuffer_sRGB (sRGB framebuffer format)     DONE ()
   glClearBuffer commands                                DONE
   glGetStringi command                                  DONE
   glTexParameterI, glGetTexParameterI commands          DONE
@@ -53,28 +68,28 @@ GL 3.0, GLSL 1.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
 GL 3.1, GLSL 1.40 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
 
   Forward compatible context support/deprecations       DONE ()
-  Instanced drawing (GL_ARB_draw_instanced)             DONE ()
-  Buffer copying (GL_ARB_copy_buffer)                   DONE ()
-  Primitive restart (GL_NV_primitive_restart)           DONE ()
+  GL_ARB_draw_instanced (Instanced drawing)             DONE ()
+  GL_ARB_copy_buffer (Buffer copying)                   DONE ()
+  GL_NV_primitive_restart (Primitive restart)           DONE ()
   16 vertex texture image units                         DONE ()
-  Texture buffer objs (GL_ARB_texture_buffer_object)    DONE for OpenGL 3.1 contexts ()
-  Rectangular textures (GL_ARB_texture_rectangle)       DONE ()
-  Uniform buffer objs (GL_ARB_uniform_buffer_object)    DONE ()
-  Signed normalized textures (GL_EXT_texture_snorm)     DONE ()
+  GL_ARB_texture_buffer_object (Texture buffer objs)    DONE (for OpenGL 3.1 contexts)
+  GL_ARB_texture_rectangle (Rectangular textures)       DONE ()
+  GL_ARB_uniform_buffer_object (Uniform buffer objs)    DONE ()
+  GL_EXT_texture_snorm (Signed normalized textures)     DONE ()
 
 
 GL 3.2, GLSL 1.50 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
 
   Core/compatibility profiles                           DONE
   Geometry shaders                                      DONE ()
-  BGRA vertex order (GL_ARB_vertex_array_bgra)          DONE ()
-  Base vertex offset(GL_ARB_draw_elements_base_vertex)  DONE ()
-  Frag shader coord (GL_ARB_fragment_coord_conventions) DONE ()
-  Provoking vertex (GL_ARB_provoking_vertex)            DONE ()
-  Seamless cubemaps (GL_ARB_seamless_cube_map)          DONE ()
-  Multisample textures (GL_ARB_texture_multisample)     DONE ()
-  Frag depth clamp (GL_ARB_depth_clamp)                 DONE ()
-  Fence objects (GL_ARB_sync)                           DONE ()
+  GL_ARB_vertex_array_bgra (BGRA vertex order)          DONE ()
+  GL_ARB_draw_elements_base_vertex (Base vertex offset) DONE ()
+  GL_ARB_fragment_coord_conventions (Frag shader coord) DONE ()
+  GL_ARB_provoking_vertex (Provoking vertex)            DONE ()
+  GL_ARB_seamless_cube_map (Seamless cubemaps)          DONE ()
+  GL_ARB_texture_multisample (Multisample textures)     DONE ()
+  GL_ARB_depth_clamp (Frag depth clamp)                 DONE ()
+  GL_ARB_sync (Fence objects)                           DONE ()
   GLX_ARB_create_context_profile                        DONE
 
 
@@ -94,170 +109,170 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
 
 GL 4.0, GLSL 4.00 --- all DONE: nvc0, r600, radeonsi
 
-  GL_ARB_draw_buffers_blend                            DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_draw_indirect                                 DONE (i965, llvmpipe, softpipe)
-  GL_ARB_gpu_shader5                                   DONE (i965)
-  - 'precise' qualifier                                DONE
-  - Dynamically uniform sampler array indices          DONE (softpipe)
-  - Dynamically uniform UBO array indices              DONE ()
-  - Implicit signed -> unsigned conversions            DONE
-  - Fused multiply-add                                 DONE ()
-  - Packing/bitfield/conversion functions              DONE (softpipe)
-  - Enhanced textureGather                             DONE (softpipe)
-  - Geometry shader instancing                         DONE (llvmpipe, softpipe)
-  - Geometry shader multiple streams                   DONE ()
-  - Enhanced per-sample shading                        DONE ()
-  - Interpolation functions                            DONE ()
-  - New overload resolution rules                      DONE
-  GL_ARB_gpu_shader_fp64                               DONE (llvmpipe, softpipe)
-  GL_ARB_sample_shading                                DONE (i965, nv50)
-  GL_ARB_shader_subroutine                             DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_tessellation_shader                           DONE (i965)
-  GL_ARB_texture_buffer_object_rgb32                   DONE (i965, llvmpipe, softpipe)
-  GL_ARB_texture_cube_map_array                        DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_texture_gather                                DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_texture_query_lod                             DONE (i965, nv50, softpipe)
-  GL_ARB_transform_feedback2                           DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_transform_feedback3                           DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_draw_buffers_blend                             DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_draw_indirect                                  DONE (i965, llvmpipe, softpipe)
+  GL_ARB_gpu_shader5                                    DONE (i965)
+  - 'precise' qualifier                                 DONE
+  - Dynamically uniform sampler array indices           DONE (softpipe)
+  - Dynamically uniform UBO array indices               DONE ()
+  - Implicit signed -> unsigned conversions             DONE
+  - Fused multiply-add                                  DONE ()
+  - Packing/bitfield/conversion functions               DONE (softpipe)
+  - Enhanced textureGather                              DONE (softpipe)
+  - Geometry shader instancing                          DONE (llvmpipe, softpipe)
+  - Geometry shader multiple streams                    DONE ()
+  - Enhanced per-sample shading                         DONE ()
+  - Interpolation functions                             DONE ()
+  - New overload resolution rules                       DONE
+  GL_ARB_gpu_shader_fp64                                DONE (llvmpipe, softpipe)
+  GL_ARB_sample_shading                                 DONE (i965, nv50)
+  GL_ARB_shader_subroutine                              DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_tessellation_shader                            DONE (i965)
+  GL_ARB_texture_buffer_object_rgb32                    DONE (i965, llvmpipe, softpipe)
+  GL_ARB_texture_cube_map_array                         DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_texture_gather                                 DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_texture_query_lod                              DONE (i965, nv50, softpipe)
+  GL_ARB_transform_feedback2                            DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_transform_feedback3                            DONE (i965, nv50, llvmpipe, softpipe)
 
 
 GL 4.1, GLSL 4.10 --- all DONE: nvc0, r600, radeonsi
 
-  GL_ARB_ES2_compatibility                             DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_get_program_binary                            DONE (0 binary formats)
-  GL_ARB_separate_shader_objects                       DONE (all drivers)
-  GL_ARB_shader_precision                              DONE (all drivers that support GLSL 4.10)
-  GL_ARB_vertex_attrib_64bit                           DONE (llvmpipe, softpipe)
-  GL_ARB_viewport_array                                DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_ES2_compatibility                              DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_get_program_binary                             DONE (0 binary formats)
+  GL_ARB_separate_shader_objects                        DONE (all drivers)
+  GL_ARB_shader_precision                               DONE (all drivers that support GLSL 4.10)
+  GL_ARB_vertex_attrib_64bit                            DONE (llvmpipe, softpipe)
+  GL_ARB_viewport_array                                 DONE (i965, nv50, llvmpipe, softpipe)
 
 
 GL 4.2, GLSL 4.20:
 
-  GL_ARB_texture_compression_bptc                      DONE (i965, nvc0, r600, radeonsi)
-  GL_ARB_compressed_texture_pixel_storage              DONE (all drivers)
-  GL_ARB_shader_atomic_counters                        DONE (i965, nvc0)
-  GL_ARB_texture_storage                               DONE (all drivers)
-  GL_ARB_transform_feedback_instanced                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_base_instance                                 DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_shader_image_load_store                       DONE (i965)
-  GL_ARB_conservative_depth                            DONE (all drivers that support GLSL 1.30)
-  GL_ARB_shading_language_420pack                      DONE (all drivers that support GLSL 1.30)
-  GL_ARB_shading_language_packing                      DONE (all drivers)
-  GL_ARB_internalformat_query                          DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_map_buffer_alignment                          DONE (all drivers)
+  GL_ARB_texture_compression_bptc                       DONE (i965, nvc0, r600, radeonsi)
+  GL_ARB_compressed_texture_pixel_storage               DONE (all drivers)
+  GL_ARB_shader_atomic_counters                         DONE (i965, nvc0)
+  GL_ARB_texture_storage                                DONE (all drivers)
+  GL_ARB_transform_feedback_instanced                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_base_instance                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_shader_image_load_store                        DONE (i965, radeonsi)
+  GL_ARB_conservative_depth                             DONE (all drivers that support GLSL 1.30)
+  GL_ARB_shading_language_420pack                       DONE (all drivers that support GLSL 1.30)
+  GL_ARB_shading_language_packing                       DONE (all drivers)
+  GL_ARB_internalformat_query                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_map_buffer_alignment                           DONE (all drivers)
 
 
 GL 4.3, GLSL 4.30:
 
-  GL_ARB_arrays_of_arrays                              DONE (all drivers that support GLSL 1.30)
-  GL_ARB_ES3_compatibility                             DONE (all drivers that support GLSL 3.30)
-  GL_ARB_clear_buffer_object                           DONE (all drivers)
-  GL_ARB_compute_shader                                DONE (i965)
-  GL_ARB_copy_image                                    DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_KHR_debug                                         DONE (all drivers)
-  GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_fragment_layer_viewport                       DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
-  GL_ARB_framebuffer_no_attachments                    DONE (i965)
-  GL_ARB_internalformat_query2                         DONE (i965)
-  GL_ARB_invalidate_subdata                            DONE (all drivers)
-  GL_ARB_multi_draw_indirect                           DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_program_interface_query                       DONE (all drivers)
-  GL_ARB_robust_buffer_access_behavior                 not started
-  GL_ARB_shader_image_size                             DONE (i965)
-  GL_ARB_shader_storage_buffer_object                  DONE (i965, nvc0)
-  GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
-  GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
-  GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_vertex_attrib_binding                         DONE (all drivers)
+  GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
+  GL_ARB_ES3_compatibility                              DONE (all drivers that support GLSL 3.30)
+  GL_ARB_clear_buffer_object                            DONE (all drivers)
+  GL_ARB_compute_shader                                 DONE (i965)
+  GL_ARB_copy_image                                     DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_KHR_debug                                          DONE (all drivers)
+  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
+  GL_ARB_fragment_layer_viewport                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
+  GL_ARB_framebuffer_no_attachments                     DONE (i965)
+  GL_ARB_internalformat_query2                          DONE (all drivers)
+  GL_ARB_invalidate_subdata                             DONE (all drivers)
+  GL_ARB_multi_draw_indirect                            DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_program_interface_query                        DONE (all drivers)
+  GL_ARB_robust_buffer_access_behavior                  not started
+  GL_ARB_shader_image_size                              DONE (i965, radeonsi)
+  GL_ARB_shader_storage_buffer_object                   DONE (i965, nvc0)
+  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_buffer_range                           DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
+  GL_ARB_texture_query_levels                           DONE (all drivers that support GLSL 1.30)
+  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
+  GL_ARB_texture_view                                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_vertex_attrib_binding                          DONE (all drivers)
 
 
 GL 4.4, GLSL 4.40:
 
-  GL_MAX_VERTEX_ATTRIB_STRIDE                          DONE (all drivers)
-  GL_ARB_buffer_storage                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_clear_texture                                 DONE (i965, nv50, nvc0)
-  GL_ARB_enhanced_layouts                              in progress (Timothy)
-  - compile-time constant expressions                  DONE
-  - explicit byte offsets for blocks                   DONE
-  - forced alignment within blocks                     DONE
-  - specified vec4-slot component numbers              in progress
-  - specified transform/feedback layout                in progress
-  - input/output block locations                       DONE
-  GL_ARB_multi_bind                                    DONE (all drivers)
-  GL_ARB_query_buffer_object                           DONE (nvc0)
-  GL_ARB_texture_mirror_clamp_to_edge                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_stencil8                              DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_vertex_type_10f_11f_11f_rev                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_MAX_VERTEX_ATTRIB_STRIDE                           DONE (all drivers)
+  GL_ARB_buffer_storage                                 DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_clear_texture                                  DONE (i965, nv50, nvc0)
+  GL_ARB_enhanced_layouts                               in progress (Timothy)
+  - compile-time constant expressions                   DONE
+  - explicit byte offsets for blocks                    DONE
+  - forced alignment within blocks                      DONE
+  - specified vec4-slot component numbers               in progress
+  - specified transform/feedback layout                 in progress
+  - input/output block locations                        DONE
+  GL_ARB_multi_bind                                     DONE (all drivers)
+  GL_ARB_query_buffer_object                            DONE (nvc0)
+  GL_ARB_texture_mirror_clamp_to_edge                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_stencil8                               DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
 
 GL 4.5, GLSL 4.50:
 
-  GL_ARB_ES3_1_compatibility                           not started
-  GL_ARB_clip_control                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_conditional_render_inverted                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_cull_distance                                 in progress (Tobias)
-  GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_direct_state_access                           DONE (all drivers)
-  GL_ARB_get_texture_sub_image                         DONE (all drivers)
-  GL_ARB_shader_texture_image_samples                  DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_texture_barrier                               DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_KHR_context_flush_control                         DONE (all - but needs GLX/EGL extension to be useful)
-  GL_KHR_robust_buffer_access_behavior                 not started
-  GL_KHR_robustness                                    90% done (the ARB variant)
-  GL_EXT_shader_integer_mix                            DONE (all drivers that support GLSL)
+  GL_ARB_ES3_1_compatibility                            not started
+  GL_ARB_clip_control                                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_conditional_render_inverted                    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_cull_distance                                  in progress (Tobias)
+  GL_ARB_derivative_control                             DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_direct_state_access                            DONE (all drivers)
+  GL_ARB_get_texture_sub_image                          DONE (all drivers)
+  GL_ARB_shader_texture_image_samples                   DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_texture_barrier                                DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_KHR_context_flush_control                          DONE (all - but needs GLX/EGL extension to be useful)
+  GL_KHR_robust_buffer_access_behavior                  not started
+  GL_KHR_robustness                                     not started (90% done with the ARB variant)
+  GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)
 
 These are the extensions cherry-picked to make GLES 3.1
 GLES3.1, GLSL ES 3.1
-  GL_ARB_arrays_of_arrays                              DONE (all drivers that support GLSL 1.30)
-  GL_ARB_compute_shader                                DONE (i965)
-  GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_framebuffer_no_attachments                    DONE (i965)
-  GL_ARB_program_interface_query                       DONE (all drivers)
-  GL_ARB_shader_atomic_counters                        DONE (i965, nvc0)
-  GL_ARB_shader_image_load_store                       DONE (i965)
-  GL_ARB_shader_image_size                             DONE (i965)
-  GL_ARB_shader_storage_buffer_object                  DONE (i965, nvc0)
-  GL_ARB_shading_language_packing                      DONE (all drivers)
-  GL_ARB_separate_shader_objects                       DONE (all drivers)
-  GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  Multisample textures (GL_ARB_texture_multisample)    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_vertex_attrib_binding                         DONE (all drivers)
-  GS5 Enhanced textureGather                           DONE (i965, nvc0, r600, radeonsi)
-  GS5 Packing/bitfield/conversion functions            DONE (i965, nvc0, r600, radeonsi)
-  GL_EXT_shader_integer_mix                            DONE (all drivers that support GLSL)
+  GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
+  GL_ARB_compute_shader                                 DONE (i965)
+  GL_ARB_draw_indirect                                  DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
+  GL_ARB_framebuffer_no_attachments                     DONE (i965)
+  GL_ARB_program_interface_query                        DONE (all drivers)
+  GL_ARB_shader_atomic_counters                         DONE (i965, nvc0)
+  GL_ARB_shader_image_load_store                        DONE (i965)
+  GL_ARB_shader_image_size                              DONE (i965)
+  GL_ARB_shader_storage_buffer_object                   DONE (i965, nvc0)
+  GL_ARB_shading_language_packing                       DONE (all drivers)
+  GL_ARB_separate_shader_objects                        DONE (all drivers)
+  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_multisample (Multisample textures)     DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
+  GL_ARB_vertex_attrib_binding                          DONE (all drivers)
+  GS5 Enhanced textureGather                            DONE (i965, nvc0, r600, radeonsi)
+  GS5 Packing/bitfield/conversion functions             DONE (i965, nvc0, r600, radeonsi)
+  GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)
 
   Additional functionality not covered above:
-      glMemoryBarrierByRegion                          DONE
-      glGetTexLevelParameter[fi]v - needs updates      DONE
+      glMemoryBarrierByRegion                           DONE
+      glGetTexLevelParameter[fi]v - needs updates       DONE
       glGetBooleani_v - restrict to GLES enums
-      gl_HelperInvocation support                      DONE (i965, nvc0, r600)
+      gl_HelperInvocation support                       DONE (i965, nvc0, r600)
 
 GLES3.2, GLSL ES 3.2
-  GL_EXT_color_buffer_float                            DONE (all drivers)
-  GL_KHR_blend_equation_advanced                       not started
-  GL_KHR_debug                                         DONE (all drivers)
-  GL_KHR_robustness                                    90% done (the ARB variant)
-  GL_KHR_texture_compression_astc_ldr                  DONE (i965/gen9+)
-  GL_OES_copy_image                                    not started (based on GL_ARB_copy_image, which is done for some drivers)
-  GL_OES_draw_buffers_indexed                          not started
-  GL_OES_draw_elements_base_vertex                     DONE (all drivers)
-  GL_OES_geometry_shader                               started (Marta)
-  GL_OES_gpu_shader5                                   DONE (all drivers that support GL_ARB_gpu_shader5)
-  GL_OES_primitive_bounding box                        not started
-  GL_OES_sample_shading                                not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
-  GL_OES_sample_variables                              not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
-  GL_OES_shader_image_atomic                           DONE (all drivers that support GL_ARB_shader_image_load_store)
-  GL_OES_shader_io_blocks                              not started (based on parts of GLSL 1.50, which is done)
-  GL_OES_shader_multisample_interpolation              not started (based on parts of GL_ARB_gpu_shader5, which is done)
-  GL_OES_tessellation_shader                           not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
-  GL_OES_texture_border_clamp                          DONE (all drivers)
-  GL_OES_texture_buffer                                not started (based on GL_ARB_texture_buffer_object, GL_ARB_texture_buffer_range, and GL_ARB_texture_buffer_object_rgb32 that are all done)
-  GL_OES_texture_cube_map_array                        not started (based on GL_ARB_texture_cube_map_array, which is done for all drivers)
-  GL_OES_texture_stencil8                              DONE (all drivers that support GL_ARB_texture_stencil8)
-  GL_OES_texture_storage_multisample_2d_array          DONE (all drivers that support GL_ARB_texture_multisample)
+  GL_EXT_color_buffer_float                             DONE (all drivers)
+  GL_KHR_blend_equation_advanced                        not started
+  GL_KHR_debug                                          DONE (all drivers)
+  GL_KHR_robustness                                     not started (90% done with the ARB variant)
+  GL_KHR_texture_compression_astc_ldr                   DONE (i965/gen9+)
+  GL_OES_copy_image                                     not started (based on GL_ARB_copy_image, which is done for some drivers)
+  GL_OES_draw_buffers_indexed                           not started
+  GL_OES_draw_elements_base_vertex                      DONE (all drivers)
+  GL_OES_geometry_shader                                started (Marta)
+  GL_OES_gpu_shader5                                    DONE (all drivers that support GL_ARB_gpu_shader5)
+  GL_OES_primitive_bounding box                         not started
+  GL_OES_sample_shading                                 not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
+  GL_OES_sample_variables                               not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
+  GL_OES_shader_image_atomic                            DONE (all drivers that support GL_ARB_shader_image_load_store)
+  GL_OES_shader_io_blocks                               not started (based on parts of GLSL 1.50, which is done)
+  GL_OES_shader_multisample_interpolation               not started (based on parts of GL_ARB_gpu_shader5, which is done)
+  GL_OES_tessellation_shader                            not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
+  GL_OES_texture_border_clamp                           DONE (all drivers)
+  GL_OES_texture_buffer                                 not started (based on GL_ARB_texture_buffer_object, GL_ARB_texture_buffer_range, and GL_ARB_texture_buffer_object_rgb32 that are all done)
+  GL_OES_texture_cube_map_array                         not started (based on GL_ARB_texture_cube_map_array, which is done for all drivers)
+  GL_OES_texture_stencil8                               DONE (all drivers that support GL_ARB_texture_stencil8)
+  GL_OES_texture_storage_multisample_2d_array           DONE (all drivers that support GL_ARB_texture_multisample)
 
 More info about these features and the work involved can be found at
 http://dri.freedesktop.org/wiki/MissingFunctionality
diff --git a/docs/envvars.html b/docs/envvars.html
index 06aa0ac9369..253aaf26dcd 100644
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -163,6 +163,9 @@ See the <a href="xlibdriver.html">Xlib software driver page</a> for details.
    <li>blorp - emit messages about the blorp operations (blits &amp; clears)</li>
    <li>nodualobj - suppress generation of dual-object geometry shader code</li>
    <li>optimizer - dump shader assembly to files at each optimization pass and iteration that make progress</li>
+   <li>vec4 - force vec4 mode in vertex shader</li>
+   <li>spill_fs - force spilling of all registers in the scalar backend (useful to debug spilling code)</li>
+   <li>spill_vec4 - force spilling of all registers in the vec4 backend (useful to debug spilling code)</li>
 </ul>
 </ul>
 
diff --git a/docs/install.html b/docs/install.html
index ae911d5347b..8b349c40b9d 100644
--- a/docs/install.html
+++ b/docs/install.html
@@ -73,8 +73,7 @@ The following are required for DRI-based hardware acceleration with Mesa:
 <ul>
 <li><a href="http://xorg.freedesktop.org/releases/individual/proto/">
 dri2proto</a> version 2.6 or later
-<li><a href="http://dri.freedesktop.org/libdrm/">libDRM</a>
-version 2.4.33 or later
+<li><a href="http://dri.freedesktop.org/libdrm/">libDRM</a> latest version
 <li>Xorg server version 1.5 or later
 <li>Linux 2.6.28 or later
 </ul>
diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index c31296ef9b1..acd8e11e3fc 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -44,8 +44,10 @@ Note: some of the new features are only available with certain drivers.
 </p>
 
 <ul>
-<li>GL_ARB_internalformat_query2 on i965</li>
+<li>GL_ARB_internalformat_query2 on all drivers</li>
 <li>GL_ARB_shader_atomic_counter_ops on nvc0</li>
+<li>GL_ARB_shader_image_load_store on radeonsi</li>
+<li>GL_ARB_shader_image_size on radeonsi</li>
 <li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li>
 <li>GL_OES_shader_image_atomic on all drivers that support GL_ARB_shader_image_load_store</li>
 </ul>
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index b0b8281869d..43377f18ad8 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -129,6 +129,7 @@ LIBGLSL_FILES = \
 	glsl/opt_tree_grafting.cpp \
 	glsl/opt_vectorize.cpp \
 	glsl/program.h \
+	glsl/propagate_invariance.cpp \
 	glsl/s_expression.cpp \
 	glsl/s_expression.h
 
diff --git a/src/compiler/glsl/Makefile.sources b/src/compiler/glsl/Makefile.sources
index 3f537d5b37a..970fab03bbf 100644
--- a/src/compiler/glsl/Makefile.sources
+++ b/src/compiler/glsl/Makefile.sources
@@ -217,6 +217,7 @@ LIBGLSL_FILES = \
 	opt_tree_grafting.cpp \
 	opt_vectorize.cpp \
 	program.h \
+	propagate_invariance.cpp \
 	s_expression.cpp \
 	s_expression.h
 
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 5262bd87655..35def8e3ae0 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -2125,7 +2125,9 @@ process_array_size(exec_node *node,
    }
 
    ir_constant *const size = ir->constant_expression_value();
-   if (size == NULL || array_size->has_sequence_subexpression()) {
+   if (size == NULL ||
+       (state->is_version(120, 300) &&
+        array_size->has_sequence_subexpression())) {
       _mesa_glsl_error(& loc, state, "array size must be a "
                        "constant valued expression");
       return 0;
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 1ac8489b45a..5d010fd47e5 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -1887,6 +1887,7 @@ do_common_optimization(exec_list *ir, bool linked,
       OPT(do_dead_functions, ir);
       OPT(do_structure_splitting, ir);
    }
+   propagate_invariance(ir);
    OPT(do_if_simplification, ir);
    OPT(opt_flatten_nested_if_blocks, ir);
    OPT(opt_conditional_discard, ir);
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index f4519679ff3..b74d68a605b 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -720,6 +720,13 @@ public:
       unsigned is_unmatched_generic_inout:1;
 
       /**
+       * Is this varying used only by transform feedback?
+       *
+       * This is used by the linker to decide if its safe to pack the varying.
+       */
+      unsigned is_xfb_only:1;
+
+      /**
        * If non-zero, then this variable may be packed along with other variables
        * into a single varying slot, so this offset should be applied when
        * accessing components.  For example, an offset of 1 means that the x
diff --git a/src/compiler/glsl/ir_optimization.h b/src/compiler/glsl/ir_optimization.h
index b56413a1500..f9599a39ff5 100644
--- a/src/compiler/glsl/ir_optimization.h
+++ b/src/compiler/glsl/ir_optimization.h
@@ -124,7 +124,8 @@ void lower_shared_reference(struct gl_shader *shader, unsigned *shared_size);
 void lower_ubo_reference(struct gl_shader *shader);
 void lower_packed_varyings(void *mem_ctx,
                            unsigned locations_used, ir_variable_mode mode,
-                           unsigned gs_input_vertices, gl_shader *shader);
+                           unsigned gs_input_vertices, gl_shader *shader,
+                           bool disable_varying_packing, bool xfb_enabled);
 bool lower_vector_insert(exec_list *instructions, bool lower_nonconstant_index);
 bool lower_vector_derefs(gl_shader *shader);
 void lower_named_interface_blocks(void *mem_ctx, gl_shader *shader);
@@ -138,6 +139,7 @@ bool lower_tess_level(gl_shader *shader);
 bool lower_vertex_id(gl_shader *shader);
 
 bool lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state);
+void propagate_invariance(exec_list *instructions);
 
 ir_rvalue *
 compare_index_block(exec_list *instructions, ir_variable *index,
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 34eb848a9c1..44fc8f617f8 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -826,7 +826,7 @@ namespace {
 class varying_matches
 {
 public:
-   varying_matches(bool disable_varying_packing,
+   varying_matches(bool disable_varying_packing, bool xfb_enabled,
                    gl_shader_stage producer_stage,
                    gl_shader_stage consumer_stage);
    ~varying_matches();
@@ -836,14 +836,30 @@ public:
    void store_locations() const;
 
 private:
+   bool is_varying_packing_safe(const glsl_type *type,
+                                const ir_variable *var);
+
    /**
     * If true, this driver disables varying packing, so all varyings need to
     * be aligned on slot boundaries, and take up a number of slots equal to
     * their number of matrix columns times their array size.
+    *
+    * Packing may also be disabled because our current packing method is not
+    * safe in SSO or versions of OpenGL where interpolation qualifiers are not
+    * guaranteed to match across stages.
     */
    const bool disable_varying_packing;
 
    /**
+    * If true, this driver has transform feedback enabled. The transform
+    * feedback code requires at least some packing be done even when varying
+    * packing is disabled, fortunately where transform feedback requires
+    * packing it's safe to override the disabled setting. See
+    * is_varying_packing_safe().
+    */
+   const bool xfb_enabled;
+
+   /**
     * Enum representing the order in which varyings are packed within a
     * packing class.
     *
@@ -862,6 +878,7 @@ private:
    static unsigned compute_packing_class(const ir_variable *var);
    static packing_order_enum compute_packing_order(const ir_variable *var);
    static int match_comparator(const void *x_generic, const void *y_generic);
+   static int xfb_comparator(const void *x_generic, const void *y_generic);
 
    /**
     * Structure recording the relationship between a single producer output
@@ -917,9 +934,11 @@ private:
 } /* anonymous namespace */
 
 varying_matches::varying_matches(bool disable_varying_packing,
+                                 bool xfb_enabled,
                                  gl_shader_stage producer_stage,
                                  gl_shader_stage consumer_stage)
    : disable_varying_packing(disable_varying_packing),
+     xfb_enabled(xfb_enabled),
      producer_stage(producer_stage),
      consumer_stage(consumer_stage)
 {
@@ -942,6 +961,24 @@ varying_matches::~varying_matches()
 
 
 /**
+ * Packing is always safe on individual arrays, structure and matices. It is
+ * also safe if the varying is only used for transform feedback.
+ */
+bool
+varying_matches::is_varying_packing_safe(const glsl_type *type,
+                                         const ir_variable *var)
+{
+   if (consumer_stage == MESA_SHADER_TESS_EVAL ||
+       consumer_stage == MESA_SHADER_TESS_CTRL ||
+       producer_stage == MESA_SHADER_TESS_CTRL)
+      return false;
+
+   return xfb_enabled && (type->is_array() || type->is_record() ||
+                          type->is_matrix() || var->data.is_xfb_only);
+}
+
+
+/**
  * Record the given producer/consumer variable pair in the list of variables
  * that should later be assigned locations.
  *
@@ -1020,7 +1057,7 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
       = this->compute_packing_class(var);
    this->matches[this->num_matches].packing_order
       = this->compute_packing_order(var);
-   if (this->disable_varying_packing) {
+   if (this->disable_varying_packing && !is_varying_packing_safe(type, var)) {
       unsigned slots = type->count_attribute_slots(false);
       this->matches[this->num_matches].num_components = slots * 4;
    } else {
@@ -1046,37 +1083,28 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
                                   uint64_t reserved_slots,
                                   bool separate_shader)
 {
-   /* We disable varying sorting for separate shader programs for the
-    * following reasons:
-    *
-    * 1/ All programs must sort the code in the same order to guarantee the
-    *    interface matching. However varying_matches::record() will change the
-    *    interpolation qualifier of some stages.
-    *
-    * 2/ GLSL version 4.50 removes the matching constrain on the interpolation
-    *    qualifier.
-    *
-    * From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.40 spec:
-    *
-    *    "The type and presence of interpolation qualifiers of variables with
-    *    the same name declared in all linked shaders for the same cross-stage
-    *    interface must match, otherwise the link command will fail.
-    *
-    *    When comparing an output from one stage to an input of a subsequent
-    *    stage, the input and output don't match if their interpolation
-    *    qualifiers (or lack thereof) are not the same."
-    *
-    *    "It is a link-time error if, within the same stage, the interpolation
-    *    qualifiers of variables of the same name do not match."
+   /* If packing has been disabled then we cannot safely sort the varyings by
+    * class as it may mean we are using a version of OpenGL where
+    * interpolation qualifiers are not guaranteed to be matching across
+    * shaders, sorting in this case could result in mismatching shader
+    * interfaces.
+    * When packing is disabled the sort orders varyings used by transform
+    * feedback first, but also depends on *undefined behaviour* of qsort to
+    * reverse the order of the varyings. See: xfb_comparator().
     */
-   if (!separate_shader) {
+   if (!this->disable_varying_packing) {
       /* Sort varying matches into an order that makes them easy to pack. */
       qsort(this->matches, this->num_matches, sizeof(*this->matches),
             &varying_matches::match_comparator);
+   } else {
+      /* Only sort varyings that are only used by transform feedback. */
+      qsort(this->matches, this->num_matches, sizeof(*this->matches),
+            &varying_matches::xfb_comparator);
    }
 
    unsigned generic_location = 0;
    unsigned generic_patch_location = MAX_VARYING*4;
+   bool previous_var_xfb_only = false;
 
    for (unsigned i = 0; i < this->num_matches; i++) {
       unsigned *location = &generic_location;
@@ -1100,16 +1128,30 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
       /* Advance to the next slot if this varying has a different packing
        * class than the previous one, and we're not already on a slot
        * boundary.
+       *
+       * Also advance to the next slot if packing is disabled. This makes sure
+       * we don't assign varyings the same locations which is possible
+       * because we still pack individual arrays, records and matrices even
+       * when packing is disabled. Note we don't advance to the next slot if
+       * we can pack varyings together that are only used for transform
+       * feedback.
        */
-      if (i > 0 &&
-          this->matches[i - 1].packing_class
-          != this->matches[i].packing_class) {
+      if ((this->disable_varying_packing &&
+           !(previous_var_xfb_only && var->data.is_xfb_only)) ||
+          (i > 0 && this->matches[i - 1].packing_class
+          != this->matches[i].packing_class )) {
          *location = ALIGN(*location, 4);
       }
 
+      previous_var_xfb_only = var->data.is_xfb_only;
+
       unsigned num_elements =  type->count_attribute_slots(is_vertex_input);
-      unsigned slot_end = this->disable_varying_packing ? 4 :
-         type->without_array()->vector_elements;
+      unsigned slot_end;
+      if (this->disable_varying_packing &&
+          !is_varying_packing_safe(type, var))
+         slot_end = 4;
+      else
+         slot_end = type->without_array()->vector_elements;
       slot_end += *location - 1;
 
       /* FIXME: We could be smarter in the below code and loop back over
@@ -1133,7 +1175,8 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
          /* Increase the slot to make sure there is enough room for next
           * array element.
           */
-         if (this->disable_varying_packing)
+         if (this->disable_varying_packing &&
+             !is_varying_packing_safe(type, var))
             slot_end += 4;
          else
             slot_end += type->without_array()->vector_elements;
@@ -1259,6 +1302,32 @@ varying_matches::match_comparator(const void *x_generic, const void *y_generic)
 
 
 /**
+ * Comparison function passed to qsort() to sort varyings used only by
+ * transform feedback when packing of other varyings is disabled.
+ */
+int
+varying_matches::xfb_comparator(const void *x_generic, const void *y_generic)
+{
+   const match *x = (const match *) x_generic;
+
+   if (x->producer_var != NULL && x->producer_var->data.is_xfb_only)
+         return match_comparator(x_generic, y_generic);
+
+   /* FIXME: When the comparator returns 0 it means the elements being
+    * compared are equivalent. However the qsort documentation says:
+    *
+    *    "The order of equivalent elements is undefined."
+    *
+    * In practice the sort ends up reversing the order of the varyings which
+    * means locations are also assigned in this reversed order and happens to
+    * be what we want. This is also whats happening in
+    * varying_matches::match_comparator().
+    */
+   return 0;
+}
+
+
+/**
  * Is the given variable a varying variable to be counted against the
  * limit in ctx->Const.MaxVarying?
  * This includes variables such as texcoords, colors and generic
@@ -1573,26 +1642,60 @@ assign_varying_locations(struct gl_context *ctx,
                          unsigned num_tfeedback_decls,
                          tfeedback_decl *tfeedback_decls)
 {
-   if (ctx->Const.DisableVaryingPacking) {
-      /* Transform feedback code assumes varyings are packed, so if the driver
-       * has disabled varying packing, make sure it does not support transform
-       * feedback.
-       */
-      assert(!ctx->Extensions.EXT_transform_feedback);
-   }
-
    /* Tessellation shaders treat inputs and outputs as shared memory and can
     * access inputs and outputs of other invocations.
     * Therefore, they can't be lowered to temps easily (and definitely not
     * efficiently).
     */
-   bool disable_varying_packing =
-      ctx->Const.DisableVaryingPacking ||
+   bool unpackable_tess =
       (consumer && consumer->Stage == MESA_SHADER_TESS_EVAL) ||
       (consumer && consumer->Stage == MESA_SHADER_TESS_CTRL) ||
       (producer && producer->Stage == MESA_SHADER_TESS_CTRL);
 
-   varying_matches matches(disable_varying_packing,
+   /* Transform feedback code assumes varying arrays are packed, so if the
+    * driver has disabled varying packing, make sure to at least enable
+    * packing required by transform feedback.
+    */
+   bool xfb_enabled =
+      ctx->Extensions.EXT_transform_feedback && !unpackable_tess;
+
+   /* Disable varying packing for GL 4.4+ as there is no guarantee
+    * that interpolation qualifiers will match between shaders in these
+    * versions. We also disable packing on outerward facing interfaces for
+    * SSO because in ES we need to retain the unpacked varying information
+    * for draw time validation. For desktop GL we could allow packing for
+    * versions < 4.4 but its just safer not to do packing.
+    *
+    * Packing is still enabled on individual arrays, structs, and matrices as
+    * these are required by the transform feedback code and it is still safe
+    * to do so. We also enable packing when a varying is only used for
+    * transform feedback and its not a SSO.
+    *
+    * Varying packing currently only packs together varyings with matching
+    * interpolation qualifiers as the backends assume all packed components
+    * are to be processed in the same way. Therefore we cannot do packing in
+    * these versions of GL without the risk of mismatching interfaces.
+    *
+    * From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.30 spec:
+    *
+    *    "The type and presence of interpolation qualifiers of variables with
+    *    the same name declared in all linked shaders for the same cross-stage
+    *    interface must match, otherwise the link command will fail.
+    *
+    *    When comparing an output from one stage to an input of a subsequent
+    *    stage, the input and output don't match if their interpolation
+    *    qualifiers (or lack thereof) are not the same."
+    *
+    * This text was also in at least revison 7 of the 4.40 spec but is no
+    * longer in revision 9 and not in the 4.50 spec.
+    */
+   bool disable_varying_packing =
+      ctx->Const.DisableVaryingPacking || unpackable_tess;
+   if ((ctx->API == API_OPENGL_CORE && ctx->Version >= 44) ||
+       (prog->SeparateShader && (producer == NULL || consumer == NULL)))
+      disable_varying_packing = true;
+
+   varying_matches matches(disable_varying_packing, xfb_enabled,
                            producer ? producer->Stage : (gl_shader_stage)-1,
                            consumer ? consumer->Stage : (gl_shader_stage)-1);
    hash_table *tfeedback_candidates
@@ -1711,8 +1814,10 @@ assign_varying_locations(struct gl_context *ctx,
          return false;
       }
 
-      if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout)
+      if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout) {
+         matched_candidate->toplevel_var->data.is_xfb_only = 1;
          matches.record(matched_candidate->toplevel_var, NULL);
+      }
    }
 
    const uint64_t reserved_slots =
@@ -1784,15 +1889,16 @@ assign_varying_locations(struct gl_context *ctx,
                                               ir_var_shader_in);
    }
 
-   if (!disable_varying_packing) {
-      if (producer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
-                               0, producer);
-      }
-      if (consumer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
-                               consumer_vertices, consumer);
-      }
+   if (producer) {
+      lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
+                            0, producer, disable_varying_packing,
+                            xfb_enabled);
+   }
+
+   if (consumer) {
+      lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
+                            consumer_vertices, consumer,
+                            disable_varying_packing, xfb_enabled);
    }
 
    return true;
diff --git a/src/compiler/glsl/lower_packed_varyings.cpp b/src/compiler/glsl/lower_packed_varyings.cpp
index 8d1eb1725d5..825cc9ee8cd 100644
--- a/src/compiler/glsl/lower_packed_varyings.cpp
+++ b/src/compiler/glsl/lower_packed_varyings.cpp
@@ -168,7 +168,9 @@ public:
                                  ir_variable_mode mode,
                                  unsigned gs_input_vertices,
                                  exec_list *out_instructions,
-                                 exec_list *out_variables);
+                                 exec_list *out_variables,
+                                 bool disable_varying_packing,
+                                 bool xfb_enabled);
 
    void run(struct gl_shader *shader);
 
@@ -231,6 +233,9 @@ private:
     * Exec list into which the visitor should insert any new variables.
     */
    exec_list *out_variables;
+
+   bool disable_varying_packing;
+   bool xfb_enabled;
 };
 
 } /* anonymous namespace */
@@ -238,7 +243,8 @@ private:
 lower_packed_varyings_visitor::lower_packed_varyings_visitor(
       void *mem_ctx, unsigned locations_used, ir_variable_mode mode,
       unsigned gs_input_vertices, exec_list *out_instructions,
-      exec_list *out_variables)
+      exec_list *out_variables, bool disable_varying_packing,
+      bool xfb_enabled)
    : mem_ctx(mem_ctx),
      locations_used(locations_used),
      packed_varyings((ir_variable **)
@@ -247,7 +253,9 @@ lower_packed_varyings_visitor::lower_packed_varyings_visitor(
      mode(mode),
      gs_input_vertices(gs_input_vertices),
      out_instructions(out_instructions),
-     out_variables(out_variables)
+     out_variables(out_variables),
+     disable_varying_packing(disable_varying_packing),
+     xfb_enabled(xfb_enabled)
 {
 }
 
@@ -656,7 +664,18 @@ lower_packed_varyings_visitor::needs_lowering(ir_variable *var)
    if (var->data.explicit_location)
       return false;
 
-   const glsl_type *type = var->type->without_array();
+   /* Override disable_varying_packing if the var is only used by transform
+    * feedback. Also override it if transform feedback is enabled and the
+    * variable is an array, struct or matrix as the elements of these types
+    * will always has the same interpolation and therefore asre safe to pack.
+    */
+   const glsl_type *type = var->type;
+   if (disable_varying_packing && !var->data.is_xfb_only &&
+       !((type->is_array() || type->is_record() || type->is_matrix()) &&
+         xfb_enabled))
+      return false;
+
+   type = type->without_array();
    if (type->vector_elements == 4 && !type->is_double())
       return false;
    return true;
@@ -709,7 +728,8 @@ lower_packed_varyings_gs_splicer::visit_leave(ir_emit_vertex *ev)
 void
 lower_packed_varyings(void *mem_ctx, unsigned locations_used,
                       ir_variable_mode mode, unsigned gs_input_vertices,
-                      gl_shader *shader)
+                      gl_shader *shader, bool disable_varying_packing,
+                      bool xfb_enabled)
 {
    exec_list *instructions = shader->ir;
    ir_function *main_func = shader->symbols->get_function("main");
@@ -720,7 +740,9 @@ lower_packed_varyings(void *mem_ctx, unsigned locations_used,
    lower_packed_varyings_visitor visitor(mem_ctx, locations_used, mode,
                                          gs_input_vertices,
                                          &new_instructions,
-                                         &new_variables);
+                                         &new_variables,
+                                         disable_varying_packing,
+                                         xfb_enabled);
    visitor.run(shader);
    if (mode == ir_var_shader_out) {
       if (shader->Stage == MESA_SHADER_GEOMETRY) {
diff --git a/src/compiler/glsl/opt_algebraic.cpp b/src/compiler/glsl/opt_algebraic.cpp
index 1e58062cb0d..f5858c83865 100644
--- a/src/compiler/glsl/opt_algebraic.cpp
+++ b/src/compiler/glsl/opt_algebraic.cpp
@@ -58,6 +58,8 @@ public:
    {
    }
 
+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+
    ir_rvalue *handle_expression(ir_expression *ir);
    void handle_rvalue(ir_rvalue **rvalue);
    bool reassociate_constant(ir_expression *ir1,
@@ -80,6 +82,23 @@ public:
 
 } /* unnamed namespace */
 
+ir_visitor_status
+ir_algebraic_visitor::visit_enter(ir_assignment *ir)
+{
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (var->data.invariant || var->data.precise) {
+      /* If we're assigning to an invariant or precise variable, just bail.
+       * Most of the algebraic optimizations aren't precision-safe.
+       *
+       * FINISHME: Find out which optimizations are precision-safe and enable
+       * then only for invariant or precise trees.
+       */
+      return visit_continue_with_parent;
+   } else {
+      return visit_continue;
+   }
+}
+
 static inline bool
 is_vec_zero(ir_constant *ir)
 {
diff --git a/src/compiler/glsl/opt_rebalance_tree.cpp b/src/compiler/glsl/opt_rebalance_tree.cpp
index 095f2d7d2f0..8045d51033d 100644
--- a/src/compiler/glsl/opt_rebalance_tree.cpp
+++ b/src/compiler/glsl/opt_rebalance_tree.cpp
@@ -131,6 +131,8 @@ public:
       progress = false;
    }
 
+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+
    void handle_rvalue(ir_rvalue **rvalue);
 
    bool progress;
@@ -146,6 +148,20 @@ struct is_reduction_data {
 
 } /* anonymous namespace */
 
+ir_visitor_status
+ir_rebalance_visitor::visit_enter(ir_assignment *ir)
+{
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (var->data.invariant || var->data.precise) {
+      /* If we're assigning to an invariant variable, just bail.  Tree
+       * rebalancing (reassociation) isn't precision-safe.
+       */
+      return visit_continue_with_parent;
+   } else {
+      return visit_continue;
+   }
+}
+
 static bool
 is_reduction_operation(ir_expression_operation operation)
 {
diff --git a/src/compiler/glsl/propagate_invariance.cpp b/src/compiler/glsl/propagate_invariance.cpp
new file mode 100644
index 00000000000..c137ff3324c
--- /dev/null
+++ b/src/compiler/glsl/propagate_invariance.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file propagate_invariance.cpp
+ * Propagate the "invariant" and "precise" qualifiers to variables used to
+ * compute invariant or precise values.
+ *
+ * The GLSL spec (depending on what version you read) says, among the
+ * conditions for geting bit-for-bit the same values on an invariant output:
+ *
+ *    "All operations in the consuming expressions and any intermediate
+ *    expressions must be the same, with the same order of operands and same
+ *    associativity, to give the same order of evaluation."
+ *
+ * This effectively means that if a variable is used to compute an invariant
+ * value then that variable becomes invariant.  The same should apply to the
+ * "precise" qualifier.
+ */
+
+#include "ir.h"
+#include "ir_visitor.h"
+#include "ir_rvalue_visitor.h"
+#include "ir_optimization.h"
+#include "compiler/glsl_types.h"
+
+namespace {
+
+class ir_invariance_propagation_visitor : public ir_hierarchical_visitor {
+public:
+   ir_invariance_propagation_visitor()
+   {
+      this->progress = false;
+      this->dst_var = NULL;
+   }
+
+   virtual ~ir_invariance_propagation_visitor()
+   {
+      /* empty */
+   }
+
+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+   virtual ir_visitor_status visit_leave(ir_assignment *ir);
+   virtual ir_visitor_status visit(ir_dereference_variable *ir);
+
+   ir_variable *dst_var;
+   bool progress;
+};
+
+} /* unnamed namespace */
+
+ir_visitor_status
+ir_invariance_propagation_visitor::visit_enter(ir_assignment *ir)
+{
+   assert(this->dst_var == NULL);
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (var->data.invariant || var->data.precise) {
+      this->dst_var = var;
+      return visit_continue;
+   } else {
+      return visit_continue_with_parent;
+   }
+}
+
+ir_visitor_status
+ir_invariance_propagation_visitor::visit_leave(ir_assignment *ir)
+{
+   this->dst_var = NULL;
+
+   return visit_continue;
+}
+
+ir_visitor_status
+ir_invariance_propagation_visitor::visit(ir_dereference_variable *ir)
+{
+   if (this->dst_var == NULL)
+      return visit_continue;
+
+   if (this->dst_var->data.invariant) {
+      if (!ir->var->data.invariant)
+         this->progress = true;
+
+      ir->var->data.invariant = true;
+   }
+
+   if (this->dst_var->data.precise) {
+      if (!ir->var->data.precise)
+         this->progress = true;
+
+      ir->var->data.precise = true;
+   }
+
+   return visit_continue;
+}
+
+void
+propagate_invariance(exec_list *instructions)
+{
+   ir_invariance_propagation_visitor visitor;
+
+   do {
+      visitor.progress = false;
+      visit_list_elements(&visitor, instructions);
+   } while (visitor.progress);
+}
diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp
index da5d730b49e..7b8b4668848 100644
--- a/src/compiler/nir/glsl_to_nir.cpp
+++ b/src/compiler/nir/glsl_to_nir.cpp
@@ -731,7 +731,7 @@ nir_visitor::visit(ir_call *ir)
          ir_dereference *param =
             (ir_dereference *) ir->actual_parameters.get_head();
          instr->variables[0] = evaluate_deref(&instr->instr, param);
-         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
@@ -765,7 +765,7 @@ nir_visitor::visit(ir_call *ir)
             const nir_intrinsic_info *info =
                     &nir_intrinsic_infos[instr->intrinsic];
             nir_ssa_dest_init(&instr->instr, &instr->dest,
-                              info->dest_components, NULL);
+                              info->dest_components, 32, NULL);
          }
 
          if (op == nir_intrinsic_image_size ||
@@ -826,7 +826,7 @@ nir_visitor::visit(ir_call *ir)
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       case nir_intrinsic_shader_clock:
-         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       case nir_intrinsic_store_ssbo: {
@@ -867,7 +867,7 @@ nir_visitor::visit(ir_call *ir)
 
          /* Setup destination register */
          nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           type->vector_elements, NULL);
+                           type->vector_elements, 32, NULL);
 
          /* Insert the created nir instruction now since in the case of boolean
           * result we will need to emit another instruction after it
@@ -890,7 +890,7 @@ nir_visitor::visit(ir_call *ir)
                load_ssbo_compare->src[1].swizzle[i] = 0;
             nir_ssa_dest_init(&load_ssbo_compare->instr,
                               &load_ssbo_compare->dest.dest,
-                              type->vector_elements, NULL);
+                              type->vector_elements, 32, NULL);
             load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1;
             nir_builder_instr_insert(&b, &load_ssbo_compare->instr);
             dest = &load_ssbo_compare->dest.dest;
@@ -936,7 +936,7 @@ nir_visitor::visit(ir_call *ir)
          /* Atomic result */
          assert(ir->return_deref);
          nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           ir->return_deref->type->vector_elements, NULL);
+                           ir->return_deref->type->vector_elements, 32, NULL);
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
@@ -951,8 +951,9 @@ nir_visitor::visit(ir_call *ir)
          instr->num_components = type->vector_elements;
 
          /* Setup destination register */
+         unsigned bit_size = glsl_get_bit_size(type->base_type);
          nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           type->vector_elements, NULL);
+                           type->vector_elements, bit_size, NULL);
 
          nir_builder_instr_insert(&b, &instr->instr);
          break;
@@ -1013,8 +1014,10 @@ nir_visitor::visit(ir_call *ir)
 
          /* Atomic result */
          assert(ir->return_deref);
+         unsigned bit_size = glsl_get_bit_size(ir->return_deref->type->base_type);
          nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           ir->return_deref->type->vector_elements, NULL);
+                           ir->return_deref->type->vector_elements,
+                           bit_size, NULL);
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
@@ -1061,6 +1064,9 @@ nir_visitor::visit(ir_assignment *ir)
 {
    unsigned num_components = ir->lhs->type->vector_elements;
 
+   b.exact = ir->lhs->variable_referenced()->data.invariant ||
+             ir->lhs->variable_referenced()->data.precise;
+
    if ((ir->rhs->as_dereference() || ir->rhs->as_constant()) &&
        (ir->write_mask == (1 << num_components) - 1 || ir->write_mask == 0)) {
       /* We're doing a plain-as-can-be copy, so emit a copy_var */
@@ -1163,7 +1169,7 @@ nir_visitor::add_instr(nir_instr *instr, unsigned num_components)
    nir_dest *dest = get_instr_dest(instr);
 
    if (dest)
-      nir_ssa_dest_init(instr, dest, num_components, NULL);
+      nir_ssa_dest_init(instr, dest, num_components, 32, NULL);
 
    nir_builder_instr_insert(&b, instr);
 
@@ -1203,6 +1209,7 @@ nir_visitor::visit(ir_expression *ir)
       nir_intrinsic_instr *load =
          nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_ubo);
       load->num_components = ir->type->vector_elements;
+      load->dest.ssa.bit_size = glsl_get_bit_size(ir->type->base_type);
       load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0]));
       load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
       add_instr(&load->instr, ir->type->vector_elements);
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index 7e41ed37b0d..b67916dc86b 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -70,6 +70,7 @@ reg_create(void *mem_ctx, struct exec_list *list)
    list_inithead(&reg->if_uses);
 
    reg->num_components = 0;
+   reg->bit_size = 32;
    reg->num_array_elems = 0;
    reg->is_packed = false;
    reg->name = NULL;
@@ -473,7 +474,7 @@ nir_load_const_instr_create(nir_shader *shader, unsigned num_components)
    nir_load_const_instr *instr = ralloc(shader, nir_load_const_instr);
    instr_init(&instr->instr, nir_instr_type_load_const);
 
-   nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL);
+   nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);
 
    return instr;
 }
@@ -562,7 +563,7 @@ nir_ssa_undef_instr_create(nir_shader *shader, unsigned num_components)
    nir_ssa_undef_instr *instr = ralloc(shader, nir_ssa_undef_instr);
    instr_init(&instr->instr, nir_instr_type_ssa_undef);
 
-   nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL);
+   nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);
 
    return instr;
 }
@@ -699,10 +700,10 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref)
       case GLSL_TYPE_FLOAT:
       case GLSL_TYPE_INT:
       case GLSL_TYPE_UINT:
-         load->value.u[i] = constant->value.u[matrix_offset + i];
+         load->value.u32[i] = constant->value.u[matrix_offset + i];
          break;
       case GLSL_TYPE_BOOL:
-         load->value.u[i] = constant->value.b[matrix_offset + i] ?
+         load->value.u32[i] = constant->value.b[matrix_offset + i] ?
                              NIR_TRUE : NIR_FALSE;
          break;
       default:
@@ -731,18 +732,11 @@ reduce_cursor(nir_cursor cursor)
 {
    switch (cursor.option) {
    case nir_cursor_before_block:
+      assert(nir_cf_node_prev(&cursor.block->cf_node) == NULL ||
+             nir_cf_node_prev(&cursor.block->cf_node)->type != nir_cf_node_block);
       if (exec_list_is_empty(&cursor.block->instr_list)) {
          /* Empty block.  After is as good as before. */
          cursor.option = nir_cursor_after_block;
-      } else {
-         /* Try to switch to after the previous block if there is one.
-          * (This isn't likely, but it can happen.)
-          */
-         nir_cf_node *prev_node = nir_cf_node_prev(&cursor.block->cf_node);
-         if (prev_node && prev_node->type == nir_cf_node_block) {
-            cursor.block = nir_cf_node_as_block(prev_node);
-            cursor.option = nir_cursor_after_block;
-         }
       }
       return cursor;
 
@@ -1379,15 +1373,18 @@ nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest)
       src_add_all_uses(dest->reg.indirect, instr, NULL);
 }
 
+/* note: does *not* take ownership of 'name' */
 void
 nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
-                 unsigned num_components, const char *name)
+                 unsigned num_components,
+                 unsigned bit_size, const char *name)
 {
-   def->name = name;
+   def->name = ralloc_strdup(instr, name);
    def->parent_instr = instr;
    list_inithead(&def->uses);
    list_inithead(&def->if_uses);
    def->num_components = num_components;
+   def->bit_size = bit_size;
 
    if (instr->block) {
       nir_function_impl *impl =
@@ -1399,12 +1396,14 @@ nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
    }
 }
 
+/* note: does *not* take ownership of 'name' */
 void
 nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
-                 unsigned num_components, const char *name)
+                 unsigned num_components, unsigned bit_size,
+                 const char *name)
 {
    dest->is_ssa = true;
-   nir_ssa_def_init(instr, &dest->ssa, num_components, name);
+   nir_ssa_def_init(instr, &dest->ssa, num_components, bit_size, name);
 }
 
 void
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index ab1afdbc475..2fd75ec8be5 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -101,6 +101,7 @@ union nir_constant_data {
    int i[16];
    float f[16];
    bool b[16];
+   double d[16];
 };
 
 typedef struct nir_constant {
@@ -381,6 +382,9 @@ typedef struct nir_register {
    unsigned num_components; /** < number of vector components */
    unsigned num_array_elems; /** < size of array (0 for no array) */
 
+   /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */
+   uint8_t bit_size;
+
    /** generic register index. */
    unsigned index;
 
@@ -488,6 +492,9 @@ typedef struct nir_ssa_def {
    struct list_head if_uses;
 
    uint8_t num_components;
+
+   /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */
+   uint8_t bit_size;
 } nir_ssa_def;
 
 struct nir_src;
@@ -594,6 +601,18 @@ nir_dest_for_reg(nir_register *reg)
    return dest;
 }
 
+static inline unsigned
+nir_src_bit_size(nir_src src)
+{
+   return src.is_ssa ? src.ssa->bit_size : src.reg.reg->bit_size;
+}
+
+static inline unsigned
+nir_dest_bit_size(nir_dest dest)
+{
+   return dest.is_ssa ? dest.ssa.bit_size : dest.reg.reg->bit_size;
+}
+
 void nir_src_copy(nir_src *dest, const nir_src *src, void *instr_or_if);
 void nir_dest_copy(nir_dest *dest, const nir_dest *src, nir_instr *instr);
 
@@ -649,9 +668,36 @@ typedef enum {
    nir_type_float,
    nir_type_int,
    nir_type_uint,
-   nir_type_bool
+   nir_type_bool,
+   nir_type_bool32 =    32 | nir_type_bool,
+   nir_type_int8 =      8  | nir_type_int,
+   nir_type_int16 =     16 | nir_type_int,
+   nir_type_int32 =     32 | nir_type_int,
+   nir_type_int64 =     64 | nir_type_int,
+   nir_type_uint8 =     8  | nir_type_uint,
+   nir_type_uint16 =    16 | nir_type_uint,
+   nir_type_uint32 =    32 | nir_type_uint,
+   nir_type_uint64 =    64 | nir_type_uint,
+   nir_type_float16 =   16 | nir_type_float,
+   nir_type_float32 =   32 | nir_type_float,
+   nir_type_float64 =   64 | nir_type_float,
 } nir_alu_type;
 
+#define NIR_ALU_TYPE_SIZE_MASK 0xfffffff8
+#define NIR_ALU_TYPE_BASE_TYPE_MASK 0x00000007
+
+static inline unsigned
+nir_alu_type_get_type_size(nir_alu_type type)
+{
+   return type & NIR_ALU_TYPE_SIZE_MASK;
+}
+
+static inline unsigned
+nir_alu_type_get_base_type(nir_alu_type type)
+{
+   return type & NIR_ALU_TYPE_BASE_TYPE_MASK;
+}
+
 typedef enum {
    NIR_OP_IS_COMMUTATIVE = (1 << 0),
    NIR_OP_IS_ASSOCIATIVE = (1 << 1),
@@ -708,6 +754,17 @@ extern const nir_op_info nir_op_infos[nir_num_opcodes];
 typedef struct nir_alu_instr {
    nir_instr instr;
    nir_op op;
+
+   /** Indicates that this ALU instruction generates an exact value
+    *
+    * This is kind of a mixture of GLSL "precise" and "invariant" and not
+    * really equivalent to either.  This indicates that the value generated by
+    * this operation is high-precision and any code transformations that touch
+    * it must ensure that the resulting value is bit-for-bit identical to the
+    * original.
+    */
+   bool exact;
+
    nir_alu_dest dest;
    nir_alu_src src[];
 } nir_alu_instr;
@@ -1218,9 +1275,12 @@ nir_tex_instr_src_index(nir_tex_instr *instr, nir_tex_src_type type)
 
 typedef struct {
    union {
-      float f[4];
-      int32_t i[4];
-      uint32_t u[4];
+      float f32[4];
+      double f64[4];
+      int32_t i32[4];
+      uint32_t u32[4];
+      int64_t i64[4];
+      uint64_t u64[4];
    };
 } nir_const_value;
 
@@ -2061,9 +2121,11 @@ void nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest,
                             nir_dest new_dest);
 
 void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
-                       unsigned num_components, const char *name);
+                       unsigned num_components, unsigned bit_size,
+                       const char *name);
 void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
-                      unsigned num_components, const char *name);
+                      unsigned num_components, unsigned bit_size,
+                      const char *name);
 void nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src);
 void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src,
                                     nir_instr *after_me);
@@ -2094,9 +2156,10 @@ void nir_index_blocks(nir_function_impl *impl);
 void nir_print_shader(nir_shader *shader, FILE *fp);
 void nir_print_instr(const nir_instr *instr, FILE *fp);
 
-nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s);
+nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s);
 nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi);
 nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
+nir_variable *nir_variable_clone(const nir_variable *c, nir_shader *shader);
 
 #ifdef DEBUG
 void nir_validate_shader(nir_shader *shader);
diff --git a/src/compiler/nir/nir_algebraic.py b/src/compiler/nir/nir_algebraic.py
index 2357b57117a..d05564f779c 100644
--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@@ -63,12 +63,13 @@ class Value(object):
 static const ${val.c_type} ${val.name} = {
    { ${val.type_enum} },
 % if isinstance(val, Constant):
-   { ${hex(val)} /* ${val.value} */ },
+   ${val.type()}, { ${hex(val)} /* ${val.value} */ },
 % elif isinstance(val, Variable):
    ${val.index}, /* ${val.var_name} */
    ${'true' if val.is_constant else 'false'},
-   nir_type_${ val.required_type or 'invalid' },
+   ${val.type() or 'nir_type_invalid' },
 % elif isinstance(val, Expression):
+   ${'true' if val.inexact else 'false'},
    nir_op_${val.opcode},
    { ${', '.join(src.c_ptr for src in val.sources)} },
 % endif
@@ -107,10 +108,18 @@ class Constant(Value):
       if isinstance(self.value, (int, long)):
          return hex(self.value)
       elif isinstance(self.value, float):
-         return hex(struct.unpack('I', struct.pack('f', self.value))[0])
+         return hex(struct.unpack('Q', struct.pack('d', self.value))[0])
       else:
          assert False
 
+   def type(self):
+      if isinstance(self.value, (bool)):
+         return "nir_type_bool32"
+      elif isinstance(self.value, (int, long)):
+         return "nir_type_int"
+      elif isinstance(self.value, float):
+         return "nir_type_float"
+
 _var_name_re = re.compile(r"(?P<const>#)?(?P<name>\w+)(?:@(?P<type>\w+))?")
 
 class Variable(Value):
@@ -129,12 +138,26 @@ class Variable(Value):
 
       self.index = varset[self.var_name]
 
+   def type(self):
+      if self.required_type == 'bool':
+         return "nir_type_bool32"
+      elif self.required_type in ('int', 'unsigned'):
+         return "nir_type_int"
+      elif self.required_type == 'float':
+         return "nir_type_float"
+
+_opcode_re = re.compile(r"(?P<inexact>~)?(?P<opcode>\w+)")
+
 class Expression(Value):
    def __init__(self, expr, name_base, varset):
       Value.__init__(self, name_base, "expression")
       assert isinstance(expr, tuple)
 
-      self.opcode = expr[0]
+      m = _opcode_re.match(expr[0])
+      assert m and m.group('opcode') is not None
+
+      self.opcode = m.group('opcode')
+      self.inexact = m.group('inexact') is not None
       self.sources = [ Value.create(src, "{0}_{1}".format(name_base, i), varset)
                        for (i, src) in enumerate(expr[1:]) ]
 
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index b4dde54f7e7..94f183c1552 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -31,6 +31,9 @@ struct exec_list;
 typedef struct nir_builder {
    nir_cursor cursor;
 
+   /* Whether new ALU instructions will be marked "exact" */
+   bool exact;
+
    nir_shader *shader;
    nir_function_impl *impl;
 } nir_builder;
@@ -39,6 +42,7 @@ static inline void
 nir_builder_init(nir_builder *build, nir_function_impl *impl)
 {
    memset(build, 0, sizeof(*build));
+   build->exact = false;
    build->impl = impl;
    build->shader = impl->function->shader;
 }
@@ -50,6 +54,7 @@ nir_builder_init_simple_shader(nir_builder *build, void *mem_ctx,
 {
    build->shader = nir_shader_create(mem_ctx, stage, options);
    nir_function *func = nir_function_create(build->shader, "main");
+   build->exact = false;
    build->impl = nir_function_impl_create(func);
    build->cursor = nir_after_cf_list(&build->impl->body);
 }
@@ -104,7 +109,7 @@ nir_imm_float(nir_builder *build, float x)
    nir_const_value v;
 
    memset(&v, 0, sizeof(v));
-   v.f[0] = x;
+   v.f32[0] = x;
 
    return nir_build_imm(build, 1, v);
 }
@@ -115,10 +120,10 @@ nir_imm_vec4(nir_builder *build, float x, float y, float z, float w)
    nir_const_value v;
 
    memset(&v, 0, sizeof(v));
-   v.f[0] = x;
-   v.f[1] = y;
-   v.f[2] = z;
-   v.f[3] = w;
+   v.f32[0] = x;
+   v.f32[1] = y;
+   v.f32[2] = z;
+   v.f32[3] = w;
 
    return nir_build_imm(build, 4, v);
 }
@@ -129,7 +134,7 @@ nir_imm_int(nir_builder *build, int x)
    nir_const_value v;
 
    memset(&v, 0, sizeof(v));
-   v.i[0] = x;
+   v.i32[0] = x;
 
    return nir_build_imm(build, 1, v);
 }
@@ -140,10 +145,10 @@ nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w)
    nir_const_value v;
 
    memset(&v, 0, sizeof(v));
-   v.i[0] = x;
-   v.i[1] = y;
-   v.i[2] = z;
-   v.i[3] = w;
+   v.i32[0] = x;
+   v.i32[1] = y;
+   v.i32[2] = z;
+   v.i32[3] = w;
 
    return nir_build_imm(build, 4, v);
 }
@@ -157,6 +162,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
    if (!instr)
       return NULL;
 
+   instr->exact = build->exact;
+
    instr->src[0].src = nir_src_for_ssa(src0);
    if (src1)
       instr->src[1].src = nir_src_for_ssa(src1);
@@ -178,6 +185,25 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
    }
    assert(num_components != 0);
 
+   /* Figure out the bitwidth based on the source bitwidth if the instruction
+    * is variable-width.
+    */
+   unsigned bit_size = nir_alu_type_get_type_size(op_info->output_type);
+   if (bit_size == 0) {
+      for (unsigned i = 0; i < op_info->num_inputs; i++) {
+         unsigned src_bit_size = instr->src[i].src.ssa->bit_size;
+         if (nir_alu_type_get_type_size(op_info->input_types[i]) == 0) {
+            if (bit_size)
+               assert(src_bit_size == bit_size);
+            else
+               bit_size = src_bit_size;
+         } else {
+            assert(src_bit_size ==
+               nir_alu_type_get_type_size(op_info->input_types[i]));
+         }
+      }
+   }
+
    /* Make sure we don't swizzle from outside of our source vector (like if a
     * scalar value was passed into a multiply with a vector).
     */
@@ -187,7 +213,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
       }
    }
 
-   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+                     bit_size, NULL);
    instr->dest.write_mask = (1 << num_components) - 1;
 
    nir_builder_instr_insert(build, &instr->instr);
@@ -252,7 +279,9 @@ static inline nir_ssa_def *
 nir_fmov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
 {
    nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_fmov);
-   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
+                     nir_src_bit_size(src.src), NULL);
+   mov->exact = build->exact;
    mov->dest.write_mask = (1 << num_components) - 1;
    mov->src[0] = src;
    nir_builder_instr_insert(build, &mov->instr);
@@ -264,7 +293,9 @@ static inline nir_ssa_def *
 nir_imov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
 {
    nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_imov);
-   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
+                     nir_src_bit_size(src.src), NULL);
+   mov->exact = build->exact;
    mov->dest.write_mask = (1 << num_components) - 1;
    mov->src[0] = src;
    nir_builder_instr_insert(build, &mov->instr);
@@ -360,7 +391,8 @@ nir_load_var(nir_builder *build, nir_variable *var)
       nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_var);
    load->num_components = num_components;
    load->variables[0] = nir_deref_var_create(load, var);
-   nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
+   nir_ssa_dest_init(&load->instr, &load->dest, num_components,
+                     glsl_get_bit_size(glsl_get_base_type(var->type)), NULL);
    nir_builder_instr_insert(build, &load->instr);
    return &load->dest.ssa;
 }
@@ -426,7 +458,7 @@ nir_load_system_value(nir_builder *build, nir_intrinsic_op op, int index)
    load->num_components = nir_intrinsic_infos[op].dest_components;
    load->const_index[0] = index;
    nir_ssa_dest_init(&load->instr, &load->dest,
-                     nir_intrinsic_infos[op].dest_components, NULL);
+                     nir_intrinsic_infos[op].dest_components, 32, NULL);
    nir_builder_instr_insert(build, &load->instr);
    return &load->dest.ssa;
 }
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index 3268deb3ee4..7d2e3835258 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -127,11 +127,10 @@ nir_constant_clone(const nir_constant *c, nir_variable *nvar)
 /* NOTE: for cloning nir_variable's, bypass nir_variable_create to avoid
  * having to deal with locals and globals separately:
  */
-static nir_variable *
-clone_variable(clone_state *state, const nir_variable *var)
+nir_variable *
+nir_variable_clone(const nir_variable *var, nir_shader *shader)
 {
-   nir_variable *nvar = rzalloc(state->ns, nir_variable);
-   add_remap(state, nvar, var);
+   nir_variable *nvar = rzalloc(shader, nir_variable);
 
    nvar->type = var->type;
    nvar->name = ralloc_strdup(nvar, var->name);
@@ -149,6 +148,15 @@ clone_variable(clone_state *state, const nir_variable *var)
    return nvar;
 }
 
+static nir_variable *
+clone_variable(clone_state *state, const nir_variable *var)
+{
+   nir_variable *nvar = nir_variable_clone(var, state->ns);
+   add_remap(state, nvar, var);
+
+   return nvar;
+}
+
 /* clone list of nir_variable: */
 static void
 clone_var_list(clone_state *state, struct exec_list *dst,
@@ -220,7 +228,8 @@ __clone_dst(clone_state *state, nir_instr *ninstr,
 {
    ndst->is_ssa = dst->is_ssa;
    if (dst->is_ssa) {
-      nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, dst->ssa.name);
+      nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components,
+                        dst->ssa.bit_size, dst->ssa.name);
       add_remap(state, &ndst->ssa, &dst->ssa);
    } else {
       ndst->reg.reg = remap_reg(state, dst->reg.reg);
@@ -303,6 +312,7 @@ static nir_alu_instr *
 clone_alu(clone_state *state, const nir_alu_instr *alu)
 {
    nir_alu_instr *nalu = nir_alu_instr_create(state->ns, alu->op);
+   nalu->exact = alu->exact;
 
    __clone_dst(state, &nalu->instr, &nalu->dest.dest, &alu->dest.dest);
    nalu->dest.saturate = alu->dest.saturate;
diff --git a/src/compiler/nir/nir_constant_expressions.h b/src/compiler/nir/nir_constant_expressions.h
index 97997f2e514..201f278c71c 100644
--- a/src/compiler/nir/nir_constant_expressions.h
+++ b/src/compiler/nir/nir_constant_expressions.h
@@ -28,4 +28,4 @@
 #include "nir.h"
 
 nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components,
-                                      nir_const_value *src);
+                                      unsigned bit_size, nir_const_value *src);
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
index 32784f6398d..e36dc4853b5 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -1,4 +1,43 @@
 #! /usr/bin/python2
+
+def type_has_size(type_):
+    return type_[-1:].isdigit()
+
+def type_sizes(type_):
+    if type_.endswith("8"):
+        return [8]
+    elif type_.endswith("16"):
+        return [16]
+    elif type_.endswith("32"):
+        return [32]
+    elif type_.endswith("64"):
+        return [64]
+    else:
+        return [32, 64]
+
+def type_add_size(type_, size):
+    if type_has_size(type_):
+        return type_
+    return type_ + str(size)
+
+def get_const_field(type_):
+    if type_ == "int32":
+        return "i32"
+    if type_ == "uint32":
+        return "u32"
+    if type_ == "int64":
+        return "i64"
+    if type_ == "uint64":
+        return "u64"
+    if type_ == "bool32":
+        return "u32"
+    if type_ == "float32":
+        return "f32"
+    if type_ == "float64":
+        return "f64"
+    raise Exception(str(type_))
+    assert(0)
+
 template = """\
 /*
  * Copyright (C) 2014 Intel Corporation
@@ -205,110 +244,140 @@ unpack_half_1x16(uint16_t u)
 }
 
 /* Some typed vector structures to make things like src0.y work */
-% for type in ["float", "int", "uint", "bool"]:
-struct ${type}_vec {
-   ${type} x;
-   ${type} y;
-   ${type} z;
-   ${type} w;
+typedef float float32_t;
+typedef double float64_t;
+typedef bool bool32_t;
+% for type in ["float", "int", "uint"]:
+% for width in [32, 64]:
+struct ${type}${width}_vec {
+   ${type}${width}_t x;
+   ${type}${width}_t y;
+   ${type}${width}_t z;
+   ${type}${width}_t w;
 };
 % endfor
+% endfor
+
+struct bool32_vec {
+    bool x;
+    bool y;
+    bool z;
+    bool w;
+};
 
 % for name, op in sorted(opcodes.iteritems()):
 static nir_const_value
-evaluate_${name}(unsigned num_components, nir_const_value *_src)
+evaluate_${name}(unsigned num_components, unsigned bit_size,
+                 nir_const_value *_src)
 {
    nir_const_value _dst_val = { { {0, 0, 0, 0} } };
 
-   ## For each non-per-component input, create a variable srcN that
-   ## contains x, y, z, and w elements which are filled in with the
-   ## appropriately-typed values.
-   % for j in range(op.num_inputs):
-      % if op.input_sizes[j] == 0:
-         <% continue %>
-      % elif "src" + str(j) not in op.const_expr:
-         ## Avoid unused variable warnings
-         <% continue %>
-      %endif
-
-      struct ${op.input_types[j]}_vec src${j} = {
-      % for k in range(op.input_sizes[j]):
-         % if op.input_types[j] == "bool":
-            _src[${j}].u[${k}] != 0,
-         % else:
-            _src[${j}].${op.input_types[j][:1]}[${k}],
-         % endif
-      % endfor
-      };
-   % endfor
+   switch (bit_size) {
+   % for bit_size in [32, 64]:
+   case ${bit_size}: {
+      <%
+      output_type = type_add_size(op.output_type, bit_size)
+      input_types = [type_add_size(type_, bit_size) for type_ in op.input_types]
+      %>
+
+      ## For each non-per-component input, create a variable srcN that
+      ## contains x, y, z, and w elements which are filled in with the
+      ## appropriately-typed values.
+      % for j in range(op.num_inputs):
+         % if op.input_sizes[j] == 0:
+            <% continue %>
+         % elif "src" + str(j) not in op.const_expr:
+            ## Avoid unused variable warnings
+            <% continue %>
+         %endif
 
-   % if op.output_size == 0:
-      ## For per-component instructions, we need to iterate over the
-      ## components and apply the constant expression one component
-      ## at a time.
-      for (unsigned _i = 0; _i < num_components; _i++) {
-         ## For each per-component input, create a variable srcN that
-         ## contains the value of the current (_i'th) component.
-         % for j in range(op.num_inputs):
-            % if op.input_sizes[j] != 0:
-               <% continue %>
-            % elif "src" + str(j) not in op.const_expr:
-               ## Avoid unused variable warnings
-               <% continue %>
-            % elif op.input_types[j] == "bool":
-               bool src${j} = _src[${j}].u[_i] != 0;
+         struct ${input_types[j]}_vec src${j} = {
+         % for k in range(op.input_sizes[j]):
+            % if input_types[j] == "bool32":
+               _src[${j}].u32[${k}] != 0,
             % else:
-               ${op.input_types[j]} src${j} = _src[${j}].${op.input_types[j][:1]}[_i];
+               _src[${j}].${get_const_field(input_types[j])}[${k}],
             % endif
          % endfor
+         };
+      % endfor
+
+      % if op.output_size == 0:
+         ## For per-component instructions, we need to iterate over the
+         ## components and apply the constant expression one component
+         ## at a time.
+         for (unsigned _i = 0; _i < num_components; _i++) {
+            ## For each per-component input, create a variable srcN that
+            ## contains the value of the current (_i'th) component.
+            % for j in range(op.num_inputs):
+               % if op.input_sizes[j] != 0:
+                  <% continue %>
+               % elif "src" + str(j) not in op.const_expr:
+                  ## Avoid unused variable warnings
+                  <% continue %>
+               % elif input_types[j] == "bool32":
+                  bool src${j} = _src[${j}].u32[_i] != 0;
+               % else:
+                  ${input_types[j]}_t src${j} =
+                     _src[${j}].${get_const_field(input_types[j])}[_i];
+               % endif
+            % endfor
+
+            ## Create an appropriately-typed variable dst and assign the
+            ## result of the const_expr to it.  If const_expr already contains
+            ## writes to dst, just include const_expr directly.
+            % if "dst" in op.const_expr:
+               ${output_type}_t dst;
+               ${op.const_expr}
+            % else:
+               ${output_type}_t dst = ${op.const_expr};
+            % endif
+
+            ## Store the current component of the actual destination to the
+            ## value of dst.
+            % if output_type == "bool32":
+               ## Sanitize the C value to a proper NIR bool
+               _dst_val.u32[_i] = dst ? NIR_TRUE : NIR_FALSE;
+            % else:
+               _dst_val.${get_const_field(output_type)}[_i] = dst;
+            % endif
+         }
+      % else:
+         ## In the non-per-component case, create a struct dst with
+         ## appropriately-typed elements x, y, z, and w and assign the result
+         ## of the const_expr to all components of dst, or include the
+         ## const_expr directly if it writes to dst already.
+         struct ${output_type}_vec dst;
 
-         ## Create an appropriately-typed variable dst and assign the
-         ## result of the const_expr to it.  If const_expr already contains
-         ## writes to dst, just include const_expr directly.
          % if "dst" in op.const_expr:
-            ${op.output_type} dst;
             ${op.const_expr}
          % else:
-            ${op.output_type} dst = ${op.const_expr};
+            ## Splat the value to all components.  This way expressions which
+            ## write the same value to all components don't need to explicitly
+            ## write to dest.  One such example is fnoise which has a
+            ## const_expr of 0.0f.
+            dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
          % endif
 
-         ## Store the current component of the actual destination to the
-         ## value of dst.
-         % if op.output_type == "bool":
-            ## Sanitize the C value to a proper NIR bool
-            _dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE;
-         % else:
-            _dst_val.${op.output_type[:1]}[_i] = dst;
-         % endif
-      }
-   % else:
-      ## In the non-per-component case, create a struct dst with
-      ## appropriately-typed elements x, y, z, and w and assign the result
-      ## of the const_expr to all components of dst, or include the
-      ## const_expr directly if it writes to dst already.
-      struct ${op.output_type}_vec dst;
-
-      % if "dst" in op.const_expr:
-         ${op.const_expr}
-      % else:
-         ## Splat the value to all components.  This way expressions which
-         ## write the same value to all components don't need to explicitly
-         ## write to dest.  One such example is fnoise which has a
-         ## const_expr of 0.0f.
-         dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
+         ## For each component in the destination, copy the value of dst to
+         ## the actual destination.
+         % for k in range(op.output_size):
+            % if output_type == "bool32":
+               ## Sanitize the C value to a proper NIR bool
+               _dst_val.u32[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
+            % else:
+               _dst_val.${get_const_field(output_type)}[${k}] = dst.${"xyzw"[k]};
+            % endif
+         % endfor
       % endif
 
-      ## For each component in the destination, copy the value of dst to
-      ## the actual destination.
-      % for k in range(op.output_size):
-         % if op.output_type == "bool":
-            ## Sanitize the C value to a proper NIR bool
-            _dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
-         % else:
-            _dst_val.${op.output_type[:1]}[${k}] = dst.${"xyzw"[k]};
-         % endif
-      % endfor
-   % endif
+      break;
+   }
+   % endfor
+
+   default:
+      unreachable("unknown bit width");
+   }
 
    return _dst_val;
 }
@@ -316,12 +385,12 @@ evaluate_${name}(unsigned num_components, nir_const_value *_src)
 
 nir_const_value
 nir_eval_const_opcode(nir_op op, unsigned num_components,
-                      nir_const_value *src)
+                      unsigned bit_width, nir_const_value *src)
 {
    switch (op) {
 % for name in sorted(opcodes.iterkeys()):
    case nir_op_${name}: {
-      return evaluate_${name}(num_components, src);
+      return evaluate_${name}(num_components, bit_width, src);
       break;
    }
 % endfor
@@ -333,4 +402,7 @@ nir_eval_const_opcode(nir_op op, unsigned num_components,
 from nir_opcodes import opcodes
 from mako.template import Template
 
-print Template(template).render(opcodes=opcodes)
+print Template(template).render(opcodes=opcodes, type_sizes=type_sizes,
+                                type_has_size=type_has_size,
+                                type_add_size=type_add_size,
+                                get_const_field=get_const_field)
diff --git a/src/compiler/nir/nir_from_ssa.c b/src/compiler/nir/nir_from_ssa.c
index 8bc9f24e406..82317c21b62 100644
--- a/src/compiler/nir/nir_from_ssa.c
+++ b/src/compiler/nir/nir_from_ssa.c
@@ -342,7 +342,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state)
          nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
                                                   nir_parallel_copy_entry);
          nir_ssa_dest_init(&pcopy->instr, &entry->dest,
-                           phi->dest.ssa.num_components, src->src.ssa->name);
+                           phi->dest.ssa.num_components,
+                           phi->dest.ssa.bit_size, src->src.ssa->name);
          exec_list_push_tail(&pcopy->entries, &entry->node);
 
          assert(src->src.is_ssa);
@@ -355,7 +356,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state)
       nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
                                                nir_parallel_copy_entry);
       nir_ssa_dest_init(&block_pcopy->instr, &entry->dest,
-                        phi->dest.ssa.num_components, phi->dest.ssa.name);
+                        phi->dest.ssa.num_components, phi->dest.ssa.bit_size,
+                        phi->dest.ssa.name);
       exec_list_push_tail(&block_pcopy->entries, &entry->node);
 
       nir_ssa_def_rewrite_uses(&phi->dest.ssa,
diff --git a/src/compiler/nir/nir_gs_count_vertices.c b/src/compiler/nir/nir_gs_count_vertices.c
index db15d160ee7..3c1bd2a59bd 100644
--- a/src/compiler/nir/nir_gs_count_vertices.c
+++ b/src/compiler/nir/nir_gs_count_vertices.c
@@ -77,13 +77,13 @@ nir_gs_count_vertices(const nir_shader *shader)
                return -1;
 
             if (count == -1)
-               count = val->i[0];
+               count = val->i32[0];
 
             /* We've found contradictory set_vertex_count intrinsics.
              * This can happen if there are early-returns in main() and
              * different paths emit different numbers of vertices.
              */
-            if (count != val->i[0])
+            if (count != val->i32[0])
                return -1;
          }
       }
diff --git a/src/compiler/nir/nir_instr_set.c b/src/compiler/nir/nir_instr_set.c
index 159ded0e72b..e244122e466 100644
--- a/src/compiler/nir/nir_instr_set.c
+++ b/src/compiler/nir/nir_instr_set.c
@@ -52,6 +52,7 @@ hash_alu(uint32_t hash, const nir_alu_instr *instr)
 {
    hash = HASH(hash, instr->op);
    hash = HASH(hash, instr->dest.dest.ssa.num_components);
+   /* We explicitly don't hash instr->dest.dest.exact */
 
    if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
       assert(nir_op_infos[instr->op].num_inputs == 2);
@@ -81,9 +82,9 @@ hash_load_const(uint32_t hash, const nir_load_const_instr *instr)
 {
    hash = HASH(hash, instr->def.num_components);
 
-   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f,
+   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32,
                                           instr->def.num_components
-                                             * sizeof(instr->value.f[0]));
+                                             * sizeof(instr->value.f32[0]));
 
    return hash;
 }
@@ -267,6 +268,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
       if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components)
          return false;
 
+      /* We explicitly don't hash instr->dest.dest.exact */
+
       if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
          assert(nir_op_infos[alu1->op].num_inputs == 2);
          return (nir_alu_srcs_equal(alu1, alu2, 0, 0) &&
@@ -322,8 +325,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
       if (load1->def.num_components != load2->def.num_components)
          return false;
 
-      return memcmp(load1->value.f, load2->value.f,
-                    load1->def.num_components * sizeof(*load2->value.f)) == 0;
+      return memcmp(load1->value.f32, load2->value.f32,
+                    load1->def.num_components * sizeof(*load2->value.f32)) == 0;
    }
    case nir_instr_type_phi: {
       nir_phi_instr *phi1 = nir_instr_as_phi(instr1);
@@ -496,8 +499,17 @@ nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr)
    struct set_entry *entry = _mesa_set_search(instr_set, instr);
    if (entry) {
       nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr);
-      nir_ssa_def *new_def =
-         nir_instr_get_dest_ssa_def((nir_instr *) entry->key);
+      nir_instr *match = (nir_instr *) entry->key;
+      nir_ssa_def *new_def = nir_instr_get_dest_ssa_def(match);
+
+      /* It's safe to replace a exact instruction with an inexact one as
+       * long as we make it exact.  If we got here, the two instructions are
+       * exactly identical in every other way so, once we've set the exact
+       * bit, they are the same.
+       */
+      if (instr->type == nir_instr_type_alu && nir_instr_as_alu(instr)->exact)
+         nir_instr_as_alu(match)->exact = true;
+
       nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def));
       return true;
    }
diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c
index 312d2f99a1c..e8ba640fe0b 100644
--- a/src/compiler/nir/nir_lower_alu_to_scalar.c
+++ b/src/compiler/nir/nir_lower_alu_to_scalar.c
@@ -31,9 +31,11 @@
  */
 
 static void
-nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components)
+nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components,
+                      unsigned bit_size)
 {
-   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+                     bit_size, NULL);
    instr->dest.write_mask = (1 << num_components) - 1;
 }
 
@@ -46,7 +48,7 @@ lower_reduction(nir_alu_instr *instr, nir_op chan_op, nir_op merge_op,
    nir_ssa_def *last = NULL;
    for (unsigned i = 0; i < num_components; i++) {
       nir_alu_instr *chan = nir_alu_instr_create(builder->shader, chan_op);
-      nir_alu_ssa_dest_init(chan, 1);
+      nir_alu_ssa_dest_init(chan, 1, instr->dest.dest.ssa.bit_size);
       nir_alu_src_copy(&chan->src[0], &instr->src[0], chan);
       chan->src[0].swizzle[0] = chan->src[0].swizzle[i];
       if (nir_op_infos[chan_op].num_inputs > 1) {
@@ -80,6 +82,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
    assert(instr->dest.write_mask != 0);
 
    b->cursor = nir_before_instr(&instr->instr);
+   b->exact = instr->exact;
 
 #define LOWER_REDUCTION(name, chan, merge) \
    case name##2: \
@@ -220,7 +223,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
             lower->src[i].swizzle[j] = instr->src[i].swizzle[src_chan];
       }
 
-      nir_alu_ssa_dest_init(lower, 1);
+      nir_alu_ssa_dest_init(lower, 1, instr->dest.dest.ssa.bit_size);
       lower->dest.saturate = instr->dest.saturate;
       comps[chan] = &lower->dest.dest.ssa;
 
diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c
index eefcb55a0a6..70381a7968a 100644
--- a/src/compiler/nir/nir_lower_atomics.c
+++ b/src/compiler/nir/nir_lower_atomics.c
@@ -75,7 +75,7 @@ lower_instr(nir_intrinsic_instr *instr,
       state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index);
 
    nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
-   offset_const->value.u[0] = instr->variables[0]->var->data.offset;
+   offset_const->value.u32[0] = instr->variables[0]->var->data.offset;
 
    nir_instr_insert_before(&instr->instr, &offset_const->instr);
 
@@ -90,17 +90,17 @@ lower_instr(nir_intrinsic_instr *instr,
       unsigned child_array_elements = tail->child != NULL ?
          glsl_get_aoa_size(tail->type) : 1;
 
-      offset_const->value.u[0] += deref_array->base_offset *
+      offset_const->value.u32[0] += deref_array->base_offset *
          child_array_elements * ATOMIC_COUNTER_SIZE;
 
       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
          nir_load_const_instr *atomic_counter_size =
                nir_load_const_instr_create(mem_ctx, 1);
-         atomic_counter_size->value.u[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
+         atomic_counter_size->value.u32[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
          nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr);
 
          nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul);
-         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL);
+         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
          mul->dest.write_mask = 0x1;
          nir_src_copy(&mul->src[0].src, &deref_array->indirect, mul);
          mul->src[1].src.is_ssa = true;
@@ -108,7 +108,7 @@ lower_instr(nir_intrinsic_instr *instr,
          nir_instr_insert_before(&instr->instr, &mul->instr);
 
          nir_alu_instr *add = nir_alu_instr_create(mem_ctx, nir_op_iadd);
-         nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL);
+         nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
          add->dest.write_mask = 0x1;
          add->src[0].src.is_ssa = true;
          add->src[0].src.ssa = &mul->dest.dest.ssa;
@@ -125,7 +125,7 @@ lower_instr(nir_intrinsic_instr *instr,
 
    if (instr->dest.is_ssa) {
       nir_ssa_dest_init(&new_instr->instr, &new_instr->dest,
-                        instr->dest.ssa.num_components, NULL);
+                        instr->dest.ssa.num_components, 32, NULL);
       nir_ssa_def_rewrite_uses(&instr->dest.ssa,
                                nir_src_for_ssa(&new_instr->dest.ssa));
    } else {
diff --git a/src/compiler/nir/nir_lower_clip.c b/src/compiler/nir/nir_lower_clip.c
index bcbad536874..c711230ad5b 100644
--- a/src/compiler/nir/nir_lower_clip.c
+++ b/src/compiler/nir/nir_lower_clip.c
@@ -88,7 +88,7 @@ load_clipdist_input(nir_builder *b, nir_variable *in, nir_ssa_def **val)
    load->num_components = 4;
    nir_intrinsic_set_base(load, in->data.driver_location);
    load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-   nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+   nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
    nir_builder_instr_insert(b, &load->instr);
 
    val[0] = nir_channel(b, &load->dest.ssa, 0);
diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c
index a4affa7bdcf..62b8c84a956 100644
--- a/src/compiler/nir/nir_lower_indirect_derefs.c
+++ b/src/compiler/nir/nir_lower_indirect_derefs.c
@@ -75,8 +75,9 @@ emit_indirect_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
       if (src == NULL) {
          /* We're a load.  We need to insert a phi node */
          nir_phi_instr *phi = nir_phi_instr_create(b->shader);
+         unsigned bit_size = then_dest->bit_size;
          nir_ssa_dest_init(&phi->instr, &phi->dest,
-                           then_dest->num_components, NULL);
+                           then_dest->num_components, bit_size, NULL);
 
          nir_phi_src *src0 = ralloc(phi, nir_phi_src);
          src0->pred = nir_cf_node_as_block(nir_if_last_then_node(if_stmt));
@@ -125,8 +126,9 @@ emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
       load->num_components = orig_instr->num_components;
       load->variables[0] =
          nir_deref_as_var(nir_copy_deref(load, &deref->deref));
+      unsigned bit_size = orig_instr->dest.ssa.bit_size;
       nir_ssa_dest_init(&load->instr, &load->dest,
-                        load->num_components, NULL);
+                        load->num_components, bit_size, NULL);
       nir_builder_instr_insert(b, &load->instr);
       *dest = &load->dest.ssa;
    } else {
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index 9d502ee4e9c..a30061d3bf0 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -289,7 +289,8 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
          if (intrin->dest.is_ssa) {
             nir_ssa_dest_init(&load->instr, &load->dest,
-                              intrin->num_components, NULL);
+                              intrin->num_components,
+                              intrin->dest.ssa.bit_size, NULL);
             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                      nir_src_for_ssa(&load->dest.ssa));
          } else {
@@ -369,7 +370,8 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
          if (intrin->dest.is_ssa) {
             nir_ssa_dest_init(&atomic->instr, &atomic->dest,
-                              intrin->dest.ssa.num_components, NULL);
+                              intrin->dest.ssa.num_components,
+                              intrin->dest.ssa.bit_size, NULL);
             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                      nir_src_for_ssa(&atomic->dest.ssa));
          } else {
diff --git a/src/compiler/nir/nir_lower_load_const_to_scalar.c b/src/compiler/nir/nir_lower_load_const_to_scalar.c
index 1eeed13cbac..b5df46413f1 100644
--- a/src/compiler/nir/nir_lower_load_const_to_scalar.c
+++ b/src/compiler/nir/nir_lower_load_const_to_scalar.c
@@ -49,7 +49,7 @@ lower_load_const_instr_scalar(nir_load_const_instr *lower)
    nir_ssa_def *loads[4];
    for (unsigned i = 0; i < lower->def.num_components; i++) {
       nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1);
-      load_comp->value.u[0] = lower->value.u[i];
+      load_comp->value.u32[0] = lower->value.u32[i];
       nir_builder_instr_insert(&b, &load_comp->instr);
       loads[i] = &load_comp->def;
    }
diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c
index 45036fa7787..0438802d3b2 100644
--- a/src/compiler/nir/nir_lower_locals_to_regs.c
+++ b/src/compiler/nir/nir_lower_locals_to_regs.c
@@ -161,7 +161,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
       if (src.reg.indirect) {
          nir_load_const_instr *load_const =
             nir_load_const_instr_create(state->shader, 1);
-         load_const->value.u[0] = glsl_get_length(parent_type);
+         load_const->value.u32[0] = glsl_get_length(parent_type);
          nir_instr_insert_before(instr, &load_const->instr);
 
          nir_alu_instr *mul = nir_alu_instr_create(state->shader, nir_op_imul);
@@ -169,7 +169,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
          mul->src[1].src.is_ssa = true;
          mul->src[1].src.ssa = &load_const->def;
          mul->dest.write_mask = 1;
-         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL);
+         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
          nir_instr_insert_before(instr, &mul->instr);
 
          src.reg.indirect->is_ssa = true;
@@ -187,7 +187,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
             add->src[0].src = *src.reg.indirect;
             nir_src_copy(&add->src[1].src, &deref_array->indirect, add);
             add->dest.write_mask = 1;
-            nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL);
+            nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
             nir_instr_insert_before(instr, &add->instr);
 
             src.reg.indirect->is_ssa = true;
@@ -221,7 +221,8 @@ lower_locals_to_regs_block(nir_block *block, void *void_state)
          mov->dest.write_mask = (1 << intrin->num_components) - 1;
          if (intrin->dest.is_ssa) {
             nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                              intrin->num_components, NULL);
+                              intrin->num_components,
+                              intrin->dest.ssa.bit_size, NULL);
             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                      nir_src_for_ssa(&mov->dest.dest.ssa));
          } else {
diff --git a/src/compiler/nir/nir_lower_phis_to_scalar.c b/src/compiler/nir/nir_lower_phis_to_scalar.c
index dd2abcf72f8..026c8665f91 100644
--- a/src/compiler/nir/nir_lower_phis_to_scalar.c
+++ b/src/compiler/nir/nir_lower_phis_to_scalar.c
@@ -188,6 +188,8 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
       if (!should_lower_phi(phi, state))
          continue;
 
+      unsigned bit_size = phi->dest.ssa.bit_size;
+
       /* Create a vecN operation to combine the results.  Most of these
        * will be redundant, but copy propagation should clean them up for
        * us.  No need to add the complexity here.
@@ -202,12 +204,14 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
 
       nir_alu_instr *vec = nir_alu_instr_create(state->mem_ctx, vec_op);
       nir_ssa_dest_init(&vec->instr, &vec->dest.dest,
-                        phi->dest.ssa.num_components, NULL);
+                        phi->dest.ssa.num_components,
+                        bit_size, NULL);
       vec->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
 
       for (unsigned i = 0; i < phi->dest.ssa.num_components; i++) {
          nir_phi_instr *new_phi = nir_phi_instr_create(state->mem_ctx);
-         nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1, NULL);
+         nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1,
+                           phi->dest.ssa.bit_size, NULL);
 
          vec->src[i].src = nir_src_for_ssa(&new_phi->dest.ssa);
 
@@ -215,7 +219,7 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
             /* We need to insert a mov to grab the i'th component of src */
             nir_alu_instr *mov = nir_alu_instr_create(state->mem_ctx,
                                                       nir_op_imov);
-            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, NULL);
+            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, bit_size, NULL);
             mov->dest.write_mask = 1;
             nir_src_copy(&mov->src[0].src, &src->src, state->mem_ctx);
             mov->src[0].swizzle[0] = i;
diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
index 79f6bedc990..c1cd1398aa1 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -65,9 +65,9 @@ convert_block(nir_block *block, void *void_state)
           */
 
          nir_const_value local_size;
-         local_size.u[0] = b->shader->info.cs.local_size[0];
-         local_size.u[1] = b->shader->info.cs.local_size[1];
-         local_size.u[2] = b->shader->info.cs.local_size[2];
+         local_size.u32[0] = b->shader->info.cs.local_size[0];
+         local_size.u32[1] = b->shader->info.cs.local_size[1];
+         local_size.u32[2] = b->shader->info.cs.local_size[2];
 
          nir_ssa_def *group_id =
             nir_load_system_value(b, nir_intrinsic_load_work_group_id, 0);
diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c
index 806acd8333c..4999603e592 100644
--- a/src/compiler/nir/nir_lower_tex.c
+++ b/src/compiler/nir/nir_lower_tex.c
@@ -140,7 +140,7 @@ get_texture_size(nir_builder *b, nir_tex_instr *tex)
    txs->src[0].src = nir_src_for_ssa(nir_imm_int(b, 0));
    txs->src[0].src_type = nir_tex_src_lod;
 
-   nir_ssa_dest_init(&txs->instr, &txs->dest, 2, NULL);
+   nir_ssa_dest_init(&txs->instr, &txs->dest, 2, 32, NULL);
    nir_builder_instr_insert(b, &txs->instr);
 
    return nir_i2f(b, &txs->dest.ssa);
@@ -223,13 +223,13 @@ get_zero_or_one(nir_builder *b, nir_alu_type type, uint8_t swizzle_val)
    memset(&v, 0, sizeof(v));
 
    if (swizzle_val == 4) {
-      v.u[0] = v.u[1] = v.u[2] = v.u[3] = 0;
+      v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 0;
    } else {
       assert(swizzle_val == 5);
       if (type == nir_type_float)
-         v.f[0] = v.f[1] = v.f[2] = v.f[3] = 1.0;
+         v.f32[0] = v.f32[1] = v.f32[2] = v.f32[3] = 1.0;
       else
-         v.u[0] = v.u[1] = v.u[2] = v.u[3] = 1;
+         v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 1;
    }
 
    return nir_build_imm(b, 4, v);
diff --git a/src/compiler/nir/nir_lower_two_sided_color.c b/src/compiler/nir/nir_lower_two_sided_color.c
index fe3507cb7a3..c7fb67e4f27 100644
--- a/src/compiler/nir/nir_lower_two_sided_color.c
+++ b/src/compiler/nir/nir_lower_two_sided_color.c
@@ -74,7 +74,7 @@ load_input(nir_builder *b, nir_variable *in)
    load->num_components = 4;
    nir_intrinsic_set_base(load, in->data.driver_location);
    load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-   nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+   nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
    nir_builder_instr_insert(b, &load->instr);
 
    return &load->dest.ssa;
diff --git a/src/compiler/nir/nir_lower_var_copies.c b/src/compiler/nir/nir_lower_var_copies.c
index 7db9839c369..c994f0fe12c 100644
--- a/src/compiler/nir/nir_lower_var_copies.c
+++ b/src/compiler/nir/nir_lower_var_copies.c
@@ -116,12 +116,15 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr,
       assert(src_tail->type == dest_tail->type);
 
       unsigned num_components = glsl_get_vector_elements(src_tail->type);
+      unsigned bit_size =
+         glsl_get_bit_size(glsl_get_base_type(src_tail->type));
 
       nir_intrinsic_instr *load =
          nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_load_var);
       load->num_components = num_components;
       load->variables[0] = nir_deref_as_var(nir_copy_deref(load, &src_head->deref));
-      nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, num_components, bit_size,
+                        NULL);
 
       nir_instr_insert_before(&copy_instr->instr, &load->instr);
 
diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c
index a3f3fcfd9b4..9f9e454c198 100644
--- a/src/compiler/nir/nir_lower_vars_to_ssa.c
+++ b/src/compiler/nir/nir_lower_vars_to_ssa.c
@@ -505,6 +505,7 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
             nir_ssa_undef_instr *undef =
                nir_ssa_undef_instr_create(state->shader,
                                           intrin->num_components);
+            undef->def.bit_size = intrin->dest.ssa.bit_size;
 
             nir_instr_insert_before(&intrin->instr, &undef->instr);
             nir_instr_remove(&intrin->instr);
@@ -528,7 +529,8 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
 
          mov->dest.write_mask = (1 << intrin->num_components) - 1;
          nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                           intrin->num_components, NULL);
+                           intrin->num_components,
+                           intrin->dest.ssa.bit_size, NULL);
 
          nir_instr_insert_before(&intrin->instr, &mov->instr);
          nir_instr_remove(&intrin->instr);
@@ -719,6 +721,7 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
       node->pb_value =
          nir_phi_builder_add_value(state.phi_builder,
                                    glsl_get_vector_elements(node->type),
+                                   glsl_get_bit_size(glsl_get_base_type(node->type)),
                                    store_blocks);
 
       if (node->deref->var->constant_initializer) {
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 60ade4a80ae..d6b658dbfc8 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -90,8 +90,12 @@ class Opcode(object):
 # helper variables for strings
 tfloat = "float"
 tint = "int"
-tbool = "bool"
+tbool = "bool32"
 tuint = "uint"
+tfloat32 = "float32"
+tint32 = "int32"
+tuint32 = "uint32"
+tfloat64 = "float64"
 
 commutative = "commutative "
 associative = "associative "
@@ -155,57 +159,57 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)")
 unop("fsqrt", tfloat, "sqrtf(src0)")
 unop("fexp2", tfloat, "exp2f(src0)")
 unop("flog2", tfloat, "log2f(src0)")
-unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion.
-unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion
-unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion.
+unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
+unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
+unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
 # Float-to-boolean conversion
-unop_convert("f2b", tbool, tfloat, "src0 != 0.0f")
+unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
 # Boolean-to-float conversion
-unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f")
+unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 # Int-to-boolean conversion
-unop_convert("i2b", tbool, tint, "src0 != 0")
-unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
-unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion.
+unop_convert("i2b", tbool, tint32, "src0 != 0")
+unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
+unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
 
 # Unary floating-point rounding operations.
 
 
-unop("ftrunc", tfloat, "truncf(src0)")
-unop("fceil", tfloat, "ceilf(src0)")
-unop("ffloor", tfloat, "floorf(src0)")
-unop("ffract", tfloat, "src0 - floorf(src0)")
-unop("fround_even", tfloat, "_mesa_roundevenf(src0)")
+unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
+unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
+unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
+unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
+unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 
 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 
 # Trigonometric operations.
 
 
-unop("fsin", tfloat, "sinf(src0)")
-unop("fcos", tfloat, "cosf(src0)")
+unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
+unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 
 
 # Partial derivatives.
 
 
-unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0.
-unop("fddy", tfloat, "0.0f")
-unop("fddx_fine", tfloat, "0.0f")
-unop("fddy_fine", tfloat, "0.0f")
-unop("fddx_coarse", tfloat, "0.0f")
-unop("fddy_coarse", tfloat, "0.0f")
+unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
+unop("fddy", tfloat, "0.0")
+unop("fddx_fine", tfloat, "0.0")
+unop("fddy_fine", tfloat, "0.0")
+unop("fddx_coarse", tfloat, "0.0")
+unop("fddy_coarse", tfloat, "0.0")
 
 
 # Floating point pack and unpack operations.
 
 def pack_2x16(fmt):
-   unop_horiz("pack_" + fmt + "_2x16", 1, tuint, 2, tfloat, """
+   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 """.replace("fmt", fmt))
 
 def pack_4x8(fmt):
-   unop_horiz("pack_" + fmt + "_4x8", 1, tuint, 4, tfloat, """
+   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
@@ -213,13 +217,13 @@ dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 """.replace("fmt", fmt))
 
 def unpack_2x16(fmt):
-   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tuint, """
+   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 """.replace("fmt", fmt))
 
 def unpack_4x8(fmt):
-   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tuint, """
+   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
@@ -238,11 +242,11 @@ unpack_2x16("unorm")
 unpack_4x8("unorm")
 unpack_2x16("half")
 
-unop_horiz("pack_uvec2_to_uint", 1, tuint, 2, tuint, """
+unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 dst.x = (src0.x & 0xffff) | (src0.y >> 16);
 """)
 
-unop_horiz("pack_uvec4_to_uint", 1, tuint, 4, tuint, """
+unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 dst.x = (src0.x <<  0) |
         (src0.y <<  8) |
         (src0.z << 16) |
@@ -252,22 +256,22 @@ dst.x = (src0.x <<  0) |
 # Lowered floating point unpacking operations.
 
 
-unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tuint,
+unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
-unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tuint,
+unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 
 
 # Bit operations, part of ARB_gpu_shader5.
 
 
-unop("bitfield_reverse", tuint, """
+unop("bitfield_reverse", tuint32, """
 /* we're not winning any awards for speed here, but that's ok */
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++)
    dst |= ((src0 >> bit) & 1) << (31 - bit);
 """)
-unop("bit_count", tuint, """
+unop("bit_count", tuint32, """
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++) {
    if ((src0 >> bit) & 1)
@@ -275,7 +279,7 @@ for (unsigned bit = 0; bit < 32; bit++) {
 }
 """)
 
-unop_convert("ufind_msb", tint, tuint, """
+unop_convert("ufind_msb", tint32, tuint32, """
 dst = -1;
 for (int bit = 31; bit > 0; bit--) {
    if ((src0 >> bit) & 1) {
@@ -285,7 +289,7 @@ for (int bit = 31; bit > 0; bit--) {
 }
 """)
 
-unop("ifind_msb", tint, """
+unop("ifind_msb", tint32, """
 dst = -1;
 for (int bit = 31; bit >= 0; bit--) {
    /* If src0 < 0, we're looking for the first 0 bit.
@@ -299,7 +303,7 @@ for (int bit = 31; bit >= 0; bit--) {
 }
 """)
 
-unop("find_lsb", tint, """
+unop("find_lsb", tint32, """
 dst = -1;
 for (unsigned bit = 0; bit < 32; bit++) {
    if ((src0 >> bit) & 1) {
@@ -359,10 +363,10 @@ binop("fmul", tfloat, commutative + associative, "src0 * src1")
 # low 32-bits of signed/unsigned integer multiply
 binop("imul", tint, commutative + associative, "src0 * src1")
 # high 32-bits of signed integer multiply
-binop("imul_high", tint, commutative,
+binop("imul_high", tint32, commutative,
       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 # high 32-bits of unsigned integer multiply
-binop("umul_high", tuint, commutative,
+binop("umul_high", tuint32, commutative,
       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 
 binop("fdiv", tfloat, "", "src0 / src1")
@@ -427,18 +431,18 @@ binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
 
 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 
-binop_reduce("fall_equal",  1, tfloat, tfloat, "{src0} == {src1}",
+binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
-binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}",
+binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 
 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 # and false respectively
 
-binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
-binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
-binop("seq", tfloat, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
-binop("sne", tfloat, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
+binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
+binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
+binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
+binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 
 
 binop("ishl", tint, "", "src0 << src1")
@@ -461,11 +465,11 @@ binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 # for true and 0.0 for false
 
-binop("fand", tfloat, commutative,
+binop("fand", tfloat32, commutative,
       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
-binop("for", tfloat, commutative,
+binop("for", tfloat32, commutative,
       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
-binop("fxor", tfloat, commutative,
+binop("fxor", tfloat32, commutative,
       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 
 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
@@ -487,7 +491,7 @@ binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 
 # Saturated vector add for 4 8bit ints.
-binop("usadd_4x8", tint, commutative + associative, """
+binop("usadd_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
@@ -495,7 +499,7 @@ for (int i = 0; i < 32; i += 8) {
 """)
 
 # Saturated vector subtract for 4 8bit ints.
-binop("ussub_4x8", tint, "", """
+binop("ussub_4x8", tint32, "", """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    int src0_chan = (src0 >> i) & 0xff;
@@ -506,7 +510,7 @@ for (int i = 0; i < 32; i += 8) {
 """)
 
 # vector min for 4 8bit ints.
-binop("umin_4x8", tint, commutative + associative, """
+binop("umin_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
@@ -514,7 +518,7 @@ for (int i = 0; i < 32; i += 8) {
 """)
 
 # vector max for 4 8bit ints.
-binop("umax_4x8", tint, commutative + associative, """
+binop("umax_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
@@ -522,7 +526,7 @@ for (int i = 0; i < 32; i += 8) {
 """)
 
 # unorm multiply: (a * b) / 255.
-binop("umul_unorm_4x8", tint, commutative + associative, """
+binop("umul_unorm_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    int src0_chan = (src0 >> i) & 0xff;
@@ -531,15 +535,15 @@ for (int i = 0; i < 32; i += 8) {
 }
 """)
 
-binop("fpow", tfloat, "", "powf(src0, src1)")
+binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 
-binop_horiz("pack_half_2x16_split", 1, tuint, 1, tfloat, 1, tfloat,
+binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 
 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 # if either of its arguments are 32.
-binop_convert("bfm", tuint, tint, "", """
+binop_convert("bfm", tuint32, tint32, "", """
 int bits = src0, offset = src1;
 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
    dst = 0; /* undefined */
@@ -548,7 +552,7 @@ else
 """)
 
 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
-dst = ldexpf(src0, src1);
+dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 /* flush denormals to zero. */
 if (!isnormal(dst))
    dst = copysignf(0.0f, src0);
@@ -588,12 +592,12 @@ triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 
 
-triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2")
+triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 opcode("bcsel", 0, tuint, [0, 0, 0],
       [tbool, tuint, tuint], "", "src0 ? src1 : src2")
 
 # SM5 bfi assembly
-triop("bfi", tuint, """
+triop("bfi", tuint32, """
 unsigned mask = src0, insert = src1, base = src2;
 if (mask == 0) {
    dst = base;
@@ -608,8 +612,8 @@ if (mask == 0) {
 """)
 
 # SM5 ubfe/ibfe assembly
-opcode("ubfe", 0, tuint,
-       [0, 0, 0], [tuint, tint, tint], "", """
+opcode("ubfe", 0, tuint32,
+       [0, 0, 0], [tuint32, tint32, tint32], "", """
 unsigned base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -622,8 +626,8 @@ if (bits == 0) {
    dst = base >> offset;
 }
 """)
-opcode("ibfe", 0, tint,
-       [0, 0, 0], [tint, tint, tint], "", """
+opcode("ibfe", 0, tint32,
+       [0, 0, 0], [tint32, tint32, tint32], "", """
 int base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -638,8 +642,8 @@ if (bits == 0) {
 """)
 
 # GLSL bitfieldExtract()
-opcode("ubitfield_extract", 0, tuint,
-       [0, 0, 0], [tuint, tint, tint], "", """
+opcode("ubitfield_extract", 0, tuint32,
+       [0, 0, 0], [tuint32, tint32, tint32], "", """
 unsigned base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -650,8 +654,8 @@ if (bits == 0) {
    dst = (base >> offset) & ((1ull << bits) - 1);
 }
 """)
-opcode("ibitfield_extract", 0, tint,
-       [0, 0, 0], [tint, tint, tint], "", """
+opcode("ibitfield_extract", 0, tint32,
+       [0, 0, 0], [tint32, tint32, tint32], "", """
 int base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -678,8 +682,8 @@ def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
           [tuint, tuint, tuint, tuint],
           "", const_expr)
 
-opcode("bitfield_insert", 0, tuint, [0, 0, 0, 0],
-       [tuint, tuint, tint, tint], "", """
+opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
+       [tuint32, tuint32, tint32, tint32], "", """
 unsigned base = src0, insert = src1;
 int offset = src2, bits = src3;
 if (bits == 0) {
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 54f7d86843a..ed21c5d5c8c 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -35,10 +35,17 @@ d = 'd'
 
 # Written in the form (<search>, <replace>) where <search> is an expression
 # and <replace> is either an expression or a value.  An expression is
-# defined as a tuple of the form (<op>, <src0>, <src1>, <src2>, <src3>)
+# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
 # where each source is either an expression or a value.  A value can be
 # either a numeric constant or a string representing a variable name.
 #
+# If the opcode in a search expression is prefixed by a '~' character, this
+# indicates that the operation is inexact.  Such operations will only get
+# applied to SSA values that do not have the exact bit set.  This should be
+# used by by any optimizations that are not bit-for-bit exact.  It should not,
+# however, be used for backend-requested lowering operations as those need to
+# happen regardless of precision.
+#
 # Variable names are specified as "[#]name[@type]" where "#" inicates that
 # the given variable will only match constants and the type indicates that
 # the given variable will only match values from ALU instructions with the
@@ -55,19 +62,19 @@ optimizations = [
    (('fabs', ('fneg', a)), ('fabs', a)),
    (('iabs', ('iabs', a)), ('iabs', a)),
    (('iabs', ('ineg', a)), ('iabs', a)),
-   (('fadd', a, 0.0), a),
+   (('~fadd', a, 0.0), a),
    (('iadd', a, 0), a),
    (('usadd_4x8', a, 0), a),
    (('usadd_4x8', a, ~0), ~0),
-   (('fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
+   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
    (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
-   (('fadd', ('fneg', a), a), 0.0),
+   (('~fadd', ('fneg', a), a), 0.0),
    (('iadd', ('ineg', a), a), 0),
    (('iadd', ('ineg', a), ('iadd', a, b)), b),
    (('iadd', a, ('iadd', ('ineg', a), b)), b),
-   (('fadd', ('fneg', a), ('fadd', a, b)), b),
-   (('fadd', a, ('fadd', ('fneg', a), b)), b),
-   (('fmul', a, 0.0), 0.0),
+   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
+   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
+   (('~fmul', a, 0.0), 0.0),
    (('imul', a, 0), 0),
    (('umul_unorm_4x8', a, 0), 0),
    (('umul_unorm_4x8', a, ~0), a),
@@ -76,32 +83,48 @@ optimizations = [
    (('fmul', a, -1.0), ('fneg', a)),
    (('imul', a, -1), ('ineg', a)),
    (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
-   (('ffma', 0.0, a, b), b),
-   (('ffma', a, 0.0, b), b),
-   (('ffma', a, b, 0.0), ('fmul', a, b)),
+   (('~ffma', 0.0, a, b), b),
+   (('~ffma', a, 0.0, b), b),
+   (('~ffma', a, b, 0.0), ('fmul', a, b)),
    (('ffma', a, 1.0, b), ('fadd', a, b)),
    (('ffma', 1.0, a, b), ('fadd', a, b)),
-   (('flrp', a, b, 0.0), a),
-   (('flrp', a, b, 1.0), b),
-   (('flrp', a, a, b), a),
-   (('flrp', 0.0, a, b), ('fmul', a, b)),
+   (('~flrp', a, b, 0.0), a),
+   (('~flrp', a, b, 1.0), b),
+   (('~flrp', a, a, b), a),
+   (('~flrp', 0.0, a, b), ('fmul', a, b)),
+   (('~flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp'),
    (('flrp', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp'),
    (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
-   (('fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp'),
-   (('fadd', a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
+   (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp'),
+   (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg',         c ))), ('fmul', b,         c )), ('flrp', a, b, c), '!options->lower_flrp'),
+   (('~fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp'),
+   (('~fadd', a, ('fmul',         c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
    (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
-   (('fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
+   (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
    # Comparison simplifications
-   (('inot', ('flt', a, b)), ('fge', a, b)),
-   (('inot', ('fge', a, b)), ('flt', a, b)),
-   (('inot', ('feq', a, b)), ('fne', a, b)),
-   (('inot', ('fne', a, b)), ('feq', a, b)),
+   (('~inot', ('flt', a, b)), ('fge', a, b)),
+   (('~inot', ('fge', a, b)), ('flt', a, b)),
+   (('~inot', ('feq', a, b)), ('fne', a, b)),
+   (('~inot', ('fne', a, b)), ('feq', a, b)),
    (('inot', ('ilt', a, b)), ('ige', a, b)),
    (('inot', ('ige', a, b)), ('ilt', a, b)),
    (('inot', ('ieq', a, b)), ('ine', a, b)),
    (('inot', ('ine', a, b)), ('ieq', a, b)),
+
+   # 0.0 >= b2f(a)
+   # b2f(a) <= 0.0
+   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
+   # inot(a)
+   (('fge', 0.0, ('b2f', a)), ('inot', a)),
+
+   # 0.0 < fabs(a)
+   # fabs(a) > 0.0
+   # fabs(a) != 0.0 because fabs(a) must be >= 0
+   # a != 0.0
+   (('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
+
    (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
-   (('bcsel', ('flt', a, b), a, b), ('fmin', a, b)),
+   (('bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
    (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
    (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)),
    (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
@@ -111,15 +134,19 @@ optimizations = [
    (('imax', a, a), a),
    (('umin', a, a), a),
    (('umax', a, a), a),
-   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
-   (('fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
+   (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
+   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
    (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
    (('fsat', ('fsat', a)), ('fsat', a)),
    (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)),
-   (('ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
-   (('ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
-   (('ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
-   (('ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
+   (('~ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
+   (('~ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
+   (('~ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
+   (('~ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
+   (('fabs', ('slt', a, b)), ('slt', a, b)),
+   (('fabs', ('sge', a, b)), ('sge', a, b)),
+   (('fabs', ('seq', a, b)), ('seq', a, b)),
+   (('fabs', ('sne', a, b)), ('sne', a, b)),
    (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
    (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
    (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
@@ -151,7 +178,6 @@ optimizations = [
    (('ior', a, 0), a),
    (('fxor', a, a), 0.0),
    (('ixor', a, a), 0),
-   (('fxor', a, 0.0), a),
    (('ixor', a, 0), a),
    (('inot', ('inot', a)), a),
    # DeMorgan's Laws
@@ -167,35 +193,35 @@ optimizations = [
    (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)),
    (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)),
    # Exponential/logarithmic identities
-   (('fexp2', ('flog2', a)), a), # 2^lg2(a) = a
-   (('flog2', ('fexp2', a)), a), # lg2(2^a) = a
+   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
+   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
    (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
-   (('fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
-   (('fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
-    ('fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
-   (('fpow', a, 1.0), a),
-   (('fpow', a, 2.0), ('fmul', a, a)),
-   (('fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
-   (('fpow', 2.0, a), ('fexp2', a)),
-   (('fpow', ('fpow', a, 2.2), 0.454545), a),
-   (('fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
-   (('fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
-   (('frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
-   (('frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
-   (('flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
-   (('flog2', ('frcp', a)), ('fneg', ('flog2', a))),
-   (('flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
-   (('flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
-   (('fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))),
-   (('fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),
-   (('fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
+   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
+   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
+    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
+   (('~fpow', a, 1.0), a),
+   (('~fpow', a, 2.0), ('fmul', a, a)),
+   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
+   (('~fpow', 2.0, a), ('fexp2', a)),
+   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
+   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
+   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
+   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
+   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
+   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
+   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
+   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
+   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
+   (('~fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))),
+   (('~fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),
+   (('~fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
    # Division and reciprocal
-   (('fdiv', 1.0, a), ('frcp', a)),
+   (('~fdiv', 1.0, a), ('frcp', a)),
    (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
-   (('frcp', ('frcp', a)), a),
-   (('frcp', ('fsqrt', a)), ('frsq', a)),
+   (('~frcp', ('frcp', a)), a),
+   (('~frcp', ('fsqrt', a)), ('frsq', a)),
    (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
-   (('frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
+   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
    # Boolean simplifications
    (('ieq', 'a@bool', True), a),
    (('ine', 'a@bool', True), ('inot', a)),
@@ -216,6 +242,10 @@ optimizations = [
    (('i2b', ('b2i', a)), a),
    (('f2i', ('ftrunc', a)), ('f2i', a)),
    (('f2u', ('ftrunc', a)), ('f2u', a)),
+   (('i2b', ('ineg', a)), ('i2b', a)),
+   (('i2b', ('iabs', a)), ('i2b', a)),
+   (('fabs', ('b2f', a)), ('b2f', a)),
+   (('iabs', ('b2i', a)), ('b2i', a)),
 
    # Byte extraction
    (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
@@ -228,7 +258,7 @@ optimizations = [
    (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
 
    # Subtracts
-   (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
+   (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
    (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
    (('ussub_4x8', a, 0), a),
    (('ussub_4x8', a, ~0), 0),
@@ -236,7 +266,7 @@ optimizations = [
    (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'),
    (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
    (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
-   (('fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
+   (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
    (('iadd', a, ('isub', 0, b)), ('isub', a, b)),
    (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
    (('iabs', ('isub', 0, a)), ('iabs', a)),
@@ -368,10 +398,13 @@ for op in ['flt', 'fge', 'feq', 'fne',
 # they help code generation but do not necessarily produce code that is
 # more easily optimizable.
 late_optimizations = [
+   # Most of these optimizations aren't quite safe when you get infinity or
+   # Nan involved but the first one should be fine.
    (('flt', ('fadd', a, b), 0.0), ('flt', a, ('fneg', b))),
-   (('fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
-   (('feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
-   (('fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
+   (('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
+   (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
+   (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
+
    (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
    (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
    (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c
index 04876a42fd7..e64ca369bbc 100644
--- a/src/compiler/nir/nir_opt_constant_folding.c
+++ b/src/compiler/nir/nir_opt_constant_folding.c
@@ -46,10 +46,28 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
    if (!instr->dest.dest.is_ssa)
       return false;
 
+   /* In the case that any outputs/inputs have unsized types, then we need to
+    * guess the bit-size. In this case, the validator ensures that all
+    * bit-sizes match so we can just take the bit-size from first
+    * output/input with an unsized type. If all the outputs/inputs are sized
+    * then we don't need to guess the bit-size at all because the code we
+    * generate for constant opcodes in this case already knows the sizes of
+    * the types involved and does not need the provided bit-size for anything
+    * (although it still requires to receive a valid bit-size).
+    */
+   unsigned bit_size = 0;
+   if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type))
+      bit_size = instr->dest.dest.ssa.bit_size;
+
    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
       if (!instr->src[i].src.is_ssa)
          return false;
 
+      if (bit_size == 0 &&
+          !nir_alu_type_get_type_size(nir_op_infos[instr->op].input_sizes[i])) {
+         bit_size = instr->src[i].src.ssa->bit_size;
+      }
+
       nir_instr *src_instr = instr->src[i].src.ssa->parent_instr;
 
       if (src_instr->type != nir_instr_type_load_const)
@@ -58,24 +76,31 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
 
       for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(instr, i);
            j++) {
-         src[i].u[j] = load_const->value.u[instr->src[i].swizzle[j]];
+         if (load_const->def.bit_size == 64)
+            src[i].u64[j] = load_const->value.u64[instr->src[i].swizzle[j]];
+         else
+            src[i].u32[j] = load_const->value.u32[instr->src[i].swizzle[j]];
       }
 
       /* We shouldn't have any source modifiers in the optimization loop. */
       assert(!instr->src[i].abs && !instr->src[i].negate);
    }
 
+   if (bit_size == 0)
+      bit_size = 32;
+
    /* We shouldn't have any saturate modifiers in the optimization loop. */
    assert(!instr->dest.saturate);
 
    nir_const_value dest =
       nir_eval_const_opcode(instr->op, instr->dest.dest.ssa.num_components,
-                            src);
+                            bit_size, src);
 
    nir_load_const_instr *new_instr =
       nir_load_const_instr_create(mem_ctx,
                                   instr->dest.dest.ssa.num_components);
 
+   new_instr->def.bit_size = instr->dest.dest.ssa.bit_size;
    new_instr->value = dest;
 
    nir_instr_insert_before(&instr->instr, &new_instr->instr);
@@ -106,7 +131,7 @@ constant_fold_deref(nir_instr *instr, nir_deref_var *deref)
          nir_load_const_instr *indirect =
             nir_instr_as_load_const(arr->indirect.ssa->parent_instr);
 
-         arr->base_offset += indirect->value.u[0];
+         arr->base_offset += indirect->value.u32[0];
 
          /* Clear out the source */
          nir_instr_rewrite_src(instr, &arr->indirect, nir_src_for_ssa(NULL));
diff --git a/src/compiler/nir/nir_opt_dead_cf.c b/src/compiler/nir/nir_opt_dead_cf.c
index 4cc6798702b..4658b23c57b 100644
--- a/src/compiler/nir/nir_opt_dead_cf.c
+++ b/src/compiler/nir/nir_opt_dead_cf.c
@@ -228,7 +228,7 @@ dead_cf_block(nir_block *block)
      if (!const_value)
         return false;
 
-      opt_constant_if(following_if, const_value->u[0] != 0);
+      opt_constant_if(following_if, const_value->u32[0] != 0);
       return true;
    }
 
diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c
index 0fc658df861..bad9dc457ad 100644
--- a/src/compiler/nir/nir_opt_peephole_select.c
+++ b/src/compiler/nir/nir_opt_peephole_select.c
@@ -210,7 +210,8 @@ nir_opt_peephole_select_block(nir_block *block, void *void_state)
       }
 
       nir_ssa_dest_init(&sel->instr, &sel->dest.dest,
-                        phi->dest.ssa.num_components, phi->dest.ssa.name);
+                        phi->dest.ssa.num_components,
+                        phi->dest.ssa.bit_size, phi->dest.ssa.name);
       sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
 
       nir_ssa_def_rewrite_uses(&phi->dest.ssa,
diff --git a/src/compiler/nir/nir_phi_builder.c b/src/compiler/nir/nir_phi_builder.c
index 5429083e5c8..a39e3606fd5 100644
--- a/src/compiler/nir/nir_phi_builder.c
+++ b/src/compiler/nir/nir_phi_builder.c
@@ -52,6 +52,7 @@ struct nir_phi_builder_value {
 
    /* Needed so we can create phis and undefs */
    unsigned num_components;
+   unsigned bit_size;
 
    /* The list of phi nodes associated with this value.  Phi nodes are not
     * added directly.  Instead, they are created, the instr->block pointer
@@ -61,8 +62,18 @@ struct nir_phi_builder_value {
     */
    struct exec_list phis;
 
-   /* Array of SSA defs, indexed by block.  If a phi needs to be inserted
-    * in a given block, it will have the magic value NEEDS_PHI.
+   /* Array of SSA defs, indexed by block.  For each block, this array has has
+    * one of three types of values:
+    *
+    *  - NULL. Indicates that there is no known definition in this block.  If
+    *    you need to find one, look at the block's immediate dominator.
+    *
+    *  - NEEDS_PHI. Indicates that the block may need a phi node but none has
+    *    been created yet.  If a def is requested for a block, a phi will need
+    *    to be created.
+    *
+    *  - A regular SSA def.  This will be either the result of a phi node or
+    *    one of the defs provided by nir_phi_builder_value_set_blocK_def().
     */
    nir_ssa_def *defs[0];
 };
@@ -101,7 +112,7 @@ nir_phi_builder_create(nir_function_impl *impl)
 
 struct nir_phi_builder_value *
 nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
-                          const BITSET_WORD *defs)
+                          unsigned bit_size, const BITSET_WORD *defs)
 {
    struct nir_phi_builder_value *val;
    unsigned i, w_start = 0, w_end = 0;
@@ -109,6 +120,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
    val = rzalloc_size(pb, sizeof(*val) + sizeof(val->defs[0]) * pb->num_blocks);
    val->builder = pb;
    val->num_components = num_components;
+   val->bit_size = bit_size;
    exec_list_make_empty(&val->phis);
    exec_list_push_tail(&pb->values, &val->node);
 
@@ -127,8 +139,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
       set_foreach(cur->dom_frontier, dom_entry) {
          nir_block *next = (nir_block *) dom_entry->key;
 
-         /*
-          * If there's more than one return statement, then the end block
+         /* If there's more than one return statement, then the end block
           * can be a join point for some definitions. However, there are
           * no instructions in the end block, so nothing would use those
           * phi nodes. Of course, we couldn't place those phi nodes
@@ -139,6 +150,10 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
             continue;
 
          if (val->defs[next->index] == NULL) {
+            /* Instead of creating a phi node immediately, we simply set the
+             * value to the magic value NEEDS_PHI.  Later, we create phi nodes
+             * on demand in nir_phi_builder_value_get_block_def().
+             */
             val->defs[next->index] = NEEDS_PHI;
 
             if (pb->work[next->index] < pb->iter_count) {
@@ -163,7 +178,9 @@ nir_ssa_def *
 nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
                                     nir_block *block)
 {
+   /* For each block, we have one of three types of values */
    if (val->defs[block->index] == NULL) {
+      /* NULL indicates that we have no SSA def for this block. */
       if (block->imm_dom) {
          /* Grab it from our immediate dominator.  We'll stash it here for
           * easy access later.
@@ -185,17 +202,36 @@ nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
          return &undef->def;
       }
    } else if (val->defs[block->index] == NEEDS_PHI) {
-      /* If we need a phi instruction, go ahead and create one but don't
-       * add it to the program yet.  Later, we'll go through and set up phi
-       * sources and add the instructions will be added at that time.
+      /* The magic value NEEDS_PHI indicates that the block needs a phi node
+       * but none has been created.  We need to create one now so we can
+       * return it to the caller.
+       *
+       * Because a phi node may use SSA defs that it does not dominate (this
+       * happens in loops), we do not yet have enough information to fully
+       * fill out the phi node.  Instead, the phi nodes we create here will be
+       * empty (have no sources) and won't actually be placed in the block's
+       * instruction list yet.  Later, in nir_phi_builder_finish(), we walk
+       * over all of the phi instructions, fill out the sources lists, and
+       * place them at the top of their respective block's instruction list.
+       *
+       * Creating phi nodes on-demand allows us to avoid creating dead phi
+       * nodes that will just get deleted later. While this probably isn't a
+       * big win for a full into-SSA pass, other users may use the phi builder
+       * to make small SSA form repairs where most of the phi nodes will never
+       * be used.
        */
       nir_phi_instr *phi = nir_phi_instr_create(val->builder->shader);
-      nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components, NULL);
+      nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components,
+                        val->bit_size, NULL);
       phi->instr.block = block;
       exec_list_push_tail(&val->phis, &phi->instr.node);
       val->defs[block->index] = &phi->dest.ssa;
       return &phi->dest.ssa;
    } else {
+      /* In this case, we have an actual SSA def.  It's either the result of a
+       * phi node created by the case above or one passed to us through
+       * nir_phi_builder_value_set_block_def().
+       */
       return val->defs[block->index];
    }
 }
@@ -216,9 +252,14 @@ nir_phi_builder_finish(struct nir_phi_builder *pb)
    NIR_VLA(nir_block *, preds, num_blocks);
 
    foreach_list_typed(struct nir_phi_builder_value, val, node, &pb->values) {
-      /* We can't iterate over the list of phis normally because we are
-       * removing them as we go and, in some cases, adding new phis as we
-       * build the source lists of others.
+      /* We treat the linked list of phi nodes like a worklist.  The list is
+       * pre-populated by calls to nir_phi_builder_value_get_block_def() that
+       * create phi nodes.  As we fill in the sources of phi nodes, more may
+       * be created and are added to the end of the list.
+       *
+       * Because we are adding and removing phi nodes from the list as we go,
+       * we can't iterate over it normally.  Instead, we just iterate until
+       * the list is empty.
        */
       while (!exec_list_is_empty(&val->phis)) {
          struct exec_node *head = exec_list_get_head(&val->phis);
diff --git a/src/compiler/nir/nir_phi_builder.h b/src/compiler/nir/nir_phi_builder.h
index 50251bf1ba3..edc530268c2 100644
--- a/src/compiler/nir/nir_phi_builder.h
+++ b/src/compiler/nir/nir_phi_builder.h
@@ -25,7 +25,38 @@
 
 #include "nir.h"
 
+/** A helper for placing phi nodes in a NIR shader
+ *
+ * Basic usage goes something like this:
+ *
+ *     each variable, var, has:
+ *         a bitset var.defs of blocks where the variable is defined
+ *         a struct nir_phi_builder_value *pb_val
+ *
+ *     // initialize bitsets
+ *     foreach block:
+ *         foreach def of variable var:
+ *             var.defs[def.block] = true;
+ *
+ *     // initialize phi builder
+ *     pb = nir_phi_builder_create()
+ *     foreach var:
+ *         var.pb_val = nir_phi_builder_add_value(pb, var.defs)
+ *
+ *     // Visit each block.  This needs to visit dominators first;
+ *     // nir_for_each_block() will be ok.
+ *     foreach block:
+ *         foreach instruction:
+ *             foreach use of variable var:
+ *                 replace use with nir_phi_builder_get_block_def(var.pb_val)
+ *             foreach def of variable var:
+ *                 create ssa def, register with
+ *     nir_phi_builder_set_block_def(var.pb_val)
+ *
+ *     nir_phi_builder_finish(pb)
+ */
 struct nir_phi_builder;
+
 struct nir_phi_builder_value;
 
 /* Create a new phi builder.
@@ -43,7 +74,7 @@ struct nir_phi_builder *nir_phi_builder_create(nir_function_impl *impl);
  */
 struct nir_phi_builder_value *
 nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
-                          const BITSET_WORD *defs);
+                          unsigned bit_size, const BITSET_WORD *defs);
 
 /* Register a definition for the given value and block.
  *
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index 24d5281ec54..60b74d13dc7 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -207,6 +207,8 @@ print_alu_instr(nir_alu_instr *instr, print_state *state)
    print_alu_dest(&instr->dest, state);
 
    fprintf(fp, " = %s", nir_op_infos[instr->op].name);
+   if (instr->exact)
+      fprintf(fp, "!");
    if (instr->dest.saturate)
       fprintf(fp, ".sat");
    fprintf(fp, " ");
@@ -714,7 +716,7 @@ print_load_const_instr(nir_load_const_instr *instr, print_state *state)
        * and then print the float in a comment for readability.
        */
 
-      fprintf(fp, "0x%08x /* %f */", instr->value.u[i], instr->value.f[i]);
+      fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]);
    }
 
    fprintf(fp, ")");
diff --git a/src/compiler/nir/nir_repair_ssa.c b/src/compiler/nir/nir_repair_ssa.c
index 3ab4f0f6db7..96c791cbc6b 100644
--- a/src/compiler/nir/nir_repair_ssa.c
+++ b/src/compiler/nir/nir_repair_ssa.c
@@ -85,7 +85,8 @@ repair_ssa_def(nir_ssa_def *def, void *void_state)
    BITSET_SET(state->def_set, def->parent_instr->block->index);
 
    struct nir_phi_builder_value *val =
-      nir_phi_builder_add_value(pb, def->num_components, state->def_set);
+      nir_phi_builder_add_value(pb, def->num_components, def->bit_size,
+                                state->def_set);
 
    nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def);
 
diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index 56d7e8162f3..6e630631453 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -62,7 +62,8 @@ alu_instr_is_bool(nir_alu_instr *instr)
    case nir_op_inot:
       return src_is_bool(instr->src[0].src);
    default:
-      return nir_op_infos[instr->op].output_type == nir_type_bool;
+      return (nir_alu_type_get_base_type(nir_op_infos[instr->op].output_type)
+             == nir_type_bool);
    }
 }
 
@@ -125,8 +126,10 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
             nir_alu_instr *src_alu =
                nir_instr_as_alu(instr->src[src].src.ssa->parent_instr);
 
-            if (nir_op_infos[src_alu->op].output_type != var->type &&
-                !(var->type == nir_type_bool && alu_instr_is_bool(src_alu)))
+            if (nir_alu_type_get_base_type(nir_op_infos[src_alu->op].output_type) !=
+                var->type &&
+                !(nir_alu_type_get_base_type(var->type) == nir_type_bool &&
+                  alu_instr_is_bool(src_alu)))
                return false;
          }
 
@@ -158,21 +161,65 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
       nir_load_const_instr *load =
          nir_instr_as_load_const(instr->src[src].src.ssa->parent_instr);
 
-      switch (nir_op_infos[instr->op].input_types[src]) {
+      switch (const_val->type) {
       case nir_type_float:
          for (unsigned i = 0; i < num_components; ++i) {
-            if (load->value.f[new_swizzle[i]] != const_val->data.f)
+            double val;
+            switch (load->def.bit_size) {
+            case 32:
+               val = load->value.f32[new_swizzle[i]];
+               break;
+            case 64:
+               val = load->value.f64[new_swizzle[i]];
+               break;
+            default:
+               unreachable("unknown bit size");
+            }
+
+            if (val != const_val->data.d)
                return false;
          }
          return true;
+
       case nir_type_int:
+         for (unsigned i = 0; i < num_components; ++i) {
+            int64_t val;
+            switch (load->def.bit_size) {
+            case 32:
+               val = load->value.i32[new_swizzle[i]];
+               break;
+            case 64:
+               val = load->value.i64[new_swizzle[i]];
+               break;
+            default:
+               unreachable("unknown bit size");
+            }
+
+            if (val != const_val->data.i)
+               return false;
+         }
+         return true;
+
       case nir_type_uint:
-      case nir_type_bool:
+      case nir_type_bool32:
          for (unsigned i = 0; i < num_components; ++i) {
-            if (load->value.i[new_swizzle[i]] != const_val->data.i)
+            uint64_t val;
+            switch (load->def.bit_size) {
+            case 32:
+               val = load->value.u32[new_swizzle[i]];
+               break;
+            case 64:
+               val = load->value.u64[new_swizzle[i]];
+               break;
+            default:
+               unreachable("unknown bit size");
+            }
+
+            if (val != const_val->data.u)
                return false;
          }
          return true;
+
       default:
          unreachable("Invalid alu source type");
       }
@@ -191,6 +238,10 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr,
    if (instr->op != expr->opcode)
       return false;
 
+   assert(instr->dest.dest.is_ssa);
+   if (expr->inexact && instr->exact)
+      return false;
+
    assert(!instr->dest.saturate);
    assert(nir_op_infos[instr->op].num_inputs > 0);
 
@@ -244,9 +295,123 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr,
    }
 }
 
+typedef struct bitsize_tree {
+   unsigned num_srcs;
+   struct bitsize_tree *srcs[4];
+
+   unsigned common_size;
+   bool is_src_sized[4];
+   bool is_dest_sized;
+
+   unsigned dest_size;
+   unsigned src_size[4];
+} bitsize_tree;
+
+static bitsize_tree *
+build_bitsize_tree(void *mem_ctx, struct match_state *state,
+                   const nir_search_value *value)
+{
+   bitsize_tree *tree = ralloc(mem_ctx, bitsize_tree);
+
+   switch (value->type) {
+   case nir_search_value_expression: {
+      nir_search_expression *expr = nir_search_value_as_expression(value);
+      nir_op_info info = nir_op_infos[expr->opcode];
+      tree->num_srcs = info.num_inputs;
+      tree->common_size = 0;
+      for (unsigned i = 0; i < info.num_inputs; i++) {
+         tree->is_src_sized[i] = !!nir_alu_type_get_type_size(info.input_types[i]);
+         if (tree->is_src_sized[i])
+            tree->src_size[i] = nir_alu_type_get_type_size(info.input_types[i]);
+         tree->srcs[i] = build_bitsize_tree(mem_ctx, state, expr->srcs[i]);
+      }
+      tree->is_dest_sized = !!nir_alu_type_get_type_size(info.output_type);
+      if (tree->is_dest_sized)
+         tree->dest_size = nir_alu_type_get_type_size(info.output_type);
+      break;
+   }
+
+   case nir_search_value_variable: {
+      nir_search_variable *var = nir_search_value_as_variable(value);
+      tree->num_srcs = 0;
+      tree->is_dest_sized = true;
+      tree->dest_size = nir_src_bit_size(state->variables[var->variable].src);
+      break;
+   }
+
+   case nir_search_value_constant: {
+      tree->num_srcs = 0;
+      tree->is_dest_sized = false;
+      tree->common_size = 0;
+      break;
+   }
+   }
+
+   return tree;
+}
+
+static unsigned
+bitsize_tree_filter_up(bitsize_tree *tree)
+{
+   for (unsigned i = 0; i < tree->num_srcs; i++) {
+      unsigned src_size = bitsize_tree_filter_up(tree->srcs[i]);
+      if (src_size == 0)
+         continue;
+
+      if (tree->is_src_sized[i]) {
+         assert(src_size == tree->src_size[i]);
+      } else if (tree->common_size != 0) {
+         assert(src_size == tree->common_size);
+         tree->src_size[i] = src_size;
+      } else {
+         tree->common_size = src_size;
+         tree->src_size[i] = src_size;
+      }
+   }
+
+   if (tree->num_srcs && tree->common_size) {
+      if (tree->dest_size == 0)
+         tree->dest_size = tree->common_size;
+      else if (!tree->is_dest_sized)
+         assert(tree->dest_size == tree->common_size);
+
+      for (unsigned i = 0; i < tree->num_srcs; i++) {
+         if (!tree->src_size[i])
+            tree->src_size[i] = tree->common_size;
+      }
+   }
+
+   return tree->dest_size;
+}
+
+static void
+bitsize_tree_filter_down(bitsize_tree *tree, unsigned size)
+{
+   if (tree->dest_size)
+      assert(tree->dest_size == size);
+   else
+      tree->dest_size = size;
+
+   if (!tree->is_dest_sized) {
+      if (tree->common_size)
+         assert(tree->common_size == size);
+      else
+         tree->common_size = size;
+   }
+
+   for (unsigned i = 0; i < tree->num_srcs; i++) {
+      if (!tree->src_size[i]) {
+         assert(tree->common_size);
+         tree->src_size[i] = tree->common_size;
+      }
+      bitsize_tree_filter_down(tree->srcs[i], tree->src_size[i]);
+   }
+}
+
 static nir_alu_src
-construct_value(const nir_search_value *value, nir_alu_type type,
-                unsigned num_components, struct match_state *state,
+construct_value(const nir_search_value *value,
+                unsigned num_components, bitsize_tree *bitsize, bool exact,
+                struct match_state *state,
                 nir_instr *instr, void *mem_ctx)
 {
    switch (value->type) {
@@ -257,7 +422,9 @@ construct_value(const nir_search_value *value, nir_alu_type type,
          num_components = nir_op_infos[expr->opcode].output_size;
 
       nir_alu_instr *alu = nir_alu_instr_create(mem_ctx, expr->opcode);
-      nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, NULL);
+      nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components,
+                        bitsize->dest_size, NULL);
+      alu->exact = exact;
       alu->dest.write_mask = (1 << num_components) - 1;
       alu->dest.saturate = false;
 
@@ -269,8 +436,7 @@ construct_value(const nir_search_value *value, nir_alu_type type,
             num_components = nir_op_infos[alu->op].input_sizes[i];
 
          alu->src[i] = construct_value(expr->srcs[i],
-                                       nir_op_infos[alu->op].input_types[i],
-                                       num_components,
+                                       num_components, bitsize->srcs[i], exact,
                                        state, instr, mem_ctx);
       }
 
@@ -301,23 +467,57 @@ construct_value(const nir_search_value *value, nir_alu_type type,
       const nir_search_constant *c = nir_search_value_as_constant(value);
       nir_load_const_instr *load = nir_load_const_instr_create(mem_ctx, 1);
 
-      switch (type) {
+      switch (c->type) {
       case nir_type_float:
-         load->def.name = ralloc_asprintf(mem_ctx, "%f", c->data.f);
-         load->value.f[0] = c->data.f;
+         load->def.name = ralloc_asprintf(load, "%f", c->data.d);
+         switch (bitsize->dest_size) {
+         case 32:
+            load->value.f32[0] = c->data.d;
+            break;
+         case 64:
+            load->value.f64[0] = c->data.d;
+            break;
+         default:
+            unreachable("unknown bit size");
+         }
          break;
+
       case nir_type_int:
-         load->def.name = ralloc_asprintf(mem_ctx, "%d", c->data.i);
-         load->value.i[0] = c->data.i;
+         load->def.name = ralloc_asprintf(load, "%ld", c->data.i);
+         switch (bitsize->dest_size) {
+         case 32:
+            load->value.i32[0] = c->data.i;
+            break;
+         case 64:
+            load->value.i64[0] = c->data.i;
+            break;
+         default:
+            unreachable("unknown bit size");
+         }
          break;
+
       case nir_type_uint:
-      case nir_type_bool:
-         load->value.u[0] = c->data.u;
+         load->def.name = ralloc_asprintf(load, "%lu", c->data.u);
+         switch (bitsize->dest_size) {
+         case 32:
+            load->value.u32[0] = c->data.u;
+            break;
+         case 64:
+            load->value.u64[0] = c->data.u;
+            break;
+         default:
+            unreachable("unknown bit size");
+         }
+
+      case nir_type_bool32:
+         load->value.u32[0] = c->data.u;
          break;
       default:
          unreachable("Invalid alu source type");
       }
 
+      load->def.bit_size = bitsize->dest_size;
+
       nir_instr_insert_before(instr, &load->instr);
 
       nir_alu_src val;
@@ -352,6 +552,11 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
                          swizzle, &state))
       return NULL;
 
+   void *bitsize_ctx = ralloc_context(NULL);
+   bitsize_tree *tree = build_bitsize_tree(bitsize_ctx, &state, replace);
+   bitsize_tree_filter_up(tree);
+   bitsize_tree_filter_down(tree, instr->dest.dest.ssa.bit_size);
+
    /* Inserting a mov may be unnecessary.  However, it's much easier to
     * simply let copy propagation clean this up than to try to go through
     * and rewrite swizzles ourselves.
@@ -359,11 +564,12 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
    nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov);
    mov->dest.write_mask = instr->dest.write_mask;
    nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                     instr->dest.dest.ssa.num_components, NULL);
+                     instr->dest.dest.ssa.num_components,
+                     instr->dest.dest.ssa.bit_size, NULL);
 
-   mov->src[0] = construct_value(replace, nir_op_infos[instr->op].output_type,
-                                 instr->dest.dest.ssa.num_components, &state,
-                                 &instr->instr, mem_ctx);
+   mov->src[0] = construct_value(replace,
+                                 instr->dest.dest.ssa.num_components, tree,
+                                 instr->exact, &state, &instr->instr, mem_ctx);
    nir_instr_insert_before(&instr->instr, &mov->instr);
 
    nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa,
@@ -375,5 +581,7 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
     */
    nir_instr_remove(&instr->instr);
 
+   ralloc_free(bitsize_ctx);
+
    return mov;
 }
diff --git a/src/compiler/nir/nir_search.h b/src/compiler/nir/nir_search.h
index 7d47792945e..61742f129b1 100644
--- a/src/compiler/nir/nir_search.h
+++ b/src/compiler/nir/nir_search.h
@@ -71,16 +71,24 @@ typedef struct {
 typedef struct {
    nir_search_value value;
 
+   nir_alu_type type;
+
    union {
-      uint32_t u;
-      int32_t i;
-      float f;
+      uint64_t u;
+      int64_t i;
+      double d;
    } data;
 } nir_search_constant;
 
 typedef struct {
    nir_search_value value;
 
+   /* When set on a search expression, the expression will only match an SSA
+    * value that does *not* have the exact bit set.  If unset, the exact bit
+    * on the SSA value is ignored.
+    */
+   bool inexact;
+
    nir_op opcode;
    const nir_search_value *srcs[4];
 } nir_search_expression;
diff --git a/src/compiler/nir/nir_to_ssa.c b/src/compiler/nir/nir_to_ssa.c
index 44a50547738..d588d7d2df3 100644
--- a/src/compiler/nir/nir_to_ssa.c
+++ b/src/compiler/nir/nir_to_ssa.c
@@ -219,7 +219,9 @@ rewrite_def_forwards(nir_dest *dest, void *_state)
                              state->states[index].num_defs);
 
    list_del(&dest->reg.def_link);
-   nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, name);
+   nir_ssa_dest_init(state->parent_instr, dest, reg->num_components,
+                     reg->bit_size, name);
+   ralloc_free(name);
 
    /* push our SSA destination on the stack */
    state->states[index].index++;
@@ -271,7 +273,9 @@ rewrite_alu_instr_forward(nir_alu_instr *instr, rewrite_state *state)
 
       instr->dest.write_mask = (1 << num_components) - 1;
       list_del(&instr->dest.dest.reg.def_link);
-      nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, name);
+      nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+                        reg->bit_size, name);
+      ralloc_free(name);
 
       if (nir_op_infos[instr->op].output_size == 0) {
          /*
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index 0c32d5fe07a..9f18d1c33e4 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -179,9 +179,12 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state)
    nir_alu_src *src = &instr->src[index];
 
    unsigned num_components;
-   if (src->src.is_ssa)
+   unsigned src_bit_size;
+   if (src->src.is_ssa) {
+      src_bit_size = src->src.ssa->bit_size;
       num_components = src->src.ssa->num_components;
-   else {
+   } else {
+      src_bit_size = src->src.reg.reg->bit_size;
       if (src->src.reg.reg->is_packed)
          num_components = 4; /* can't check anything */
       else
@@ -194,6 +197,24 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state)
          assert(src->swizzle[i] < num_components);
    }
 
+   nir_alu_type src_type = nir_op_infos[instr->op].input_types[index];
+
+   /* 8-bit float isn't a thing */
+   if (nir_alu_type_get_base_type(src_type) == nir_type_float)
+      assert(src_bit_size == 16 || src_bit_size == 32 || src_bit_size == 64);
+
+   if (nir_alu_type_get_type_size(src_type)) {
+      /* This source has an explicit bit size */
+      assert(nir_alu_type_get_type_size(src_type) == src_bit_size);
+   } else {
+      if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type)) {
+         unsigned dest_bit_size =
+            instr->dest.dest.is_ssa ? instr->dest.dest.ssa.bit_size
+                                    : instr->dest.dest.reg.reg->bit_size;
+         assert(dest_bit_size == src_bit_size);
+      }
+   }
+
    validate_src(&src->src, state);
 }
 
@@ -263,8 +284,10 @@ validate_dest(nir_dest *dest, validate_state *state)
 }
 
 static void
-validate_alu_dest(nir_alu_dest *dest, validate_state *state)
+validate_alu_dest(nir_alu_instr *instr, validate_state *state)
 {
+   nir_alu_dest *dest = &instr->dest;
+
    unsigned dest_size =
       dest->dest.is_ssa ? dest->dest.ssa.num_components
                         : dest->dest.reg.reg->num_components;
@@ -282,6 +305,17 @@ validate_alu_dest(nir_alu_dest *dest, validate_state *state)
    assert(nir_op_infos[alu->op].output_type == nir_type_float ||
           !dest->saturate);
 
+   unsigned bit_size = dest->dest.is_ssa ? dest->dest.ssa.bit_size
+                                         : dest->dest.reg.reg->bit_size;
+   nir_alu_type type = nir_op_infos[instr->op].output_type;
+
+   /* 8-bit float isn't a thing */
+   if (nir_alu_type_get_base_type(type) == nir_type_float)
+      assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
+
+   assert(nir_alu_type_get_type_size(type) == 0 ||
+          nir_alu_type_get_type_size(type) == bit_size);
+
    validate_dest(&dest->dest, state);
 }
 
@@ -294,7 +328,7 @@ validate_alu_instr(nir_alu_instr *instr, validate_state *state)
       validate_alu_src(instr, i, state);
    }
 
-   validate_alu_dest(&instr->dest, state);
+   validate_alu_dest(instr, state);
 }
 
 static void
diff --git a/src/compiler/nir/spirv/spirv_to_nir.c b/src/compiler/nir/spirv/spirv_to_nir.c
index 5a7184acac6..42a1f953848 100644
--- a/src/compiler/nir/spirv/spirv_to_nir.c
+++ b/src/compiler/nir/spirv/spirv_to_nir.c
@@ -92,7 +92,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
             nir_load_const_instr_create(b->shader, num_components);
 
          for (unsigned i = 0; i < num_components; i++)
-            load->value.u[i] = constant->value.u[i];
+            load->value.u32[i] = constant->value.u[i];
 
          nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
          val->def = &load->def;
@@ -109,7 +109,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
                nir_load_const_instr_create(b->shader, rows);
 
             for (unsigned j = 0; j < rows; j++)
-               load->value.u[j] = constant->value.u[rows * i + j];
+               load->value.u32[j] = constant->value.u[rows * i + j];
 
             nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
             col_val->def = &load->def;
@@ -1035,6 +1035,8 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
          nir_op op = vtn_nir_alu_op_for_spirv_opcode(opcode, &swap);
 
          unsigned num_components = glsl_get_vector_elements(val->const_type);
+         unsigned bit_size =
+            glsl_get_bit_size(glsl_get_base_type(val->const_type));
 
          nir_const_value src[3];
          assert(count <= 7);
@@ -1043,14 +1045,16 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
                vtn_value(b, w[4 + i], vtn_value_type_constant)->constant;
 
             unsigned j = swap ? 1 - i : i;
+            assert(bit_size == 32);
             for (unsigned k = 0; k < num_components; k++)
-               src[j].u[k] = c->value.u[k];
+               src[j].u32[k] = c->value.u[k];
          }
 
-         nir_const_value res = nir_eval_const_opcode(op, num_components, src);
+         nir_const_value res = nir_eval_const_opcode(op, num_components,
+                                                     bit_size, src);
 
          for (unsigned k = 0; k < num_components; k++)
-            val->constant->value.u[k] = res.u[k];
+            val->constant->value.u[k] = res.u32[k];
 
          return;
       } /* default */
@@ -1414,7 +1418,7 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
    }
 
    nir_ssa_dest_init(&instr->instr, &instr->dest,
-                     nir_tex_instr_dest_size(instr), NULL);
+                     nir_tex_instr_dest_size(instr), 32, NULL);
 
    assert(glsl_get_vector_elements(ret_type->type) ==
           nir_tex_instr_dest_size(instr));
@@ -1600,7 +1604,7 @@ vtn_handle_image(struct vtn_builder *b, SpvOp opcode,
    if (opcode != SpvOpImageWrite) {
       struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
       struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
-      nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, NULL);
+      nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, 32, NULL);
 
       nir_builder_instr_insert(&b->nb, &intrin->instr);
 
@@ -1738,7 +1742,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
       fill_common_atomic_sources(b, opcode, w, &atomic->src[2]);
    }
 
-   nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, NULL);
+   nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL);
 
    struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
    struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
@@ -1750,7 +1754,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
 }
 
 static nir_alu_instr *
-create_vec(nir_shader *shader, unsigned num_components)
+create_vec(nir_shader *shader, unsigned num_components, unsigned bit_size)
 {
    nir_op op;
    switch (num_components) {
@@ -1762,7 +1766,8 @@ create_vec(nir_shader *shader, unsigned num_components)
    }
 
    nir_alu_instr *vec = nir_alu_instr_create(shader, op);
-   nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components,
+                     bit_size, NULL);
    vec->dest.write_mask = (1 << num_components) - 1;
 
    return vec;
@@ -1779,7 +1784,8 @@ vtn_ssa_transpose(struct vtn_builder *b, struct vtn_ssa_value *src)
 
    for (unsigned i = 0; i < glsl_get_matrix_columns(dest->type); i++) {
       nir_alu_instr *vec = create_vec(b->shader,
-                                      glsl_get_matrix_columns(src->type));
+                                      glsl_get_matrix_columns(src->type),
+                                      glsl_get_bit_size(glsl_get_base_type(src->type)));
       if (glsl_type_is_vector_or_scalar(src->type)) {
           vec->src[0].src = nir_src_for_ssa(src->def);
           vec->src[0].swizzle[0] = i;
@@ -1809,7 +1815,8 @@ nir_ssa_def *
 vtn_vector_insert(struct vtn_builder *b, nir_ssa_def *src, nir_ssa_def *insert,
                   unsigned index)
 {
-   nir_alu_instr *vec = create_vec(b->shader, src->num_components);
+   nir_alu_instr *vec = create_vec(b->shader, src->num_components,
+                                   src->bit_size);
 
    for (unsigned i = 0; i < src->num_components; i++) {
       if (i == index) {
@@ -1854,7 +1861,7 @@ vtn_vector_shuffle(struct vtn_builder *b, unsigned num_components,
                    nir_ssa_def *src0, nir_ssa_def *src1,
                    const uint32_t *indices)
 {
-   nir_alu_instr *vec = create_vec(b->shader, num_components);
+   nir_alu_instr *vec = create_vec(b->shader, num_components, src0->bit_size);
 
    nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(b->shader, 1);
    nir_builder_instr_insert(&b->nb, &undef->instr);
@@ -1884,7 +1891,8 @@ static nir_ssa_def *
 vtn_vector_construct(struct vtn_builder *b, unsigned num_components,
                      unsigned num_srcs, nir_ssa_def **srcs)
 {
-   nir_alu_instr *vec = create_vec(b->shader, num_components);
+   nir_alu_instr *vec = create_vec(b->shader, num_components,
+                                   srcs[0]->bit_size);
 
    unsigned dest_idx = 0;
    for (unsigned i = 0; i < num_srcs; i++) {
diff --git a/src/compiler/nir/spirv/vtn_glsl450.c b/src/compiler/nir/spirv/vtn_glsl450.c
index 6b649fd085b..3360fda067a 100644
--- a/src/compiler/nir/spirv/vtn_glsl450.c
+++ b/src/compiler/nir/spirv/vtn_glsl450.c
@@ -627,7 +627,9 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
 
    nir_alu_instr *instr = nir_alu_instr_create(b->shader, op);
    nir_ssa_dest_init(&instr->instr, &instr->dest.dest,
-                     glsl_get_vector_elements(val->ssa->type), val->name);
+                     glsl_get_vector_elements(val->ssa->type),
+                     glsl_get_bit_size(glsl_get_base_type(val->ssa->type)),
+                     val->name);
    instr->dest.write_mask = (1 << instr->dest.dest.ssa.num_components) - 1;
    val->ssa->def = &instr->dest.dest.ssa;
 
diff --git a/src/compiler/nir/spirv/vtn_variables.c b/src/compiler/nir/spirv/vtn_variables.c
index 31bf416ff5e..3cbac1e5da8 100644
--- a/src/compiler/nir/spirv/vtn_variables.c
+++ b/src/compiler/nir/spirv/vtn_variables.c
@@ -190,7 +190,9 @@ _vtn_local_load_store(struct vtn_builder *b, bool load, nir_deref_var *deref,
 
       if (load) {
          nir_ssa_dest_init(&intrin->instr, &intrin->dest,
-                           intrin->num_components, NULL);
+                           intrin->num_components,
+                           glsl_get_bit_size(glsl_get_base_type(tail->type)),
+                           NULL);
          inout->def = &intrin->dest.ssa;
       } else {
          nir_intrinsic_set_write_mask(intrin, (1 << intrin->num_components) - 1);
@@ -322,7 +324,7 @@ get_vulkan_resource_index(struct vtn_builder *b, struct vtn_access_chain *chain,
    nir_intrinsic_set_desc_set(instr, chain->var->descriptor_set);
    nir_intrinsic_set_binding(instr, chain->var->binding);
 
-   nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
    nir_builder_instr_insert(&b->nb, &instr->instr);
 
    return &instr->dest.ssa;
@@ -411,7 +413,8 @@ _vtn_load_store_tail(struct vtn_builder *b, nir_intrinsic_op op, bool load,
 
    if (load) {
       nir_ssa_dest_init(&instr->instr, &instr->dest,
-                        instr->num_components, NULL);
+                        instr->num_components,
+                        glsl_get_bit_size(glsl_get_base_type(type)), NULL);
       (*inout)->def = &instr->dest.ssa;
    }
 
@@ -1385,7 +1388,7 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
          nir_intrinsic_instr_create(b->nb.shader,
                                     nir_intrinsic_get_buffer_size);
       instr->src[0] = nir_src_for_ssa(index);
-      nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+      nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
       nir_builder_instr_insert(&b->nb, &instr->instr);
       nir_ssa_def *buf_size = &instr->dest.ssa;
 
diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h
index d92605bf4fb..5efdd85dea5 100644
--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -80,6 +80,27 @@ enum glsl_base_type glsl_get_sampler_result_type(const struct glsl_type *type);
 unsigned glsl_get_record_location_offset(const struct glsl_type *type,
                                          unsigned length);
 
+static inline unsigned
+glsl_get_bit_size(enum glsl_base_type type)
+{
+   switch (type) {
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_FLOAT: /* TODO handle mediump */
+   case GLSL_TYPE_SUBROUTINE:
+      return 32;
+
+   case GLSL_TYPE_DOUBLE:
+      return 64;
+
+   default:
+      unreachable("unknown base type");
+   }
+
+   return 0;
+}
+
 bool glsl_type_is_void(const struct glsl_type *type);
 bool glsl_type_is_error(const struct glsl_type *type);
 bool glsl_type_is_vector(const struct glsl_type *type);
diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index c445d9b0c92..d79c0e15422 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -44,7 +44,6 @@
 #include "egllog.h"
 
 
-#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))
 
 
 /**
diff --git a/src/egl/main/egldefines.h b/src/egl/main/egldefines.h
index a32cab26408..13a7563ce04 100644
--- a/src/egl/main/egldefines.h
+++ b/src/egl/main/egldefines.h
@@ -40,9 +40,16 @@ extern "C" {
 
 #define _EGL_MAX_EXTENSIONS_LEN 1000
 
+/* Hardcoded, conservative default for EGL_LARGEST_PBUFFER,
+ * this is used to implement EGL_LARGEST_PBUFFER.
+ */
+#define _EGL_MAX_PBUFFER_WIDTH 4096
+#define _EGL_MAX_PBUFFER_HEIGHT 4096
+
 #define _EGL_VENDOR_STRING "Mesa Project"
 
 #define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))
 
 #ifdef __cplusplus
 }
diff --git a/src/egl/main/eglsurface.c b/src/egl/main/eglsurface.c
index 4fa43f3e2b1..2971bb0983a 100644
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -307,6 +307,12 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
    if (err != EGL_SUCCESS)
       return _eglError(err, func);
 
+   /* if EGL_LARGEST_PBUFFER in use, clamp width and height */
+   if (surf->LargestPbuffer) {
+      surf->Width = MIN2(surf->Width, _EGL_MAX_PBUFFER_WIDTH);
+      surf->Height = MIN2(surf->Height, _EGL_MAX_PBUFFER_HEIGHT);
+   }
+
    return EGL_TRUE;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 6b33341ce6c..fcef31b4ff5 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -206,12 +206,6 @@ static unsigned tgsi_gs_run(struct draw_geometry_shader *shader,
 {
    struct tgsi_exec_machine *machine = shader->machine;
 
-   tgsi_set_exec_mask(machine,
-                      1,
-                      input_primitives > 1,
-                      input_primitives > 2,
-                      input_primitives > 3);
-
    /* run interpreter */
    tgsi_exec_machine_run(machine);
 
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index e85ae16c1df..cd9ee5434d3 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -264,11 +264,11 @@ aa_transform_epilog(struct tgsi_transform_context *ctx)
    if (aactx->colorOutput != -1) {
       /* insert texture sampling code for antialiasing. */
 
-      /* TEX texTemp, input_coord, sampler */
-      tgsi_transform_tex_2d_inst(ctx,
-                                 TGSI_FILE_TEMPORARY, aactx->texTemp,
-                                 TGSI_FILE_INPUT, aactx->maxInput + 1,
-                                 aactx->freeSampler);
+      /* TEX texTemp, input_coord, sampler, 2D */
+      tgsi_transform_tex_inst(ctx,
+                              TGSI_FILE_TEMPORARY, aactx->texTemp,
+                              TGSI_FILE_INPUT, aactx->maxInput + 1,
+                              TGSI_TEXTURE_2D, aactx->freeSampler);
 
       /* MOV rgb */
       tgsi_transform_op1_inst(ctx, TGSI_OPCODE_MOV,
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index abd64f5acd2..3fd8ef3cd2f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -159,12 +159,6 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
          input = (const float (*)[4])((const char *)input + input_stride);
       } 
 
-      tgsi_set_exec_mask(machine,
-                         1,
-                         max_vertices > 1,
-                         max_vertices > 2,
-                         max_vertices > 3);
-
       /* run interpreter */
       tgsi_exec_machine_run( machine );
 
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index fb998349a35..4673458171e 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -1191,6 +1191,7 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
          "FRAG\n"
          "DCL IN[0], GENERIC[0], LINEAR\n"
          "DCL SAMP[0]\n"
+         "DCL SVIEW[0], RECT, FLOAT\n"
          "DCL OUT[0], COLOR[0]\n"
          "DCL TEMP[0]\n"
 
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index b71917618c1..7ec8b662200 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -459,7 +459,7 @@ ttn_emit_immediate(struct ttn_compile *c)
    c->next_imm++;
 
    for (i = 0; i < 4; i++)
-      load_const->value.u[i] = tgsi_imm->u[i].Uint;
+      load_const->value.u32[i] = tgsi_imm->u[i].Uint;
 
    nir_builder_instr_insert(b, &load_const->instr);
 }
@@ -515,8 +515,8 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
                                            nir_intrinsic_load_var);
          load->num_components = 4;
          load->variables[0] = ttn_array_deref(c, load, var, offset, indirect);
-
-         nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+         nir_ssa_dest_init(&load->instr, &load->dest,
+                           4, 32, NULL);
          nir_builder_instr_insert(b, &load->instr);
 
          src = nir_src_for_ssa(&load->dest.ssa);
@@ -567,7 +567,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
       load = nir_intrinsic_instr_create(b->shader, op);
       load->num_components = ncomp;
 
-      nir_ssa_dest_init(&load->instr, &load->dest, ncomp, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, ncomp, 32, NULL);
       nir_builder_instr_insert(b, &load->instr);
 
       src = nir_src_for_ssa(&load->dest.ssa);
@@ -632,7 +632,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
       }
       load->src[srcn++] = nir_src_for_ssa(offset);
 
-      nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
       nir_builder_instr_insert(b, &load->instr);
 
       src = nir_src_for_ssa(&load->dest.ssa);
@@ -1425,7 +1425,7 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
 
    assert(src_number == num_srcs);
 
-   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, 32, NULL);
    nir_builder_instr_insert(b, &instr->instr);
 
    /* Resolve the writemask on the texture op. */
@@ -1464,10 +1464,10 @@ ttn_txq(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
    txs->src[0].src = nir_src_for_ssa(ttn_channel(b, src[0], X));
    txs->src[0].src_type = nir_tex_src_lod;
 
-   nir_ssa_dest_init(&txs->instr, &txs->dest, 3, NULL);
+   nir_ssa_dest_init(&txs->instr, &txs->dest, 3, 32, NULL);
    nir_builder_instr_insert(b, &txs->instr);
 
-   nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, NULL);
+   nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, 32, NULL);
    nir_builder_instr_insert(b, &qlv->instr);
 
    ttn_move_dest_masked(b, dest, &txs->dest.ssa, TGSI_WRITEMASK_XYZ);
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.h b/src/gallium/auxiliary/nir/tgsi_to_nir.h
index 0651870ea80..f480009afa4 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.h
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.h
@@ -23,8 +23,6 @@
 
 #include "compiler/nir/nir.h"
 
-struct nir_shader_compiler_options *options;
-
 struct nir_shader *
 tgsi_to_nir(const void *tgsi_tokens,
             const struct nir_shader_compiler_options *options);
diff --git a/src/gallium/auxiliary/postprocess/pp_colors.h b/src/gallium/auxiliary/postprocess/pp_colors.h
index a79858ef53c..76c4ab49b5c 100644
--- a/src/gallium/auxiliary/postprocess/pp_colors.h
+++ b/src/gallium/auxiliary/postprocess/pp_colors.h
@@ -33,6 +33,7 @@ static const char nored[] = "FRAG\n"
    "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0]\n"
    "IMM FLT32 {    0.0000,     0.0000,     0.0000,     0.0000}\n"
    "  0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -46,6 +47,7 @@ static const char nogreen[] = "FRAG\n"
    "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0]\n"
    "IMM FLT32 {    0.0000,     0.0000,     0.0000,     0.0000}\n"
    "  0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -59,6 +61,7 @@ static const char noblue[] = "FRAG\n"
    "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0]\n"
    "IMM FLT32 {    0.0000,     0.0000,     0.0000,     0.0000}\n"
    "  0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
diff --git a/src/gallium/auxiliary/postprocess/pp_mlaa.h b/src/gallium/auxiliary/postprocess/pp_mlaa.h
index 93a8a8afa90..0b2c363e1c4 100644
--- a/src/gallium/auxiliary/postprocess/pp_mlaa.h
+++ b/src/gallium/auxiliary/postprocess/pp_mlaa.h
@@ -50,6 +50,7 @@ static const char depth1fs[] = "FRAG\n"
    "DCL IN[2], GENERIC[11], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0..2]\n"
    "IMM FLT32 {    0.0030,     0.0000,     1.0000,     0.0000}\n"
    "  0: TEX TEMP[0].x, IN[1].xyyy, SAMP[0], 2D\n"
@@ -80,6 +81,7 @@ static const char color1fs[] = "FRAG\n"
    "DCL IN[2], GENERIC[11], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0..2]\n"
    "IMM FLT32 {    0.2126,     0.7152,     0.0722,     0.1000}\n"
    "IMM FLT32 {    1.0000,     0.0000,     0.0000,     0.0000}\n"
@@ -112,6 +114,7 @@ static const char neigh3fs[] = "FRAG\n"
    "DCL IN[2], GENERIC[11], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL SAMP[1]\n"
    "DCL TEMP[0..8]\n"
    "IMM FLT32 {    1.0000,     0.00001,     0.0000,     0.0000}\n"
@@ -175,8 +178,11 @@ static const char blend2fs_1[] = "FRAG\n"
    "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL SAMP[1]\n"
+   "DCL SVIEW[1], 2D, FLOAT\n"
    "DCL SAMP[2]\n"
+   "DCL SVIEW[2], 2D, FLOAT\n"
    "DCL CONST[0]\n"
    "DCL TEMP[0..6]\n"
    "IMM FLT32 {    0.0000,    -0.2500,     0.00609756,     0.5000}\n"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index e5355f573bb..7e30bb646e2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -111,7 +111,7 @@ tgsi_default_declaration( void )
    declaration.Local = 0;
    declaration.Array = 0;
    declaration.Atomic = 0;
-   declaration.Shared = 0;
+   declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL;
    declaration.Padding = 0;
 
    return declaration;
@@ -127,6 +127,8 @@ tgsi_build_declaration(
    unsigned invariant,
    unsigned local,
    unsigned array,
+   unsigned atomic,
+   unsigned mem_type,
    struct tgsi_header *header )
 {
    struct tgsi_declaration declaration;
@@ -143,6 +145,8 @@ tgsi_build_declaration(
    declaration.Invariant = invariant;
    declaration.Local = local;
    declaration.Array = array;
+   declaration.Atomic = atomic;
+   declaration.MemType = mem_type;
    header_bodysize_grow( header );
 
    return declaration;
@@ -401,6 +405,8 @@ tgsi_build_full_declaration(
       full_decl->Declaration.Invariant,
       full_decl->Declaration.Local,
       full_decl->Declaration.Array,
+      full_decl->Declaration.Atomic,
+      full_decl->Declaration.MemType,
       header );
 
    if (maxsize <= size)
@@ -775,6 +781,8 @@ tgsi_default_instruction_memory( void )
    struct tgsi_instruction_memory instruction_memory;
 
    instruction_memory.Qualifier = 0;
+   instruction_memory.Texture = 0;
+   instruction_memory.Format = 0;
    instruction_memory.Padding = 0;
 
    return instruction_memory;
@@ -790,6 +798,8 @@ tgsi_build_instruction_memory(
    struct tgsi_instruction_memory instruction_memory;
 
    instruction_memory.Qualifier = qualifier;
+   instruction_memory.Texture = 0;
+   instruction_memory.Format = 0;
    instruction_memory.Padding = 0;
    instruction->Memory = 1;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index c8b91bba534..6d39ef21083 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -365,8 +365,13 @@ iter_declaration(
    }
 
    if (decl->Declaration.File == TGSI_FILE_MEMORY) {
-      if (decl->Declaration.Shared)
-         TXT(", SHARED");
+      switch (decl->Declaration.MemType) {
+      /* Note: ,GLOBAL is optional / the default */
+      case TGSI_MEMORY_TYPE_GLOBAL:  TXT(", GLOBAL");  break;
+      case TGSI_MEMORY_TYPE_SHARED:  TXT(", SHARED");  break;
+      case TGSI_MEMORY_TYPE_PRIVATE: TXT(", PRIVATE"); break;
+      case TGSI_MEMORY_TYPE_INPUT:   TXT(", INPUT");   break;
+      }
    }
 
    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 12a68759ce5..991c3bfc5db 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -196,10 +196,6 @@ struct tgsi_sampler
 #define TGSI_EXEC_TEMP_HALF_I       (TGSI_EXEC_NUM_TEMPS + 3)
 #define TGSI_EXEC_TEMP_HALF_C       0
 
-/* execution mask, each value is either 0 or ~0 */
-#define TGSI_EXEC_MASK_I            (TGSI_EXEC_NUM_TEMPS + 3)
-#define TGSI_EXEC_MASK_C            1
-
 /* 4 register buffer for various purposes */
 #define TGSI_EXEC_TEMP_R0           (TGSI_EXEC_NUM_TEMPS + 4)
 #define TGSI_EXEC_NUM_TEMP_R        4
@@ -397,27 +393,6 @@ boolean
 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst);
 
 
-static inline void
-tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
-{
-   mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
-      mask;
-}
-
-
-/** Set execution mask values prior to executing the shader */
-static inline void
-tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
-                   boolean ch0, boolean ch1, boolean ch2, boolean ch3)
-{
-   int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i;
-   mask[0] = ch0 ? ~0 : 0;
-   mask[1] = ch1 ? ~0 : 0;
-   mask[2] = ch2 ? ~0 : 0;
-   mask[3] = ch3 ? ~0 : 0;
-}
-
-
 extern void
 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
                                unsigned num_bufs,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 8e24cc626bd..d32c3a14344 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -38,6 +38,7 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
+#include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_scan.h"
@@ -192,8 +193,17 @@ scan_instruction(struct tgsi_shader_info *info,
          }
       }
 
-      if (is_memory_file(src->Register.File))
+      if (is_memory_file(src->Register.File)) {
          is_mem_inst = true;
+
+         if (tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_store) {
+            info->writes_memory = TRUE;
+
+            if (src->Register.File == TGSI_FILE_IMAGE &&
+                !src->Register.Indirect)
+               info->images_writemask |= 1 << src->Register.Index;
+         }
+      }
    }
 
    /* check for indirect register writes */
@@ -204,8 +214,16 @@ scan_instruction(struct tgsi_shader_info *info,
          info->indirect_files_written |= (1 << dst->Register.File);
       }
 
-      if (is_memory_file(dst->Register.File))
+      if (is_memory_file(dst->Register.File)) {
+         assert(fullinst->Instruction.Opcode == TGSI_OPCODE_STORE);
+
          is_mem_inst = true;
+         info->writes_memory = TRUE;
+
+         if (dst->Register.File == TGSI_FILE_IMAGE &&
+             !dst->Register.Indirect)
+            info->images_writemask |= 1 << dst->Register.Index;
+      }
    }
 
    if (is_mem_inst)
@@ -413,6 +431,9 @@ scan_declaration(struct tgsi_shader_info *info,
          }
       } else if (file == TGSI_FILE_SAMPLER) {
          info->samplers_declared |= 1 << reg;
+      } else if (file == TGSI_FILE_IMAGE) {
+         if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER)
+            info->images_buffers |= 1 << reg;
       }
    }
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index d65dec71888..76d8925119e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -111,6 +111,7 @@ struct tgsi_shader_info
    boolean writes_clipvertex;
    boolean writes_viewport_index;
    boolean writes_layer;
+   boolean writes_memory; /**< contains stores or atomics to buffers or images */
    boolean is_msaa_sampler[PIPE_MAX_SAMPLERS];
    boolean uses_doubles; /**< uses any of the double instructions */
    unsigned clipdist_writemask;
@@ -118,6 +119,15 @@ struct tgsi_shader_info
    unsigned num_written_culldistance;
    unsigned num_written_clipdistance;
    /**
+    * Bitmask indicating which images are written to (STORE / ATOM*).
+    * Indirect image accesses are not reflected in this mask.
+    */
+   unsigned images_writemask;
+   /**
+    * Bitmask indicating which declared image is a buffer.
+    */
+   unsigned images_buffers;
+   /**
     * Bitmask indicating which register files are accessed with
     * indirect addressing.  The bits are (1 << TGSI_FILE_x), etc.
     */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 6bd1a2e14d2..ae779a8320a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -145,6 +145,7 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
    "NUM_CLIPDIST_ENABLED",
    "NUM_CULLDIST_ENABLED",
    "FS_EARLY_DEPTH_STENCIL",
+   "NEXT_SHADER",
 };
 
 const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] =
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 77598d2cb79..028633c9412 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -1390,8 +1390,18 @@ static boolean parse_declaration( struct translate_ctx *ctx )
             ctx->cur = cur;
          }
       } else if (file == TGSI_FILE_MEMORY) {
-         if (str_match_nocase_whole(&cur, "SHARED")) {
-            decl.Declaration.Shared = 1;
+         if (str_match_nocase_whole(&cur, "GLOBAL")) {
+            /* Note this is a no-op global is the default */
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL;
+            ctx->cur = cur;
+         } else if (str_match_nocase_whole(&cur, "SHARED")) {
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_SHARED;
+            ctx->cur = cur;
+         } else if (str_match_nocase_whole(&cur, "PRIVATE")) {
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_PRIVATE;
+            ctx->cur = cur;
+         } else if (str_match_nocase_whole(&cur, "INPUT")) {
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_INPUT;
             ctx->cur = cur;
          }
       } else {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index 27e6179c9ee..c21ff959cbf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -302,6 +302,40 @@ tgsi_transform_op2_inst(struct tgsi_transform_context *ctx,
 
 
 static inline void
+tgsi_transform_op3_inst(struct tgsi_transform_context *ctx,
+                        unsigned opcode,
+                        unsigned dst_file,
+                        unsigned dst_index,
+                        unsigned dst_writemask,
+                        unsigned src0_file,
+                        unsigned src0_index,
+                        unsigned src1_file,
+                        unsigned src1_index,
+                        unsigned src2_file,
+                        unsigned src2_index)
+{
+   struct tgsi_full_instruction inst;
+
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.Dst[0].Register.File = dst_file,
+   inst.Dst[0].Register.Index = dst_index;
+   inst.Dst[0].Register.WriteMask = dst_writemask;
+   inst.Instruction.NumSrcRegs = 3;
+   inst.Src[0].Register.File = src0_file;
+   inst.Src[0].Register.Index = src0_index;
+   inst.Src[1].Register.File = src1_file;
+   inst.Src[1].Register.Index = src1_index;
+   inst.Src[2].Register.File = src2_file;
+   inst.Src[2].Register.Index = src2_index;
+
+   ctx->emit_instruction(ctx, &inst);
+}
+
+
+
+static inline void
 tgsi_transform_op1_swz_inst(struct tgsi_transform_context *ctx,
                             unsigned opcode,
                             unsigned dst_file,
@@ -482,15 +516,18 @@ tgsi_transform_kill_inst(struct tgsi_transform_context *ctx,
 
 
 static inline void
-tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
-                           unsigned dst_file,
-                           unsigned dst_index,
-                           unsigned src_file,
-                           unsigned src_index,
-                           unsigned sampler_index)
+tgsi_transform_tex_inst(struct tgsi_transform_context *ctx,
+                        unsigned dst_file,
+                        unsigned dst_index,
+                        unsigned src_file,
+                        unsigned src_index,
+                        unsigned tex_target,
+                        unsigned sampler_index)
 {
    struct tgsi_full_instruction inst;
 
+   assert(tex_target < TGSI_TEXTURE_COUNT);
+
    inst = tgsi_default_full_instruction();
    inst.Instruction.Opcode = TGSI_OPCODE_TEX;
    inst.Instruction.NumDstRegs = 1;
@@ -498,7 +535,7 @@ tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
    inst.Dst[0].Register.Index = dst_index;
    inst.Instruction.NumSrcRegs = 2;
    inst.Instruction.Texture = TRUE;
-   inst.Texture.Texture = TGSI_TEXTURE_2D;
+   inst.Texture.Texture = tex_target;
    inst.Src[0].Register.File = src_file;
    inst.Src[0].Register.Index = src_index;
    inst.Src[1].Register.File = TGSI_FILE_SAMPLER;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index ab1d03458ef..297e257b322 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -101,6 +101,7 @@ struct ureg_program
 {
    unsigned processor;
    bool supports_any_inout_decl_range;
+   int next_shader_processor;
 
    struct {
       unsigned semantic_name;
@@ -190,7 +191,7 @@ struct ureg_program
 
    struct ureg_tokens domain[2];
 
-   bool use_shared_memory;
+   bool use_memory[TGSI_MEMORY_TYPE_COUNT];
 };
 
 static union tgsi_any_token error_tokens[32];
@@ -729,13 +730,14 @@ struct ureg_src ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr,
    return reg;
 }
 
-/* Allocate a shared memory area.
+/* Allocate a memory area.
  */
-struct ureg_src ureg_DECL_shared_memory(struct ureg_program *ureg)
+struct ureg_src ureg_DECL_memory(struct ureg_program *ureg,
+                                 unsigned memory_type)
 {
-   struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, 0);
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, memory_type);
 
-   ureg->use_shared_memory = true;
+   ureg->use_memory[memory_type] = true;
    return reg;
 }
 
@@ -1672,7 +1674,7 @@ emit_decl_buffer(struct ureg_program *ureg,
 }
 
 static void
-emit_decl_shared_memory(struct ureg_program *ureg)
+emit_decl_memory(struct ureg_program *ureg, unsigned memory_type)
 {
    union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2);
 
@@ -1681,11 +1683,11 @@ emit_decl_shared_memory(struct ureg_program *ureg)
    out[0].decl.NrTokens = 2;
    out[0].decl.File = TGSI_FILE_MEMORY;
    out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
-   out[0].decl.Shared = true;
+   out[0].decl.MemType = memory_type;
 
    out[1].value = 0;
-   out[1].decl_range.First = 0;
-   out[1].decl_range.Last = 0;
+   out[1].decl_range.First = memory_type;
+   out[1].decl_range.Last = memory_type;
 }
 
 static void
@@ -1860,8 +1862,10 @@ static void emit_decls( struct ureg_program *ureg )
       emit_decl_buffer(ureg, ureg->buffer[i].index, ureg->buffer[i].atomic);
    }
 
-   if (ureg->use_shared_memory)
-      emit_decl_shared_memory(ureg);
+   for (i = 0; i < TGSI_MEMORY_TYPE_COUNT; i++) {
+      if (ureg->use_memory[i])
+         emit_decl_memory(ureg, i);
+   }
 
    if (ureg->const_decls.nr_constant_ranges) {
       for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) {
@@ -1966,6 +1970,16 @@ const struct tgsi_token *ureg_finalize( struct ureg_program *ureg )
 {
    const struct tgsi_token *tokens;
 
+   switch (ureg->processor) {
+   case TGSI_PROCESSOR_VERTEX:
+   case TGSI_PROCESSOR_TESS_EVAL:
+      ureg_property(ureg, TGSI_PROPERTY_NEXT_SHADER,
+                    ureg->next_shader_processor == -1 ?
+                       TGSI_PROCESSOR_FRAGMENT :
+                       ureg->next_shader_processor);
+      break;
+   }
+
    emit_header( ureg );
    emit_decls( ureg );
    copy_instructions( ureg );
@@ -2079,6 +2093,7 @@ ureg_create_with_screen(unsigned processor, struct pipe_screen *screen)
       screen->get_shader_param(screen,
                                util_pipe_shader_from_tgsi_processor(processor),
                                PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE) != 0;
+   ureg->next_shader_processor = -1;
 
    for (i = 0; i < Elements(ureg->properties); i++)
       ureg->properties[i] = ~0;
@@ -2108,6 +2123,13 @@ no_ureg:
 }
 
 
+void
+ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor)
+{
+   ureg->next_shader_processor = processor;
+}
+
+
 unsigned
 ureg_get_nr_outputs( const struct ureg_program *ureg )
 {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 04a62a6e160..b4258fdbaa2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -114,6 +114,8 @@ ureg_create_shader( struct ureg_program *,
                     struct pipe_context *pipe,
 		    const struct pipe_stream_output_info *so );
 
+void
+ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor);
 
 /* Alternately, return the built token stream and hand ownership of
  * that memory to the caller:
@@ -338,7 +340,7 @@ struct ureg_src
 ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr, bool atomic);
 
 struct ureg_src
-ureg_DECL_shared_memory(struct ureg_program *ureg);
+ureg_DECL_memory(struct ureg_program *ureg, unsigned memory_type);
 
 static inline struct ureg_src
 ureg_imm4f( struct ureg_program *ureg,
diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index 74e6f99da67..bcbe2a25b25 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -344,11 +344,11 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
                            pctx->wincoordFile, wincoordInput,
                            TGSI_FILE_IMMEDIATE, pctx->numImmed);
 
-   /* TEX texTemp, texTemp, sampler; */
-   tgsi_transform_tex_2d_inst(ctx,
-                              TGSI_FILE_TEMPORARY, texTemp,
-                              TGSI_FILE_TEMPORARY, texTemp,
-                              sampIdx);
+   /* TEX texTemp, texTemp, sampler, 2D; */
+   tgsi_transform_tex_inst(ctx,
+                           TGSI_FILE_TEMPORARY, texTemp,
+                           TGSI_FILE_TEMPORARY, texTemp,
+                           TGSI_TEXTURE_2D, sampIdx);
 
    /* KILL_IF -texTemp;   # if -texTemp < 0, kill fragment */
    tgsi_transform_kill_inst(ctx,
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 7ffb2712472..76950a1a9cf 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -646,6 +646,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe,
          "FRAG\n"
          "DCL IN[0], GENERIC[0], LINEAR\n"
          "DCL SAMP[0..1]\n"
+         "DCL SVIEW[0..1], %s, FLOAT\n"
          "DCL OUT[0], POSITION\n"
          "DCL OUT[1], STENCIL\n"
          "DCL TEMP[0]\n"
@@ -663,7 +664,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe,
    assert(tgsi_tex == TGSI_TEXTURE_2D_MSAA ||
           tgsi_tex == TGSI_TEXTURE_2D_ARRAY_MSAA);
 
-   sprintf(text, shader_templ, type, type);
+   sprintf(text, shader_templ, type, type, type);
 
    if (!tgsi_text_translate(text, tokens, Elements(tokens))) {
       assert(0);
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index af2df2251da..6366f7e802d 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -3213,6 +3213,14 @@ Whether depth test, stencil test, and occlusion query should run before
 the fragment shader (regardless of fragment shader side effects). Corresponds
 to GLSL early_fragment_tests.
 
+NEXT_SHADER
+"""""""""""
+
+Which shader stage will MOST LIKELY follow after this shader when the shader
+is bound. This is only a hint to the driver and doesn't have to be precise.
+Only set for VS and TES.
+
+
 Texture Sampling and Texture Formats
 ------------------------------------
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 7a1812f2518..54315d2f592 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1017,7 +1017,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 
 	const_offset = nir_src_as_const_value(intr->src[1]);
 	if (const_offset) {
-		off += const_offset->u[0];
+		off += const_offset->u32[0];
 	} else {
 		/* For load_ubo_indirect, second src is indirect offset: */
 		src1 = get_src(ctx, &intr->src[1])[0];
@@ -1159,7 +1159,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[0]);
 		if (const_offset) {
-			idx += const_offset->u[0];
+			idx += const_offset->u32[0];
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i;
 				dst[i] = create_uniform(ctx, n);
@@ -1186,7 +1186,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[0]);
 		if (const_offset) {
-			idx += const_offset->u[0];
+			idx += const_offset->u32[0];
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i;
 				dst[i] = ctx->ir->inputs[n];
@@ -1213,7 +1213,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[1]);
 		compile_assert(ctx, const_offset != NULL);
-		idx += const_offset->u[0];
+		idx += const_offset->u32[0];
 
 		src = get_src(ctx, &intr->src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
@@ -1301,7 +1301,7 @@ emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr)
 	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
 			instr->def.num_components);
 	for (int i = 0; i < instr->def.num_components; i++)
-		dst[i] = create_immed(ctx->block, instr->value.u[i]);
+		dst[i] = create_immed(ctx->block, instr->value.u32[i]);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
index 8815ac981eb..ec76b0bdc4d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -290,7 +290,7 @@ lower_if_else_block(nir_block *block, void *void_state)
 		}
 
 		nir_ssa_dest_init(&sel->instr, &sel->dest.dest,
-				phi->dest.ssa.num_components, phi->dest.ssa.name);
+				phi->dest.ssa.num_components, 32, phi->dest.ssa.name);
 		sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
 
 		nir_ssa_def_rewrite_uses(&phi->dest.ssa,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 9f7d2572bbe..21523a27761 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -160,7 +160,7 @@ struct nv50_ir_prog_info
       uint8_t clipDistances;     /* number of clip distance outputs */
       uint8_t cullDistances;     /* number of cull distance outputs */
       int8_t genUserClip;        /* request user clip planes for ClipVertex */
-      uint8_t auxCBSlot;         /* constant buffer index of UCP/draw data */
+      uint8_t auxCBSlot;         /* driver constant buffer slot */
       uint16_t ucpBase;          /* base address for UCPs */
       uint16_t drawInfoBase;     /* base address for draw parameters */
       uint8_t pointSize;         /* output index for PointSize */
@@ -175,7 +175,6 @@ struct nv50_ir_prog_info
       uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
       bool fp64;                 /* program uses fp64 math */
       bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
-      uint8_t resInfoCBSlot;     /* cX[] used for tex handles, surface info */
       uint16_t texBindBase;      /* base address for tex handles (nve4) */
       uint16_t suInfoBase;       /* base address for surface info (nve4) */
       uint16_t sampleInfoBase;   /* base address for sample positions */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 0d7d95e3105..70f3c3f69ff 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1655,10 +1655,8 @@ CodeEmitterGK110::emitSTORE(const Instruction *i)
       break;
    }
 
-   if (i->src(0).getFile() != FILE_MEMORY_GLOBAL)
-      offset &= 0xffffff;
-
    if (code[0] & 0x2) {
+      offset &= 0xffffff;
       emitLoadStoreType(i->dType, 0x33);
       if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
          emitCachingMode(i->cache, 0x2f);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 682a19d6d78..bd6200687ed 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -1634,7 +1634,9 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
    code[1] |= (i->tex.mask & 0xc) << 12;
 
    if (i->tex.liveOnly)
-      code[1] |= 4;
+      code[1] |= 1 << 2;
+   if (i->tex.derivAll)
+      code[1] |= 1 << 3;
 
    defId(i->def(0), 2);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index d284446f5d9..611d5f9c3ed 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -856,15 +856,17 @@ public:
    };
    std::vector<TextureView> textureViews;
 
+   /*
    struct Resource {
       uint8_t target; // TGSI_TEXTURE_*
       bool raw;
       uint8_t slot; // $surface index
    };
    std::vector<Resource> resources;
+   */
 
    struct MemoryFile {
-      bool shared;
+      uint8_t mem_type; // TGSI_MEMORY_TYPE_*
    };
    std::vector<MemoryFile> memoryFiles;
 
@@ -1037,6 +1039,9 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
    case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
       info->io.cullDistances = prop->u[0].Data;
       break;
+   case TGSI_PROPERTY_NEXT_SHADER:
+      /* Do not need to know the next shader stage. */
+      break;
    default:
       INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
       break;
@@ -1222,7 +1227,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
       break;
    case TGSI_FILE_MEMORY:
       for (i = first; i <= last; ++i)
-         memoryFiles[i].shared = decl->Declaration.Shared;
+         memoryFiles[i].mem_type = decl->Declaration.MemType;
       break;
    case TGSI_FILE_NULL:
    case TGSI_FILE_TEMPORARY:
@@ -1261,9 +1266,9 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       info->numBarriers = 1;
 
    if (insn.dstCount()) {
-      if (insn.getDst(0).getFile() == TGSI_FILE_OUTPUT) {
-         Instruction::DstRegister dst = insn.getDst(0);
+      Instruction::DstRegister dst = insn.getDst(0);
 
+      if (dst.getFile() == TGSI_FILE_OUTPUT) {
          if (dst.isIndirect(0))
             for (unsigned i = 0; i < info->numOutputs; ++i)
                info->out[i].mask = 0xf;
@@ -1280,11 +1285,11 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
          if (isEdgeFlagPassthrough(insn))
             info->io.edgeFlagIn = insn.getSrc(0).getIndex(0);
       } else
-      if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
-         if (insn.getDst(0).isIndirect(0))
-            indirectTempArrays.insert(insn.getDst(0).getArrayId());
+      if (dst.getFile() == TGSI_FILE_TEMPORARY) {
+         if (dst.isIndirect(0))
+            indirectTempArrays.insert(dst.getArrayId());
       } else
-      if (insn.getDst(0).getFile() == TGSI_FILE_BUFFER) {
+      if (dst.getFile() == TGSI_FILE_BUFFER) {
          info->io.globalAccess |= 0x2;
       }
    }
@@ -1419,8 +1424,8 @@ private:
    void handleLIT(Value *dst0[4]);
    void handleUserClipPlanes();
 
-   Symbol *getResourceBase(int r);
-   void getResourceCoords(std::vector<Value *>&, int r, int s);
+   // Symbol *getResourceBase(int r);
+   // void getResourceCoords(std::vector<Value *>&, int r, int s);
 
    void handleLOAD(Value *dst0[4]);
    void handleSTORE();
@@ -1527,8 +1532,21 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
 
    sym->reg.fileIndex = fileIdx;
 
-   if (tgsiFile == TGSI_FILE_MEMORY && code->memoryFiles[fileIdx].shared)
-      sym->setFile(FILE_MEMORY_SHARED);
+   if (tgsiFile == TGSI_FILE_MEMORY) {
+      switch (code->memoryFiles[fileIdx].mem_type) {
+      case TGSI_MEMORY_TYPE_SHARED:
+         sym->setFile(FILE_MEMORY_SHARED);
+         break;
+      case TGSI_MEMORY_TYPE_INPUT:
+         assert(prog->getType() == Program::TYPE_COMPUTE);
+         assert(idx == -1);
+         sym->setFile(FILE_SHADER_INPUT);
+         address += info->prop.cp.inputOffset;
+         break;
+      default:
+         assert(0); /* TODO: Add support for global and private memory */
+      }
+   }
 
    if (idx >= 0) {
       if (sym->reg.file == FILE_SHADER_INPUT)
@@ -1989,7 +2007,6 @@ Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask)
 void
 Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
 {
-   Value *val;
    Value *arg[4], *src[8];
    Value *lod = NULL, *shd = NULL;
    unsigned int s, c, d;
@@ -2032,17 +2049,6 @@ Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
          shd = src[n - 1];
    }
 
-   if (tgt.isCube()) {
-      for (c = 0; c < 3; ++c)
-         src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]);
-      val = getScratch();
-      mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
-      mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
-      mkOp1(OP_RCP, TYPE_F32, val, val);
-      for (c = 0; c < 3; ++c)
-         src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val);
-   }
-
    for (c = 0, d = 0; c < 4; ++c) {
       if (dst[c]) {
          texi->setDef(d++, dst[c]);
@@ -2148,6 +2154,7 @@ Converter::handleLIT(Value *dst0[4])
    }
 }
 
+/* Keep this around for now as reference when adding img support
 static inline bool
 isResourceSpecial(const int r)
 {
@@ -2178,7 +2185,8 @@ Converter::getResourceBase(const int r)
 
    switch (r) {
    case TGSI_RESOURCE_GLOBAL:
-      sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL, 15);
+      sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL,
+                       info->io.auxCBSlot);
       break;
    case TGSI_RESOURCE_LOCAL:
       assert(prog->getType() == Program::TYPE_COMPUTE);
@@ -2243,6 +2251,7 @@ partitionLoadStore(uint8_t comp[2], uint8_t size[2], uint8_t mask)
    }
    return n + 1;
 }
+*/
 
 // For raw loads, granularity is 4 byte.
 // Usage of the texture read mask on OP_SULDP is not allowed.
@@ -2253,8 +2262,9 @@ Converter::handleLOAD(Value *dst0[4])
    int c;
    std::vector<Value *> off, src, ldv, def;
 
-   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER ||
-       tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) {
+   switch (tgsi.getSrc(0).getFile()) {
+   case TGSI_FILE_BUFFER:
+   case TGSI_FILE_MEMORY:
       for (c = 0; c < 4; ++c) {
          if (!dst0[c])
             continue;
@@ -2274,9 +2284,12 @@ Converter::handleLOAD(Value *dst0[4])
          if (tgsi.getSrc(0).isIndirect(0))
             ld->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
       }
-      return;
+      break;
+   default:
+      assert(!"Unsupported srcFile for LOAD");
    }
 
+/* Keep this around for now as reference when adding img support
    getResourceCoords(off, r, 1);
 
    if (isResourceRaw(code, r)) {
@@ -2342,6 +2355,7 @@ Converter::handleLOAD(Value *dst0[4])
    FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
       if (dst0[c] != def[c])
          mkMov(dst0[c], def[tgsi.getSrc(0).getSwizzle(c)]);
+*/
 }
 
 // For formatted stores, the write mask on OP_SUSTP can be used.
@@ -2353,8 +2367,9 @@ Converter::handleSTORE()
    int c;
    std::vector<Value *> off, src, dummy;
 
-   if (tgsi.getDst(0).getFile() == TGSI_FILE_BUFFER ||
-       tgsi.getDst(0).getFile() == TGSI_FILE_MEMORY) {
+   switch (tgsi.getDst(0).getFile()) {
+   case TGSI_FILE_BUFFER:
+   case TGSI_FILE_MEMORY:
       for (c = 0; c < 4; ++c) {
          if (!(tgsi.getDst(0).getMask() & (1 << c)))
             continue;
@@ -2375,9 +2390,12 @@ Converter::handleSTORE()
          if (tgsi.getDst(0).isIndirect(0))
             st->setIndirect(0, 1, fetchSrc(tgsi.getDst(0).getIndirect(0), 0, 0));
       }
-      return;
+      break;
+   default:
+      assert(!"Unsupported dstFile for STORE");
    }
 
+/* Keep this around for now as reference when adding img support
    getResourceCoords(off, r, 0);
    src = off;
    const int s = src.size();
@@ -2425,6 +2443,7 @@ Converter::handleSTORE()
       mkTex(OP_SUSTP, getResourceTarget(code, r), code->resources[r].slot, 0,
             dummy, src)->tex.mask = tgsi.getDst(0).getMask();
    }
+*/
 }
 
 // XXX: These only work on resources with the single-component u32/s32 formats.
@@ -2439,8 +2458,9 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
    std::vector<Value *> defv;
    LValue *dst = getScratch();
 
-   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER ||
-       tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) {
+   switch (tgsi.getSrc(0).getFile()) {
+   case TGSI_FILE_BUFFER:
+   case TGSI_FILE_MEMORY:
       for (int c = 0; c < 4; ++c) {
          if (!dst0[c])
             continue;
@@ -2468,10 +2488,12 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
       for (int c = 0; c < 4; ++c)
          if (dst0[c])
             dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
-      return;
+      break;
+   default:
+      assert(!"Unsupported srcFile for ATOM");
    }
 
-
+/* Keep this around for now as reference when adding img support
    getResourceCoords(srcv, r, 1);
 
    if (isResourceSpecial(r)) {
@@ -2499,6 +2521,7 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
    for (int c = 0; c < 4; ++c)
       if (dst0[c])
          dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
+*/
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 0b903780614..a5deaef14e0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -67,6 +67,7 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
    tmp = bld.getScratch();
 
    for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
       // mov coordinates from lane l to all lanes
       bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
       for (c = 0; c < dim; ++c) {
@@ -92,10 +93,25 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
          add->lanes = 1; /* abused for .ndv */
       }
 
+      // normalize cube coordinates if necessary
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
+
       // texture
       bld.insert(tex = cloneForward(func, i));
       for (c = 0; c < dim; ++c)
-         tex->setSrc(c + array, crd[c]);
+         tex->setSrc(c + array, src[c]);
       bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
 
       // save results
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 12c5f699603..02c4f1a4ca8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -682,7 +682,7 @@ void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
                                        Value **ms_x, Value **ms_y) {
    // This loads the texture-indexed ms setting from the constant buffer
    Value *tmp = new_LValue(func, FILE_GPR);
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    off += prog->driver->io.suInfoBase;
    if (prog->getType() > Program::TYPE_VERTEX)
       off += 16 * 2 * 4;
@@ -724,6 +724,23 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i)
    const int dref = arg;
    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
 
+   /* Only normalize in the non-explicit derivatives case.
+    */
+   if (i->tex.target.isCube() && i->op != OP_TXD) {
+      Value *src[3], *val;
+      int c;
+      for (c = 0; c < 3; ++c)
+         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
+      val = bld.getScratch();
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c) {
+         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
+                                 i->getSrc(c), val));
+      }
+   }
+
    // handle MS, which means looking up the MS params for this texture, and
    // adjusting the input coordinates to point at the right sample.
    if (i->tex.target.isMS()) {
@@ -934,12 +951,14 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
 
    handleTEX(i);
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
+   i->tex.derivAll = true;
 
    for (c = 0; c < dim; ++c)
       crd[c] = bld.getScratch();
 
    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
    for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
       // mov coordinates from lane l to all lanes
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
@@ -949,10 +968,24 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
       // add dPdy from lane l to lanes dy
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // normalize cube coordinates if necessary
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
       // texture
       bld.insert(tex = cloneForward(func, i));
       for (c = 0; c < dim; ++c)
-         tex->setSrc(c, crd[c]);
+         tex->setSrc(c, src[c]);
       // save results
       for (c = 0; i->defExists(c); ++c) {
          Instruction *mov;
@@ -1174,7 +1207,7 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
       bld.mkLoad(TYPE_F32,
                  def,
                  bld.mkSymbol(
-                       FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
+                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
                        TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
                  off);
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index d0936d88d60..e8f8e30918b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -600,7 +600,7 @@ NVC0LoweringPass::visit(BasicBlock *bb)
 inline Value *
 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    uint32_t off = prog->driver->io.texBindBase + slot * 4;
    return bld.
       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
@@ -615,6 +615,24 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
    const int chipset = prog->getTarget()->getChipset();
 
+   /* Only normalize in the non-explicit derivatives case. For explicit
+    * derivatives, this is handled in handleManualTXD.
+    */
+   if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
+      Value *src[3], *val;
+      int c;
+      for (c = 0; c < 3; ++c)
+         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
+      val = bld.getScratch();
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c) {
+         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
+                                 i->getSrc(c), val));
+      }
+   }
+
    // Arguments to the TEX instruction are a little insane. Even though the
    // encoding is identical between SM20 and SM30, the arguments mean
    // different things between Fermi and Kepler+. A lot of arguments are
@@ -728,9 +746,13 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
       }
 
       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
-      for (int s = dim; s >= 1; --s)
-         i->setSrc(s, i->getSrc(s - 1));
-      i->setSrc(0, arrayIndex);
+      if (arrayIndex) {
+         for (int s = dim; s >= 1; --s)
+            i->setSrc(s, i->getSrc(s - 1));
+         i->setSrc(0, arrayIndex);
+      } else {
+         i->moveSources(0, 1);
+      }
 
       if (arrayIndex) {
          int sat = (i->op == OP_TXF) ? 1 : 0;
@@ -861,6 +883,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
 
    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
    for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
       // mov coordinates from lane l to all lanes
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
@@ -870,10 +893,24 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
       // add dPdy from lane l to lanes dy
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // normalize cube coordinates
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
       // texture
       bld.insert(tex = cloneForward(func, i));
       for (c = 0; c < dim; ++c)
-         tex->setSrc(c + array, crd[c]);
+         tex->setSrc(c + array, src[c]);
       // save results
       for (c = 0; i->defExists(c); ++c) {
          Instruction *mov;
@@ -1098,6 +1135,7 @@ NVC0LoweringPass::handleSharedATOM(Instruction *atom)
          break;
       default:
          assert(0);
+         return;
       }
 
       Instruction *i =
@@ -1204,7 +1242,7 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 inline Value *
 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    off += prog->driver->io.suInfoBase;
    return bld.
       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
@@ -1213,7 +1251,7 @@ NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
 inline Value *
 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    off += prog->driver->io.suInfoBase;
 
    if (ptr)
@@ -1226,7 +1264,7 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
 inline Value *
 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    off += prog->driver->io.suInfoBase;
 
    if (ptr)
@@ -1540,7 +1578,7 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
       call->indirect = 1;
       call->absolute = 1;
       call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
-                                   prog->driver->io.resInfoCBSlot, TYPE_U32,
+                                   prog->driver->io.auxCBSlot, TYPE_U32,
                                    prog->driver->io.suInfoBase + base));
       call->setSrc(1, r[2]);
       call->setSrc(2, r[4]);
@@ -1698,7 +1736,8 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       }
       addr += prog->driver->prop.cp.gridInfoBase;
       bld.mkLoad(TYPE_U32, i->getDef(0),
-                 bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
+                 bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+                              TYPE_U32, addr), NULL);
       break;
    case SV_SAMPLE_INDEX:
       // TODO: Properly pass source as an address in the PIX address space
@@ -1715,7 +1754,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       bld.mkLoad(TYPE_F32,
                  i->getDef(0),
                  bld.mkSymbol(
-                       FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
+                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
                        TYPE_U32, prog->driver->io.sampleInfoBase +
                        4 * sym->reg.data.sv.index),
                  off);
@@ -1780,7 +1819,7 @@ NVC0LoweringPass::handleSQRT(Instruction *i)
 {
    if (i->dType == TYPE_F64) {
       Value *pred = bld.getSSA(1, FILE_PREDICATE);
-      Value *zero = bld.loadImm(NULL, 0.0d);
+      Value *zero = bld.loadImm(NULL, 0.0);
       Value *dst = bld.getSSA(8);
       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index cfa85ec123c..066faa367d2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -204,6 +204,11 @@ static const char *ldstSubOpStr[] =
    "", "lock", "unlock"
 };
 
+static const char *subfmOpStr[] =
+{
+   "", "3d"
+};
+
 static const char *DataTypeStr[] =
 {
    "-",
@@ -548,6 +553,10 @@ void Instruction::print() const
          if (subOp < Elements(ldstSubOpStr))
             PRINT("%s ", ldstSubOpStr[subOp]);
          break;
+      case OP_SUBFM:
+         if (subOp < Elements(subfmOpStr))
+            PRINT("%s ", subfmOpStr[subOp]);
+         break;
       default:
          if (subOp)
             PRINT("(SUBOP:%u) ", subOp);
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c
index cd44aa1e1d9..ca73fd17a43 100644
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -114,8 +114,6 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[],
 
    info.io.auxCBSlot = 15;
    info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
-
-   info.io.resInfoCBSlot = 15;
    info.io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
    info.io.msInfoCBSlot = 15;
    info.io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
index 04488d6d0a6..d781f6fd7d4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -67,122 +67,94 @@ nv50_screen_compute_setup(struct nv50_screen *screen,
    if (ret)
       return ret;
 
-   BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   BEGIN_NV04(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
    PUSH_DATA (push, screen->compute->handle);
 
-   BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1);
+   BEGIN_NV04(push, NV50_CP(UNK02A0), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_STACK), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2);
+   BEGIN_NV04(push, NV50_CP(STACK_ADDRESS_HIGH), 2);
    PUSH_DATAh(push, screen->stack_bo->offset);
    PUSH_DATA (push, screen->stack_bo->offset);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1);
+   BEGIN_NV04(push, NV50_CP(STACK_SIZE_LOG), 1);
    PUSH_DATA (push, 4);
 
-   BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1);
+   BEGIN_NV04(push, NV50_CP(UNK0290), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1);
+   BEGIN_NV04(push, NV50_CP(LANES32_ENABLE), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1);
+   BEGIN_NV04(push, NV50_CP(REG_MODE), 1);
    PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);
-   BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1);
+   BEGIN_NV04(push, NV50_CP(UNK0384), 1);
    PUSH_DATA (push, 0x100);
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_GLOBAL), 1);
    PUSH_DATA (push, fifo->vram);
 
    for (i = 0; i < 15; i++) {
-      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2);
+      BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(i)), 2);
       PUSH_DATA (push, 0);
       PUSH_DATA (push, 0);
-      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1);
+      BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(i)), 1);
       PUSH_DATA (push, 0);
-      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1);
+      BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(i)), 1);
       PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
    }
 
-   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2);
+   BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(15)), 2);
    PUSH_DATA (push, 0);
    PUSH_DATA (push, 0);
-   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1);
+   BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(15)), 1);
    PUSH_DATA (push, ~0);
-   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1);
+   BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(15)), 1);
    PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
 
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1);
+   BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_LOG_ALLOC), 1);
    PUSH_DATA (push, 7);
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1);
+   BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_NO_CLAMP), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1);
+   BEGIN_NV04(push, NV50_CP(STACK_WARPS_LOG_ALLOC), 1);
    PUSH_DATA (push, 7);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1);
+   BEGIN_NV04(push, NV50_CP(STACK_WARPS_NO_CLAMP), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
    PUSH_DATA (push, 0);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_TEXTURE), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1);
+   BEGIN_NV04(push, NV50_CP(TEX_LIMITS), 1);
    PUSH_DATA (push, 0x54);
-   BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1);
+   BEGIN_NV04(push, NV50_CP(LINKED_TSC), 1);
    PUSH_DATA (push, 0);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_TIC), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3);
+   BEGIN_NV04(push, NV50_CP(TIC_ADDRESS_HIGH), 3);
    PUSH_DATAh(push, screen->txc->offset);
    PUSH_DATA (push, screen->txc->offset);
    PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_TSC), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3);
+   BEGIN_NV04(push, NV50_CP(TSC_ADDRESS_HIGH), 3);
    PUSH_DATAh(push, screen->txc->offset + 65536);
    PUSH_DATA (push, screen->txc->offset + 65536);
    PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_CODE_CB), 1);
    PUSH_DATA (push, fifo->vram);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_LOCAL), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2);
+   BEGIN_NV04(push, NV50_CP(LOCAL_ADDRESS_HIGH), 2);
    PUSH_DATAh(push, screen->tls_bo->offset + 65536);
    PUSH_DATA (push, screen->tls_bo->offset + 65536);
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1);
+   BEGIN_NV04(push, NV50_CP(LOCAL_SIZE_LOG), 1);
    PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));
 
    return 0;
 }
 
-static bool
-nv50_compute_validate_program(struct nv50_context *nv50)
-{
-   struct nv50_program *prog = nv50->compprog;
-
-   if (prog->mem)
-      return true;
-
-   if (!prog->translated) {
-      prog->translated = nv50_program_translate(
-         prog, nv50->screen->base.device->chipset, &nv50->base.debug);
-      if (!prog->translated)
-         return false;
-   }
-   if (unlikely(!prog->code_size))
-      return false;
-
-   if (likely(prog->code_size)) {
-      if (nv50_program_upload_code(nv50, prog)) {
-         struct nouveau_pushbuf *push = nv50->base.pushbuf;
-         BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1);
-         PUSH_DATA (push, 0);
-         return true;
-      }
-   }
-   return false;
-}
-
 static void
 nv50_compute_validate_globals(struct nv50_context *nv50)
 {
@@ -198,26 +170,25 @@ nv50_compute_validate_globals(struct nv50_context *nv50)
    }
 }
 
+static struct nv50_state_validate
+validate_list_cp[] = {
+   { nv50_compprog_validate,              NV50_NEW_CP_PROGRAM     },
+   { nv50_compute_validate_globals,       NV50_NEW_CP_GLOBALS     },
+};
+
 static bool
-nv50_compute_state_validate(struct nv50_context *nv50)
+nv50_state_validate_cp(struct nv50_context *nv50, uint32_t mask)
 {
-   if (!nv50_compute_validate_program(nv50))
-      return false;
-
-   if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS)
-      nv50_compute_validate_globals(nv50);
+   bool ret;
 
    /* TODO: validate textures, samplers, surfaces */
+   ret = nv50_state_validate(nv50, mask, validate_list_cp,
+                             ARRAY_SIZE(validate_list_cp), &nv50->dirty_cp,
+                             nv50->bufctx_cp);
 
-   nv50_bufctx_fence(nv50->bufctx_cp, false);
-
-   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_cp);
-   if (unlikely(nouveau_pushbuf_validate(nv50->base.pushbuf)))
-      return false;
    if (unlikely(nv50->state.flushed))
       nv50_bufctx_fence(nv50->bufctx_cp, true);
-
-   return true;
+   return ret;
 }
 
 static void
@@ -227,7 +198,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
    struct nouveau_pushbuf *push = screen->base.pushbuf;
    unsigned size = align(nv50->compprog->parm_size, 0x4);
 
-   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
    PUSH_DATA (push, (size / 4) << 8);
 
    if (size) {
@@ -245,7 +216,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
       nouveau_pushbuf_bufctx(push, nv50->bufctx);
       nouveau_pushbuf_validate(push);
 
-      BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4);
+      BEGIN_NV04(push, NV50_CP(USER_PARAM(0)), size / 4);
       nouveau_pushbuf_data(push, bo, offset, size);
 
       nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);
@@ -278,7 +249,7 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    struct nv50_program *cp = nv50->compprog;
    bool ret;
 
-   ret = !nv50_compute_state_validate(nv50);
+   ret = !nv50_state_validate_cp(nv50, ~0);
    if (ret) {
       NOUVEAU_ERR("Failed to launch grid !\n");
       return;
@@ -286,33 +257,33 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
 
    nv50_compute_upload_input(nv50, info->input);
 
-   BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1);
+   BEGIN_NV04(push, NV50_CP(CP_START_ID), 1);
    PUSH_DATA (push, nv50_compute_find_symbol(nv50, info->pc));
 
-   BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1);
+   BEGIN_NV04(push, NV50_CP(SHARED_SIZE), 1);
    PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40));
-   BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1);
+   BEGIN_NV04(push, NV50_CP(CP_REG_ALLOC_TEMP), 1);
    PUSH_DATA (push, cp->max_gpr);
 
    /* grid/block setup */
-   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2);
+   BEGIN_NV04(push, NV50_CP(BLOCKDIM_XY), 2);
    PUSH_DATA (push, info->block[1] << 16 | info->block[0]);
    PUSH_DATA (push, info->block[2]);
-   BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1);
+   BEGIN_NV04(push, NV50_CP(BLOCK_ALLOC), 1);
    PUSH_DATA (push, 1 << 16 | block_size);
-   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1);
+   BEGIN_NV04(push, NV50_CP(BLOCKDIM_LATCH), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1);
+   BEGIN_NV04(push, NV50_CP(GRIDDIM), 1);
    PUSH_DATA (push, info->grid[1] << 16 | info->grid[0]);
-   BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1);
+   BEGIN_NV04(push, NV50_CP(GRIDID), 1);
    PUSH_DATA (push, 1);
 
    /* kernel launching */
-   BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1);
+   BEGIN_NV04(push, NV50_CP(LAUNCH), 1);
    PUSH_DATA (push, 0);
-   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
    PUSH_DATA (push, 0);
 
    /* bind a compute shader clobbers fragment shader state */
-   nv50->dirty |= NV50_NEW_FRAGPROG;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 4874b77b1e1..61a52c4b366 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -176,8 +176,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
          if (nv50->framebuffer.cbufs[i] &&
              nv50->framebuffer.cbufs[i]->texture == res) {
-            nv50->dirty |= NV50_NEW_FRAMEBUFFER;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+            nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
             if (!--ref)
                return ref;
          }
@@ -186,8 +186,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
    if (bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nv50->framebuffer.zsbuf &&
           nv50->framebuffer.zsbuf->texture == res) {
-         nv50->dirty |= NV50_NEW_FRAMEBUFFER;
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+         nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
          if (!--ref)
             return ref;
       }
@@ -202,8 +202,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
          if (nv50->vtxbuf[i].buffer == res) {
-            nv50->dirty |= NV50_NEW_ARRAYS;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+            nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX);
             if (!--ref)
                return ref;
          }
@@ -211,8 +211,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
 
       if (nv50->idxbuf.buffer == res) {
          /* Just rebind to the bufctx as there is no separate dirty bit */
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX);
-         BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(res), RD);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
+         BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(res), RD);
          if (!--ref)
             return ref;
       }
@@ -222,8 +222,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       for (i = 0; i < nv50->num_textures[s]; ++i) {
          if (nv50->textures[s][i] &&
              nv50->textures[s][i]->texture == res) {
-            nv50->dirty |= NV50_NEW_TEXTURES;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+            nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
             if (!--ref)
                return ref;
          }
@@ -236,9 +236,9 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
             continue;
          if (!nv50->constbuf[s][i].user &&
              nv50->constbuf[s][i].u.buf == res) {
-            nv50->dirty |= NV50_NEW_CONSTBUF;
+            nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;
             nv50->constbuf_dirty[s] |= 1 << i;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i));
             if (!--ref)
                return ref;
          }
@@ -345,10 +345,10 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 
    flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
 
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->code);
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms);
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc);
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->code);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->uniforms);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->txc);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->stack_bo);
    if (screen->compute) {
       BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->code);
       BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->txc);
@@ -357,7 +357,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 
    flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
 
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->fence.bo);
    BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo);
    if (screen->compute)
       BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 2620d03b999..2317fa2ccf8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -26,43 +26,43 @@
 #include "nv50/nv50_3d.xml.h"
 #include "nv50/nv50_2d.xml.h"
 
-#define NV50_NEW_BLEND        (1 << 0)
-#define NV50_NEW_RASTERIZER   (1 << 1)
-#define NV50_NEW_ZSA          (1 << 2)
-#define NV50_NEW_VERTPROG     (1 << 3)
-#define NV50_NEW_GMTYPROG     (1 << 6)
-#define NV50_NEW_FRAGPROG     (1 << 7)
-#define NV50_NEW_BLEND_COLOUR (1 << 8)
-#define NV50_NEW_STENCIL_REF  (1 << 9)
-#define NV50_NEW_CLIP         (1 << 10)
-#define NV50_NEW_SAMPLE_MASK  (1 << 11)
-#define NV50_NEW_FRAMEBUFFER  (1 << 12)
-#define NV50_NEW_STIPPLE      (1 << 13)
-#define NV50_NEW_SCISSOR      (1 << 14)
-#define NV50_NEW_VIEWPORT     (1 << 15)
-#define NV50_NEW_ARRAYS       (1 << 16)
-#define NV50_NEW_VERTEX       (1 << 17)
-#define NV50_NEW_CONSTBUF     (1 << 18)
-#define NV50_NEW_TEXTURES     (1 << 19)
-#define NV50_NEW_SAMPLERS     (1 << 20)
-#define NV50_NEW_STRMOUT      (1 << 21)
-#define NV50_NEW_MIN_SAMPLES  (1 << 22)
-#define NV50_NEW_CONTEXT      (1 << 31)
+#define NV50_NEW_3D_BLEND        (1 << 0)
+#define NV50_NEW_3D_RASTERIZER   (1 << 1)
+#define NV50_NEW_3D_ZSA          (1 << 2)
+#define NV50_NEW_3D_VERTPROG     (1 << 3)
+#define NV50_NEW_3D_GMTYPROG     (1 << 6)
+#define NV50_NEW_3D_FRAGPROG     (1 << 7)
+#define NV50_NEW_3D_BLEND_COLOUR (1 << 8)
+#define NV50_NEW_3D_STENCIL_REF  (1 << 9)
+#define NV50_NEW_3D_CLIP         (1 << 10)
+#define NV50_NEW_3D_SAMPLE_MASK  (1 << 11)
+#define NV50_NEW_3D_FRAMEBUFFER  (1 << 12)
+#define NV50_NEW_3D_STIPPLE      (1 << 13)
+#define NV50_NEW_3D_SCISSOR      (1 << 14)
+#define NV50_NEW_3D_VIEWPORT     (1 << 15)
+#define NV50_NEW_3D_ARRAYS       (1 << 16)
+#define NV50_NEW_3D_VERTEX       (1 << 17)
+#define NV50_NEW_3D_CONSTBUF     (1 << 18)
+#define NV50_NEW_3D_TEXTURES     (1 << 19)
+#define NV50_NEW_3D_SAMPLERS     (1 << 20)
+#define NV50_NEW_3D_STRMOUT      (1 << 21)
+#define NV50_NEW_3D_MIN_SAMPLES  (1 << 22)
+#define NV50_NEW_3D_CONTEXT      (1 << 31)
 
 #define NV50_NEW_CP_PROGRAM   (1 << 0)
 #define NV50_NEW_CP_GLOBALS   (1 << 1)
 
 /* 3d bufctx (during draw_vbo, blit_3d) */
-#define NV50_BIND_FB          0
-#define NV50_BIND_VERTEX      1
-#define NV50_BIND_VERTEX_TMP  2
-#define NV50_BIND_INDEX       3
-#define NV50_BIND_TEXTURES    4
-#define NV50_BIND_CB(s, i)   (5 + 16 * (s) + (i))
-#define NV50_BIND_SO         53
-#define NV50_BIND_SCREEN     54
-#define NV50_BIND_TLS        55
-#define NV50_BIND_3D_COUNT   56
+#define NV50_BIND_3D_FB          0
+#define NV50_BIND_3D_VERTEX      1
+#define NV50_BIND_3D_VERTEX_TMP  2
+#define NV50_BIND_3D_INDEX       3
+#define NV50_BIND_3D_TEXTURES    4
+#define NV50_BIND_3D_CB(s, i)   (5 + 16 * (s) + (i))
+#define NV50_BIND_3D_SO         53
+#define NV50_BIND_3D_SCREEN     54
+#define NV50_BIND_3D_TLS        55
+#define NV50_BIND_3D_COUNT      56
 
 /* compute bufctx (during launch_grid) */
 #define NV50_BIND_CP_GLOBAL   0
@@ -115,7 +115,7 @@ struct nv50_context {
    struct nouveau_bufctx *bufctx;
    struct nouveau_bufctx *bufctx_cp;
 
-   uint32_t dirty;
+   uint32_t dirty_3d; /* dirty flags for 3d state */
    uint32_t dirty_cp; /* dirty flags for compute state */
    bool cb_dirty;
 
@@ -221,6 +221,7 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);
 void nv50_vertprog_validate(struct nv50_context *);
 void nv50_gmtyprog_validate(struct nv50_context *);
 void nv50_fragprog_validate(struct nv50_context *);
+void nv50_compprog_validate(struct nv50_context *);
 void nv50_fp_linkage_validate(struct nv50_context *);
 void nv50_gp_linkage_validate(struct nv50_context *);
 void nv50_constbufs_validate(struct nv50_context *);
@@ -231,7 +232,15 @@ void nv50_stream_output_validate(struct nv50_context *);
 extern void nv50_init_state_functions(struct nv50_context *);
 
 /* nv50_state_validate.c */
-bool nv50_state_validate(struct nv50_context *, uint32_t state_mask);
+struct nv50_state_validate {
+   void (*func)(struct nv50_context *);
+   uint32_t states;
+};
+
+bool nv50_state_validate(struct nv50_context *, uint32_t,
+                         struct nv50_state_validate *, int, uint32_t *,
+                         struct nouveau_bufctx *);
+bool nv50_state_validate_3d(struct nv50_context *, uint32_t);
 
 /* nv50_surface.c */
 extern void nv50_clear(struct pipe_context *, unsigned buffers,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index a67ef28abf8..3444b3110de 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -335,7 +335,6 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
    info->io.genUserClip = prog->vp.clpd_nr;
 
-   info->io.resInfoCBSlot = 15;
    info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
    info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
    info->io.msInfoCBSlot = 15;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
index be19c0fdc85..0a73090d78d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
@@ -202,10 +202,10 @@ nv50_hw_sm_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
       func = nv50_hw_sm_get_func(c);
 
       /* configure and reset the counter(s) */
-      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+      BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1);
       PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
                         | cfg->ctr[i].unit | cfg->ctr[i].mode);
-      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_SET(c)), 1);
+      BEGIN_NV04(push, NV50_CP(MP_PM_SET(c)), 1);
       PUSH_DATA (push, 0);
    }
    return true;
@@ -240,7 +240,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
    PUSH_SPACE(push, 8);
    for (c = 0; c < 4; c++) {
       if (screen->pm.mp_counter[c]) {
-         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+         BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1);
          PUSH_DATA (push, 0);
       }
    }
@@ -257,7 +257,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
                 hq->bo);
 
    PUSH_SPACE(push, 2);
-   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
    PUSH_DATA (push, 0);
 
    pipe->bind_compute_state(pipe, screen->pm.prog);
@@ -295,7 +295,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
          mask |= 1 << hsq->ctr[i];
          func  = nv50_hw_sm_get_func(hsq->ctr[i]);
 
-         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(hsq->ctr[i])), 1);
+         BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(hsq->ctr[i])), 1);
          PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
                     | cfg->ctr[i].unit | cfg->ctr[i].mode);
       }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 8e4b2b42bda..3d2ebfbcc46 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -29,6 +29,8 @@
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_query_hw.h"
 
+#include "nv50/nv50_compute.xml.h"
+
 void
 nv50_constbufs_validate(struct nv50_context *nv50)
 {
@@ -94,7 +96,7 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
                PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
 
-               BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD);
+               BCTX_REFN(nv50->bufctx_3d, 3D_CB(s, i), res, RD);
 
                nv50->cb_dirty = 1; /* Force cache flush for UBO. */
             } else {
@@ -131,14 +133,14 @@ nv50_program_update_context_state(struct nv50_context *nv50,
 
    if (prog && prog->tls_space) {
       if (nv50->state.new_tls_space)
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS);
       if (!nv50->state.tls_required || nv50->state.new_tls_space)
-         BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
+         BCTX_REFN_bo(nv50->bufctx_3d, 3D_TLS, flags, nv50->screen->tls_bo);
       nv50->state.new_tls_space = false;
       nv50->state.tls_required |= 1 << stage;
    } else {
       if (nv50->state.tls_required == (1 << stage))
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS);
       nv50->state.tls_required &= ~(1 << stage);
    }
 }
@@ -181,7 +183,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
       fp->fp.force_persample_interp = rast->force_persample_interp;
    }
 
-   if (fp->mem && !(nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_MIN_SAMPLES)))
+   if (fp->mem && !(nv50->dirty_3d & (NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_MIN_SAMPLES)))
       return;
 
    if (!nv50_program_validate(nv50, fp))
@@ -238,6 +240,19 @@ nv50_gmtyprog_validate(struct nv50_context *nv50)
    /* GP_ENABLE is updated in linkage validation */
 }
 
+void
+nv50_compprog_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_program *cp = nv50->compprog;
+
+   if (cp && !nv50_program_validate(nv50, cp))
+      return;
+
+   BEGIN_NV04(push, NV50_CP(CODE_CB_FLUSH), 1);
+   PUSH_DATA (push, 0);
+}
+
 static void
 nv50_sprite_coords_validate(struct nv50_context *nv50)
 {
@@ -309,7 +324,7 @@ nv50_validate_derived_rs(struct nv50_context *nv50)
       PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard);
    }
 
-   if (nv50->dirty & NV50_NEW_FRAGPROG)
+   if (nv50->dirty_3d & NV50_NEW_3D_FRAGPROG)
       return;
    psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
    color = nv50->state.semantic_color & ~NV50_3D_SEMANTIC_COLOR_CLMP_EN;
@@ -378,9 +393,9 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    uint8_t map[64];
    uint8_t so_map[64];
 
-   if (!(nv50->dirty & (NV50_NEW_VERTPROG |
-                        NV50_NEW_FRAGPROG |
-                        NV50_NEW_GMTYPROG))) {
+   if (!(nv50->dirty_3d & (NV50_NEW_3D_VERTPROG |
+                           NV50_NEW_3D_FRAGPROG |
+                           NV50_NEW_3D_GMTYPROG))) {
       uint8_t bfc, ffc;
       ffc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_FFC0_ID__MASK);
       bfc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_BFC0_ID__MASK)
@@ -633,8 +648,6 @@ nv50_stream_output_validate(struct nv50_context *nv50)
    BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
    PUSH_DATA (push, ctrl);
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
-
    for (i = 0; i < nv50->num_so_targets; ++i) {
       struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
       struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
@@ -664,7 +677,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
          prims = MIN2(prims, limit);
       }
       targ->stride = so->stride[i];
-      BCTX_REFN(nv50->bufctx_3d, SO, buf, WR);
+      BCTX_REFN(nv50->bufctx_3d, 3D_SO, buf, WR);
    }
    if (prims != ~0) {
       BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 8504ba466cc..86e74d68b11 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -200,7 +200,7 @@ nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->blend = hwcso;
-   nv50->dirty |= NV50_NEW_BLEND;
+   nv50->dirty_3d |= NV50_NEW_3D_BLEND;
 }
 
 static void
@@ -337,7 +337,7 @@ nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->rast = hwcso;
-   nv50->dirty |= NV50_NEW_RASTERIZER;
+   nv50->dirty_3d |= NV50_NEW_3D_RASTERIZER;
 }
 
 static void
@@ -426,7 +426,7 @@ nv50_zsa_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->zsa = hwcso;
-   nv50->dirty |= NV50_NEW_ZSA;
+   nv50->dirty_3d |= NV50_NEW_3D_ZSA;
 }
 
 static void
@@ -605,7 +605,7 @@ nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,
 
    nv50->num_samplers[s] = nr;
 
-   nv50->dirty |= NV50_NEW_SAMPLERS;
+   nv50->dirty_3d |= NV50_NEW_3D_SAMPLERS;
 }
 
 static void
@@ -698,9 +698,9 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
 
    nv50->num_textures[s] = nr;
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
 
-   nv50->dirty |= NV50_NEW_TEXTURES;
+   nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
 }
 
 static void
@@ -776,7 +776,7 @@ nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
     struct nv50_context *nv50 = nv50_context(pipe);
 
     nv50->vertprog = hwcso;
-    nv50->dirty |= NV50_NEW_VERTPROG;
+    nv50->dirty_3d |= NV50_NEW_3D_VERTPROG;
 }
 
 static void *
@@ -792,7 +792,7 @@ nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
     struct nv50_context *nv50 = nv50_context(pipe);
 
     nv50->fragprog = hwcso;
-    nv50->dirty |= NV50_NEW_FRAGPROG;
+    nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
 }
 
 static void *
@@ -808,7 +808,7 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     struct nv50_context *nv50 = nv50_context(pipe);
 
     nv50->gmtyprog = hwcso;
-    nv50->dirty |= NV50_NEW_GMTYPROG;
+    nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG;
 }
 
 static void *
@@ -857,7 +857,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
       nv50->constbuf[s][i].u.buf = NULL;
    else
    if (nv50->constbuf[s][i].u.buf)
-      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i));
 
    pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res);
 
@@ -882,7 +882,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
    }
    nv50->constbuf_dirty[s] |= 1 << i;
 
-   nv50->dirty |= NV50_NEW_CONSTBUF;
+   nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;
 }
 
 /* =============================================================================
@@ -895,7 +895,7 @@ nv50_set_blend_color(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->blend_colour = *bcol;
-   nv50->dirty |= NV50_NEW_BLEND_COLOUR;
+   nv50->dirty_3d |= NV50_NEW_3D_BLEND_COLOUR;
 }
 
 static void
@@ -905,7 +905,7 @@ nv50_set_stencil_ref(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->stencil_ref = *sr;
-   nv50->dirty |= NV50_NEW_STENCIL_REF;
+   nv50->dirty_3d |= NV50_NEW_3D_STENCIL_REF;
 }
 
 static void
@@ -916,7 +916,7 @@ nv50_set_clip_state(struct pipe_context *pipe,
 
    memcpy(nv50->clip.ucp, clip->ucp, sizeof(clip->ucp));
 
-   nv50->dirty |= NV50_NEW_CLIP;
+   nv50->dirty_3d |= NV50_NEW_3D_CLIP;
 }
 
 static void
@@ -925,7 +925,7 @@ nv50_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->sample_mask = sample_mask;
-   nv50->dirty |= NV50_NEW_SAMPLE_MASK;
+   nv50->dirty_3d |= NV50_NEW_3D_SAMPLE_MASK;
 }
 
 static void
@@ -935,7 +935,7 @@ nv50_set_min_samples(struct pipe_context *pipe, unsigned min_samples)
 
    if (nv50->min_samples != min_samples) {
       nv50->min_samples = min_samples;
-      nv50->dirty |= NV50_NEW_MIN_SAMPLES;
+      nv50->dirty_3d |= NV50_NEW_3D_MIN_SAMPLES;
    }
 }
 
@@ -945,11 +945,11 @@ nv50_set_framebuffer_state(struct pipe_context *pipe,
 {
    struct nv50_context *nv50 = nv50_context(pipe);
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
 
    util_copy_framebuffer_state(&nv50->framebuffer, fb);
 
-   nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
 }
 
 static void
@@ -959,7 +959,7 @@ nv50_set_polygon_stipple(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->stipple = *stipple;
-   nv50->dirty |= NV50_NEW_STIPPLE;
+   nv50->dirty_3d |= NV50_NEW_3D_STIPPLE;
 }
 
 static void
@@ -977,7 +977,7 @@ nv50_set_scissor_states(struct pipe_context *pipe,
          continue;
       nv50->scissors[start_slot + i] = scissor[i];
       nv50->scissors_dirty |= 1 << (start_slot + i);
-      nv50->dirty |= NV50_NEW_SCISSOR;
+      nv50->dirty_3d |= NV50_NEW_3D_SCISSOR;
    }
 }
 
@@ -996,7 +996,7 @@ nv50_set_viewport_states(struct pipe_context *pipe,
          continue;
       nv50->viewports[start_slot + i] = vpt[i];
       nv50->viewports_dirty |= 1 << (start_slot + i);
-      nv50->dirty |= NV50_NEW_VIEWPORT;
+      nv50->dirty_3d |= NV50_NEW_3D_VIEWPORT;
    }
 }
 
@@ -1008,8 +1008,8 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
    unsigned i;
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
-   nv50->dirty |= NV50_NEW_ARRAYS;
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX);
+   nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
 
    util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
                                  start_slot, count);
@@ -1051,14 +1051,14 @@ nv50_set_index_buffer(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    if (nv50->idxbuf.buffer)
-      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX);
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
 
    if (ib) {
       pipe_resource_reference(&nv50->idxbuf.buffer, ib->buffer);
       nv50->idxbuf.index_size = ib->index_size;
       if (ib->buffer) {
          nv50->idxbuf.offset = ib->offset;
-         BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(ib->buffer), RD);
+         BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(ib->buffer), RD);
       } else {
          nv50->idxbuf.user_buffer = ib->user_buffer;
       }
@@ -1073,7 +1073,7 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->vertex = hwcso;
-   nv50->dirty |= NV50_NEW_VERTEX;
+   nv50->dirty_3d |= NV50_NEW_3D_VERTEX;
 }
 
 static struct pipe_stream_output_target *
@@ -1180,8 +1180,10 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
    }
    nv50->num_so_targets = num_targets;
 
-   if (nv50->so_targets_dirty)
-      nv50->dirty |= NV50_NEW_STRMOUT;
+   if (nv50->so_targets_dirty) {
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_SO);
+      nv50->dirty_3d |= NV50_NEW_3D_STRMOUT;
+   }
 }
 
 static void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 55369781606..51204930031 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -25,7 +25,7 @@ nv50_validate_fb(struct nv50_context *nv50)
    unsigned ms_mode = NV50_3D_MULTISAMPLE_MODE_MS1;
    uint32_t array_size = 0xffff, array_mode = 0;
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
 
    BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
    PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs);
@@ -90,7 +90,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
       /* only register for writing, otherwise we'd always serialize here */
-      BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
+      BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR);
    }
 
    if (fb->zsbuf) {
@@ -118,7 +118,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
-      BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
+      BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR);
    } else {
       BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
       PUSH_DATA (push, 0);
@@ -187,8 +187,8 @@ nv50_validate_scissor(struct nv50_context *nv50)
 #ifdef NV50_SCISSORS_CLIPPING
    int minx, maxx, miny, maxy, i;
 
-   if (!(nv50->dirty &
-         (NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT | NV50_NEW_FRAMEBUFFER)) &&
+   if (!(nv50->dirty_3d &
+         (NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT | NV50_NEW_3D_FRAMEBUFFER)) &&
        nv50->state.scissor == nv50->rast->pipe.scissor)
       return;
 
@@ -197,7 +197,7 @@ nv50_validate_scissor(struct nv50_context *nv50)
 
    nv50->state.scissor = nv50->rast->pipe.scissor;
 
-   if ((nv50->dirty & NV50_NEW_FRAMEBUFFER) && !nv50->state.scissor)
+   if ((nv50->dirty_3d & NV50_NEW_3D_FRAMEBUFFER) && !nv50->state.scissor)
       nv50->scissors_dirty = (1 << NV50_MAX_VIEWPORTS) - 1;
 
    for (i = 0; i < NV50_MAX_VIEWPORTS; i++) {
@@ -290,10 +290,10 @@ nv50_check_program_ucps(struct nv50_context *nv50,
 
    vp->vp.clpd_nr = n;
    if (likely(vp == nv50->vertprog)) {
-      nv50->dirty |= NV50_NEW_VERTPROG;
+      nv50->dirty_3d |= NV50_NEW_3D_VERTPROG;
       nv50_vertprog_validate(nv50);
    } else {
-      nv50->dirty |= NV50_NEW_GMTYPROG;
+      nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG;
       nv50_gmtyprog_validate(nv50);
    }
    nv50_fp_linkage_validate(nv50);
@@ -342,7 +342,7 @@ nv50_validate_clip(struct nv50_context *nv50)
    struct nv50_program *vp;
    uint8_t clip_enable;
 
-   if (nv50->dirty & NV50_NEW_CLIP) {
+   if (nv50->dirty_3d & NV50_NEW_3D_CLIP) {
       BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
       PUSH_DATA (push, (NV50_CB_AUX_UCP_OFFSET << 8) | NV50_CB_AUX);
       BEGIN_NI04(push, NV50_3D(CB_DATA(0)), PIPE_MAX_CLIP_PLANES * 4);
@@ -436,7 +436,8 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
    else
       ctx_to->state = ctx_to->screen->save_state;
 
-   ctx_to->dirty = ~0;
+   ctx_to->dirty_3d = ~0;
+   ctx_to->dirty_cp = ~0;
    ctx_to->viewports_dirty = ~0;
    ctx_to->scissors_dirty = ~0;
 
@@ -445,71 +446,71 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
    ctx_to->constbuf_dirty[2] = (1 << NV50_MAX_PIPE_CONSTBUFS) - 1;
 
    if (!ctx_to->vertex)
-      ctx_to->dirty &= ~(NV50_NEW_VERTEX | NV50_NEW_ARRAYS);
+      ctx_to->dirty_3d &= ~(NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS);
 
    if (!ctx_to->vertprog)
-      ctx_to->dirty &= ~NV50_NEW_VERTPROG;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_VERTPROG;
    if (!ctx_to->fragprog)
-      ctx_to->dirty &= ~NV50_NEW_FRAGPROG;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_FRAGPROG;
 
    if (!ctx_to->blend)
-      ctx_to->dirty &= ~NV50_NEW_BLEND;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_BLEND;
    if (!ctx_to->rast)
 #ifdef NV50_SCISSORS_CLIPPING
-      ctx_to->dirty &= ~(NV50_NEW_RASTERIZER | NV50_NEW_SCISSOR);
+      ctx_to->dirty_3d &= ~(NV50_NEW_3D_RASTERIZER | NV50_NEW_3D_SCISSOR);
 #else
-      ctx_to->dirty &= ~NV50_NEW_RASTERIZER;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_RASTERIZER;
 #endif
    if (!ctx_to->zsa)
-      ctx_to->dirty &= ~NV50_NEW_ZSA;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_ZSA;
 
    ctx_to->screen->cur_ctx = ctx_to;
 }
 
-static struct state_validate {
-    void (*func)(struct nv50_context *);
-    uint32_t states;
-} validate_list[] = {
-    { nv50_validate_fb,            NV50_NEW_FRAMEBUFFER },
-    { nv50_validate_blend,         NV50_NEW_BLEND },
-    { nv50_validate_zsa,           NV50_NEW_ZSA },
-    { nv50_validate_sample_mask,   NV50_NEW_SAMPLE_MASK },
-    { nv50_validate_rasterizer,    NV50_NEW_RASTERIZER },
-    { nv50_validate_blend_colour,  NV50_NEW_BLEND_COLOUR },
-    { nv50_validate_stencil_ref,   NV50_NEW_STENCIL_REF },
-    { nv50_validate_stipple,       NV50_NEW_STIPPLE },
+static struct nv50_state_validate
+validate_list_3d[] = {
+    { nv50_validate_fb,            NV50_NEW_3D_FRAMEBUFFER },
+    { nv50_validate_blend,         NV50_NEW_3D_BLEND },
+    { nv50_validate_zsa,           NV50_NEW_3D_ZSA },
+    { nv50_validate_sample_mask,   NV50_NEW_3D_SAMPLE_MASK },
+    { nv50_validate_rasterizer,    NV50_NEW_3D_RASTERIZER },
+    { nv50_validate_blend_colour,  NV50_NEW_3D_BLEND_COLOUR },
+    { nv50_validate_stencil_ref,   NV50_NEW_3D_STENCIL_REF },
+    { nv50_validate_stipple,       NV50_NEW_3D_STIPPLE },
 #ifdef NV50_SCISSORS_CLIPPING
-    { nv50_validate_scissor,       NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT |
-                                   NV50_NEW_RASTERIZER |
-                                   NV50_NEW_FRAMEBUFFER },
+    { nv50_validate_scissor,       NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT |
+                                   NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_FRAMEBUFFER },
 #else
-    { nv50_validate_scissor,       NV50_NEW_SCISSOR },
+    { nv50_validate_scissor,       NV50_NEW_3D_SCISSOR },
 #endif
-    { nv50_validate_viewport,      NV50_NEW_VIEWPORT },
-    { nv50_vertprog_validate,      NV50_NEW_VERTPROG },
-    { nv50_gmtyprog_validate,      NV50_NEW_GMTYPROG },
-    { nv50_fragprog_validate,      NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
-                                   NV50_NEW_MIN_SAMPLES },
-    { nv50_fp_linkage_validate,    NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
-                                   NV50_NEW_GMTYPROG | NV50_NEW_RASTERIZER },
-    { nv50_gp_linkage_validate,    NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG },
-    { nv50_validate_derived_rs,    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
-                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_validate_derived_2,     NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
-    { nv50_validate_derived_3,     NV50_NEW_BLEND | NV50_NEW_FRAMEBUFFER },
-    { nv50_validate_clip,          NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
-                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
-    { nv50_validate_textures,      NV50_NEW_TEXTURES },
-    { nv50_validate_samplers,      NV50_NEW_SAMPLERS },
-    { nv50_stream_output_validate, NV50_NEW_STRMOUT |
-                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS },
-    { nv50_validate_min_samples,   NV50_NEW_MIN_SAMPLES },
+    { nv50_validate_viewport,      NV50_NEW_3D_VIEWPORT },
+    { nv50_vertprog_validate,      NV50_NEW_3D_VERTPROG },
+    { nv50_gmtyprog_validate,      NV50_NEW_3D_GMTYPROG },
+    { nv50_fragprog_validate,      NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_MIN_SAMPLES },
+    { nv50_fp_linkage_validate,    NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_VERTPROG |
+                                   NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_RASTERIZER },
+    { nv50_gp_linkage_validate,    NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_VERTPROG },
+    { nv50_validate_derived_rs,    NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
+    { nv50_validate_derived_2,     NV50_NEW_3D_ZSA | NV50_NEW_3D_FRAMEBUFFER },
+    { nv50_validate_derived_3,     NV50_NEW_3D_BLEND | NV50_NEW_3D_FRAMEBUFFER },
+    { nv50_validate_clip,          NV50_NEW_3D_CLIP | NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
+    { nv50_constbufs_validate,     NV50_NEW_3D_CONSTBUF },
+    { nv50_validate_textures,      NV50_NEW_3D_TEXTURES },
+    { nv50_validate_samplers,      NV50_NEW_3D_SAMPLERS },
+    { nv50_stream_output_validate, NV50_NEW_3D_STRMOUT |
+                                   NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
+    { nv50_vertex_arrays_validate, NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS },
+    { nv50_validate_min_samples,   NV50_NEW_3D_MIN_SAMPLES },
 };
 
 bool
-nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
+nv50_state_validate(struct nv50_context *nv50, uint32_t mask,
+                    struct nv50_state_validate *validate_list, int size,
+                    uint32_t *dirty, struct nouveau_bufctx *bufctx)
 {
    uint32_t state_mask;
    int ret;
@@ -518,16 +519,16 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
    if (nv50->screen->cur_ctx != nv50)
       nv50_switch_pipe_context(nv50);
 
-   state_mask = nv50->dirty & mask;
+   state_mask = *dirty & mask;
 
    if (state_mask) {
-      for (i = 0; i < ARRAY_SIZE(validate_list); ++i) {
-         struct state_validate *validate = &validate_list[i];
+      for (i = 0; i < size; i++) {
+         struct nv50_state_validate *validate = &validate_list[i];
 
          if (state_mask & validate->states)
             validate->func(nv50);
       }
-      nv50->dirty &= ~state_mask;
+      *dirty &= ~state_mask;
 
       if (nv50->state.rt_serialize) {
          nv50->state.rt_serialize = false;
@@ -535,14 +536,26 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
          PUSH_DATA (nv50->base.pushbuf, 0);
       }
 
-      nv50_bufctx_fence(nv50->bufctx_3d, false);
+      nv50_bufctx_fence(bufctx, false);
    }
-   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d);
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, bufctx);
    ret = nouveau_pushbuf_validate(nv50->base.pushbuf);
 
+   return !ret;
+}
+
+bool
+nv50_state_validate_3d(struct nv50_context *nv50, uint32_t mask)
+{
+   bool ret;
+
+   ret = nv50_state_validate(nv50, mask, validate_list_3d,
+                             ARRAY_SIZE(validate_list_3d), &nv50->dirty_3d,
+                             nv50->bufctx_3d);
+
    if (unlikely(nv50->state.flushed)) {
       nv50->state.flushed = false;
       nv50_bufctx_fence(nv50->bufctx_3d, true);
    }
-   return !ret;
+   return ret;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 84646f6adb1..68b0e18ef8f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -353,7 +353,7 @@ nv50_clear_render_target(struct pipe_context *pipe,
    BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
    PUSH_DATA (push, nv50->cond_condmode);
 
-   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR;
 }
 
 static void
@@ -436,7 +436,7 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
    BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
    PUSH_DATA (push, nv50->cond_condmode);
 
-   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR;
 }
 
 void
@@ -525,7 +525,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
    uint32_t mode = 0;
 
    /* don't need NEW_BLEND, COLOR_MASK doesn't affect CLEAR_BUFFERS */
-   if (!nv50_state_validate(nv50, NV50_NEW_FRAMEBUFFER))
+   if (!nv50_state_validate_3d(nv50, NV50_NEW_3D_FRAMEBUFFER))
       return;
 
    /* We have to clear ALL of the layers, not up to the min number of layers
@@ -798,7 +798,7 @@ nv50_clear_buffer(struct pipe_context *pipe,
                              data, data_size);
    }
 
-   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR;
 }
 
 /* =============================== BLIT CODE ===================================
@@ -834,7 +834,7 @@ struct nv50_blitctx
       struct pipe_sampler_view *texture[2];
       struct nv50_tsc_entry *sampler[2];
       unsigned min_samples;
-      uint32_t dirty;
+      uint32_t dirty_3d;
    } saved;
    struct nv50_rasterizer_stateobj rast;
 };
@@ -1253,15 +1253,15 @@ nv50_blitctx_pre_blit(struct nv50_blitctx *ctx)
 
    nv50->min_samples = 1;
 
-   ctx->saved.dirty = nv50->dirty;
+   ctx->saved.dirty_3d = nv50->dirty_3d;
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
 
-   nv50->dirty =
-      NV50_NEW_FRAMEBUFFER | NV50_NEW_MIN_SAMPLES |
-      NV50_NEW_VERTPROG | NV50_NEW_FRAGPROG | NV50_NEW_GMTYPROG |
-      NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS;
+   nv50->dirty_3d =
+      NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_MIN_SAMPLES |
+      NV50_NEW_3D_VERTPROG | NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_GMTYPROG |
+      NV50_NEW_3D_TEXTURES | NV50_NEW_3D_SAMPLERS;
 }
 
 static void
@@ -1302,14 +1302,14 @@ nv50_blitctx_post_blit(struct nv50_blitctx *blit)
       nv50->base.pipe.render_condition(&nv50->base.pipe, nv50->cond_query,
                                        nv50->cond_cond, nv50->cond_mode);
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
 
-   nv50->dirty = blit->saved.dirty |
-      (NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR | NV50_NEW_SAMPLE_MASK |
-       NV50_NEW_RASTERIZER | NV50_NEW_ZSA | NV50_NEW_BLEND |
-       NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS |
-       NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG | NV50_NEW_FRAGPROG);
+   nv50->dirty_3d = blit->saved.dirty_3d |
+      (NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR | NV50_NEW_3D_SAMPLE_MASK |
+       NV50_NEW_3D_RASTERIZER | NV50_NEW_3D_ZSA | NV50_NEW_3D_BLEND |
+       NV50_NEW_3D_TEXTURES | NV50_NEW_3D_SAMPLERS |
+       NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_FRAGPROG);
    nv50->scissors_dirty |= 1;
 
    nv50->base.pipe.set_min_samples(&nv50->base.pipe, blit->saved.min_samples);
@@ -1344,7 +1344,7 @@ nv50_blit_3d(struct nv50_context *nv50, const struct pipe_blit_info *info)
 
    nv50_blitctx_prepare_state(blit);
 
-   nv50_state_validate(nv50, ~0);
+   nv50_state_validate_3d(nv50, ~0);
 
    x_range = (float)info->src.box.width / (float)info->dst.box.width;
    y_range = (float)info->src.box.height / (float)info->dst.box.height;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
index 4b69c3bd504..414d326eeed 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -299,7 +299,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s)
       res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
 
-      BCTX_REFN(nv50->bufctx_3d, TEXTURES, res, RD);
+      BCTX_REFN(nv50->bufctx_3d, 3D_TEXTURES, res, RD);
 
       BEGIN_NV04(push, NV50_3D(BIND_TIC(s)), 1);
       PUSH_DATA (push, (tic->id << 9) | (i << 1) | 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 6f60445d8d2..a11cdf847b1 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -230,7 +230,7 @@ nv50_upload_user_buffers(struct nv50_context *nv50,
       addrs[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer, base, size,
                                       &bo);
       if (addrs[b])
-         BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, NOUVEAU_BO_GART |
+         BCTX_REFN_bo(nv50->bufctx_3d, 3D_VERTEX_TMP, NOUVEAU_BO_GART |
                       NOUVEAU_BO_RD, bo);
    }
    nv50->base.vbo_dirty = true;
@@ -269,7 +269,7 @@ nv50_update_user_vbufs(struct nv50_context *nv50)
          address[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer,
                                            base, size, &bo);
          if (address[b])
-            BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, bo_flags, bo);
+            BCTX_REFN_bo(nv50->bufctx_3d, 3D_VERTEX_TMP, bo_flags, bo);
       }
 
       BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2);
@@ -286,7 +286,7 @@ static inline void
 nv50_release_user_vbufs(struct nv50_context *nv50)
 {
    if (nv50->vbo_user) {
-      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX_TMP);
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX_TMP);
       nouveau_scratch_done(&nv50->base);
    }
 }
@@ -394,7 +394,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
          struct nv04_resource *buf = nv04_resource(vb->buffer);
          if (!(refd & (1 << b))) {
             refd |= 1 << b;
-            BCTX_REFN(nv50->bufctx_3d, VERTEX, buf, RD);
+            BCTX_REFN(nv50->bufctx_3d, 3D_VERTEX, buf, RD);
          }
          address = buf->address + vb->buffer_offset + ve->pipe.src_offset;
          limit = buf->address + buf->base.width0 - 1;
@@ -779,9 +779,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    nv50->vbo_push_hint = /* the 64 is heuristic */
       !(info->indexed && ((nv50->vb_elt_limit + 64) < info->count));
 
-   if (nv50->vbo_user && !(nv50->dirty & (NV50_NEW_ARRAYS | NV50_NEW_VERTEX))) {
+   if (nv50->vbo_user && !(nv50->dirty_3d & (NV50_NEW_3D_ARRAYS | NV50_NEW_3D_VERTEX))) {
       if (!!nv50->vbo_fifo != nv50->vbo_push_hint)
-         nv50->dirty |= NV50_NEW_ARRAYS;
+         nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
       else
       if (!nv50->vbo_fifo)
          nv50_update_user_vbufs(nv50);
@@ -790,7 +790,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (unlikely(nv50->num_so_targets && !nv50->gmtyprog))
       nv50->state.prim_size = nv50_pipe_prim_to_prim_size[info->mode];
 
-   nv50_state_validate(nv50, ~0);
+   nv50_state_validate_3d(nv50, ~0);
 
    push->kick_notify = nv50_draw_vbo_kick_notify;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
index 68002305d72..7056258d1bf 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
@@ -58,8 +58,8 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define SUBC_M2MF(m) 5, (m)
 #define NV50_M2MF(n) SUBC_M2MF(NV50_M2MF_##n)
 
-#define SUBC_COMPUTE(m) 6, (m)
-#define NV50_COMPUTE(n) SUBC_COMPUTE(NV50_COMPUTE_##n)
+#define SUBC_CP(m) 6, (m)
+#define NV50_CP(n) SUBC_CP(NV50_COMPUTE_##n)
 
 
 static inline uint32_t
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index ffbb16f79de..6aaa7ce1aaf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -153,7 +153,7 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0)
 
       if (nvc0->constbuf[s][i].user) {
          struct nouveau_bo *bo = nvc0->screen->uniform_bo;
-         const unsigned base = s << 16;
+         const unsigned base = NVC0_CB_USR_INFO(s);
          const unsigned size = nvc0->constbuf[s][0].size;
          assert(i == 0); /* we really only want OpenGL uniforms here */
          assert(nvc0->constbuf[s][0].u.data);
@@ -207,8 +207,8 @@ nvc0_compute_validate_driverconst(struct nvc0_context *nvc0)
 
    BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
    PUSH_DATA (push, 1024);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (5 << 10));
-   PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (5 << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
    BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
    PUSH_DATA (push, (15 << 8) | 1);
 
@@ -219,15 +219,16 @@ static void
 nvc0_compute_validate_buffers(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
    const int s = 5;
    int i;
 
    BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
    PUSH_DATA (push, 1024);
-   PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10));
-   PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
    BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS);
-   PUSH_DATA (push, 512);
+   PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0));
 
    for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
       if (nvc0->buffers[s][i].buffer) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 54afe887ebd..31e1272aeed 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -98,6 +98,31 @@
 #define NVC0_BIND_M2MF          0
 #define NVC0_BIND_FENCE         1
 
+/* 6 user uniform buffers, at 64K each */
+#define NVC0_CB_USR_INFO(s)         (s << 16)
+#define NVC0_CB_USR_SIZE            (6 << 16)
+/* 6 driver constbuts, at 1K each */
+#define NVC0_CB_AUX_INFO(s)         NVC0_CB_USR_SIZE + (s << 10)
+#define NVC0_CB_AUX_SIZE            (6 << 10)
+/* XXX: Figure out what this UNK data is. */
+#define NVC0_CB_AUX_UNK_INFO        0x000
+#define NVC0_CB_AUX_UNK_SIZE        (8 * 4)
+/* 32 textures handles, at 1 32-bits integer each */
+#define NVC0_CB_AUX_TEX_INFO(i)     0x020 + (i) * 4
+#define NVC0_CB_AUX_TEX_SIZE        (32 * 4)
+/* 8 user clip planes, at 4 32-bits floats each */
+#define NVC0_CB_AUX_UCP_INFO        0x100
+#define NVC0_CB_AUX_UCP_SIZE        (PIPE_MAX_CLIP_PLANES * 4 * 4)
+/* 8 sets of 32-bits integer pairs sample offsets */
+#define NVC0_CB_AUX_SAMPLE_INFO     0x180 /* FP */
+#define NVC0_CB_AUX_SAMPLE_SIZE     (8 * 4 * 2)
+/* draw parameters (index bais, base instance, drawid) */
+#define NVC0_CB_AUX_DRAW_INFO       0x180 /* VP */
+/* 32 user buffers, at 4 32-bits integers each */
+#define NVC0_CB_AUX_BUF_INFO(i)     0x200 + (i) * 4 * 4
+#define NVC0_CB_AUX_BUF_SIZE        (NVC0_MAX_BUFFERS * 4 * 4)
+/* 4 32-bits floats for the vertex runout, put at the end */
+#define NVC0_CB_AUX_RUNOUT_INFO     NVC0_CB_USR_SIZE + NVC0_CB_AUX_SIZE
 
 struct nvc0_blitctx;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index bc884d6c08f..b7c6faf9cde 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -535,29 +535,27 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
 
    info->io.genUserClip = prog->vp.num_ucps;
    info->io.auxCBSlot = 15;
-   info->io.ucpBase = 256;
-   info->io.drawInfoBase = 256 + 128;
+   info->io.ucpBase = NVC0_CB_AUX_UCP_INFO;
+   info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO;
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
-         info->io.resInfoCBSlot = 0;
+         info->io.auxCBSlot = 0;
          info->io.texBindBase = NVE4_CP_INPUT_TEX(0);
          info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
          info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0);
       } else {
-         info->io.resInfoCBSlot = 15;
-         info->io.suInfoBase = 512;
+         info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
       }
       info->io.msInfoCBSlot = 0;
       info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS;
    } else {
       if (chipset >= NVISA_GK104_CHIPSET) {
-         info->io.texBindBase = 0x20;
+         info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
          info->io.suInfoBase = 0; /* TODO */
       }
-      info->io.resInfoCBSlot = 15;
-      info->io.sampleInfoBase = 256 + 128;
-      info->io.suInfoBase = 512;
+      info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;
+      info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
       info->io.msInfoCBSlot = 15;
       info->io.msInfoBase = 0; /* TODO */
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 3c5b1da2063..553c001cd2b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -922,14 +922,14 @@ nvc0_screen_create(struct nouveau_device *dev)
       /* auxiliary constants (6 user clip planes, base instance id) */
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 1024);
-      PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (i << 10));
-      PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (i << 10));
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
       BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1);
       PUSH_DATA (push, (15 << 4) | 1);
       if (screen->eng3d->oclass >= NVE4_3D_CLASS) {
          unsigned j;
          BEGIN_1IC0(push, NVC0_3D(CB_POS), 9);
-         PUSH_DATA (push, 0);
+         PUSH_DATA (push, NVC0_CB_AUX_UNK_INFO);
          for (j = 0; j < 8; ++j)
             PUSH_DATA(push, j);
       } else {
@@ -943,8 +943,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    /* return { 0.0, 0.0, 0.0, 0.0 } for out-of-bounds vtxbuf access */
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
    PUSH_DATA (push, 256);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (6 << 10));
-   PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (6 << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO);
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO);
    BEGIN_1IC0(push, NVC0_3D(CB_POS), 5);
    PUSH_DATA (push, 0);
    PUSH_DATAf(push, 0.0f);
@@ -952,8 +952,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATAf(push, 0.0f);
    PUSH_DATAf(push, 0.0f);
    BEGIN_NVC0(push, NVC0_3D(VERTEX_RUNOUT_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (6 << 10));
-   PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (6 << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO);
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO);
 
    if (screen->base.drm->version >= 0x01000101) {
       ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 8487abcf999..46b692df2e3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -66,7 +66,7 @@ struct nvc0_screen {
 
    struct nouveau_bo *text;
    struct nouveau_bo *parm;       /* for COMPUTE */
-   struct nouveau_bo *uniform_bo; /* for 3D */
+   struct nouveau_bo *uniform_bo;
    struct nouveau_bo *tls;
    struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
    struct nouveau_bo *poly_cache;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 090a0395432..a100fc4c478 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -413,7 +413,7 @@ nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
 {
    unsigned s, i;
 
-   for (s = 0; s < 5; ++s)
+   for (s = 0; s < 6; ++s)
       for (i = 0; i < nvc0_context(pipe)->num_samplers[s]; ++i)
          if (nvc0_context(pipe)->samplers[s][i] == hwcso)
             nvc0_context(pipe)->samplers[s][i] = NULL;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index c0ed5c0043d..9c64482f2e2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -72,6 +72,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
 {
     struct nouveau_pushbuf *push = nvc0->base.pushbuf;
     struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
+    struct nvc0_screen *screen = nvc0->screen;
     unsigned i, ms;
     unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1;
     bool serialize = false;
@@ -183,10 +184,10 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
     ms = 1 << ms_mode;
     BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
     PUSH_DATA (push, 1024);
-    PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (4 << 10));
-    PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (4 << 10));
+    PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
+    PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
     BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * ms);
-    PUSH_DATA (push, 256 + 128);
+    PUSH_DATA (push, NVC0_CB_AUX_SAMPLE_INFO);
     for (i = 0; i < ms; i++) {
        float xy[2];
        nvc0->base.pipe.get_sample_position(&nvc0->base.pipe, ms, i, xy);
@@ -313,14 +314,14 @@ static inline void
 nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   struct nouveau_bo *bo = nvc0->screen->uniform_bo;
+   struct nvc0_screen *screen = nvc0->screen;
 
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
    PUSH_DATA (push, 1024);
-   PUSH_DATAh(push, bo->offset + (6 << 16) + (s << 10));
-   PUSH_DATA (push, bo->offset + (6 << 16) + (s << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
    BEGIN_1IC0(push, NVC0_3D(CB_POS), PIPE_MAX_CLIP_PLANES * 4 + 1);
-   PUSH_DATA (push, 256);
+   PUSH_DATA (push, NVC0_CB_AUX_UCP_INFO);
    PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4);
 }
 
@@ -424,7 +425,7 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0)
 
          if (nvc0->constbuf[s][i].user) {
             struct nouveau_bo *bo = nvc0->screen->uniform_bo;
-            const unsigned base = s << 16;
+            const unsigned base = NVC0_CB_USR_INFO(s);
             const unsigned size = nvc0->constbuf[s][0].size;
             assert(i == 0); /* we really only want OpenGL uniforms here */
             assert(nvc0->constbuf[s][0].u.data);
@@ -478,15 +479,16 @@ static void
 nvc0_validate_buffers(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
    int i, s;
 
    for (s = 0; s < 5; s++) {
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 1024);
-      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10));
-      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10));
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
       BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS);
-      PUSH_DATA (push, 512);
+      PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0));
       for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
          if (nvc0->buffers[s][i].buffer) {
             struct nv04_resource *res =
@@ -550,8 +552,8 @@ nvc0_validate_driverconst(struct nvc0_context *nvc0)
    for (i = 0; i < 5; ++i) {
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 1024);
-      PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (i << 10));
-      PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (i << 10));
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
       BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1);
       PUSH_DATA (push, (15 << 4) | 1);
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 53332400a4f..ce6a6dce39c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -707,21 +707,20 @@ void
 nve4_set_tex_handles(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   uint64_t address;
+   struct nvc0_screen *screen = nvc0->screen;
    unsigned s;
 
    if (nvc0->screen->base.class_3d < NVE4_3D_CLASS)
       return;
-   address = nvc0->screen->uniform_bo->offset + (6 << 16);
 
-   for (s = 0; s < 5; ++s, address += (1 << 10)) {
+   for (s = 0; s < 5; ++s) {
       uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
       if (!dirty)
          continue;
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 1024);
-      PUSH_DATAh(push, address);
-      PUSH_DATA (push, address);
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
       do {
          int i = ffs(dirty) - 1;
          dirty &= ~(1 << i);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index e0e0ad2a0f7..4d9cd5752b5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -820,6 +820,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    struct nv04_resource *buf_count = nv04_resource(info->indirect_params);
    unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
    uint32_t offset = buf->offset + info->indirect_offset;
+   struct nvc0_screen *screen = nvc0->screen;
 
    PUSH_SPACE(push, 7);
 
@@ -833,10 +834,10 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    /* Queue things up to let the macros write params to the driver constbuf */
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
    PUSH_DATA (push, 512);
-   PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9));
-   PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
    BEGIN_NVC0(push, NVC0_3D(CB_POS), 1);
-   PUSH_DATA (push, 256 + 128);
+   PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
 
    if (info->indexed) {
       assert(nvc0->idxbuf.buffer);
@@ -934,6 +935,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
    int s;
 
    /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
@@ -975,11 +977,11 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       PUSH_SPACE(push, 9);
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 512);
-      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9));
-      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9));
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
       if (!info->indirect) {
          BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
-         PUSH_DATA (push, 256 + 128);
+         PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
          PUSH_DATA (push, info->index_bias);
          PUSH_DATA (push, info->start_instance);
          PUSH_DATA (push, info->drawid);
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index 6fa892089ec..d100a9df55b 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -385,7 +385,7 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     if (!r300->ctx)
         goto fail;
 
-    r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300, NULL);
+    r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300);
     if (r300->cs == NULL)
         goto fail;
 
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
index 7a75b43a53e..63182cba2b2 100644
--- a/src/gallium/drivers/r300/r300_flush.c
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -53,7 +53,7 @@ static void r300_flush_and_cleanup(struct r300_context *r300, unsigned flags,
     }
 
     r300->flush_counter++;
-    r300->rws->cs_flush(r300->cs, flags, fence, 0);
+    r300->rws->cs_flush(r300->cs, flags, fence);
     r300->dirty_hw = 0;
 
     /* New kitchen sink, baby. */
@@ -88,11 +88,11 @@ void r300_flush(struct pipe_context *pipe,
              * and we cannot emit an empty CS. Let's write to some reg. */
             CS_LOCALS(r300);
             OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0);
-            r300->rws->cs_flush(r300->cs, flags, fence, 0);
+            r300->rws->cs_flush(r300->cs, flags, fence);
         } else {
             /* Even if hw is not dirty, we should at least reset the CS in case
              * the space checking failed for the first draw operation. */
-            r300->rws->cs_flush(r300->cs, flags, NULL, 0);
+            r300->rws->cs_flush(r300->cs, flags, NULL);
         }
     }
 
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 57456c6d867..709345a492e 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -981,8 +981,8 @@ boolean r300_resource_get_handle(struct pipe_screen* screen,
         return FALSE;
     }
 
-    return rws->buffer_get_handle(tex->buf,
-                                  tex->tex.stride_in_bytes[0], whandle);
+    return rws->buffer_get_handle(tex->buf, tex->tex.stride_in_bytes[0],
+                                  0, 0, whandle);
 }
 
 static const struct u_resource_vtbl r300_texture_vtbl =
@@ -1116,7 +1116,7 @@ struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen,
         return NULL;
     }
 
-    buffer = rws->buffer_from_handle(rws, whandle, &stride);
+    buffer = rws->buffer_from_handle(rws, whandle, &stride, NULL);
     if (!buffer)
         return NULL;
 
diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am
index 8317da727a2..f3bb03e54be 100644
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@@ -21,14 +21,6 @@ AM_CFLAGS += \
 	$(LLVM_CFLAGS) \
 	-I$(top_srcdir)/src/gallium/drivers/radeon/
 
-libr600_la_SOURCES += \
-	$(LLVM_C_SOURCES)
-
-endif
-
-if USE_R600_LLVM_COMPILER
-AM_CFLAGS += \
-	-DR600_USE_LLVM
 endif
 
 if HAVE_GALLIUM_COMPUTE
diff --git a/src/gallium/drivers/r600/Makefile.sources b/src/gallium/drivers/r600/Makefile.sources
index 024dea3a002..8bf8083bbab 100644
--- a/src/gallium/drivers/r600/Makefile.sources
+++ b/src/gallium/drivers/r600/Makefile.sources
@@ -64,7 +64,3 @@ CXX_SOURCES = \
 	sb/sb_shader.h \
 	sb/sb_ssa_builder.cpp \
 	sb/sb_valtable.cpp
-
-LLVM_C_SOURCES = \
-	r600_llvm.c \
-	r600_llvm.h
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 2a1b2519ec7..f4b669000dc 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -192,6 +192,69 @@ static const struct u_resource_vtbl r600_global_buffer_vtbl =
 	r600_compute_global_transfer_inline_write /* transfer_inline_write */
 };
 
+/* We need to define these R600 registers here, because we can't include
+ * evergreend.h and r600d.h.
+ */
+#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
+#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
+
+#ifdef HAVE_OPENCL
+
+static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
+					   struct r600_bytecode *bc,
+					   uint64_t symbol_offset,
+					   boolean *use_kill)
+{
+       unsigned i;
+       const unsigned char *config =
+               radeon_shader_binary_config_start(binary, symbol_offset);
+
+       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
+               unsigned reg =
+                       util_le32_to_cpu(*(uint32_t*)(config + i));
+               unsigned value =
+                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
+               switch (reg) {
+               /* R600 / R700 */
+               case R_028850_SQ_PGM_RESOURCES_PS:
+               case R_028868_SQ_PGM_RESOURCES_VS:
+               /* Evergreen / Northern Islands */
+               case R_028844_SQ_PGM_RESOURCES_PS:
+               case R_028860_SQ_PGM_RESOURCES_VS:
+               case R_0288D4_SQ_PGM_RESOURCES_LS:
+                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
+                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
+                       break;
+               case R_02880C_DB_SHADER_CONTROL:
+                       *use_kill = G_02880C_KILL_ENABLE(value);
+                       break;
+               case R_0288E8_SQ_LDS_ALLOC:
+                       bc->nlds_dw = value;
+                       break;
+               }
+       }
+}
+
+static unsigned r600_create_shader(struct r600_bytecode *bc,
+				   const struct radeon_shader_binary *binary,
+				   boolean *use_kill)
+
+{
+	assert(binary->code_size % 4 == 0);
+	bc->bytecode = CALLOC(1, binary->code_size);
+	memcpy(bc->bytecode, binary->code, binary->code_size);
+	bc->ndw = binary->code_size / 4;
+
+	r600_shader_binary_read_config(binary, bc, 0, use_kill);
+	return 0;
+}
+
+#endif
+
+static void r600_destroy_shader(struct r600_bytecode *bc)
+{
+	FREE(bc->bytecode);
+}
 
 void *evergreen_create_compute_state(
 	struct pipe_context *ctx_,
@@ -236,13 +299,11 @@ void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
 	if (!shader)
 		return;
 
-#ifdef HAVE_OPENCL
 	radeon_shader_binary_clean(&shader->binary);
 	r600_destroy_shader(&shader->bc);
 
 	/* TODO destroy shader->code_bo, shader->const_bo
 	 * we'll need something like r600_buffer_free */
-#endif
 	FREE(shader);
 }
 
diff --git a/src/gallium/drivers/r600/evergreen_compute_internal.h b/src/gallium/drivers/r600/evergreen_compute_internal.h
index c8998d00f5a..e6ff7609aea 100644
--- a/src/gallium/drivers/r600/evergreen_compute_internal.h
+++ b/src/gallium/drivers/r600/evergreen_compute_internal.h
@@ -26,6 +26,10 @@
 #define EVERGREEN_COMPUTE_INTERNAL_H
 
 #include "r600_asm.h"
+#ifdef HAVE_OPENCL
+#include "radeon/radeon_llvm.h"
+#include <llvm-c/Core.h>
+#endif
 
 struct r600_pipe_compute {
 	struct r600_context *ctx;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 4951297df42..7a6f957945b 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -57,18 +57,11 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 
 		/* The number of dwords all the dirty states would take. */
 		mask = ctx->dirty_atoms;
-		while (mask != 0) {
+		while (mask != 0)
 			num_dw += ctx->atoms[u_bit_scan64(&mask)]->num_dw;
-			if (ctx->screen->b.trace_bo) {
-				num_dw += R600_TRACE_CS_DWORDS;
-			}
-		}
 
 		/* The upper-bound of how much space a draw command would take. */
 		num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS;
-		if (ctx->screen->b.trace_bo) {
-			num_dw += R600_TRACE_CS_DWORDS;
-		}
 	}
 
 	/* Count in queries_suspend. */
@@ -273,7 +266,7 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 	flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
 
 	/* Flush the CS. */
-	ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++);
+	ctx->b.ws->cs_flush(cs, flags, fence);
 
 	r600_begin_new_cs(ctx);
 }
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
deleted file mode 100644
index 7eab29c6eb4..00000000000
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ /dev/null
@@ -1,943 +0,0 @@
-#include "r600_llvm.h"
-
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_intr.h"
-#include "gallivm/lp_bld_gather.h"
-#include "tgsi/tgsi_parse.h"
-#include "util/list.h"
-#include "util/u_memory.h"
-
-#include "evergreend.h"
-#include "r600_asm.h"
-#include "r600_sq.h"
-#include "r600_opcodes.h"
-#include "r600_shader.h"
-#include "r600_pipe.h"
-#include "radeon_llvm.h"
-#include "radeon_llvm_emit.h"
-#include "radeon_elf_util.h"
-
-#include <stdio.h>
-
-#if defined R600_USE_LLVM || defined HAVE_OPENCL
-
-#define CONSTANT_BUFFER_0_ADDR_SPACE 8
-#define CONSTANT_BUFFER_1_ADDR_SPACE (CONSTANT_BUFFER_0_ADDR_SPACE + R600_BUFFER_INFO_CONST_BUFFER)
-#define LLVM_R600_BUFFER_INFO_CONST_BUFFER \
-	(CONSTANT_BUFFER_0_ADDR_SPACE + R600_BUFFER_INFO_CONST_BUFFER)
-
-static LLVMValueRef llvm_load_const_buffer(
-	struct lp_build_tgsi_context * bld_base,
-	LLVMValueRef OffsetValue,
-	unsigned ConstantAddressSpace)
-{
-	LLVMValueRef offset[2] = {
-		LLVMConstInt(LLVMInt64TypeInContext(bld_base->base.gallivm->context), 0, false),
-		OffsetValue
-	};
-
-	LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base->base.elem_type, 4), 1024),
-							ConstantAddressSpace);
-	LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base->base.gallivm->builder, lp_build_const_int32(bld_base->base.gallivm, 0), const_ptr_type, "");
-	LLVMValueRef ptr = LLVMBuildGEP(bld_base->base.gallivm->builder, const_ptr, offset, 2, "");
-	return LLVMBuildLoad(bld_base->base.gallivm->builder, ptr, "");
-}
-
-static LLVMValueRef llvm_fetch_const(
-	struct lp_build_tgsi_context * bld_base,
-	const struct tgsi_full_src_register *reg,
-	enum tgsi_opcode_type type,
-	unsigned swizzle)
-{
-	LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, reg->Register.Index);
-	if (reg->Register.Indirect) {
-		struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-		LLVMValueRef index = LLVMBuildLoad(bld_base->base.gallivm->builder, bld->addr[reg->Indirect.Index][reg->Indirect.Swizzle], "");
-		offset = LLVMBuildAdd(bld_base->base.gallivm->builder, offset, index, "");
-	}
-	unsigned ConstantAddressSpace = CONSTANT_BUFFER_0_ADDR_SPACE ;
-	if (reg->Register.Dimension) {
-		ConstantAddressSpace += reg->Dimension.Index;
-	}
-	LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, ConstantAddressSpace);
-	LLVMValueRef cval = LLVMBuildExtractElement(bld_base->base.gallivm->builder, cvecval, lp_build_const_int32(bld_base->base.gallivm, swizzle), "");
-	return bitcast(bld_base, type, cval);
-}
-
-static void llvm_load_system_value(
-		struct radeon_llvm_context * ctx,
-		unsigned index,
-		const struct tgsi_full_declaration *decl)
-{
-	unsigned chan;
-
-	switch (decl->Semantic.Name) {
-	case TGSI_SEMANTIC_INSTANCEID: chan = 3; break;
-	case TGSI_SEMANTIC_VERTEXID: chan = 0; break;
-	default: assert(!"unknown system value");
-	}
-
-	ctx->system_values[index] = LLVMBuildExtractElement(ctx->gallivm.builder,
-		LLVMGetParam(ctx->main_fn, 0), lp_build_const_int32(&(ctx->gallivm), chan),
-		"");
-}
-
-static LLVMValueRef
-llvm_load_input_vector(
-	struct radeon_llvm_context * ctx, unsigned location, unsigned ijregs,
-	boolean interp)
-{
-		LLVMTypeRef VecType;
-		LLVMValueRef Args[3] = {
-			lp_build_const_int32(&(ctx->gallivm), location)
-		};
-		unsigned ArgCount = 1;
-		if (interp) {
-			VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 2);
-			LLVMValueRef IJIndex = LLVMGetParam(ctx->main_fn, ijregs / 2);
-			Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex,
-				lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2)), "");
-			Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex,
-				lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2) + 1), "");
-			LLVMValueRef HalfVec[2] = {
-				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy",
-					VecType, Args, ArgCount, LLVMReadNoneAttribute),
-				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw",
-					VecType, Args, ArgCount, LLVMReadNoneAttribute)
-			};
-			LLVMValueRef MaskInputs[4] = {
-				lp_build_const_int32(&(ctx->gallivm), 0),
-				lp_build_const_int32(&(ctx->gallivm), 1),
-				lp_build_const_int32(&(ctx->gallivm), 2),
-				lp_build_const_int32(&(ctx->gallivm), 3)
-			};
-			LLVMValueRef Mask = LLVMConstVector(MaskInputs, 4);
-			return LLVMBuildShuffleVector(ctx->gallivm.builder, HalfVec[0], HalfVec[1],
-				Mask, "");
-		} else {
-			VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 4);
-			return lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const",
-				VecType, Args, ArgCount, LLVMReadNoneAttribute);
-		}
-}
-
-static LLVMValueRef
-llvm_face_select_helper(
-	struct radeon_llvm_context * ctx,
-	LLVMValueRef face, LLVMValueRef front_color, LLVMValueRef back_color)
-{
-	const struct lp_build_context * bb = &ctx->soa.bld_base.base;
-	LLVMValueRef is_front = LLVMBuildFCmp(
-		bb->gallivm->builder, LLVMRealUGT, face,
-		lp_build_const_float(bb->gallivm, 0.0f),	"");
-	return LLVMBuildSelect(bb->gallivm->builder, is_front,
-		front_color, back_color, "");
-}
-
-static void llvm_load_input(
-	struct radeon_llvm_context * ctx,
-	unsigned input_index,
-	const struct tgsi_full_declaration *decl)
-{
-	const struct r600_shader_io * input = &ctx->r600_inputs[input_index];
-	unsigned chan;
-	int two_side = (ctx->two_side && input->name == TGSI_SEMANTIC_COLOR);
-	LLVMValueRef v;
-	boolean require_interp_intrinsic = ctx->chip_class >= EVERGREEN &&
-		ctx->type == TGSI_PROCESSOR_FRAGMENT;
-
-	if (require_interp_intrinsic && input->spi_sid) {
-		v = llvm_load_input_vector(ctx, input->lds_pos, input->ij_index,
-			(input->interpolate > 0));
-	} else
-		v = LLVMGetParam(ctx->main_fn, input->gpr);
-
-	if (two_side) {
-		struct r600_shader_io * back_input =
-			&ctx->r600_inputs[input->back_color_input];
-		LLVMValueRef v2;
-		LLVMValueRef face = LLVMGetParam(ctx->main_fn, ctx->face_gpr);
-		face = LLVMBuildExtractElement(ctx->gallivm.builder, face,
-			lp_build_const_int32(&(ctx->gallivm), 0), "");
-
-		if (require_interp_intrinsic && back_input->spi_sid)
-			v2 = llvm_load_input_vector(ctx, back_input->lds_pos,
-				back_input->ij_index, (back_input->interpolate > 0));
-		else
-			v2 = LLVMGetParam(ctx->main_fn, back_input->gpr);
-		v = llvm_face_select_helper(ctx, face, v, v2);
-	}
-
-	for (chan = 0; chan < 4; chan++) {
-		unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
-
-		ctx->inputs[soa_index] = LLVMBuildExtractElement(ctx->gallivm.builder, v,
-			lp_build_const_int32(&(ctx->gallivm), chan), "");
-
-		if (input->name == TGSI_SEMANTIC_POSITION &&
-				ctx->type == TGSI_PROCESSOR_FRAGMENT && chan == 3) {
-		/* RCP for fragcoord.w */
-		ctx->inputs[soa_index] = LLVMBuildFDiv(ctx->gallivm.builder,
-				lp_build_const_float(&(ctx->gallivm), 1.0f),
-				ctx->inputs[soa_index], "");
-	}
-	}
-}
-
-static void llvm_emit_prologue(struct lp_build_tgsi_context * bld_base)
-{
-	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-	radeon_llvm_shader_type(ctx->main_fn, ctx->type);
-
-}
-
-static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
-{
-	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-	struct lp_build_context * base = &bld_base->base;
-	struct pipe_stream_output_info * so = ctx->stream_outputs;
-	unsigned i;
-	unsigned next_pos = 60;
-	unsigned next_param = 0;
-
-	unsigned color_count = 0;
-	boolean has_color = false;
-
-	if (ctx->type == TGSI_PROCESSOR_VERTEX && so->num_outputs) {
-		for (i = 0; i < so->num_outputs; i++) {
-			unsigned register_index = so->output[i].register_index;
-			unsigned start_component = so->output[i].start_component;
-			unsigned num_components = so->output[i].num_components;
-			unsigned dst_offset = so->output[i].dst_offset;
-			unsigned chan;
-			LLVMValueRef elements[4];
-			if (dst_offset < start_component) {
-				for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-					elements[chan] = LLVMBuildLoad(base->gallivm->builder,
-						ctx->soa.outputs[register_index][(chan + start_component) % TGSI_NUM_CHANNELS], "");
-				}
-				start_component = 0;
-			} else {
-				for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-					elements[chan] = LLVMBuildLoad(base->gallivm->builder,
-						ctx->soa.outputs[register_index][chan], "");
-				}
-			}
-			LLVMValueRef output = lp_build_gather_values(base->gallivm, elements, 4);
-			LLVMValueRef args[4];
-			args[0] = output;
-			args[1] = lp_build_const_int32(base->gallivm, dst_offset - start_component);
-			args[2] = lp_build_const_int32(base->gallivm, so->output[i].output_buffer);
-			args[3] = lp_build_const_int32(base->gallivm, ((1 << num_components) - 1) << start_component);
-			lp_build_intrinsic(base->gallivm->builder, "llvm.R600.store.stream.output",
-				LLVMVoidTypeInContext(base->gallivm->context), args, 4, 0);
-		}
-	}
-
-	/* Add the necessary export instructions */
-	for (i = 0; i < ctx->output_reg_count; i++) {
-		unsigned chan;
-		LLVMValueRef elements[4];
-		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			elements[chan] = LLVMBuildLoad(base->gallivm->builder,
-				ctx->soa.outputs[i][chan], "");
-		}
-		if (ctx->alpha_to_one && ctx->type == TGSI_PROCESSOR_FRAGMENT && ctx->r600_outputs[i].name == TGSI_SEMANTIC_COLOR)
-			elements[3] = lp_build_const_float(base->gallivm, 1.0f);
-		LLVMValueRef output = lp_build_gather_values(base->gallivm, elements, 4);
-
-		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
-			switch (ctx->r600_outputs[i].name) {
-			case TGSI_SEMANTIC_POSITION:
-			case TGSI_SEMANTIC_PSIZE: {
-				LLVMValueRef args[3];
-				args[0] = output;
-				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				break;
-			}
-			case TGSI_SEMANTIC_CLIPVERTEX: {
-				LLVMValueRef args[3];
-				unsigned reg_index;
-				LLVMValueRef adjusted_elements[4];
-				for (reg_index = 0; reg_index < 2; reg_index ++) {
-					for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-						LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, reg_index * 4 + chan);
-						LLVMValueRef base_vector = llvm_load_const_buffer(bld_base, offset, CONSTANT_BUFFER_1_ADDR_SPACE);
-						args[0] = output;
-						args[1] = base_vector;
-						adjusted_elements[chan] = lp_build_intrinsic(base->gallivm->builder,
-							"llvm.AMDGPU.dp4", bld_base->base.elem_type,
-							args, 2, LLVMReadNoneAttribute);
-					}
-					args[0] = lp_build_gather_values(base->gallivm,
-						adjusted_elements, 4);
-					args[1] = lp_build_const_int32(base->gallivm, next_pos++);
-					args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-					lp_build_intrinsic(
-						base->gallivm->builder,
-						"llvm.R600.store.swizzle",
-						LLVMVoidTypeInContext(base->gallivm->context),
-						args, 3, 0);
-				}
-				break;
-			}
-			case TGSI_SEMANTIC_CLIPDIST : {
-				LLVMValueRef args[3];
-				args[0] = output;
-				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				args[1] = lp_build_const_int32(base->gallivm, next_param++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				break;
-			}
-			case TGSI_SEMANTIC_FOG: {
-				elements[0] = LLVMBuildLoad(base->gallivm->builder,
-					ctx->soa.outputs[i][0], "");
-				elements[1] = elements[2] = lp_build_const_float(base->gallivm, 0.0f);
-				elements[3] = lp_build_const_float(base->gallivm, 1.0f);
-
-				LLVMValueRef args[3];
-				args[0] = lp_build_gather_values(base->gallivm, elements, 4);
-				args[1] = lp_build_const_int32(base->gallivm, next_param++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				break;
-			}
-			default: {
-				LLVMValueRef args[3];
-				args[0] = output;
-				args[1] = lp_build_const_int32(base->gallivm, next_param++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				break;
-			}
-			}
-		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-			switch (ctx->r600_outputs[i].name) {
-			case TGSI_SEMANTIC_COLOR:
-				has_color = true;
-				if ( color_count < ctx->color_buffer_count) {
-					LLVMValueRef args[3];
-					args[0] = output;
-					if (ctx->fs_color_all) {
-						for (unsigned j = 0; j < ctx->color_buffer_count; j++) {
-							args[1] = lp_build_const_int32(base->gallivm, j);
-							args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-							lp_build_intrinsic(
-								base->gallivm->builder,
-								"llvm.R600.store.swizzle",
-								LLVMVoidTypeInContext(base->gallivm->context),
-								args, 3, 0);
-						}
-					} else {
-						args[1] = lp_build_const_int32(base->gallivm, color_count++);
-						args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-						lp_build_intrinsic(
-							base->gallivm->builder,
-							"llvm.R600.store.swizzle",
-							LLVMVoidTypeInContext(base->gallivm->context),
-							args, 3, 0);
-					}
-				}
-				break;
-			case TGSI_SEMANTIC_POSITION:
-				lp_build_intrinsic_unary(
-					base->gallivm->builder,
-					"llvm.R600.store.pixel.depth",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					LLVMBuildLoad(base->gallivm->builder, ctx->soa.outputs[i][2], ""));
-				break;
-			case TGSI_SEMANTIC_STENCIL:
-				lp_build_intrinsic_unary(
-					base->gallivm->builder,
-					"llvm.R600.store.pixel.stencil",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					LLVMBuildLoad(base->gallivm->builder, ctx->soa.outputs[i][1], ""));
-				break;
-			}
-		}
-	}
-	// Add dummy exports
-	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
-		if (!next_param) {
-			lp_build_intrinsic_unary(base->gallivm->builder, "llvm.R600.store.dummy",
-				LLVMVoidTypeInContext(base->gallivm->context),
-				lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM));
-		}
-		if (!(next_pos-60)) {
-			lp_build_intrinsic_unary(base->gallivm->builder, "llvm.R600.store.dummy",
-				LLVMVoidTypeInContext(base->gallivm->context),
-				lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS));
-		}
-	}
-	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-		if (!has_color) {
-			lp_build_intrinsic_unary(base->gallivm->builder, "llvm.R600.store.dummy",
-				LLVMVoidTypeInContext(base->gallivm->context),
-				lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL));
-		}
-	}
-
-}
-
-static void llvm_emit_tex(
-	const struct lp_build_tgsi_action * action,
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	struct gallivm_state * gallivm = bld_base->base.gallivm;
-	LLVMValueRef args[7];
-	unsigned c, sampler_src;
-	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-
-	if (emit_data->inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
-		switch (emit_data->inst->Instruction.Opcode) {
-		case TGSI_OPCODE_TXQ: {
-			struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-			ctx->uses_tex_buffers = true;
-			bool isEgPlus = (ctx->chip_class >= EVERGREEN);
-			LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm,
-				isEgPlus ? 0 : 1);
-			LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset,
-				LLVM_R600_BUFFER_INFO_CONST_BUFFER);
-			if (!isEgPlus) {
-				LLVMValueRef maskval[4] = {
-					lp_build_const_int32(gallivm, 1),
-					lp_build_const_int32(gallivm, 2),
-					lp_build_const_int32(gallivm, 3),
-					lp_build_const_int32(gallivm, 0),
-				};
-				LLVMValueRef mask = LLVMConstVector(maskval, 4);
-				cvecval = LLVMBuildShuffleVector(gallivm->builder, cvecval, cvecval,
-					mask, "");
-			}
-			emit_data->output[0] = cvecval;
-			return;
-		}
-		case TGSI_OPCODE_TXF: {
-			args[0] = LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], lp_build_const_int32(gallivm, 0), "");
-			args[1] = lp_build_const_int32(gallivm, R600_MAX_CONST_BUFFERS);
-			emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
-							"llvm.R600.load.texbuf",
-							emit_data->dst_type, args, 2, LLVMReadNoneAttribute);
-			if (ctx->chip_class >= EVERGREEN)
-				return;
-			ctx->uses_tex_buffers = true;
-			LLVMDumpValue(emit_data->output[0]);
-			emit_data->output[0] = LLVMBuildBitCast(gallivm->builder,
-				emit_data->output[0], LLVMVectorType(bld_base->base.int_elem_type, 4),
-				"");
-			LLVMValueRef Mask = llvm_load_const_buffer(bld_base,
-				lp_build_const_int32(gallivm, 0),
-				LLVM_R600_BUFFER_INFO_CONST_BUFFER);
-			Mask = LLVMBuildBitCast(gallivm->builder, Mask,
-				LLVMVectorType(bld_base->base.int_elem_type, 4), "");
-			emit_data->output[0] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_AND,
-				emit_data->output[0],
-				Mask);
-			LLVMValueRef WComponent = LLVMBuildExtractElement(gallivm->builder,
-				emit_data->output[0], lp_build_const_int32(gallivm, 3), "");
-			Mask = llvm_load_const_buffer(bld_base, lp_build_const_int32(gallivm, 1),
-				LLVM_R600_BUFFER_INFO_CONST_BUFFER);
-			Mask = LLVMBuildExtractElement(gallivm->builder, Mask,
-				lp_build_const_int32(gallivm, 0), "");
-			Mask = LLVMBuildBitCast(gallivm->builder, Mask,
-				bld_base->base.int_elem_type, "");
-			WComponent = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_OR,
-				WComponent, Mask);
-			emit_data->output[0] = LLVMBuildInsertElement(gallivm->builder,
-				emit_data->output[0], WComponent, lp_build_const_int32(gallivm, 3), "");
-			emit_data->output[0] = LLVMBuildBitCast(gallivm->builder,
-				emit_data->output[0], LLVMVectorType(bld_base->base.elem_type, 4), "");
-		}
-			return;
-		default:
-			break;
-		}
-	}
-
-	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TEX ||
-		emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
-		LLVMValueRef Vector[4] = {
-			LLVMBuildExtractElement(gallivm->builder, emit_data->args[0],
-				lp_build_const_int32(gallivm, 0), ""),
-			LLVMBuildExtractElement(gallivm->builder, emit_data->args[0],
-				lp_build_const_int32(gallivm, 1), ""),
-			LLVMBuildExtractElement(gallivm->builder, emit_data->args[0],
-				lp_build_const_int32(gallivm, 2), ""),
-			LLVMBuildExtractElement(gallivm->builder, emit_data->args[0],
-				lp_build_const_int32(gallivm, 3), ""),
-		};
-		switch (emit_data->inst->Texture.Texture) {
-		case TGSI_TEXTURE_2D:
-		case TGSI_TEXTURE_RECT:
-			Vector[2] = Vector[3] = LLVMGetUndef(bld_base->base.elem_type);
-			break;
-		case TGSI_TEXTURE_1D:
-			Vector[1] = Vector[2] = Vector[3] = LLVMGetUndef(bld_base->base.elem_type);
-			break;
-		default:
-			break;
-		}
-		args[0] = lp_build_gather_values(gallivm, Vector, 4);
-	} else {
-		args[0] = emit_data->args[0];
-	}
-
-	assert(emit_data->arg_count + 2 <= Elements(args));
-
-	for (c = 1; c < emit_data->arg_count; ++c)
-		args[c] = emit_data->args[c];
-
-	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
-		args[1] = LLVMBuildShl(gallivm->builder, args[1], lp_build_const_int32(gallivm, 1), "");
-		args[2] = LLVMBuildShl(gallivm->builder, args[2], lp_build_const_int32(gallivm, 1), "");
-		args[3] = LLVMBuildShl(gallivm->builder, args[3], lp_build_const_int32(gallivm, 1), "");
-	}
-
-	sampler_src = emit_data->inst->Instruction.NumSrcRegs-1;
-
-	args[c++] = lp_build_const_int32(gallivm,
-					emit_data->inst->Src[sampler_src].Register.Index + R600_MAX_CONST_BUFFERS);
-	args[c++] = lp_build_const_int32(gallivm,
-					emit_data->inst->Src[sampler_src].Register.Index);
-	args[c++] = lp_build_const_int32(gallivm,
-					emit_data->inst->Texture.Texture);
-
-	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
-		(emit_data->inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
-		emit_data->inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
-
-		switch (emit_data->inst->Texture.Texture) {
-		case TGSI_TEXTURE_2D_MSAA:
-			args[6] = lp_build_const_int32(gallivm, TGSI_TEXTURE_2D);
-			break;
-		case TGSI_TEXTURE_2D_ARRAY_MSAA:
-			args[6] = lp_build_const_int32(gallivm, TGSI_TEXTURE_2D_ARRAY);
-			break;
-		default:
-			break;
-		}
-
-		if (ctx->has_compressed_msaa_texturing) {
-			LLVMValueRef ldptr_args[10] = {
-				args[0], // Coord
-				args[1], // Offset X
-				args[2], // Offset Y
-				args[3], // Offset Z
-				args[4],
-				args[5],
-				lp_build_const_int32(gallivm, 1),
-				lp_build_const_int32(gallivm, 1),
-				lp_build_const_int32(gallivm, 1),
-				lp_build_const_int32(gallivm, 1)
-			};
-			LLVMValueRef ptr = lp_build_intrinsic(gallivm->builder,
-				"llvm.R600.ldptr",
-				emit_data->dst_type, ldptr_args, 10, LLVMReadNoneAttribute);
-			LLVMValueRef Tmp = LLVMBuildExtractElement(gallivm->builder, args[0],
-				lp_build_const_int32(gallivm, 3), "");
-			Tmp = LLVMBuildMul(gallivm->builder, Tmp,
-				lp_build_const_int32(gallivm, 4), "");
-			LLVMValueRef ResX = LLVMBuildExtractElement(gallivm->builder, ptr,
-				lp_build_const_int32(gallivm, 0), "");
-			ResX = LLVMBuildBitCast(gallivm->builder, ResX,
-				bld_base->base.int_elem_type, "");
-			Tmp = LLVMBuildLShr(gallivm->builder, ResX, Tmp, "");
-			Tmp = LLVMBuildAnd(gallivm->builder, Tmp,
-				lp_build_const_int32(gallivm, 0xF), "");
-			args[0] = LLVMBuildInsertElement(gallivm->builder, args[0], Tmp,
-				lp_build_const_int32(gallivm, 3), "");
-			args[c++] = lp_build_const_int32(gallivm,
-				emit_data->inst->Texture.Texture);
-		}
-	}
-
-	emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
-					action->intr_name,
-					emit_data->dst_type, args, c, LLVMReadNoneAttribute);
-
-	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
-		((emit_data->inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-		emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
-		if (emit_data->inst->Dst[0].Register.WriteMask & 4) {
-			LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, 0);
-			LLVMValueRef ZLayer = LLVMBuildExtractElement(gallivm->builder,
-				llvm_load_const_buffer(bld_base, offset, LLVM_R600_BUFFER_INFO_CONST_BUFFER),
-				lp_build_const_int32(gallivm, 0), "");
-
-			emit_data->output[0] = LLVMBuildInsertElement(gallivm->builder, emit_data->output[0], ZLayer, lp_build_const_int32(gallivm, 2), "");
-			struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-			ctx->has_txq_cube_array_z_comp = true;
-		}
-}
-
-static void emit_cndlt(
-		const struct lp_build_tgsi_action * action,
-		struct lp_build_tgsi_context * bld_base,
-		struct lp_build_emit_data * emit_data)
-{
-	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-	LLVMValueRef float_zero = lp_build_const_float(
-		bld_base->base.gallivm, 0.0f);
-	LLVMValueRef cmp = LLVMBuildFCmp(
-		builder, LLVMRealULT, emit_data->args[0], float_zero, "");
-	emit_data->output[emit_data->chan] = LLVMBuildSelect(builder,
-		cmp, emit_data->args[1], emit_data->args[2], "");
-}
-
-static void dp_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	struct lp_build_context * base = &bld_base->base;
-	unsigned chan;
-	LLVMValueRef elements[2][4];
-	unsigned opcode = emit_data->inst->Instruction.Opcode;
-	unsigned dp_components = (opcode == TGSI_OPCODE_DP2 ? 2 :
-					(opcode == TGSI_OPCODE_DP3 ? 3 : 4));
-	for (chan = 0 ; chan < dp_components; chan++) {
-		elements[0][chan] = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 0, chan);
-		elements[1][chan] = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 1, chan);
-	}
-
-	for ( ; chan < 4; chan++) {
-		elements[0][chan] = base->zero;
-		elements[1][chan] = base->zero;
-	}
-
-	 /* Fix up for DPH */
-	if (opcode == TGSI_OPCODE_DPH) {
-		elements[0][TGSI_CHAN_W] = base->one;
-	}
-
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-							elements[0], 4);
-	emit_data->args[1] = lp_build_gather_values(bld_base->base.gallivm,
-							elements[1], 4);
-	emit_data->arg_count = 2;
-
-	emit_data->dst_type = base->elem_type;
-}
-
-static struct lp_build_tgsi_action dot_action = {
-	.fetch_args = dp_fetch_args,
-	.emit = build_tgsi_intrinsic_nomem,
-	.intr_name = "llvm.AMDGPU.dp4"
-};
-
-static void txd_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[4];
-	unsigned chan, src;
-	for (src = 0; src < 3; src++) {
-		for (chan = 0; chan < 4; chan++)
-			coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan);
-
-		emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm,
-				coords, 4);
-	}
-	emit_data->arg_count = 3;
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-
-static void txp_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	LLVMValueRef src_w;
-	unsigned chan;
-	LLVMValueRef coords[5];
-
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-
-	for (chan = 0; chan < 3; chan++ ) {
-		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 0, chan);
-		coords[chan] = lp_build_emit_llvm_binary(bld_base,
-					TGSI_OPCODE_DIV, arg, src_w);
-	}
-	coords[3] = bld_base->base.one;
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
-	}
-
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->arg_count = 1;
-}
-
-static void tex_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[5];
-	unsigned chan;
-	for (chan = 0; chan < 4; chan++) {
-		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
-	}
-
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
-		/* These instructions have additional operand that should be packed
-		 * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords.
-		 * That operand should be passed as a float value in the args array
-		 * right after the coord vector. After packing it's not used anymore,
-		 * that's why arg_count is not increased */
-		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
-	}
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
-	}
-
-	emit_data->arg_count = 1;
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-static void txf_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-	const struct tgsi_texture_offset * off = inst->TexOffsets;
-	LLVMTypeRef offset_type = bld_base->int_bld.elem_type;
-
-	/* fetch tex coords */
-	tex_fetch_args(bld_base, emit_data);
-
-	/* fetch tex offsets */
-	if (inst->Texture.NumOffsets) {
-		assert(inst->Texture.NumOffsets == 1);
-
-		emit_data->args[1] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleX],
-			offset_type);
-		emit_data->args[2] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleY],
-			offset_type);
-		emit_data->args[3] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleZ],
-			offset_type);
-	} else {
-		emit_data->args[1] = bld_base->int_bld.zero;
-		emit_data->args[2] = bld_base->int_bld.zero;
-		emit_data->args[3] = bld_base->int_bld.zero;
-	}
-
-	emit_data->arg_count = 4;
-}
-
-LLVMModuleRef r600_tgsi_llvm(
-	struct radeon_llvm_context * ctx,
-	const struct tgsi_token * tokens)
-{
-	struct tgsi_shader_info shader_info;
-	struct lp_build_tgsi_context * bld_base = &ctx->soa.bld_base;
-	radeon_llvm_context_init(ctx, "r600--");
-	LLVMTypeRef Arguments[32];
-	unsigned ArgumentsCount = 0;
-	for (unsigned i = 0; i < ctx->inputs_count; i++)
-		Arguments[ArgumentsCount++] = LLVMVectorType(bld_base->base.elem_type, 4);
-	radeon_llvm_create_func(ctx, NULL, 0, Arguments, ArgumentsCount);
-	for (unsigned i = 0; i < ctx->inputs_count; i++) {
-		LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
-		LLVMAddAttribute(P, LLVMInRegAttribute);
-	}
-	tgsi_scan_shader(tokens, &shader_info);
-
-	bld_base->info = &shader_info;
-	bld_base->userdata = ctx;
-	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const;
-	bld_base->emit_prologue = llvm_emit_prologue;
-	bld_base->emit_epilogue = llvm_emit_epilogue;
-	ctx->load_input = llvm_load_input;
-	ctx->load_system_value = llvm_load_system_value;
-
-	bld_base->op_actions[TGSI_OPCODE_DP2] = dot_action;
-	bld_base->op_actions[TGSI_OPCODE_DP3] = dot_action;
-	bld_base->op_actions[TGSI_OPCODE_DP4] = dot_action;
-	bld_base->op_actions[TGSI_OPCODE_DPH] = dot_action;
-	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
-	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_DDX].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
-	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_DDY].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TEX].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TEX2].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXB].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXB2].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
-	bld_base->op_actions[TGSI_OPCODE_TXD].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
-	bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXL2].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TXP].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
-	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cndlt;
-
-	lp_build_tgsi_llvm(bld_base, tokens);
-
-	LLVMBuildRetVoid(bld_base->base.gallivm->builder);
-	radeon_llvm_finalize_module(ctx);
-
-	return ctx->gallivm.module;
-}
-
-/* We need to define these R600 registers here, because we can't include
- * evergreend.h and r600d.h.
- */
-#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
-#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
-
-void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
-					struct r600_bytecode *bc,
-					uint64_t symbol_offset,
-					boolean *use_kill)
-{
-	unsigned i;
-	const unsigned char *config =
-		radeon_shader_binary_config_start(binary, symbol_offset);
-
-	for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
-		unsigned reg =
-			util_le32_to_cpu(*(uint32_t*)(config + i));
-		unsigned value =
-			util_le32_to_cpu(*(uint32_t*)(config + i + 4));
-		switch (reg) {
-		/* R600 / R700 */
-		case R_028850_SQ_PGM_RESOURCES_PS:
-		case R_028868_SQ_PGM_RESOURCES_VS:
-		/* Evergreen / Northern Islands */
-		case R_028844_SQ_PGM_RESOURCES_PS:
-		case R_028860_SQ_PGM_RESOURCES_VS:
-		case R_0288D4_SQ_PGM_RESOURCES_LS:
-			bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
-			bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
-			break;
-		case R_02880C_DB_SHADER_CONTROL:
-			*use_kill = G_02880C_KILL_ENABLE(value);
-			break;
-		case R_0288E8_SQ_LDS_ALLOC:
-			bc->nlds_dw = value;
-			break;
-		}
-	}
-
-}
-
-unsigned r600_create_shader(struct r600_bytecode *bc,
-		const struct radeon_shader_binary *binary,
-		boolean *use_kill)
-
-{
-	assert(binary->code_size % 4 == 0);
-	bc->bytecode = CALLOC(1, binary->code_size);
-	memcpy(bc->bytecode, binary->code, binary->code_size);
-	bc->ndw = binary->code_size / 4;
-
-	r600_shader_binary_read_config(binary, bc, 0, use_kill);
-
-	return 0;
-}
-
-void r600_destroy_shader(struct r600_bytecode *bc)
-{
-	FREE(bc->bytecode);
-}
-
-unsigned r600_llvm_compile(
-	LLVMModuleRef mod,
-	enum radeon_family family,
-	struct r600_bytecode *bc,
-	boolean *use_kill,
-	unsigned dump,
-	struct pipe_debug_callback *debug)
-{
-	unsigned r;
-	struct radeon_shader_binary binary;
-	const char * gpu_family = r600_get_llvm_processor_name(family);
-
-	radeon_shader_binary_init(&binary);
-	if (dump)
-		LLVMDumpModule(mod);
-	r = radeon_llvm_compile(mod, &binary, gpu_family, NULL, debug);
-
-	r = r600_create_shader(bc, &binary, use_kill);
-
-	radeon_shader_binary_clean(&binary);
-
-	return r;
-}
-
-#endif
diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h
deleted file mode 100644
index 3f7fc4bef7e..00000000000
--- a/src/gallium/drivers/r600/r600_llvm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-
-#ifndef R600_LLVM_H
-#define R600_LLVM_H
-
-#if defined R600_USE_LLVM || defined HAVE_OPENCL
-
-#include "radeon/radeon_llvm.h"
-#include <llvm-c/Core.h>
-
-struct pipe_debug_callback;
-struct r600_bytecode;
-struct r600_shader_ctx;
-struct radeon_llvm_context;
-struct radeon_shader_binary;
-enum radeon_family;
-
-LLVMModuleRef r600_tgsi_llvm(
-	struct radeon_llvm_context * ctx,
-	const struct tgsi_token * tokens);
-
-unsigned r600_llvm_compile(
-	LLVMModuleRef mod,
-	enum radeon_family family,
-	struct r600_bytecode *bc,
-	boolean *use_kill,
-	unsigned dump,
-	struct pipe_debug_callback *debug);
-
-unsigned r600_create_shader(struct r600_bytecode *bc,
-		const struct radeon_shader_binary *binary,
-		boolean *use_kill);
-
-void r600_destroy_shader(struct r600_bytecode *bc);
-
-void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
-		struct r600_bytecode *bc,
-		uint64_t symbol_offset,
-		boolean *use_kill);
-
-#endif /* defined R600_USE_LLVM || defined HAVE_OPENCL */
-
-#endif /* R600_LLVM_H */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 7018088d204..b8011917907 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -43,9 +43,6 @@
 
 static const struct debug_named_value r600_debug_options[] = {
 	/* features */
-#if defined(R600_USE_LLVM)
-	{ "llvm", DBG_LLVM, "Enable the LLVM shader compiler" },
-#endif
 	{ "nocpdma", DBG_NO_CP_DMA, "Disable CP DMA" },
 
 	/* shader backend */
@@ -187,9 +184,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen,
 	}
 
 	rctx->b.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
-				       r600_context_gfx_flush, rctx,
-				       rscreen->b.trace_bo ?
-					       rscreen->b.trace_bo->buf : NULL);
+				       r600_context_gfx_flush, rctx);
 	rctx->b.gfx.flush = r600_context_gfx_flush;
 
 	rctx->allocator_fetch_shader = u_suballocator_create(&rctx->b.b, 64 * 1024, 256,
@@ -622,8 +617,6 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
 		rscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS | DBG_TCS | DBG_TES;
 	if (!debug_get_bool_option("R600_HYPERZ", TRUE))
 		rscreen->b.debug_flags |= DBG_NO_HYPERZ;
-	if (debug_get_bool_option("R600_LLVM", FALSE))
-		rscreen->b.debug_flags |= DBG_LLVM;
 
 	if (rscreen->b.family == CHIP_UNKNOWN) {
 		fprintf(stderr, "r600: Unknown chipset 0x%04X\n", rscreen->b.info.pci_id);
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index f8a20398355..cd0052a519f 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -28,8 +28,6 @@
 
 #include "radeon/r600_pipe_common.h"
 #include "radeon/r600_cs.h"
-
-#include "r600_llvm.h"
 #include "r600_public.h"
 
 #include "util/u_suballoc.h"
@@ -60,7 +58,6 @@
 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS	16
 #define R600_MAX_DRAW_CS_DWORDS		58
-#define R600_TRACE_CS_DWORDS		7
 
 #define R600_MAX_USER_CONST_BUFFERS 13
 #define R600_MAX_DRIVER_CONST_BUFFERS 3
@@ -244,7 +241,6 @@ struct r600_gs_rings_state {
 
 /* This must start from 16. */
 /* features */
-#define DBG_LLVM		(1 << 29)
 #define DBG_NO_CP_DMA		(1 << 30)
 /* shader backend */
 #define DBG_NO_SB		(1 << 21)
@@ -571,15 +567,10 @@ static inline void r600_mark_atom_dirty(struct r600_context *rctx,
 	r600_set_atom_dirty(rctx, atom, true);
 }
 
-void r600_trace_emit(struct r600_context *rctx);
-
 static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
 {
 	atom->emit(&rctx->b, atom);
 	r600_set_atom_dirty(rctx, atom, false);
-	if (rctx->screen->b.trace_bo) {
-		r600_trace_emit(rctx);
-	}
 }
 
 static inline void r600_set_cso_state(struct r600_context *rctx,
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index df40f94bdcf..77658f53551 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -21,7 +21,6 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #include "r600_sq.h"
-#include "r600_llvm.h"
 #include "r600_formats.h"
 #include "r600_opcodes.h"
 #include "r600_shader.h"
@@ -194,10 +193,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 	/* disable SB for shaders using doubles */
 	use_sb &= !shader->shader.uses_doubles;
 
-	/* Check if the bytecode has already been built.  When using the llvm
-	 * backend, r600_shader_from_tgsi() will take care of building the
-	 * bytecode.
-	 */
+	/* Check if the bytecode has already been built. */
 	if (!shader->shader.bc.bytecode) {
 		r = r600_bytecode_build(&shader->shader.bc);
 		if (r) {
@@ -332,7 +328,6 @@ struct r600_shader_ctx {
 	uint32_t				*literals;
 	uint32_t				nliterals;
 	uint32_t				max_driver_temp_used;
-	boolean use_llvm;
 	/* needed for evergreen interpolation */
 	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
 	/* evergreen/cayman also store sample mask in face register */
@@ -661,11 +656,9 @@ static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
 		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
 		if (ctx->shader->input[index].interpolate > 0) {
 			evergreen_interp_assign_ij_index(ctx, index);
-			if (!ctx->use_llvm)
-				r = evergreen_interp_alu(ctx, index);
+			r = evergreen_interp_alu(ctx, index);
 		} else {
-			if (!ctx->use_llvm)
-				r = evergreen_interp_flat(ctx, index);
+			r = evergreen_interp_flat(ctx, index);
 		}
 	}
 	return r;
@@ -2936,22 +2929,16 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	int i, j, k, r = 0;
 	int next_param_base = 0, next_clip_base;
 	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
-	/* Declarations used by llvm code */
-	bool use_llvm = false;
 	bool indirect_gprs;
 	bool ring_outputs = false;
 	bool lds_outputs = false;
 	bool lds_inputs = false;
 	bool pos_emitted = false;
 
-#ifdef R600_USE_LLVM
-	use_llvm = rscreen->b.debug_flags & DBG_LLVM;
-#endif
 	ctx.bc = &shader->bc;
 	ctx.shader = shader;
 	ctx.native_integers = true;
 
-
 	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
 			   rscreen->has_compressed_msaa_texturing);
 	ctx.tokens = tokens;
@@ -3043,19 +3030,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		ctx.file_offset[i] = 0;
 	}
 
-#ifdef R600_USE_LLVM
-	if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
-		fprintf(stderr, "Warning: R600 LLVM backend does not support "
-				"indirect adressing.  Falling back to TGSI "
-				"backend.\n");
-		use_llvm = 0;
-	}
-#endif
 	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
-		if (!use_llvm) {
-			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
-		}
+		r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
 	}
 	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
 		if (ctx.bc->chip_class >= EVERGREEN)
@@ -3085,16 +3062,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		if (add_tess_inout)
 			ctx.file_offset[TGSI_FILE_INPUT]+=2;
 	}
-	ctx.use_llvm = use_llvm;
 
-	if (use_llvm) {
-		ctx.file_offset[TGSI_FILE_OUTPUT] =
-			ctx.file_offset[TGSI_FILE_INPUT];
-	} else {
-	   ctx.file_offset[TGSI_FILE_OUTPUT] =
+	ctx.file_offset[TGSI_FILE_OUTPUT] =
 			ctx.file_offset[TGSI_FILE_INPUT] +
 			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
-	}
 	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
 						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
 
@@ -3234,71 +3205,12 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		}
 	}
 
-/* LLVM backend setup */
-#ifdef R600_USE_LLVM
-	if (use_llvm) {
-		struct radeon_llvm_context radeon_llvm_ctx;
-		LLVMModuleRef mod;
-		bool dump = r600_can_dump_shader(&rscreen->b,
-						 tgsi_get_processor_type(tokens));
-		boolean use_kill = false;
-
-		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
-		radeon_llvm_ctx.type = ctx.type;
-		radeon_llvm_ctx.two_side = shader->two_side;
-		radeon_llvm_ctx.face_gpr = ctx.face_gpr;
-		radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
-		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
-		radeon_llvm_ctx.r600_outputs = ctx.shader->output;
-		radeon_llvm_ctx.color_buffer_count = max_color_exports;
-		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
-		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
-		radeon_llvm_ctx.stream_outputs = &so;
-		radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
-		radeon_llvm_ctx.has_compressed_msaa_texturing =
-			ctx.bc->has_compressed_msaa_texturing;
-		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
-		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
-		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
-
-		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill,
-				      dump, &rctx->b.debug)) {
-			radeon_llvm_dispose(&radeon_llvm_ctx);
-			use_llvm = 0;
-			fprintf(stderr, "R600 LLVM backend failed to compile "
-				"shader.  Falling back to TGSI\n");
-		} else {
-			ctx.file_offset[TGSI_FILE_OUTPUT] =
-					ctx.file_offset[TGSI_FILE_INPUT];
-		}
-		if (use_kill)
-			ctx.shader->uses_kill = use_kill;
-		radeon_llvm_dispose(&radeon_llvm_ctx);
-	}
-#endif
-/* End of LLVM backend setup */
-
 	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
 		shader->nr_ps_max_color_exports = 8;
 
-	if (!use_llvm) {
-		if (ctx.fragcoord_input >= 0) {
-			if (ctx.bc->chip_class == CAYMAN) {
-				for (j = 0 ; j < 4; j++) {
-					struct r600_bytecode_alu alu;
-					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-					alu.op = ALU_OP1_RECIP_IEEE;
-					alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
-					alu.src[0].chan = 3;
-
-					alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
-					alu.dst.chan = j;
-					alu.dst.write = (j == 3);
-					alu.last = 1;
-					if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
-						return r;
-				}
-			} else {
+	if (ctx.fragcoord_input >= 0) {
+		if (ctx.bc->chip_class == CAYMAN) {
+			for (j = 0 ; j < 4; j++) {
 				struct r600_bytecode_alu alu;
 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 				alu.op = ALU_OP1_RECIP_IEEE;
@@ -3306,87 +3218,100 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 				alu.src[0].chan = 3;
 
 				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
-				alu.dst.chan = 3;
-				alu.dst.write = 1;
+				alu.dst.chan = j;
+				alu.dst.write = (j == 3);
 				alu.last = 1;
 				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
 					return r;
 			}
-		}
-
-		if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
+		} else {
 			struct r600_bytecode_alu alu;
-			int r;
-
-			/* GS thread with no output workaround - emit a cut at start of GS */
-			if (ctx.bc->chip_class == R600)
-				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP1_RECIP_IEEE;
+			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
+			alu.src[0].chan = 3;
 
-			for (j = 0; j < 4; j++) {
-				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-				alu.op = ALU_OP1_MOV;
-				alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
-				alu.src[0].value = 0;
-				alu.dst.sel = ctx.gs_export_gpr_tregs[j];
-				alu.dst.write = 1;
-				alu.last = 1;
-				r = r600_bytecode_add_alu(ctx.bc, &alu);
-				if (r)
-					return r;
-			}
+			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
+			alu.dst.chan = 3;
+			alu.dst.write = 1;
+			alu.last = 1;
+			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
+				return r;
 		}
+	}
+
+	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
+		struct r600_bytecode_alu alu;
+		int r;
 
-		if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
-			r600_fetch_tess_io_info(&ctx);
+		/* GS thread with no output workaround - emit a cut at start of GS */
+		if (ctx.bc->chip_class == R600)
+			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
 
-		if (shader->two_side && ctx.colors_used) {
-			if ((r = process_twoside_color_inputs(&ctx)))
+		for (j = 0; j < 4; j++) {
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP1_MOV;
+			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+			alu.src[0].value = 0;
+			alu.dst.sel = ctx.gs_export_gpr_tregs[j];
+			alu.dst.write = 1;
+			alu.last = 1;
+			r = r600_bytecode_add_alu(ctx.bc, &alu);
+			if (r)
 				return r;
 		}
+	}
 
-		tgsi_parse_init(&ctx.parse, tokens);
-		while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
-			tgsi_parse_token(&ctx.parse);
-			switch (ctx.parse.FullToken.Token.Type) {
-			case TGSI_TOKEN_TYPE_INSTRUCTION:
-				r = tgsi_is_supported(&ctx);
-				if (r)
-					goto out_err;
-				ctx.max_driver_temp_used = 0;
-				/* reserve first tmp for everyone */
-				r600_get_temp(&ctx);
+	if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
+		r600_fetch_tess_io_info(&ctx);
+
+	if (shader->two_side && ctx.colors_used) {
+		if ((r = process_twoside_color_inputs(&ctx)))
+			return r;
+	}
 
-				opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
-				if ((r = tgsi_split_constant(&ctx)))
+	tgsi_parse_init(&ctx.parse, tokens);
+	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
+		tgsi_parse_token(&ctx.parse);
+		switch (ctx.parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			r = tgsi_is_supported(&ctx);
+			if (r)
+				goto out_err;
+			ctx.max_driver_temp_used = 0;
+			/* reserve first tmp for everyone */
+			r600_get_temp(&ctx);
+
+			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
+			if ((r = tgsi_split_constant(&ctx)))
+				goto out_err;
+			if ((r = tgsi_split_literal_constant(&ctx)))
+				goto out_err;
+			if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
+				if ((r = tgsi_split_gs_inputs(&ctx)))
 					goto out_err;
-				if ((r = tgsi_split_literal_constant(&ctx)))
+			} else if (lds_inputs) {
+				if ((r = tgsi_split_lds_inputs(&ctx)))
 					goto out_err;
-				if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
-					if ((r = tgsi_split_gs_inputs(&ctx)))
-						goto out_err;
-				} else if (lds_inputs) {
-					if ((r = tgsi_split_lds_inputs(&ctx)))
-						goto out_err;
-				}
-				if (ctx.bc->chip_class == CAYMAN)
-					ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
-				else if (ctx.bc->chip_class >= EVERGREEN)
-					ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
-				else
-					ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
-				r = ctx.inst_info->process(&ctx);
+			}
+			if (ctx.bc->chip_class == CAYMAN)
+				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
+			else if (ctx.bc->chip_class >= EVERGREEN)
+				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
+			else
+				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
+			r = ctx.inst_info->process(&ctx);
+			if (r)
+				goto out_err;
+
+			if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) {
+				r = r600_store_tcs_output(&ctx);
 				if (r)
 					goto out_err;
-
-				if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) {
-					r = r600_store_tcs_output(&ctx);
-					if (r)
-						goto out_err;
-				}
-				break;
-			default:
-				break;
 			}
+			break;
+		default:
+			break;
 		}
 	}
 
@@ -3437,8 +3362,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 				alu.dst.write = (j == ochan);
 				if (j == 3)
 					alu.last = 1;
-				if (!use_llvm)
-					r = r600_bytecode_add_alu(ctx.bc, &alu);
+				r = r600_bytecode_add_alu(ctx.bc, &alu);
 				if (r)
 					return r;
 			}
@@ -3446,7 +3370,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	}
 
 	/* Add stream outputs. */
-	if (!use_llvm && so.num_outputs) {
+	if (so.num_outputs) {
 		bool emit = false;
 		if (!lds_outputs && !ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX)
 			emit = true;
@@ -3709,31 +3633,27 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			}
 		}
 		/* add output to bytecode */
-		if (!use_llvm) {
-			for (i = 0; i < noutput; i++) {
-				r = r600_bytecode_add_output(ctx.bc, &output[i]);
-				if (r)
-					goto out_err;
-			}
+		for (i = 0; i < noutput; i++) {
+			r = r600_bytecode_add_output(ctx.bc, &output[i]);
+			if (r)
+				goto out_err;
 		}
 	}
 
 	/* add program end */
-	if (!use_llvm) {
-		if (ctx.bc->chip_class == CAYMAN)
-			cm_bytecode_add_cf_end(ctx.bc);
-		else {
-			const struct cf_op_info *last = NULL;
+	if (ctx.bc->chip_class == CAYMAN)
+		cm_bytecode_add_cf_end(ctx.bc);
+	else {
+		const struct cf_op_info *last = NULL;
 
-			if (ctx.bc->cf_last)
-				last = r600_isa_cf(ctx.bc->cf_last->op);
+		if (ctx.bc->cf_last)
+			last = r600_isa_cf(ctx.bc->cf_last->op);
 
-			/* alu clause instructions don't have EOP bit, so add NOP */
-			if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS)
-				r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
+		/* alu clause instructions don't have EOP bit, so add NOP */
+		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS)
+			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
 
-			ctx.bc->cf_last->end_of_program = 1;
-		}
+		ctx.bc->cf_last->end_of_program = 1;
 	}
 
 	/* check GPR limit - we have 124 = 128 - 4
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 2211e07ceba..df41d3f028d 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -2029,10 +2029,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT);
 	}
 
-	if (rctx->screen->b.trace_bo) {
-		r600_trace_emit(rctx);
-	}
-
 	/* Set the depth buffer as dirty. */
 	if (rctx->framebuffer.state.zsbuf) {
 		struct pipe_surface *surf = rctx->framebuffer.state.zsbuf;
@@ -2927,22 +2923,3 @@ void r600_init_common_state_functions(struct r600_context *rctx)
 	rctx->b.set_occlusion_query_state = r600_set_occlusion_query_state;
 	rctx->b.need_gfx_cs_space = r600_need_gfx_cs_space;
 }
-
-void r600_trace_emit(struct r600_context *rctx)
-{
-	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
-	uint64_t va;
-	uint32_t reloc;
-
-	va = rscreen->b.trace_bo->gpu_address;
-	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rscreen->b.trace_bo,
-				      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
-	radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
-	radeon_emit(cs, va & 0xFFFFFFFFUL);
-	radeon_emit(cs, (va >> 32UL) & 0xFFUL);
-	radeon_emit(cs, cs->cdw);
-	radeon_emit(cs, rscreen->b.cs_count);
-	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, reloc);
-}
diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp
index 556a05da395..3dd3a4815ba 100644
--- a/src/gallium/drivers/r600/sb/sb_expr.cpp
+++ b/src/gallium/drivers/r600/sb/sb_expr.cpp
@@ -598,9 +598,13 @@ bool expr_handler::fold_assoc(alu_node *n) {
 
 	unsigned op = n->bc.op;
 	bool allow_neg = false, cur_neg = false;
+	bool distribute_neg = false;
 
 	switch(op) {
 	case ALU_OP2_ADD:
+		distribute_neg = true;
+		allow_neg = true;
+		break;
 	case ALU_OP2_MUL:
 	case ALU_OP2_MUL_IEEE:
 		allow_neg = true;
@@ -632,7 +636,7 @@ bool expr_handler::fold_assoc(alu_node *n) {
 		if (v1->is_const()) {
 			literal arg = v1->get_const_value();
 			apply_alu_src_mod(a->bc, 1, arg);
-			if (cur_neg)
+			if (cur_neg && distribute_neg)
 				arg.f = -arg.f;
 
 			if (a == n)
@@ -660,7 +664,7 @@ bool expr_handler::fold_assoc(alu_node *n) {
 		if (v0->is_const()) {
 			literal arg = v0->get_const_value();
 			apply_alu_src_mod(a->bc, 0, arg);
-			if (cur_neg)
+			if (cur_neg && distribute_neg)
 				arg.f = -arg.f;
 
 			if (last_arg == 0) {
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index ea028272ccd..eed9d83ee49 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -229,7 +229,7 @@ static void r600_flush_dma_ring(void *ctx, unsigned flags,
 	struct radeon_winsys_cs *cs = rctx->dma.cs;
 
 	if (cs->cdw)
-		rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence, 0);
+		rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
 	if (fence)
 		rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 }
@@ -318,7 +318,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (rscreen->info.has_sdma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
 		rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 						   r600_flush_dma_ring,
-						   rctx, NULL);
+						   rctx);
 		rctx->dma.flush = r600_flush_dma_ring;
 	}
 
@@ -379,7 +379,6 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "tex", DBG_TEX, "Print texture info" },
 	{ "compute", DBG_COMPUTE, "Print compute info" },
 	{ "vm", DBG_VM, "Print virtual addresses when creating resources" },
-	{ "trace_cs", DBG_TRACE_CS, "Trace cs and write rlockup_<csid>.c file with faulty cs" },
 	{ "info", DBG_INFO, "Print driver information" },
 
 	/* shaders */
@@ -893,19 +892,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	pipe_mutex_init(rscreen->aux_context_lock);
 	pipe_mutex_init(rscreen->gpu_load_mutex);
 
-	if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) ||
-	     rscreen->info.drm_major == 3) &&
-	    (rscreen->debug_flags & DBG_TRACE_CS)) {
-		rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b,
-										PIPE_BIND_CUSTOM,
-										PIPE_USAGE_STAGING,
-										4096);
-		if (rscreen->trace_bo) {
-			rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->buf, NULL,
-									PIPE_TRANSFER_UNSYNCHRONIZED);
-		}
-	}
-
 	if (rscreen->debug_flags & DBG_INFO) {
 		printf("pci_id = 0x%x\n", rscreen->info.pci_id);
 		printf("family = %i (%s)\n", rscreen->info.family,
@@ -951,9 +937,6 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 	pipe_mutex_destroy(rscreen->aux_context_lock);
 	rscreen->aux_context->destroy(rscreen->aux_context);
 
-	if (rscreen->trace_bo)
-		pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL);
-
 	rscreen->ws->destroy(rscreen->ws);
 	FREE(rscreen);
 }
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index cf8dcf7ea88..381ad21a4e3 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -61,7 +61,7 @@
 /* gap - reuse */
 #define DBG_COMPUTE		(1 << 2)
 #define DBG_VM			(1 << 3)
-#define DBG_TRACE_CS		(1 << 4)
+/* gap - reuse */
 /* shader logging */
 #define DBG_FS			(1 << 5)
 #define DBG_VS			(1 << 6)
@@ -303,10 +303,6 @@ struct r600_common_screen {
 	struct pipe_context		*aux_context;
 	pipe_mutex			aux_context_lock;
 
-	struct r600_resource		*trace_bo;
-	uint32_t			*trace_ptr;
-	unsigned			cs_count;
-
 	/* This must be in the screen, because UE4 uses one context for
 	 * compilation and another one for rendering.
 	 */
@@ -610,6 +606,8 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   struct r600_atom *fb_state,
 				   unsigned *buffers, unsigned *dirty_cbufs,
 				   const union pipe_color_union *color);
+void r600_texture_disable_dcc(struct r600_common_screen *rscreen,
+			      struct r600_texture *rtex);
 void r600_init_screen_texture_functions(struct r600_common_screen *rscreen);
 void r600_init_context_texture_functions(struct r600_common_context *rctx);
 
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 115c7289c4c..7322f3ee985 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -201,9 +201,11 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
 
 static int r600_setup_surface(struct pipe_screen *screen,
 			      struct r600_texture *rtex,
-			      unsigned pitch_in_bytes_override)
+			      unsigned pitch_in_bytes_override,
+			      unsigned offset)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+	unsigned i;
 	int r;
 
 	r = rscreen->ws->surface_init(rscreen->ws, &rtex->surface);
@@ -225,6 +227,11 @@ static int r600_setup_surface(struct pipe_screen *screen,
 			rtex->surface.stencil_level[0].offset = rtex->surface.level[0].slice_size;
 		}
 	}
+
+	if (offset) {
+		for (i = 0; i < Elements(rtex->surface.level); ++i)
+			rtex->surface.level[i].offset += offset;
+	}
 	return 0;
 }
 
@@ -290,8 +297,8 @@ static void r600_texture_disable_cmask(struct r600_common_screen *rscreen,
 	p_atomic_inc(&rscreen->compressed_colortex_counter);
 }
 
-static void r600_texture_disable_dcc(struct r600_common_screen *rscreen,
-				     struct r600_texture *rtex)
+void r600_texture_disable_dcc(struct r600_common_screen *rscreen,
+			      struct r600_texture *rtex)
 {
 	struct r600_common_context *rctx =
 		(struct r600_common_context *)rscreen->aux_context;
@@ -366,6 +373,8 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 
 	return rscreen->ws->buffer_get_handle(res->buf,
 					      rtex->surface.level[0].pitch_bytes,
+					      rtex->surface.level[0].offset,
+					      rtex->surface.level[0].slice_size,
 					      whandle);
 }
 
@@ -629,8 +638,14 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
 		return 0;
 
-	/* Overalign HTILE on Stoney to fix piglit/depthstencil-render-miplevels 585. */
-	if (rscreen->family == CHIP_STONEY)
+	/* Overalign HTILE on P2 configs to work around GPU hangs in
+	 * piglit/depthstencil-render-miplevels 585.
+	 *
+	 * This has been confirmed to help Kabini & Stoney, where the hangs
+	 * are always reproducible. I think I have seen the test hang
+	 * on Carrizo too, though it was very rare there.
+	 */
+	if (rscreen->chip_class >= CIK && num_pipes < 4)
 		num_pipes = 4;
 
 	switch (num_pipes) {
@@ -791,6 +806,7 @@ static struct r600_texture *
 r600_texture_create_object(struct pipe_screen *screen,
 			   const struct pipe_resource *base,
 			   unsigned pitch_in_bytes_override,
+			   unsigned offset,
 			   struct pb_buffer *buf,
 			   struct radeon_surf *surface)
 {
@@ -812,7 +828,7 @@ r600_texture_create_object(struct pipe_screen *screen,
 	rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format));
 
 	rtex->surface = *surface;
-	if (r600_setup_surface(screen, rtex, pitch_in_bytes_override)) {
+	if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) {
 		FREE(rtex);
 		return NULL;
 	}
@@ -979,7 +995,7 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 	if (r) {
 		return NULL;
 	}
-	return (struct pipe_resource *)r600_texture_create_object(screen, templ,
+	return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0,
 								  0, NULL, &surface);
 }
 
@@ -990,7 +1006,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct pb_buffer *buf = NULL;
-	unsigned stride = 0;
+	unsigned stride = 0, offset = 0;
 	unsigned array_mode;
 	struct radeon_surf surface;
 	int r;
@@ -1002,7 +1018,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 	      templ->depth0 != 1 || templ->last_level != 0)
 		return NULL;
 
-	buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride);
+	buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride, &offset);
 	if (!buf)
 		return NULL;
 
@@ -1029,8 +1045,8 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 	if (metadata.scanout)
 		surface.flags |= RADEON_SURF_SCANOUT;
 
-	rtex = r600_texture_create_object(screen, templ,
-					  stride, buf, &surface);
+	rtex = r600_texture_create_object(screen, templ, stride,
+					  offset, buf, &surface);
 	if (!rtex)
 		return NULL;
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index bdee2f8020a..0a164bba307 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -51,24 +51,8 @@ struct radeon_llvm_loop {
 };
 
 struct radeon_llvm_context {
-
 	struct lp_build_tgsi_soa_context soa;
 
-	unsigned chip_class;
-	unsigned type;
-	unsigned face_gpr;
-	unsigned two_side;
-	unsigned inputs_count;
-	struct r600_shader_io * r600_inputs;
-	struct r600_shader_io * r600_outputs;
-	struct pipe_stream_output_info *stream_outputs;
-	unsigned color_buffer_count;
-	unsigned fs_color_all;
-	unsigned alpha_to_one;
-	unsigned has_txq_cube_array_z_comp;
-	unsigned uses_tex_buffers;
-	unsigned has_compressed_msaa_texturing;
-
 	/*=== Front end configuration ===*/
 
 	/* Instructions that are not described by any of the TGSI opcodes. */
@@ -90,7 +74,6 @@ struct radeon_llvm_context {
 	  */
 	LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
 	LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS];
-	unsigned output_reg_count;
 
 	/** This pointer is used to contain the temporary values.
 	  * The amount of temporary used in tgsi can't be bound to a max value and
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index c74397fb5c9..fb883cb585e 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -363,9 +363,6 @@ static void emit_declaration(
 					ctx->soa.bld_base.base.elem_type, "");
 			}
 		}
-
-		ctx->output_reg_count = MAX2(ctx->output_reg_count,
-							 decl->Range.Last + 1);
 		break;
 	}
 
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index b8efc58eaab..233f46091a4 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -92,7 +92,7 @@ struct ruvd_decoder {
 /* flush IB to the hardware */
 static void flush(struct ruvd_decoder *dec)
 {
-	dec->ws->cs_flush(dec->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+	dec->ws->cs_flush(dec->cs, RADEON_FLUSH_ASYNC, NULL);
 }
 
 /* add a new set register command to the IB */
@@ -1142,7 +1142,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	dec->stream_handle = rvid_alloc_stream_handle();
 	dec->screen = context->screen;
 	dec->ws = ws;
-	dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL);
+	dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL);
 	if (!dec->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 087d9422c04..2ab74e9eb6c 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -56,7 +56,7 @@
  */
 static void flush(struct rvce_encoder *enc)
 {
-	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL);
 	enc->task_info_idx = 0;
 	enc->bs_idx = 0;
 }
@@ -429,7 +429,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	enc->screen = context->screen;
 	enc->ws = ws;
-	enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL);
+	enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc);
 	if (!enc->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index b8a065957a7..d35e963133e 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -515,7 +515,7 @@ struct radeon_winsys {
      */
     struct pb_buffer *(*buffer_from_handle)(struct radeon_winsys *ws,
                                             struct winsys_handle *whandle,
-                                            unsigned *stride);
+                                            unsigned *stride, unsigned *offset);
 
     /**
      * Get a winsys buffer from a user pointer. The resulting buffer can't
@@ -546,7 +546,8 @@ struct radeon_winsys {
      * \return          TRUE on success.
      */
     boolean (*buffer_get_handle)(struct pb_buffer *buf,
-                                 unsigned stride,
+                                 unsigned stride, unsigned offset,
+                                 unsigned slice_size,
                                  struct winsys_handle *whandle);
 
     /**
@@ -592,14 +593,12 @@ struct radeon_winsys {
      * \param ring_type The ring type (GFX, DMA, UVD)
      * \param flush     Flush callback function associated with the command stream.
      * \param user      User pointer that will be passed to the flush callback.
-     * \param trace_buf Trace buffer when tracing is enabled
      */
     struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx,
                                           enum ring_type ring_type,
                                           void (*flush)(void *ctx, unsigned flags,
 							struct pipe_fence_handle **fence),
-                                          void *flush_ctx,
-                                          struct pb_buffer *trace_buf);
+                                          void *flush_ctx);
 
     /**
      * Destroy a command stream.
@@ -672,12 +671,10 @@ struct radeon_winsys {
      * \param flags,      RADEON_FLUSH_ASYNC or 0.
      * \param fence       Pointer to a fence. If non-NULL, a fence is inserted
      *                    after the CS and is returned through this parameter.
-     * \param cs_trace_id A unique identifier of the cs, used for tracing.
      */
     void (*cs_flush)(struct radeon_winsys_cs *cs,
                      unsigned flags,
-                     struct pipe_fence_handle **fence,
-                     uint32_t cs_trace_id);
+                     struct pipe_fence_handle **fence);
 
     /**
      * Return TRUE if a buffer is referenced by a command stream.
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index f9a6de48f6b..e0dbec5fb79 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -325,8 +325,8 @@ static void si_blit_decompress_color(struct pipe_context *ctx,
 }
 
 static void
-si_decompress_color_textures(struct si_context *sctx,
-			     struct si_textures_info *textures)
+si_decompress_sampler_color_textures(struct si_context *sctx,
+				     struct si_textures_info *textures)
 {
 	unsigned i;
 	unsigned mask = textures->compressed_colortex_mask;
@@ -350,6 +350,33 @@ si_decompress_color_textures(struct si_context *sctx,
 	}
 }
 
+static void
+si_decompress_image_color_textures(struct si_context *sctx,
+				   struct si_images_info *images)
+{
+	unsigned i;
+	unsigned mask = images->compressed_colortex_mask;
+
+	while (mask) {
+		const struct pipe_image_view *view;
+		struct r600_texture *tex;
+
+		i = u_bit_scan(&mask);
+
+		view = &images->views[i];
+		assert(view->resource->target != PIPE_BUFFER);
+
+		tex = (struct r600_texture *)view->resource;
+		if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
+			continue;
+
+		si_blit_decompress_color(&sctx->b.b, tex,
+					 view->u.tex.level, view->u.tex.level,
+					 0, util_max_layer(&tex->resource.b.b, view->u.tex.level),
+					 false);
+	}
+}
+
 void si_decompress_textures(struct si_context *sctx)
 {
 	unsigned compressed_colortex_counter;
@@ -370,7 +397,10 @@ void si_decompress_textures(struct si_context *sctx)
 			si_flush_depth_textures(sctx, &sctx->samplers[i]);
 		}
 		if (sctx->samplers[i].compressed_colortex_mask) {
-			si_decompress_color_textures(sctx, &sctx->samplers[i]);
+			si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
+		}
+		if (sctx->images[i].compressed_colortex_mask) {
+			si_decompress_image_color_textures(sctx, &sctx->images[i]);
 		}
 	}
 }
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index d12b3e6b28a..815b87bbd7e 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -64,7 +64,8 @@
 #include "util/u_upload_mgr.h"
 
 
-/* NULL image and buffer descriptor.
+/* NULL image and buffer descriptor for textures (alpha = 1) and images
+ * (alpha = 0).
  *
  * For images, all fields must be zero except for the swizzle, which
  * supports arbitrary combinations of 0s and 1s. The texture type must be
@@ -74,7 +75,7 @@
  *
  * This is the only reason why the buffer descriptor must be in words [4:7].
  */
-static uint32_t null_descriptor[8] = {
+static uint32_t null_texture_descriptor[8] = {
 	0,
 	0,
 	0,
@@ -84,10 +85,20 @@ static uint32_t null_descriptor[8] = {
 	 * descriptor */
 };
 
+static uint32_t null_image_descriptor[8] = {
+	0,
+	0,
+	0,
+	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
+	/* the rest must contain zeros, which is also used by the buffer
+	 * descriptor */
+};
+
 static void si_init_descriptors(struct si_descriptors *desc,
 				unsigned shader_userdata_index,
 				unsigned element_dw_size,
-				unsigned num_elements)
+				unsigned num_elements,
+				const uint32_t *null_descriptor)
 {
 	int i;
 
@@ -100,10 +111,12 @@ static void si_init_descriptors(struct si_descriptors *desc,
 	desc->shader_userdata_offset = shader_userdata_index * 4;
 
 	/* Initialize the array to NULL descriptors if the element size is 8. */
-	if (element_dw_size % 8 == 0)
+	if (null_descriptor) {
+		assert(element_dw_size % 8 == 0);
 		for (i = 0; i < num_elements * element_dw_size / 8; i++)
-			memcpy(desc->list + i*8, null_descriptor,
-			       sizeof(null_descriptor));
+			memcpy(desc->list + i * 8, null_descriptor,
+			       8 * 4);
+	}
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
@@ -210,7 +223,7 @@ static void si_set_sampler_view(struct si_context *sctx,
 		} else {
 			/* Disable FMASK and bind sampler state in [12:15]. */
 			memcpy(views->desc.list + slot*16 + 8,
-			       null_descriptor, 4*4);
+			       null_texture_descriptor, 4*4);
 
 			if (views->sampler_states[slot])
 				memcpy(views->desc.list + slot*16 + 12,
@@ -220,9 +233,9 @@ static void si_set_sampler_view(struct si_context *sctx,
 		views->desc.enabled_mask |= 1llu << slot;
 	} else {
 		pipe_sampler_view_reference(&views->views[slot], NULL);
-		memcpy(views->desc.list + slot*16, null_descriptor, 8*4);
+		memcpy(views->desc.list + slot*16, null_texture_descriptor, 8*4);
 		/* Only clear the lower dwords of FMASK. */
-		memcpy(views->desc.list + slot*16 + 8, null_descriptor, 4*4);
+		memcpy(views->desc.list + slot*16 + 8, null_texture_descriptor, 4*4);
 		views->desc.enabled_mask &= ~(1llu << slot);
 	}
 
@@ -301,6 +314,160 @@ si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers)
 	}
 }
 
+/* IMAGE VIEWS */
+
+static void
+si_release_image_views(struct si_images_info *images)
+{
+	unsigned i;
+
+	for (i = 0; i < SI_NUM_IMAGES; ++i) {
+		struct pipe_image_view *view = &images->views[i];
+
+		pipe_resource_reference(&view->resource, NULL);
+	}
+
+	si_release_descriptors(&images->desc);
+}
+
+static void
+si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images)
+{
+	uint mask = images->desc.enabled_mask;
+
+	/* Add buffers to the CS. */
+	while (mask) {
+		int i = u_bit_scan(&mask);
+		struct pipe_image_view *view = &images->views[i];
+
+		assert(view->resource);
+
+		si_sampler_view_add_buffer(sctx, view->resource);
+	}
+
+	if (images->desc.buffer) {
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+					  images->desc.buffer,
+					  RADEON_USAGE_READ,
+					  RADEON_PRIO_DESCRIPTORS);
+	}
+}
+
+static void
+si_disable_shader_image(struct si_images_info *images, unsigned slot)
+{
+	if (images->desc.enabled_mask & (1llu << slot)) {
+		pipe_resource_reference(&images->views[slot].resource, NULL);
+		images->compressed_colortex_mask &= ~(1 << slot);
+
+		memcpy(images->desc.list + slot*8, null_image_descriptor, 8*4);
+		images->desc.enabled_mask &= ~(1llu << slot);
+		images->desc.list_dirty = true;
+	}
+}
+
+static void
+si_set_shader_images(struct pipe_context *pipe, unsigned shader,
+		     unsigned start_slot, unsigned count,
+		     struct pipe_image_view *views)
+{
+	struct si_context *ctx = (struct si_context *)pipe;
+	struct si_screen *screen = ctx->screen;
+	struct si_images_info *images = &ctx->images[shader];
+	unsigned i, slot;
+
+	assert(shader < SI_NUM_SHADERS);
+
+	if (!count)
+		return;
+
+	assert(start_slot + count <= SI_NUM_IMAGES);
+
+	for (i = 0, slot = start_slot; i < count; ++i, ++slot) {
+		struct r600_resource *res;
+
+		if (!views || !views[i].resource) {
+			si_disable_shader_image(images, slot);
+			continue;
+		}
+
+		res = (struct r600_resource *)views[i].resource;
+		util_copy_image_view(&images->views[slot], &views[i]);
+
+		si_sampler_view_add_buffer(ctx, &res->b.b);
+
+		if (res->b.b.target == PIPE_BUFFER) {
+			si_make_buffer_descriptor(screen, res,
+						  views[i].format,
+						  views[i].u.buf.first_element,
+						  views[i].u.buf.last_element,
+						  images->desc.list + slot * 8);
+			images->compressed_colortex_mask &= ~(1 << slot);
+		} else {
+			static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
+			struct r600_texture *tex = (struct r600_texture *)res;
+			unsigned level;
+			unsigned width, height, depth;
+
+			assert(!tex->is_depth);
+			assert(tex->fmask.size == 0);
+
+			if (tex->dcc_offset &&
+			    views[i].access & PIPE_IMAGE_ACCESS_WRITE)
+				r600_texture_disable_dcc(&screen->b, tex);
+
+			if (is_compressed_colortex(tex)) {
+				images->compressed_colortex_mask |= 1 << slot;
+			} else {
+				images->compressed_colortex_mask &= ~(1 << slot);
+			}
+
+			/* Always force the base level to the selected level.
+			 *
+			 * This is required for 3D textures, where otherwise
+			 * selecting a single slice for non-layered bindings
+			 * fails. It doesn't hurt the other targets.
+			 */
+			level = views[i].u.tex.level;
+			width = u_minify(res->b.b.width0, level);
+			height = u_minify(res->b.b.height0, level);
+			depth = u_minify(res->b.b.depth0, level);
+
+			si_make_texture_descriptor(screen, tex, false, res->b.b.target,
+						   views[i].format, swizzle,
+						   level, 0, 0,
+						   views[i].u.tex.first_layer, views[i].u.tex.last_layer,
+						   width, height, depth,
+						   images->desc.list + slot * 8,
+						   NULL);
+		}
+
+		images->desc.enabled_mask |= 1llu << slot;
+		images->desc.list_dirty = true;
+	}
+}
+
+static void
+si_images_update_compressed_colortex_mask(struct si_images_info *images)
+{
+	uint64_t mask = images->desc.enabled_mask;
+
+	while (mask) {
+		int i = u_bit_scan64(&mask);
+		struct pipe_resource *res = images->views[i].resource;
+
+		if (res && res->target != PIPE_BUFFER) {
+			struct r600_texture *rtex = (struct r600_texture *)res;
+
+			if (is_compressed_colortex(rtex)) {
+				images->compressed_colortex_mask |= 1 << i;
+			} else {
+				images->compressed_colortex_mask &= ~(1 << i);
+			}
+		}
+	}
+}
+
 /* SAMPLER STATES */
 
 static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
@@ -351,7 +518,7 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers,
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 
 	si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
-			    num_buffers);
+			    num_buffers, NULL);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
@@ -804,6 +971,7 @@ void si_update_compressed_colortex_masks(struct si_context *sctx)
 {
 	for (int i = 0; i < SI_NUM_SHADERS; ++i) {
 		si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]);
+		si_images_update_compressed_colortex_mask(&sctx->images[i]);
 	}
 }
 
@@ -925,6 +1093,28 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 			}
 		}
 	}
+
+	/* Shader images */
+	for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
+		struct si_images_info *images = &sctx->images[shader];
+		unsigned mask = images->desc.enabled_mask;
+
+		while (mask) {
+			unsigned i = u_bit_scan(&mask);
+
+			if (images->views[i].resource == buf) {
+				si_desc_reset_buffer_offset(
+					ctx, images->desc.list + i * 8 + 4,
+					old_va, buf);
+				images->desc.list_dirty = true;
+
+				radeon_add_to_buffer_list(
+					&sctx->b, &sctx->b.gfx, rbuffer,
+					RADEON_USAGE_READWRITE,
+					RADEON_PRIO_SAMPLER_BUFFER);
+			}
+		}
+	}
 }
 
 /* SHADER USER DATA */
@@ -1055,6 +1245,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
 
 		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
 		si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->images[i].desc, base, false);
 	}
 	si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
 }
@@ -1074,14 +1265,20 @@ void si_init_all_descriptors(struct si_context *sctx)
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT);
 
 		si_init_descriptors(&sctx->samplers[i].views.desc,
-				    SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS);
+				    SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
+				    null_texture_descriptor);
+
+		si_init_descriptors(&sctx->images[i].desc,
+				    SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
+				    null_image_descriptor);
 	}
 
 	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
-			    4, SI_NUM_VERTEX_BUFFERS);
+			    4, SI_NUM_VERTEX_BUFFERS, NULL);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.bind_sampler_states = si_bind_sampler_states;
+	sctx->b.b.set_shader_images = si_set_shader_images;
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
 	sctx->b.b.set_sampler_views = si_set_sampler_views;
 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
@@ -1105,7 +1302,8 @@ bool si_upload_shader_descriptors(struct si_context *sctx)
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
 		    !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
-		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc))
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
+		    !si_upload_descriptors(sctx, &sctx->images[i].desc))
 			return false;
 	}
 	return si_upload_vertex_buffer_descriptors(sctx);
@@ -1119,6 +1317,7 @@ void si_release_all_descriptors(struct si_context *sctx)
 		si_release_buffer_resources(&sctx->const_buffers[i]);
 		si_release_buffer_resources(&sctx->rw_buffers[i]);
 		si_release_sampler_views(&sctx->samplers[i].views);
+		si_release_image_views(&sctx->images[i]);
 	}
 	si_release_descriptors(&sctx->vertex_buffers);
 }
@@ -1131,6 +1330,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 		si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
 		si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]);
 		si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
+		si_image_views_begin_new_cs(sctx, &sctx->images[i]);
 	}
 	si_vertex_buffers_begin_new_cs(sctx);
 	si_shader_userdata_begin_new_cs(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index b5a4034cc12..8c900a4ecb6 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -118,8 +118,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	}
 
 	/* Flush the CS. */
-	ws->cs_flush(cs, flags, &ctx->last_gfx_fence,
-		     ctx->screen->b.cs_count++);
+	ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
 
 	if (fence)
 		ws->fence_reference(fence, ctx->last_gfx_fence);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 8b50a49cba0..dd1103eed06 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -140,9 +140,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 		sctx->b.b.create_video_buffer = vl_video_buffer_create;
 	}
 
-	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
-				       sctx, sscreen->b.trace_bo ?
-					       sscreen->b.trace_bo->buf : NULL);
+	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX,
+				       si_context_gfx_flush, sctx);
 	sctx->b.gfx.flush = si_context_gfx_flush;
 
 	/* Border colors. */
@@ -539,8 +538,9 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
 	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
 		return 0;
+	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+		return HAVE_LLVM >= 0x0309 ? SI_NUM_IMAGES : 0;
 	}
 	return 0;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 0fef5f72098..6d0d687fe4c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -141,6 +141,12 @@ struct si_textures_info {
 	uint32_t			compressed_colortex_mask;
 };
 
+struct si_images_info {
+	struct si_descriptors		desc;
+	struct pipe_image_view		views[SI_NUM_IMAGES];
+	uint32_t			compressed_colortex_mask;
+};
+
 struct si_framebuffer {
 	struct r600_atom		atom;
 	struct pipe_framebuffer_state	state;
@@ -251,6 +257,7 @@ struct si_context {
 	struct si_buffer_resources	const_buffers[SI_NUM_SHADERS];
 	struct si_buffer_resources	rw_buffers[SI_NUM_SHADERS];
 	struct si_textures_info		samplers[SI_NUM_SHADERS];
+	struct si_images_info		images[SI_NUM_SHADERS];
 
 	/* other shader resources */
 	struct pipe_constant_buffer	null_const_buf; /* used for set_constant_buffer(NULL) on CIK */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 8c1151aa493..9eb531f8d80 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -40,6 +40,7 @@
 #include "util/u_memory.h"
 #include "util/u_pstipple.h"
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_build.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_dump.h"
 
@@ -99,6 +100,7 @@ struct si_shader_context
 	LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
 	LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
 	LLVMValueRef fmasks[SI_NUM_USER_SAMPLERS];
+	LLVMValueRef images[SI_NUM_IMAGES];
 	LLVMValueRef so_buffers[4];
 	LLVMValueRef esgs_ring;
 	LLVMValueRef gsvs_ring[4];
@@ -530,6 +532,37 @@ static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 }
 
 /**
+ * Like get_indirect_index, but restricts the return value to a (possibly
+ * undefined) value inside [0..num).
+ */
+static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
+					       const struct tgsi_ind_register *ind,
+					       int rel_index, unsigned num)
+{
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
+	LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
+	LLVMValueRef cc;
+
+	if (util_is_power_of_two(num)) {
+		result = LLVMBuildAnd(builder, result, c_max, "");
+	} else {
+		/* In theory, this MAX pattern should result in code that is
+		 * as good as the bit-wise AND above.
+		 *
+		 * In practice, LLVM generates worse code (at the time of
+		 * writing), because its value tracking is not strong enough.
+		 */
+		cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
+		result = LLVMBuildSelect(builder, cc, result, c_max, "");
+	}
+
+	return result;
+}
+
+
+/**
  * Calculate a dword address given an input or output register and a stride.
  */
 static LLVMValueRef get_dw_address(struct si_shader_context *ctx,
@@ -2656,10 +2689,90 @@ static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
 	ctx->return_value = ret;
 }
 
+/**
+ * Given a v8i32 resource descriptor for a buffer, extract the size of the
+ * buffer in number of elements and return it as an i32.
+ */
+static LLVMValueRef get_buffer_size(
+	struct lp_build_tgsi_context *bld_base,
+	LLVMValueRef descriptor)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef size =
+		LLVMBuildExtractElement(builder, descriptor,
+					lp_build_const_int32(gallivm, 6), "");
+
+	if (ctx->screen->b.chip_class >= VI) {
+		/* On VI, the descriptor contains the size in bytes,
+		 * but TXQ must return the size in elements.
+		 * The stride is always non-zero for resources using TXQ.
+		 */
+		LLVMValueRef stride =
+			LLVMBuildExtractElement(builder, descriptor,
+						lp_build_const_int32(gallivm, 5), "");
+		stride = LLVMBuildLShr(builder, stride,
+				       lp_build_const_int32(gallivm, 16), "");
+		stride = LLVMBuildAnd(builder, stride,
+				      lp_build_const_int32(gallivm, 0x3FFF), "");
+
+		size = LLVMBuildUDiv(builder, size, stride, "");
+	}
+
+	return size;
+}
+
+/**
+ * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
+ * intrinsic names).
+ */
+static void build_int_type_name(
+	LLVMTypeRef type,
+	char *buf, unsigned bufsize)
+{
+	assert(bufsize >= 6);
+
+	if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
+		snprintf(buf, bufsize, "v%ui32",
+			 LLVMGetVectorSize(type));
+	else
+		strcpy(buf, "i32");
+}
+
 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
 				struct lp_build_tgsi_context *bld_base,
 				struct lp_build_emit_data *emit_data);
 
+/* Prevent optimizations (at least of memory accesses) across the current
+ * point in the program by emitting empty inline assembly that is marked as
+ * having side effects.
+ */
+static void emit_optimization_barrier(struct si_shader_context *ctx)
+{
+	LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
+	LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
+	LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
+	LLVMBuildCall(builder, inlineasm, NULL, 0, "");
+}
+
+static void membar_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+
+	/* Since memoryBarrier only makes guarantees about atomics and
+	 * coherent image accesses (which bypass TC L1), we do not need to emit
+	 * any special cache handling here.
+	 *
+	 * We do have to prevent LLVM from re-ordering loads across
+	 * the barrier though.
+	 */
+	emit_optimization_barrier(ctx);
+}
+
 static bool tgsi_is_array_sampler(unsigned target)
 {
 	return target == TGSI_TEXTURE_1D_ARRAY ||
@@ -2671,6 +2784,459 @@ static bool tgsi_is_array_sampler(unsigned target)
 	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
 }
 
+static bool tgsi_is_array_image(unsigned target)
+{
+	return target == TGSI_TEXTURE_3D ||
+	       target == TGSI_TEXTURE_CUBE ||
+	       target == TGSI_TEXTURE_1D_ARRAY ||
+	       target == TGSI_TEXTURE_2D_ARRAY ||
+	       target == TGSI_TEXTURE_CUBE_ARRAY ||
+	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
+}
+
+/**
+ * Given a 256-bit resource descriptor, force the DCC enable bit to off.
+ *
+ * At least on Tonga, executing image stores on images with DCC enabled and
+ * non-trivial can eventually lead to lockups. This can occur when an
+ * application binds an image as read-only but then uses a shader that writes
+ * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
+ * program termination) in this case, but it doesn't cost much to be a bit
+ * nicer: disabling DCC in the shader still leads to undefined results but
+ * avoids the lockup.
+ */
+static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
+				  LLVMValueRef rsrc)
+{
+	if (ctx->screen->b.chip_class <= CIK) {
+		return rsrc;
+	} else {
+		LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
+		LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
+		LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
+		LLVMValueRef tmp;
+
+		tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
+		tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
+		return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
+	}
+}
+
+/**
+ * Load the resource descriptor for \p image.
+ */
+static void
+image_fetch_rsrc(
+	struct lp_build_tgsi_context *bld_base,
+	const struct tgsi_full_src_register *image,
+	bool dcc_off,
+	LLVMValueRef *rsrc)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+
+	assert(image->Register.File == TGSI_FILE_IMAGE);
+
+	if (!image->Register.Indirect) {
+		/* Fast path: use preloaded resources */
+		*rsrc = ctx->images[image->Register.Index];
+	} else {
+		/* Indexing and manual load */
+		LLVMValueRef ind_index;
+		LLVMValueRef rsrc_ptr;
+		LLVMValueRef tmp;
+
+		/* From the GL_ARB_shader_image_load_store extension spec:
+		 *
+		 *    If a shader performs an image load, store, or atomic
+		 *    operation using an image variable declared as an array,
+		 *    and if the index used to select an individual element is
+		 *    negative or greater than or equal to the size of the
+		 *    array, the results of the operation are undefined but may
+		 *    not lead to termination.
+		 */
+		ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
+						       image->Register.Index,
+						       SI_NUM_IMAGES);
+
+		rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
+		tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
+		if (dcc_off)
+			tmp = force_dcc_off(ctx, tmp);
+		*rsrc = tmp;
+	}
+}
+
+static LLVMValueRef image_fetch_coords(
+		struct lp_build_tgsi_context *bld_base,
+		const struct tgsi_full_instruction *inst,
+		unsigned src)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	unsigned target = inst->Memory.Texture;
+	int sample;
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &sample);
+	LLVMValueRef coords[4];
+	LLVMValueRef tmp;
+	int chan;
+
+	for (chan = 0; chan < num_coords; ++chan) {
+		tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
+		tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
+		coords[chan] = tmp;
+	}
+
+	if (num_coords == 1)
+		return coords[0];
+
+	if (num_coords == 3) {
+		/* LLVM has difficulties lowering 3-element vectors. */
+		coords[3] = bld_base->uint_bld.undef;
+		num_coords = 4;
+	}
+
+	return lp_build_gather_values(gallivm, coords, num_coords);
+}
+
+/**
+ * Append the extra mode bits that are used by image load and store.
+ */
+static void image_append_args(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data * emit_data,
+		unsigned target,
+		bool atomic)
+{
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
+	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
+
+	emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
+	emit_data->args[emit_data->arg_count++] =
+		tgsi_is_array_image(target) ? i1true : i1false; /* da */
+	if (!atomic) {
+		emit_data->args[emit_data->arg_count++] =
+			inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
+			i1true : i1false; /* glc */
+	}
+	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
+}
+
+/**
+ * Append the resource and indexing arguments for buffer intrinsics.
+ *
+ * \param rsrc the 256 bit resource
+ * \param index index into the buffer
+ */
+static void buffer_append_args(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data *emit_data,
+		LLVMValueRef rsrc,
+		LLVMValueRef index,
+		bool atomic)
+{
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
+	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
+	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
+
+	rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
+	rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
+	rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
+
+	emit_data->args[emit_data->arg_count++] = rsrc;
+	emit_data->args[emit_data->arg_count++] = index; /* vindex */
+	emit_data->args[emit_data->arg_count++] = bld_base->uint_bld.zero; /* voffset */
+	if (!atomic) {
+		emit_data->args[emit_data->arg_count++] =
+			inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
+			i1true : i1false; /* glc */
+	}
+	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
+}
+
+static void load_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	LLVMValueRef coords;
+	LLVMValueRef rsrc;
+
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+
+	image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
+	coords = image_fetch_coords(bld_base, inst, 1);
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		buffer_append_args(ctx, emit_data, rsrc, coords, false);
+	} else {
+		emit_data->args[0] = coords;
+		emit_data->args[1] = rsrc;
+		emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
+		emit_data->arg_count = 3;
+
+		image_append_args(ctx, emit_data, target, false);
+	}
+}
+
+static void load_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	char intrinsic_name[32];
+	char coords_type[8];
+
+	if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
+		emit_optimization_barrier(ctx);
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
+			builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
+			emit_data->args, emit_data->arg_count,
+			LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
+	} else {
+		build_int_type_name(LLVMTypeOf(emit_data->args[0]),
+				    coords_type, sizeof(coords_type));
+
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.image.load.%s", coords_type);
+
+		emit_data->output[emit_data->chan] =
+			lp_build_intrinsic(
+				builder, intrinsic_name, emit_data->dst_type,
+				emit_data->args, emit_data->arg_count,
+				LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
+	}
+}
+
+static void store_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	struct tgsi_full_src_register image;
+	unsigned target = inst->Memory.Texture;
+	LLVMValueRef chans[4];
+	LLVMValueRef data;
+	LLVMValueRef coords;
+	LLVMValueRef rsrc;
+	unsigned chan;
+
+	emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
+
+	image = tgsi_full_src_register_from_dst(&inst->Dst[0]);
+	coords = image_fetch_coords(bld_base, inst, 0);
+
+	for (chan = 0; chan < 4; ++chan) {
+		chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
+	}
+	data = lp_build_gather_values(gallivm, chans, 4);
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		image_fetch_rsrc(bld_base, &image, false, &rsrc);
+		emit_data->args[0] = data;
+		emit_data->arg_count = 1;
+
+		buffer_append_args(ctx, emit_data, rsrc, coords, false);
+	} else {
+		emit_data->args[0] = data;
+		emit_data->args[1] = coords;
+		image_fetch_rsrc(bld_base, &image, true, &emit_data->args[2]);
+		emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
+		emit_data->arg_count = 4;
+
+		image_append_args(ctx, emit_data, target, false);
+	}
+}
+
+static void store_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	char intrinsic_name[32];
+	char coords_type[8];
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
+			builder, "llvm.amdgcn.buffer.store.format.v4f32",
+			emit_data->dst_type, emit_data->args, emit_data->arg_count,
+			LLVMNoUnwindAttribute);
+	} else {
+		build_int_type_name(LLVMTypeOf(emit_data->args[1]),
+				    coords_type, sizeof(coords_type));
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.image.store.%s", coords_type);
+
+		emit_data->output[emit_data->chan] =
+			lp_build_intrinsic(
+				builder, intrinsic_name, emit_data->dst_type,
+				emit_data->args, emit_data->arg_count,
+				LLVMNoUnwindAttribute);
+	}
+}
+
+static void atomic_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	LLVMValueRef data1, data2;
+	LLVMValueRef coords;
+	LLVMValueRef rsrc;
+	LLVMValueRef tmp;
+
+	emit_data->dst_type = bld_base->base.elem_type;
+
+	image_fetch_rsrc(bld_base, &inst->Src[0], target != TGSI_TEXTURE_BUFFER,
+			 &rsrc);
+	coords = image_fetch_coords(bld_base, inst, 1);
+
+	tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
+	data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+		tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
+		data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
+	}
+
+	/* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
+	 * of arguments, which is reversed relative to TGSI (and GLSL)
+	 */
+	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+		emit_data->args[emit_data->arg_count++] = data2;
+	emit_data->args[emit_data->arg_count++] = data1;
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		buffer_append_args(ctx, emit_data, rsrc, coords, true);
+	} else {
+		emit_data->args[emit_data->arg_count++] = coords;
+		emit_data->args[emit_data->arg_count++] = rsrc;
+
+		image_append_args(ctx, emit_data, target, true);
+	}
+}
+
+static void atomic_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	char intrinsic_name[40];
+	LLVMValueRef tmp;
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
+	} else {
+		char coords_type[8];
+
+		build_int_type_name(LLVMTypeOf(emit_data->args[1]),
+				    coords_type, sizeof(coords_type));
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.image.atomic.%s.%s",
+			 action->intr_name, coords_type);
+	}
+
+	tmp = lp_build_intrinsic(
+		builder, intrinsic_name, bld_base->uint_bld.elem_type,
+		emit_data->args, emit_data->arg_count,
+		LLVMNoUnwindAttribute);
+	emit_data->output[emit_data->chan] =
+		LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
+}
+
+static void resq_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	const struct tgsi_full_src_register *reg = &inst->Src[0];
+	unsigned tex_target = inst->Memory.Texture;
+
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+
+	if (tex_target == TGSI_TEXTURE_BUFFER) {
+		image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
+		emit_data->arg_count = 1;
+	} else {
+		emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
+		image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
+		emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
+		emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
+		emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
+		emit_data->args[5] = tgsi_is_array_image(tex_target) ?
+			bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
+		emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
+		emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
+		emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
+		emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
+		emit_data->arg_count = 10;
+	}
+}
+
+static void resq_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	LLVMValueRef out;
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		out = get_buffer_size(bld_base, emit_data->args[0]);
+	} else {
+		out = lp_build_intrinsic(
+			builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
+			emit_data->args, emit_data->arg_count,
+			LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
+
+		/* Divide the number of layers by 6 to get the number of cubes. */
+		if (target == TGSI_TEXTURE_CUBE_ARRAY) {
+			LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
+			LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
+
+			LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
+			z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
+			z = LLVMBuildSDiv(builder, z, imm6, "");
+			z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
+			out = LLVMBuildInsertElement(builder, out, z, imm2, "");
+		}
+	}
+
+	emit_data->output[emit_data->chan] = out;
+}
+
 static void set_tex_fetch_args(struct si_shader_context *ctx,
 			       struct lp_build_emit_data *emit_data,
 			       unsigned opcode, unsigned target,
@@ -2836,26 +3402,7 @@ static void tex_fetch_args(
 		if (target == TGSI_TEXTURE_BUFFER) {
 			/* Read the size from the buffer descriptor directly. */
 			LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
-			LLVMValueRef size = LLVMBuildExtractElement(builder, res,
-							lp_build_const_int32(gallivm, 6), "");
-
-			if (ctx->screen->b.chip_class >= VI) {
-				/* On VI, the descriptor contains the size in bytes,
-				 * but TXQ must return the size in elements.
-				 * The stride is always non-zero for resources using TXQ.
-				 */
-				LLVMValueRef stride =
-					LLVMBuildExtractElement(builder, res,
-								lp_build_const_int32(gallivm, 5), "");
-				stride = LLVMBuildLShr(builder, stride,
-						       lp_build_const_int32(gallivm, 16), "");
-				stride = LLVMBuildAnd(builder, stride,
-						      lp_build_const_int32(gallivm, 0x3FFF), "");
-
-				size = LLVMBuildUDiv(builder, size, stride, "");
-			}
-
-			emit_data->args[0] = size;
+			emit_data->args[0] = get_buffer_size(bld_base, res);
 			return;
 		}
 
@@ -3236,14 +3783,9 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
 		return;
 	}
 
-	if (LLVMGetTypeKind(LLVMTypeOf(emit_data->args[0])) == LLVMVectorTypeKind)
-		sprintf(type, ".v%ui32",
-			LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
-	else
-		strcpy(type, ".i32");
-
 	/* Add the type and suffixes .c, .o if needed. */
-	sprintf(intr_name, "%s%s%s%s%s",
+	build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
+	sprintf(intr_name, "%s%s%s%s.%s",
 		name, is_shadow ? ".c" : "", infix,
 		has_offset ? ".o" : "", type);
 
@@ -3865,8 +4407,8 @@ static void create_function(struct si_shader_context *ctx)
 	params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
 	params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
 	params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
-	params[SI_PARAM_UNUSED] = LLVMPointerType(ctx->i32, CONST_ADDR_SPACE);
-	last_array_pointer = SI_PARAM_UNUSED;
+	params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
+	last_array_pointer = SI_PARAM_IMAGES;
 
 	switch (ctx->type) {
 	case TGSI_PROCESSOR_VERTEX:
@@ -4153,6 +4695,34 @@ static void preload_samplers(struct si_shader_context *ctx)
 	}
 }
 
+static void preload_images(struct si_shader_context *ctx)
+{
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	struct tgsi_shader_info *info = &ctx->shader->selector->info;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
+	LLVMValueRef res_ptr;
+	unsigned i;
+
+	if (num_images == 0)
+		return;
+
+	res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
+
+	for (i = 0; i < num_images; ++i) {
+		/* Rely on LLVM to shrink the load for buffer resources. */
+		LLVMValueRef rsrc =
+			build_indexed_load_const(ctx, res_ptr,
+						 lp_build_const_int32(gallivm, i));
+
+		if (info->images_writemask & (1 << i) &&
+		    !(info->images_buffers & (1 << i)))
+			rsrc = force_dcc_off(ctx, rsrc);
+
+		ctx->images[i] = rsrc;
+	}
+}
+
 static void preload_streamout_buffers(struct si_shader_context *ctx)
 {
 	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
@@ -4792,6 +5362,7 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 			       LLVMTargetMachineRef tm)
 {
 	struct lp_build_tgsi_context *bld_base;
+	struct lp_build_tgsi_action tmpl = {};
 
 	memset(ctx, 0, sizeof(*ctx));
 	radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
@@ -4839,6 +5410,38 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 	bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
 
+	bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
+	bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
+	bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
+
+	tmpl.fetch_args = atomic_fetch_args;
+	tmpl.emit = atomic_emit;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
+	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
+	bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
+	bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
+	bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
+	bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
+
+	bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
+
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
@@ -4926,6 +5529,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 	create_function(&ctx);
 	preload_constants(&ctx);
 	preload_samplers(&ctx);
+	preload_images(&ctx);
 	preload_streamout_buffers(&ctx);
 	preload_ring_buffers(&ctx);
 
@@ -5383,7 +5987,7 @@ static bool si_compile_tcs_epilog(struct si_screen *sscreen,
 	last_array_pointer = SI_PARAM_RW_BUFFERS;
 	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
 	params[SI_PARAM_SAMPLERS] = ctx.i64;
-	params[SI_PARAM_UNUSED] = ctx.i64;
+	params[SI_PARAM_IMAGES] = ctx.i64;
 	params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
 	params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
 	params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
@@ -5633,7 +6237,7 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen,
 	params[SI_PARAM_RW_BUFFERS] = ctx.i64;
 	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
 	params[SI_PARAM_SAMPLERS] = ctx.i64;
-	params[SI_PARAM_UNUSED] = ctx.i64;
+	params[SI_PARAM_IMAGES] = ctx.i64;
 	params[SI_PARAM_ALPHA_REF] = ctx.f32;
 	last_array_pointer = -1;
 	last_sgpr = SI_PARAM_ALPHA_REF;
@@ -5897,12 +6501,15 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	struct si_shader *mainp = shader->selector->main_shader_part;
 	int r;
 
-	/* LS and ES are always compiled on demand. */
+	/* LS, ES, VS are compiled on demand if the main part hasn't been
+	 * compiled for that stage.
+	 */
 	if (!mainp ||
 	    (shader->selector->type == PIPE_SHADER_VERTEX &&
-	     (shader->key.vs.as_es || shader->key.vs.as_ls)) ||
+	     (shader->key.vs.as_es != mainp->key.vs.as_es ||
+	      shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
 	    (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
-	     shader->key.tes.as_es)) {
+	     shader->key.tes.as_es != mainp->key.tes.as_es)) {
 		/* Monolithic shader (compiled as a whole, has many variants,
 		 * may take a long time to compile).
 		 */
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index de23e642fe4..8059edf6395 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -80,7 +80,7 @@ struct radeon_shader_reloc;
 #define SI_SGPR_RW_BUFFERS	0  /* rings (& stream-out, VS only) */
 #define SI_SGPR_CONST_BUFFERS	2
 #define SI_SGPR_SAMPLERS	4  /* images & sampler states interleaved */
-/* TODO: gap */
+#define SI_SGPR_IMAGES		6
 #define SI_SGPR_VERTEX_BUFFERS	8  /* VS only */
 #define SI_SGPR_BASE_VERTEX	10 /* VS only */
 #define SI_SGPR_START_INSTANCE	11 /* VS only */
@@ -104,7 +104,7 @@ struct radeon_shader_reloc;
 #define SI_PARAM_RW_BUFFERS	0
 #define SI_PARAM_CONST_BUFFERS	1
 #define SI_PARAM_SAMPLERS	2
-#define SI_PARAM_UNUSED		3 /* TODO: use */
+#define SI_PARAM_IMAGES		3
 
 /* VS only parameters */
 #define SI_PARAM_VERTEX_BUFFERS	4
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index f823af188c7..1245f56c08a 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2797,7 +2797,7 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
  * Build the sampler view descriptor for a buffer texture.
  * @param state 256-bit descriptor; only the high 128 bits are filled in
  */
-static void
+void
 si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
 			  enum pipe_format format,
 			  unsigned first_element, unsigned last_element,
@@ -2838,9 +2838,10 @@ si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
 /**
  * Build the sampler view descriptor for a texture.
  */
-static void
+void
 si_make_texture_descriptor(struct si_screen *screen,
 			   struct r600_texture *tex,
+			   bool sampler,
 			   enum pipe_texture_target target,
 			   enum pipe_format pipe_format,
 			   const unsigned char state_swizzle[4],
@@ -2855,7 +2856,7 @@ si_make_texture_descriptor(struct si_screen *screen,
 	const struct util_format_description *desc;
 	unsigned char swizzle[4];
 	int first_non_void;
-	unsigned num_format, data_format;
+	unsigned num_format, data_format, type;
 	uint32_t pitch;
 	uint64_t va;
 
@@ -2973,12 +2974,30 @@ si_make_texture_descriptor(struct si_screen *screen,
 		data_format = 0;
 	}
 
-	if (res->target == PIPE_TEXTURE_1D_ARRAY) {
+	if (!sampler &&
+	    (res->target == PIPE_TEXTURE_CUBE ||
+	     res->target == PIPE_TEXTURE_CUBE_ARRAY ||
+	     res->target == PIPE_TEXTURE_3D)) {
+		/* For the purpose of shader images, treat cube maps and 3D
+		 * textures as 2D arrays. For 3D textures, the address
+		 * calculations for mipmaps are different, so we rely on the
+		 * caller to effectively disable mipmaps.
+		 */
+		type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+
+		assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
+	} else {
+		type = si_tex_dim(res->target, target, res->nr_samples);
+	}
+
+	if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
 	        height = 1;
 		depth = res->array_size;
-	} else if (res->target == PIPE_TEXTURE_2D_ARRAY) {
-		depth = res->array_size;
-	} else if (res->target == PIPE_TEXTURE_CUBE_ARRAY)
+	} else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
+		   type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
+		if (sampler || res->target != PIPE_TEXTURE_3D)
+			depth = res->array_size;
+	} else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
 		depth = res->array_size / 6;
 
 	pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format);
@@ -3001,7 +3020,7 @@ si_make_texture_descriptor(struct si_screen *screen,
 					last_level) |
 		    S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level, false)) |
 		    S_008F1C_POW2_PAD(res->last_level > 0) |
-		    S_008F1C_TYPE(si_tex_dim(res->target, target, res->nr_samples)));
+		    S_008F1C_TYPE(type));
 	state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));
 	state[5] = (S_008F24_BASE_ARRAY(first_layer) |
 		    S_008F24_LAST_ARRAY(last_layer));
@@ -3155,7 +3174,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 	    state->target == PIPE_TEXTURE_CUBE)
 		last_layer = state->u.tex.first_layer;
 
-	si_make_texture_descriptor(sctx->screen, tmp, state->target,
+	si_make_texture_descriptor(sctx->screen, tmp, true, state->target,
 				   state->format, state_swizzle,
 				   base_level, first_level, last_level,
 				   state->u.tex.first_layer, last_layer,
@@ -3503,6 +3522,52 @@ static void si_texture_barrier(struct pipe_context *ctx)
 			 SI_CONTEXT_FLUSH_AND_INV_CB;
 }
 
+static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+
+	/* Subsequent commands must wait for all shader invocations to
+	 * complete. */
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+
+	if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
+		sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
+				 SI_CONTEXT_INV_VMEM_L1;
+
+	if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
+		     PIPE_BARRIER_SHADER_BUFFER |
+		     PIPE_BARRIER_TEXTURE |
+		     PIPE_BARRIER_IMAGE |
+		     PIPE_BARRIER_STREAMOUT_BUFFER)) {
+		/* As far as I can tell, L1 contents are written back to L2
+		 * automatically at end of shader, but the contents of other
+		 * L1 caches might still be stale. */
+		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1;
+	}
+
+	if (flags & PIPE_BARRIER_INDEX_BUFFER) {
+		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1;
+
+		/* Indices are read through TC L2 since VI. */
+		if (sctx->screen->b.chip_class <= CIK)
+			sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+	}
+
+	if (flags & PIPE_BARRIER_FRAMEBUFFER)
+		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+
+	if (flags & (PIPE_BARRIER_MAPPED_BUFFER |
+		     PIPE_BARRIER_FRAMEBUFFER |
+		     PIPE_BARRIER_INDIRECT_BUFFER)) {
+		/* Not sure if INV_GLOBAL_L2 is the best thing here.
+		 *
+		 * We need to make sure that TC L1 & L2 are written back to
+		 * memory, because neither CPU accesses nor CB fetches consider
+		 * TC, but there's no need to invalidate any TC cache lines. */
+		sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+	}
+}
+
 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
 {
 	struct pipe_blend_state blend;
@@ -3583,6 +3648,7 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->b.b.set_index_buffer = si_set_index_buffer;
 
 	sctx->b.b.texture_barrier = si_texture_barrier;
+	sctx->b.b.memory_barrier = si_memory_barrier;
 	sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
 	sctx->b.b.set_min_samples = si_set_min_samples;
 	sctx->b.b.set_tess_state = si_set_tess_state;
@@ -3637,7 +3703,8 @@ static void si_query_opaque_metadata(struct r600_common_screen *rscreen,
 	/* TILE_MODE_INDEX is ambiguous without a PCI ID. */
 	md->metadata[1] = (ATI_VENDOR_ID << 16) | rscreen->info.pci_id;
 
-	si_make_texture_descriptor(sscreen, rtex, res->target, res->format,
+	si_make_texture_descriptor(sscreen, rtex, true,
+				   res->target, res->format,
 				   swizzle, 0, 0, res->last_level, 0,
 				   is_array ? res->array_size - 1 : 0,
 				   res->width0, res->height0, res->depth0,
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 60c34f19e55..c4d6b9d9eee 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -158,6 +158,8 @@ struct si_shader_data {
 #define SI_DRIVER_STATE_CONST_BUF	SI_NUM_USER_CONST_BUFFERS
 #define SI_NUM_CONST_BUFFERS		(SI_DRIVER_STATE_CONST_BUF + 1)
 
+#define SI_NUM_IMAGES			16
+
 /* Read-write buffer slots.
  *
  * Ring buffers:        0..1
@@ -272,6 +274,23 @@ unsigned cik_tile_split(unsigned tile_split);
 unsigned si_array_mode(unsigned mode);
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex);
 unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil);
+void
+si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
+			  enum pipe_format format,
+			  unsigned first_element, unsigned last_element,
+			  uint32_t *state);
+void
+si_make_texture_descriptor(struct si_screen *screen,
+			   struct r600_texture *tex,
+			   bool sampler,
+			   enum pipe_texture_target target,
+			   enum pipe_format pipe_format,
+			   const unsigned char state_swizzle[4],
+			   unsigned base_level, unsigned first_level, unsigned last_level,
+			   unsigned first_layer, unsigned last_layer,
+			   unsigned width, unsigned height, unsigned depth,
+			   uint32_t *state,
+			   uint32_t *fmask_state);
 struct pipe_sampler_view *
 si_create_sampler_view_custom(struct pipe_context *ctx,
 			      struct pipe_resource *texture,
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 5fe1f7960f3..02489583423 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -794,9 +794,15 @@ static void si_shader_ps(struct si_shader *shader)
 	 * - the shader uses at least 2 VMEM instructions, or
 	 * - the code size is at least 50 2-dword instructions or 100 1-dword
 	 *   instructions.
+	 *
+	 * Shaders with side effects that must execute independently of the
+	 * depth test require LATE_Z.
 	 */
-	if (info->num_memory_instructions >= 2 ||
-	    shader->binary.code_size > 100*4)
+	if (info->writes_memory &&
+	    !info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])
+		shader->z_order = V_02880C_LATE_Z;
+	else if (info->num_memory_instructions >= 2 ||
+	         shader->binary.code_size > 100*4)
 		shader->z_order = V_02880C_EARLY_Z_THEN_RE_Z;
 	else
 		shader->z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
@@ -1042,6 +1048,31 @@ static int si_shader_select(struct pipe_context *ctx,
 	return si_shader_select_with_key(ctx, state, &key);
 }
 
+static void si_parse_next_shader_property(const struct tgsi_shader_info *info,
+					  union si_shader_key *key)
+{
+	unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
+
+	switch (info->processor) {
+	case TGSI_PROCESSOR_VERTEX:
+		switch (next_shader) {
+		case TGSI_PROCESSOR_GEOMETRY:
+			key->vs.as_es = 1;
+			break;
+		case TGSI_PROCESSOR_TESS_CTRL:
+		case TGSI_PROCESSOR_TESS_EVAL:
+			key->vs.as_ls = 1;
+			break;
+		}
+		break;
+
+	case TGSI_PROCESSOR_TESS_EVAL:
+		if (next_shader == TGSI_PROCESSOR_GEOMETRY)
+			key->tes.as_es = 1;
+		break;
+	}
+}
+
 static void *si_create_shader_selector(struct pipe_context *ctx,
 				       const struct pipe_shader_state *state)
 {
@@ -1157,6 +1188,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 	if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])
 		sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1);
 
+	if (sel->info.writes_memory)
+		sel->db_shader_control |= S_02880C_EXEC_ON_HIER_FAIL(1) |
+					  S_02880C_EXEC_ON_NOOP(1);
+
 	/* Compile the main shader part for use with a prolog and/or epilog. */
 	if (sel->type != PIPE_SHADER_GEOMETRY &&
 	    !sscreen->use_monolithic_shaders) {
@@ -1167,6 +1202,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 			goto error;
 
 		shader->selector = sel;
+		si_parse_next_shader_property(&sel->info, &shader->key);
 
 		tgsi_binary = si_get_tgsi_binary(sel);
 
@@ -1202,6 +1238,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		union si_shader_key key;
 
 		memset(&key, 0, sizeof(key));
+		si_parse_next_shader_property(&sel->info, &key);
 
 		/* Set reasonable defaults, so that the shader key doesn't
 		 * cause any code to be eliminated.
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index da4281490ae..896dcdf59d0 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -247,6 +247,7 @@ struct pipe_context *svga_context_create(struct pipe_screen *screen,
           sizeof(svga->state.hw_draw.default_constbuf_size));
    memset(svga->state.hw_draw.enabled_constbufs, 0,
           sizeof(svga->state.hw_draw.enabled_constbufs));
+   svga->state.hw_draw.ib = NULL;
 
    /* Create a no-operation blend state which we will bind whenever the
     * requested blend state is impossible (e.g. due to having an integer
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 1976f98e5c1..ead47c07980 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -55,16 +55,21 @@
 #define SVGA_QUERY_COMMAND_BUFFER_SIZE     (PIPE_QUERY_DRIVER_SPECIFIC + 7)
 #define SVGA_QUERY_FLUSH_TIME              (PIPE_QUERY_DRIVER_SPECIFIC + 8)
 #define SVGA_QUERY_SURFACE_WRITE_FLUSHES   (PIPE_QUERY_DRIVER_SPECIFIC + 9)
+#define SVGA_QUERY_NUM_READBACKS           (PIPE_QUERY_DRIVER_SPECIFIC + 10)
+#define SVGA_QUERY_NUM_RESOURCE_UPDATES    (PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define SVGA_QUERY_NUM_BUFFER_UPLOADS      (PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define SVGA_QUERY_NUM_CONST_BUF_UPDATES   (PIPE_QUERY_DRIVER_SPECIFIC + 13)
+#define SVGA_QUERY_NUM_CONST_UPDATES       (PIPE_QUERY_DRIVER_SPECIFIC + 14)
 
 /* running total counters */
-#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 10)
-#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 11)
-#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 12)
-#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 13)
-#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 14)
-#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 15)
+#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 15)
+#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 16)
+#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 17)
+#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 18)
+#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 19)
+#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 20)
 /*SVGA_QUERY_MAX has to be last because it is size of an array*/
-#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 16)
+#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 21)
 
 /**
  * Maximum supported number of constant buffers per shader
@@ -499,20 +504,25 @@ struct svga_context
 
    /** performance / info queries for HUD */
    struct {
-      uint64_t num_draw_calls;       /**< SVGA_QUERY_DRAW_CALLS */
-      uint64_t num_fallbacks;        /**< SVGA_QUERY_NUM_FALLBACKS */
-      uint64_t num_flushes;          /**< SVGA_QUERY_NUM_FLUSHES */
-      uint64_t num_validations;      /**< SVGA_QUERY_NUM_VALIDATIONS */
-      uint64_t map_buffer_time;      /**< SVGA_QUERY_MAP_BUFFER_TIME */
-      uint64_t num_resources_mapped; /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */
-      uint64_t command_buffer_size;  /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */
-      uint64_t flush_time;           /**< SVGA_QUERY_FLUSH_TIME */
-      uint64_t surface_write_flushes; /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */
-      uint64_t num_shaders;          /**< SVGA_QUERY_NUM_SHADERS */
-      uint64_t num_state_objects;    /**< SVGA_QUERY_NUM_STATE_OBJECTS */
-      uint64_t num_surface_views;    /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
-      uint64_t num_bytes_uploaded;   /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
-      uint64_t num_generate_mipmap;  /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */
+      uint64_t num_draw_calls;          /**< SVGA_QUERY_DRAW_CALLS */
+      uint64_t num_fallbacks;           /**< SVGA_QUERY_NUM_FALLBACKS */
+      uint64_t num_flushes;             /**< SVGA_QUERY_NUM_FLUSHES */
+      uint64_t num_validations;         /**< SVGA_QUERY_NUM_VALIDATIONS */
+      uint64_t map_buffer_time;         /**< SVGA_QUERY_MAP_BUFFER_TIME */
+      uint64_t num_resources_mapped;    /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */
+      uint64_t command_buffer_size;     /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */
+      uint64_t flush_time;              /**< SVGA_QUERY_FLUSH_TIME */
+      uint64_t surface_write_flushes;   /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */
+      uint64_t num_readbacks;           /**< SVGA_QUERY_NUM_READBACKS */
+      uint64_t num_resource_updates;    /**< SVGA_QUERY_NUM_RESOURCE_UPDATES */
+      uint64_t num_buffer_uploads;      /**< SVGA_QUERY_NUM_BUFFER_UPLOADS */
+      uint64_t num_const_buf_updates;   /**< SVGA_QUERY_NUM_CONST_BUF_UPDATES */
+      uint64_t num_const_updates;       /**< SVGA_QUERY_NUM_CONST_UPDATES */
+      uint64_t num_shaders;             /**< SVGA_QUERY_NUM_SHADERS */
+      uint64_t num_state_objects;       /**< SVGA_QUERY_NUM_STATE_OBJECTS */
+      uint64_t num_surface_views;       /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
+      uint64_t num_bytes_uploaded;      /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
+      uint64_t num_generate_mipmap;     /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */
    } hud;
 
    /** The currently bound stream output targets */
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index fe6cf71a6e5..0b9ea889afa 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -458,6 +458,14 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
       ret = svga_rebind_shaders(svga);
       if (ret != PIPE_OK)
          return ret;
+
+      /* Rebind stream output targets */
+      ret = svga_rebind_stream_output_targets(svga);
+      if (ret != PIPE_OK)
+         return ret;
+
+      /* Force rebinding the index buffer when needed */
+      svga->state.hw_draw.ib = NULL;
    }
 
    ret = validate_sampler_resources(svga);
diff --git a/src/gallium/drivers/svga/svga_pipe_misc.c b/src/gallium/drivers/svga/svga_pipe_misc.c
index af9356d7c75..a26e577d8f7 100644
--- a/src/gallium/drivers/svga/svga_pipe_misc.c
+++ b/src/gallium/drivers/svga/svga_pipe_misc.c
@@ -254,10 +254,13 @@ svga_set_debug_callback(struct pipe_context *pipe,
 {
    struct svga_context *svga = svga_context(pipe);
 
-   if (cb)
+   if (cb) {
       svga->debug.callback = *cb;
-   else
+      svga->swc->debug_callback = &svga->debug.callback;
+   } else {
       memset(&svga->debug.callback, 0, sizeof(svga->debug.callback));
+      svga->swc->debug_callback = NULL;
+   }
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 845f4ef3a1c..88f41eadc1d 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -72,11 +72,14 @@ struct svga_query {
 
 /** cast wrapper */
 static inline struct svga_query *
-svga_query( struct pipe_query *q )
+svga_query(struct pipe_query *q)
 {
    return (struct svga_query *)q;
 }
 
+/**
+ * VGPU9
+ */
 
 static boolean
 svga_get_query_result(struct pipe_context *pipe,
@@ -736,6 +739,11 @@ svga_create_query(struct pipe_context *pipe,
    case SVGA_QUERY_NUM_STATE_OBJECTS:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
+   case SVGA_QUERY_NUM_READBACKS:
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+   case SVGA_QUERY_NUM_CONST_UPDATES:
       break;
    default:
       assert(!"unexpected query type in svga_create_query()");
@@ -808,6 +816,11 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_STATE_OBJECTS:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
+   case SVGA_QUERY_NUM_READBACKS:
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+   case SVGA_QUERY_NUM_CONST_UPDATES:
       /* nothing */
       break;
    default:
@@ -899,6 +912,21 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
       sq->begin_count = svga->hud.surface_write_flushes;
       break;
+   case SVGA_QUERY_NUM_READBACKS:
+      sq->begin_count = svga->hud.num_readbacks;
+      break;
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+      sq->begin_count = svga->hud.num_resource_updates;
+      break;
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+      sq->begin_count = svga->hud.num_buffer_uploads;
+      break;
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+      sq->begin_count = svga->hud.num_const_buf_updates;
+      break;
+   case SVGA_QUERY_NUM_CONST_UPDATES:
+      sq->begin_count = svga->hud.num_const_updates;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -1002,6 +1030,21 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
       sq->end_count = svga->hud.surface_write_flushes;
       break;
+   case SVGA_QUERY_NUM_READBACKS:
+      sq->end_count = svga->hud.num_readbacks;
+      break;
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+      sq->end_count = svga->hud.num_resource_updates;
+      break;
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+      sq->end_count = svga->hud.num_buffer_uploads;
+      break;
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+      sq->end_count = svga->hud.num_const_buf_updates;
+      break;
+   case SVGA_QUERY_NUM_CONST_UPDATES:
+      sq->end_count = svga->hud.num_const_updates;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -1103,6 +1146,11 @@ svga_get_query_result(struct pipe_context *pipe,
    case SVGA_QUERY_COMMAND_BUFFER_SIZE:
    case SVGA_QUERY_FLUSH_TIME:
    case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
+   case SVGA_QUERY_NUM_READBACKS:
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+   case SVGA_QUERY_NUM_CONST_UPDATES:
       vresult->u64 = sq->end_count - sq->begin_count;
       break;
    /* These are running total counters */
diff --git a/src/gallium/drivers/svga/svga_pipe_streamout.c b/src/gallium/drivers/svga/svga_pipe_streamout.c
index 3f443c44eee..1318b5565ce 100644
--- a/src/gallium/drivers/svga/svga_pipe_streamout.c
+++ b/src/gallium/drivers/svga/svga_pipe_streamout.c
@@ -311,6 +311,25 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
    svga->num_so_targets = num_targets;
 }
 
+/**
+ * Rebind stream output target surfaces
+ */
+enum pipe_error
+svga_rebind_stream_output_targets(struct svga_context *svga)
+{
+   struct svga_winsys_context *swc = svga->swc;
+   enum pipe_error ret;
+   unsigned i;
+
+   for (i = 0; i < svga->num_so_targets; i++) {
+      ret = swc->resource_rebind(swc, svga->so_surfaces[i], NULL, SVGA_RELOC_WRITE);
+      if (ret != PIPE_OK)
+         return ret;
+   }
+
+   return PIPE_OK;
+}
+
 void
 svga_init_stream_output_functions(struct svga_context *svga)
 {
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index a8ffcc7f680..9ecb97509c2 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -109,6 +109,8 @@ svga_buffer_transfer_map(struct pipe_context *pipe,
             assert(ret == PIPE_OK);
          }
 
+         svga->hud.num_readbacks++;
+
          svga_context_finish(svga);
 
          sbuf->dirty = FALSE;
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 7f7ceab0aa5..1121b780af1 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -311,6 +311,8 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
    swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
+   svga->hud.num_resource_updates++;
+
    return PIPE_OK;
 }
 
@@ -385,6 +387,8 @@ svga_buffer_upload_command(struct svga_context *svga,
    swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
+   svga->hud.num_buffer_uploads++;
+
    return PIPE_OK;
 }
 
@@ -433,6 +437,7 @@ svga_buffer_upload_flush(struct svga_context *svga,
          assert(box->x + box->w <= sbuf->b.b.width0);
 
          svga->hud.num_bytes_uploaded += box->w;
+         svga->hud.num_buffer_uploads++;
       }
    }
    else {
@@ -460,6 +465,7 @@ svga_buffer_upload_flush(struct svga_context *svga,
          assert(box->x + box->w <= sbuf->b.b.width0);
 
          svga->hud.num_bytes_uploaded += box->w;
+         svga->hud.num_buffer_uploads++;
       }
    }
 
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index 1edb41dabee..db730802c7a 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -448,6 +448,8 @@ svga_texture_transfer_map(struct pipe_context *pipe,
             ret = readback_image_vgpu9(svga, surf, st->slice, transfer->level);
          }
 
+         svga->hud.num_readbacks++;
+
          assert(ret == PIPE_OK);
          (void) ret;
 
@@ -681,6 +683,8 @@ svga_texture_transfer_unmap(struct pipe_context *pipe,
          ret = update_image_vgpu9(svga, surf, &box, st->slice, transfer->level);
       }
 
+      svga->hud.num_resource_updates++;
+
       assert(ret == PIPE_OK);
       (void) ret;
    }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index bcc512041f7..c0873c0c65a 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -837,6 +837,16 @@ svga_get_driver_query_info(struct pipe_screen *screen,
             PIPE_DRIVER_QUERY_TYPE_MICROSECONDS),
       QUERY("surface-write-flushes", SVGA_QUERY_SURFACE_WRITE_FLUSHES,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-readbacks", SVGA_QUERY_NUM_READBACKS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-resource-updates", SVGA_QUERY_NUM_RESOURCE_UPDATES,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-buffer-uploads", SVGA_QUERY_NUM_BUFFER_UPLOADS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-const-buf-updates", SVGA_QUERY_NUM_CONST_BUF_UPDATES,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-const-updates", SVGA_QUERY_NUM_CONST_UPDATES,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
 
       /* running total counters */
       QUERY("memory-used", SVGA_QUERY_MEMORY_USED,
diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c
index 5c99e16d976..78eb3f65b61 100644
--- a/src/gallium/drivers/svga/svga_shader.c
+++ b/src/gallium/drivers/svga/svga_shader.c
@@ -180,18 +180,18 @@ svga_init_shader_key_common(const struct svga_context *svga, unsigned shader,
          assert(view->texture);
          assert(view->texture->target < (1 << 4)); /* texture_target:4 */
 
-         key->tex[i].texture_target = view->texture->target;
-
          /* 1D/2D array textures with one slice are treated as non-arrays
           * by the SVGA3D device.  Convert the texture type here so that
           * we emit the right TEX/SAMPLE instruction in the shader.
           */
-         if (view->texture->array_size == 1) {
-            if (view->texture->target == PIPE_TEXTURE_1D_ARRAY) {
-               key->tex[i].texture_target = PIPE_TEXTURE_1D;
+         if (view->texture->target == PIPE_TEXTURE_1D_ARRAY ||
+             view->texture->target == PIPE_TEXTURE_2D_ARRAY) {
+            if (view->texture->array_size == 1) {
+               key->tex[i].is_array = 0;
             }
-            else if (view->texture->target == PIPE_TEXTURE_2D_ARRAY) {
-               key->tex[i].texture_target = PIPE_TEXTURE_2D;
+            else {
+               assert(view->texture->array_size > 1);
+               key->tex[i].is_array = 1;
             }
          }
 
@@ -207,8 +207,6 @@ svga_init_shader_key_common(const struct svga_context *svga, unsigned shader,
          key->tex[i].swizzle_g = view->swizzle_g;
          key->tex[i].swizzle_b = view->swizzle_b;
          key->tex[i].swizzle_a = view->swizzle_a;
-
-         key->tex[i].return_type = svga_get_texture_datatype(view->format);
       }
    }
    key->num_textures = svga->curr.num_sampler_views[shader];
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index f49fdb46d0e..3f915740b1f 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -98,14 +98,13 @@ struct svga_compile_key
       unsigned compare_func:3;
       unsigned unnormalized:1;
       unsigned width_height_idx:5; /**< texture unit */
-      unsigned texture_target:4;   /**< PIPE_TEXTURE_x */
+      unsigned is_array:1;
       unsigned texture_msaa:1;    /**< A multisample texture? */
       unsigned sprite_texgen:1;
       unsigned swizzle_r:3;
       unsigned swizzle_g:3;
       unsigned swizzle_b:3;
       unsigned swizzle_a:3;
-      unsigned return_type:3;  /**< TGSI_RETURN_TYPE_x */
    } tex[PIPE_MAX_SAMPLERS];
    /* Note: svga_compile_keys_equal() depends on the variable-size
     * tex[] array being at the end of this structure.
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index 8ab1693088a..5ae0382cd45 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -301,6 +301,8 @@ emit_const(struct svga_context *svga, unsigned shader, unsigned i,
          return ret;
 
       memcpy(svga->state.hw_draw.cb[shader][i], value, 4 * sizeof(float));
+
+      svga->hud.num_const_updates++;
    }
 
    return ret;
@@ -420,6 +422,9 @@ emit_const_range(struct svga_context *svga,
                 (j - i) * 4 * sizeof(float));
 
          i = j + 1;
+
+         svga->hud.num_const_updates++;
+
       } else {
          ++i;
       }
@@ -549,6 +554,7 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
    void *src_map = NULL, *dst_map;
    unsigned offset;
    const struct svga_shader_variant *variant;
+   unsigned alloc_buf_size;
 
    assert(shader == PIPE_SHADER_VERTEX ||
           shader == PIPE_SHADER_GEOMETRY ||
@@ -613,7 +619,16 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
     */
    new_buf_size = align(new_buf_size, 16);
 
-   u_upload_alloc(svga->const0_upload, 0, new_buf_size,
+   /* Constant buffer size in the upload buffer must be in multiples of 256.
+    * In order to maximize the chance of merging the upload buffer chunks
+    * when svga_buffer_add_range() is called,
+    * the allocate buffer size needs to be in multiples of 256 as well.
+    * Otherwise, since there is gap between each dirty range of the upload buffer,
+    * each dirty range will end up in its own UPDATE_GB_IMAGE command.
+    */
+   alloc_buf_size = align(new_buf_size, CONST0_UPLOAD_ALIGNMENT);
+
+   u_upload_alloc(svga->const0_upload, 0, alloc_buf_size,
                   CONST0_UPLOAD_ALIGNMENT, &offset,
                   &dst_buffer, &dst_map);
    if (!dst_map) {
@@ -664,6 +679,8 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
 
    pipe_resource_reference(&dst_buffer, NULL);
 
+   svga->hud.num_const_buf_updates++;
+
    return ret;
 }
 
@@ -732,6 +749,8 @@ emit_consts_vgpu10(struct svga_context *svga, unsigned shader)
                                                   size);
       if (ret != PIPE_OK)
          return ret;
+
+      svga->hud.num_const_buf_updates++;
    }
 
    svga->state.hw_draw.enabled_constbufs[shader] = enabled_constbufs;
diff --git a/src/gallium/drivers/svga/svga_streamout.h b/src/gallium/drivers/svga/svga_streamout.h
index da0c4457d2e..1daa1ad5352 100644
--- a/src/gallium/drivers/svga/svga_streamout.h
+++ b/src/gallium/drivers/svga/svga_streamout.h
@@ -47,4 +47,7 @@ void
 svga_delete_stream_output(struct svga_context *svga,
                           struct svga_stream_output *streamout);
 
+enum pipe_error
+svga_rebind_stream_output_targets(struct svga_context *svga);
+
 #endif /* SVGA_STREAMOUT_H */
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
index ca4009b9e38..204b814a964 100644
--- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -517,15 +517,15 @@ vs30_output(struct svga_shader_emitter *emit,
 static ubyte
 svga_tgsi_sampler_type(const struct svga_shader_emitter *emit, int idx)
 {
-   switch (emit->key.tex[idx].texture_target) {
-   case PIPE_TEXTURE_1D:
+   switch (emit->sampler_target[idx]) {
+   case TGSI_TEXTURE_1D:
       return SVGA3DSAMP_2D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_RECT:
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
       return SVGA3DSAMP_2D;
-   case PIPE_TEXTURE_3D:
+   case TGSI_TEXTURE_3D:
       return SVGA3DSAMP_VOLUME;
-   case PIPE_TEXTURE_CUBE:
+   case TGSI_TEXTURE_CUBE:
       return SVGA3DSAMP_CUBE;
    }
 
@@ -585,6 +585,14 @@ svga_translate_decl_sm30( struct svga_shader_emitter *emit,
             ok = ps30_output( emit, decl->Semantic, idx );
          break;
 
+      case TGSI_FILE_SAMPLER_VIEW:
+         {
+            unsigned unit = decl->Range.First;
+            assert(decl->Range.First == decl->Range.Last);
+            emit->sampler_target[unit] = decl->SamplerView.Resource;
+         }
+         break;
+
       default:
          /* don't need to declare other vars */
          ok = TRUE;
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 83f0c8bd4d0..7a593ba6e9d 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -136,6 +136,8 @@ struct svga_shader_emitter
    int current_arl;
 
    unsigned pstipple_sampler_unit;
+
+   uint8_t sampler_target[PIPE_MAX_SAMPLERS];
 };
 
 
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 489e68f88e8..3188c411863 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -3849,7 +3849,7 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit,
 
       if (new_tokens) {
          /* Setup texture state for stipple */
-         emit->key.tex[unit].texture_target = PIPE_TEXTURE_2D;
+         emit->sampler_target[unit] = TGSI_TEXTURE_2D;
          emit->key.tex[unit].swizzle_r = TGSI_SWIZZLE_X;
          emit->key.tex[unit].swizzle_g = TGSI_SWIZZLE_Y;
          emit->key.tex[unit].swizzle_b = TGSI_SWIZZLE_Z;
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index 0c5afeb4cf9..0d5628251df 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -134,6 +134,8 @@ struct svga_shader_emitter_v10
 
    /* Samplers */
    unsigned num_samplers;
+   ubyte sampler_target[PIPE_MAX_SAMPLERS];  /**< TGSI_TEXTURE_x */
+   ubyte sampler_return_type[PIPE_MAX_SAMPLERS];  /**< TGSI_RETURN_TYPE_x */
 
    /* Address regs (really implemented with temps) */
    unsigned num_address_regs;
@@ -2312,9 +2314,13 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
       return TRUE;
 
    case TGSI_FILE_SAMPLER_VIEW:
-      /* Not used at this time, but maybe in the future.
-       * See emit_resource_declarations().
-       */
+      {
+         unsigned unit = decl->Range.First;
+         assert(decl->Range.First == decl->Range.Last);
+         emit->sampler_target[unit] = decl->SamplerView.Resource;
+         /* Note: we can ignore YZW return types for now */
+         emit->sampler_return_type[unit] = decl->SamplerView.ReturnTypeX;
+      }
       return TRUE;
 
    default:
@@ -2854,7 +2860,7 @@ emit_constant_declaration(struct svga_shader_emitter_v10 *emit)
 
    /* Texture buffer sizes */
    for (i = 0; i < emit->num_samplers; i++) {
-      if (emit->key.tex[i].texture_target == PIPE_BUFFER) {
+      if (emit->sampler_target[i] == TGSI_TEXTURE_BUFFER) {
          emit->texture_buffer_size_index[i] = total_consts++;
       }
    }
@@ -2918,30 +2924,44 @@ emit_sampler_declarations(struct svga_shader_emitter_v10 *emit)
 
 
 /**
- * Translate PIPE_TEXTURE_x to VGAPU10_RESOURCE_DIMENSION_x.
+ * Translate TGSI_TEXTURE_x to VGAPU10_RESOURCE_DIMENSION_x.
  */
 static unsigned
-pipe_texture_to_resource_dimension(unsigned target, bool msaa)
+tgsi_texture_to_resource_dimension(unsigned target, boolean is_array)
 {
    switch (target) {
-   case PIPE_BUFFER:
+   case TGSI_TEXTURE_BUFFER:
       return VGPU10_RESOURCE_DIMENSION_BUFFER;
-   case PIPE_TEXTURE_1D:
+   case TGSI_TEXTURE_1D:
       return VGPU10_RESOURCE_DIMENSION_TEXTURE1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_RECT:
-      return msaa ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS
-         : VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
-   case PIPE_TEXTURE_3D:
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
+   case TGSI_TEXTURE_3D:
       return VGPU10_RESOURCE_DIMENSION_TEXTURE3D;
-   case PIPE_TEXTURE_CUBE:
+   case TGSI_TEXTURE_CUBE:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURECUBE;
+   case TGSI_TEXTURE_SHADOW1D:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURE1D;
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
+   case TGSI_TEXTURE_1D_ARRAY:
+   case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE1DARRAY
+         : VGPU10_RESOURCE_DIMENSION_TEXTURE1D;
+   case TGSI_TEXTURE_2D_ARRAY:
+   case TGSI_TEXTURE_SHADOW2D_ARRAY:
+      return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DARRAY
+         : VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
+   case TGSI_TEXTURE_SHADOWCUBE:
       return VGPU10_RESOURCE_DIMENSION_TEXTURECUBE;
-   case PIPE_TEXTURE_1D_ARRAY:
-      return VGPU10_RESOURCE_DIMENSION_TEXTURE1DARRAY;
-   case PIPE_TEXTURE_2D_ARRAY:
-      return msaa ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMSARRAY
-         : VGPU10_RESOURCE_DIMENSION_TEXTURE2DARRAY;
-   case PIPE_TEXTURE_CUBE_ARRAY:
+   case TGSI_TEXTURE_2D_MSAA:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS;
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMSARRAY
+         : VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS;
+   case TGSI_TEXTURE_CUBE_ARRAY:
       return VGPU10_RESOURCE_DIMENSION_TEXTURECUBEARRAY;
    default:
       assert(!"Unexpected resource type");
@@ -2993,8 +3013,8 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit)
       opcode0.value = 0;
       opcode0.opcodeType = VGPU10_OPCODE_DCL_RESOURCE;
       opcode0.resourceDimension =
-         pipe_texture_to_resource_dimension(emit->key.tex[i].texture_target,
-                                            emit->key.tex[i].texture_msaa);
+         tgsi_texture_to_resource_dimension(emit->sampler_target[i],
+                                            emit->key.tex[i].is_array);
       operand0.value = 0;
       operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
       operand0.operandType = VGPU10_OPERAND_TYPE_RESOURCE;
@@ -3008,10 +3028,10 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit)
       STATIC_ASSERT(VGPU10_RETURN_TYPE_SINT == TGSI_RETURN_TYPE_SINT + 1);
       STATIC_ASSERT(VGPU10_RETURN_TYPE_UINT == TGSI_RETURN_TYPE_UINT + 1);
       STATIC_ASSERT(VGPU10_RETURN_TYPE_FLOAT == TGSI_RETURN_TYPE_FLOAT + 1);
-      assert(emit->key.tex[i].return_type <= TGSI_RETURN_TYPE_FLOAT);
-      rt = emit->key.tex[i].return_type + 1;
+      assert(emit->sampler_return_type[i] <= TGSI_RETURN_TYPE_FLOAT);
+      rt = emit->sampler_return_type[i] + 1;
 #else
-      switch (emit->key.tex[i].return_type) {
+      switch (emit->sampler_return_type[i]) {
          case TGSI_RETURN_TYPE_UNORM: rt = VGPU10_RETURN_TYPE_UNORM; break;
          case TGSI_RETURN_TYPE_SNORM: rt = VGPU10_RETURN_TYPE_SNORM; break;
          case TGSI_RETURN_TYPE_SINT:  rt = VGPU10_RETURN_TYPE_SINT;  break;
@@ -5024,7 +5044,7 @@ end_tex_swizzle(struct svga_shader_emitter_v10 *emit,
       unsigned swz_b = emit->key.tex[swz->unit].swizzle_b;
       unsigned swz_a = emit->key.tex[swz->unit].swizzle_a;
       unsigned writemask_0 = 0, writemask_1 = 0;
-      boolean int_tex = is_integer_type(emit->key.tex[swz->unit].return_type);
+      boolean int_tex = is_integer_type(emit->sampler_return_type[swz->unit]);
 
       /* Swizzle w/out zero/one terms */
       struct tgsi_full_src_register src_swizzled =
@@ -5131,7 +5151,7 @@ is_valid_tex_instruction(struct svga_shader_emitter_v10 *emit,
    boolean valid = TRUE;
 
    if (tgsi_is_shadow_target(target) &&
-       is_integer_type(emit->key.tex[unit].return_type)) {
+       is_integer_type(emit->sampler_return_type[unit])) {
       debug_printf("Invalid SAMPLE_C with an integer texture!\n");
       valid = FALSE;
    }
@@ -5528,7 +5548,7 @@ emit_txq(struct svga_shader_emitter_v10 *emit,
 {
    const uint unit = inst->Src[1].Register.Index;
 
-   if (emit->key.tex[unit].texture_target == PIPE_BUFFER) {
+   if (emit->sampler_target[unit] == TGSI_TEXTURE_BUFFER) {
       /* RESINFO does not support querying texture buffers, so we instead
        * store texture buffer sizes in shader constants, then copy them to
        * implement TXQ instead of emitting RESINFO.
@@ -6617,7 +6637,7 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit,
    emit->fs.pstipple_sampler_unit = unit;
 
    /* Setup texture state for stipple */
-   emit->key.tex[unit].texture_target = PIPE_TEXTURE_2D;
+   emit->sampler_target[unit] = TGSI_TEXTURE_2D;
    emit->key.tex[unit].swizzle_r = TGSI_SWIZZLE_X;
    emit->key.tex[unit].swizzle_g = TGSI_SWIZZLE_Y;
    emit->key.tex[unit].swizzle_b = TGSI_SWIZZLE_Z;
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index 0ad6b5e6c76..7da2c4e77ca 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -48,6 +48,7 @@ struct svga_winsys_screen;
 struct svga_winsys_buffer;
 struct pipe_screen;
 struct pipe_context;
+struct pipe_debug_callback;
 struct pipe_fence_handle;
 struct pipe_resource;
 struct svga_region;
@@ -286,6 +287,9 @@ struct svga_winsys_context
                       struct svga_winsys_surface *surface,
                       struct svga_winsys_gb_shader *shader,
                       unsigned flags);
+
+   /** To report perf/conformance/etc issues to the state tracker */
+   struct pipe_debug_callback *debug_callback;
 };
 
 
diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
index c8cb145d334..78b8fdf619b 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -129,7 +129,7 @@ swr_transfer_map(struct pipe_context *pipe,
                swr_fence_submit(swr_context(pipe), screen->flush_fence);
 
             swr_fence_finish(pipe->screen, screen->flush_fence, 0);
-            swr_resource_unused(pipe, spr);
+            swr_resource_unused(resource);
          }
       }
    }
@@ -206,8 +206,8 @@ swr_resource_copy(struct pipe_context *pipe,
    swr_store_dirty_resource(pipe, dst, SWR_TILE_RESOLVED);
 
    swr_fence_finish(pipe->screen, screen->flush_fence, 0);
-   swr_resource_unused(pipe, swr_resource(src));
-   swr_resource_unused(pipe, swr_resource(dst));
+   swr_resource_unused(src);
+   swr_resource_unused(dst);
 
    if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER)
        || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) {
@@ -293,6 +293,7 @@ static void
 swr_destroy(struct pipe_context *pipe)
 {
    struct swr_context *ctx = swr_context(pipe);
+   struct swr_screen *screen = swr_screen(pipe->screen);
 
    if (ctx->blitter)
       util_blitter_destroy(ctx->blitter);
@@ -306,6 +307,9 @@ swr_destroy(struct pipe_context *pipe)
 
    swr_destroy_scratch_buffers(ctx);
 
+   assert(screen);
+   screen->pipe = NULL;
+
    FREE(ctx);
 }
 
@@ -324,9 +328,10 @@ swr_render_condition(struct pipe_context *pipe,
 }
 
 struct pipe_context *
-swr_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
+swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
 {
    struct swr_context *ctx = CALLOC_STRUCT(swr_context);
+   struct swr_screen *screen = swr_screen(p_screen);
    ctx->blendJIT =
       new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>;
 
@@ -347,7 +352,8 @@ swr_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
    if (ctx->swrContext == NULL)
       goto fail;
 
-   ctx->pipe.screen = screen;
+   screen->pipe = &ctx->pipe;
+   ctx->pipe.screen = p_screen;
    ctx->pipe.destroy = swr_destroy;
    ctx->pipe.priv = priv;
    ctx->pipe.create_surface = swr_create_surface;
diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h
index 2fdc7683cb8..59cf0284461 100644
--- a/src/gallium/drivers/swr/swr_resource.h
+++ b/src/gallium/drivers/swr/swr_resource.h
@@ -54,9 +54,6 @@ struct swr_resource {
    unsigned mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
 
    enum swr_resource_status status;
-
-   /* pipe_context to which resource is currently bound. */
-   struct pipe_context *bound_to_context;
 };
 
 
@@ -120,24 +117,21 @@ swr_resource_status & operator|=(enum swr_resource_status & a,
 }
 
 static INLINE void
-swr_resource_read(struct pipe_context *pipe, struct swr_resource *resource)
+swr_resource_read(struct pipe_resource *resource)
 {
-   resource->status |= SWR_RESOURCE_READ;
-   resource->bound_to_context = pipe;
+   swr_resource(resource)->status |= SWR_RESOURCE_READ;
 }
 
 static INLINE void
-swr_resource_write(struct pipe_context *pipe, struct swr_resource *resource)
+swr_resource_write(struct pipe_resource *resource)
 {
-   resource->status |= SWR_RESOURCE_WRITE;
-   resource->bound_to_context = pipe;
+   swr_resource(resource)->status |= SWR_RESOURCE_WRITE;
 }
 
 static INLINE void
-swr_resource_unused(struct pipe_context *pipe, struct swr_resource *resource)
+swr_resource_unused(struct pipe_resource *resource)
 {
-   resource->status = SWR_RESOURCE_UNUSED;
-   resource->bound_to_context = nullptr;
+   swr_resource(resource)->status = SWR_RESOURCE_UNUSED;
 }
 
 #endif
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
index e46df47570f..f9e52be2367 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -620,7 +620,7 @@ swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt)
 {
    struct swr_screen *screen = swr_screen(p_screen);
    struct swr_resource *spr = swr_resource(pt);
-   struct pipe_context *pipe = spr->bound_to_context;
+   struct pipe_context *pipe = screen->pipe;
 
    /* Only wait on fence if the resource is being used */
    if (pipe && spr->status) {
@@ -630,7 +630,7 @@ swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt)
          swr_fence_submit(swr_context(pipe), screen->flush_fence);
 
       swr_fence_finish(p_screen, screen->flush_fence, 0);
-      swr_resource_unused(pipe, spr);
+      swr_resource_unused(pt);
    }
 
    /*
@@ -661,11 +661,11 @@ swr_flush_frontbuffer(struct pipe_screen *p_screen,
    struct swr_screen *screen = swr_screen(p_screen);
    struct sw_winsys *winsys = screen->winsys;
    struct swr_resource *spr = swr_resource(resource);
-   struct pipe_context *pipe = spr->bound_to_context;
+   struct pipe_context *pipe = screen->pipe;
 
    if (pipe) {
       swr_fence_finish(p_screen, screen->flush_fence, 0);
-      swr_resource_unused(pipe, spr);
+      swr_resource_unused(resource);
       SwrEndFrame(swr_context(pipe)->swrContext);
    }
 
diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h
index a96dc44cf66..0c82a2eff7a 100644
--- a/src/gallium/drivers/swr/swr_screen.h
+++ b/src/gallium/drivers/swr/swr_screen.h
@@ -32,6 +32,7 @@ struct sw_winsys;
 
 struct swr_screen {
    struct pipe_screen base;
+   struct pipe_context *pipe;
 
    struct pipe_fence_handle *flush_fence;
 
diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
index 47ee3cb2664..e7bf3618a7d 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -646,24 +646,24 @@ swr_update_resource_status(struct pipe_context *pipe,
    if (fb->nr_cbufs)
       for (uint32_t i = 0; i < fb->nr_cbufs; ++i)
          if (fb->cbufs[i])
-            swr_resource_write(pipe, swr_resource(fb->cbufs[i]->texture));
+            swr_resource_write(fb->cbufs[i]->texture);
 
    /* depth/stencil target */
    if (fb->zsbuf)
-      swr_resource_write(pipe, swr_resource(fb->zsbuf->texture));
+      swr_resource_write(fb->zsbuf->texture);
 
    /* VBO vertex buffers */
    for (uint32_t i = 0; i < ctx->num_vertex_buffers; i++) {
       struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
       if (!vb->user_buffer)
-         swr_resource_read(pipe, swr_resource(vb->buffer));
+         swr_resource_read(vb->buffer);
    }
 
    /* VBO index buffer */
    if (p_draw_info && p_draw_info->indexed) {
       struct pipe_index_buffer *ib = &ctx->index_buffer;
       if (!ib->user_buffer)
-         swr_resource_read(pipe, swr_resource(ib->buffer));
+         swr_resource_read(ib->buffer);
    }
 
    /* texture sampler views */
@@ -671,7 +671,7 @@ swr_update_resource_status(struct pipe_context *pipe,
       struct pipe_sampler_view *view =
          ctx->sampler_views[PIPE_SHADER_FRAGMENT][i];
       if (view)
-         swr_resource_read(pipe, swr_resource(view->texture));
+         swr_resource_read(view->texture);
    }
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index a13e309985a..49a314cdb25 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -62,7 +62,7 @@ vc4_nir_get_dst_color(nir_builder *b, int sample)
         load->num_components = 1;
         load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT + sample;
         load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-        nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+        nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
         nir_builder_instr_insert(b, &load->instr);
         return &load->dest.ssa;
 }
@@ -627,7 +627,7 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
                         nir_intrinsic_instr_create(b->shader,
                                                    nir_intrinsic_load_sample_mask_in);
                 load->num_components = 1;
-                nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+                nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
                 nir_builder_instr_insert(b, &load->instr);
 
                 nir_ssa_def *bitmask = &load->dest.ssa;
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index d47e3bf52b0..d08ad588e5b 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -183,7 +183,7 @@ vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b,
          * with an offset value of 0.
          */
         assert(nir_src_as_const_value(intr->src[0]) &&
-               nir_src_as_const_value(intr->src[0])->u[0] == 0);
+               nir_src_as_const_value(intr->src[0])->u32[0] == 0);
 
         /* Generate dword loads for the VPM values (Since these intrinsics may
          * be reordered, the actual reads will be generated at the top of the
@@ -197,7 +197,7 @@ vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b,
                 intr_comp->num_components = 1;
                 intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
                 intr_comp->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
                 nir_builder_instr_insert(b, &intr_comp->instr);
 
                 vpm_reads[i] = &intr_comp->dest.ssa;
@@ -256,7 +256,7 @@ vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b,
          * with an offset value of 0.
          */
         assert(nir_src_as_const_value(intr->src[0]) &&
-               nir_src_as_const_value(intr->src[0])->u[0] == 0);
+               nir_src_as_const_value(intr->src[0])->u32[0] == 0);
 
         /* Generate scalar loads equivalent to the original VEC4. */
         nir_ssa_def *dests[4];
@@ -267,7 +267,7 @@ vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b,
                 intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
                 intr_comp->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
 
-                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
                 nir_builder_instr_insert(b, &intr_comp->instr);
 
                 dests[i] = &intr_comp->dest.ssa;
@@ -339,7 +339,7 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
          * with an offset value of 0.
          */
         assert(nir_src_as_const_value(intr->src[1]) &&
-               nir_src_as_const_value(intr->src[1])->u[0] == 0);
+               nir_src_as_const_value(intr->src[1])->u32[0] == 0);
 
         b->cursor = nir_before_instr(&intr->instr);
 
@@ -378,7 +378,7 @@ vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
                 nir_intrinsic_instr *intr_comp =
                         nir_intrinsic_instr_create(c->s, intr->intrinsic);
                 intr_comp->num_components = 1;
-                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
 
                 /* Convert the uniform (not user_clip_plane) offset to bytes.
                  * If it happens to be a constant, constant-folding will clean
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
index f6ba5b802ad..a2d89ef3349 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -123,7 +123,7 @@ vc4_nir_lower_txf_ms_instr(struct vc4_compile *c, nir_builder *b,
 
         txf->src[0].src_type = nir_tex_src_coord;
         txf->src[0].src = nir_src_for_ssa(nir_vec2(b, addr, nir_imm_int(b, 0)));
-        nir_ssa_dest_init(&txf->instr, &txf->dest, 4, NULL);
+        nir_ssa_dest_init(&txf->instr, &txf->dest, 4, 32, NULL);
         nir_builder_instr_insert(b, &txf->instr);
         nir_ssa_def_rewrite_uses(&txf_ms->dest.ssa,
                                  nir_src_for_ssa(&txf->dest.ssa));
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index f5826d85174..71a1ebbb313 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -118,7 +118,7 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
         intr->const_index[0] = (VC4_NIR_STATE_UNIFORM_OFFSET + contents) * 4;
         intr->num_components = 1;
         intr->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-        nir_ssa_dest_init(&intr->instr, &intr->dest, 1, NULL);
+        nir_ssa_dest_init(&intr->instr, &intr->dest, 1, 32, NULL);
         nir_builder_instr_insert(b, &intr->instr);
         return &intr->dest.ssa;
 }
@@ -885,7 +885,9 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
         struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
 
-        if (nir_op_infos[compare_instr->op].input_types[0] == nir_type_float)
+        unsigned unsized_type =
+                nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);
+        if (unsized_type == nir_type_float)
                 qir_SF(c, qir_FSUB(c, src0, src1));
         else
                 qir_SF(c, qir_SUB(c, src0, src1));
@@ -1519,7 +1521,7 @@ ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
 {
         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
-                qregs[i] = qir_uniform_ui(c, instr->value.u[i]);
+                qregs[i] = qir_uniform_ui(c, instr->value.u32[i]);
 
         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
 }
@@ -1553,7 +1555,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 assert(instr->num_components == 1);
                 const_offset = nir_src_as_const_value(instr->src[0]);
                 if (const_offset) {
-                        offset = instr->const_index[0] + const_offset->u[0];
+                        offset = instr->const_index[0] + const_offset->u32[0];
                         assert(offset % 4 == 0);
                         /* We need dwords */
                         offset = offset / 4;
@@ -1584,7 +1586,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 const_offset = nir_src_as_const_value(instr->src[0]);
                 assert(const_offset && "vc4 doesn't support indirect inputs");
                 if (instr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT) {
-                        assert(const_offset->u[0] == 0);
+                        assert(const_offset->u32[0] == 0);
                         /* Reads of the per-sample color need to be done in
                          * order.
                          */
@@ -1598,7 +1600,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                         }
                         *dest = c->color_reads[sample_index];
                 } else {
-                        offset = instr->const_index[0] + const_offset->u[0];
+                        offset = instr->const_index[0] + const_offset->u32[0];
                         *dest = c->inputs[offset];
                 }
                 break;
@@ -1606,7 +1608,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_store_output:
                 const_offset = nir_src_as_const_value(instr->src[1]);
                 assert(const_offset && "vc4 doesn't support indirect outputs");
-                offset = instr->const_index[0] + const_offset->u[0];
+                offset = instr->const_index[0] + const_offset->u32[0];
 
                 /* MSAA color outputs are the only case where we have an
                  * output that's not lowered to being a store of a single 32
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index bdd76ab1f81..8257b4a7142 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -360,6 +360,14 @@ enum pipe_flush_flags
 #define PIPE_BARRIER_MAPPED_BUFFER     (1 << 0)
 #define PIPE_BARRIER_SHADER_BUFFER     (1 << 1)
 #define PIPE_BARRIER_QUERY_BUFFER      (1 << 2)
+#define PIPE_BARRIER_VERTEX_BUFFER     (1 << 3)
+#define PIPE_BARRIER_INDEX_BUFFER      (1 << 4)
+#define PIPE_BARRIER_CONSTANT_BUFFER   (1 << 5)
+#define PIPE_BARRIER_INDIRECT_BUFFER   (1 << 6)
+#define PIPE_BARRIER_TEXTURE           (1 << 7)
+#define PIPE_BARRIER_IMAGE             (1 << 8)
+#define PIPE_BARRIER_FRAMEBUFFER       (1 << 9)
+#define PIPE_BARRIER_STREAMOUT_BUFFER  (1 << 10)
 
 /**
  * Resource binding flags -- state tracker must specify in advance all
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 7a34841088a..5cc18a293d3 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -117,6 +117,12 @@ enum tgsi_file_type {
 #define TGSI_CYLINDRICAL_WRAP_Z (1 << 2)
 #define TGSI_CYLINDRICAL_WRAP_W (1 << 3)
 
+#define TGSI_MEMORY_TYPE_GLOBAL        0 /* OpenCL global              */
+#define TGSI_MEMORY_TYPE_SHARED        1 /* OpenCL local / GLSL shared */
+#define TGSI_MEMORY_TYPE_PRIVATE       2 /* OpenCL private             */
+#define TGSI_MEMORY_TYPE_INPUT         3 /* OpenCL kernel input params */
+#define TGSI_MEMORY_TYPE_COUNT         4
+
 struct tgsi_declaration
 {
    unsigned Type        : 4;  /**< TGSI_TOKEN_TYPE_DECLARATION */
@@ -130,8 +136,8 @@ struct tgsi_declaration
    unsigned Local       : 1;  /**< optimize as subroutine local variable? */
    unsigned Array       : 1;  /**< extra array info? */
    unsigned Atomic      : 1;  /**< atomic only? for TGSI_FILE_BUFFER */
-   unsigned Shared      : 1;  /**< shared storage for TGSI_FILE_MEMORY */
-   unsigned Padding     : 4;
+   unsigned MemType     : 2;  /**< TGSI_MEMORY_TYPE_x for TGSI_FILE_MEMORY */
+   unsigned Padding     : 3;
 };
 
 struct tgsi_declaration_range
@@ -231,15 +237,6 @@ struct tgsi_declaration_array {
    unsigned Padding : 22;
 };
 
-/*
- * Special resources that don't need to be declared.  They map to the
- * GLOBAL/LOCAL/PRIVATE/INPUT compute memory spaces.
- */
-#define TGSI_RESOURCE_GLOBAL	0x7fff
-#define TGSI_RESOURCE_LOCAL	0x7ffe
-#define TGSI_RESOURCE_PRIVATE	0x7ffd
-#define TGSI_RESOURCE_INPUT	0x7ffc
-
 #define TGSI_IMM_FLOAT32   0
 #define TGSI_IMM_UINT32    1
 #define TGSI_IMM_INT32     2
@@ -278,7 +275,8 @@ union tgsi_immediate_data
 #define TGSI_PROPERTY_NUM_CLIPDIST_ENABLED   15
 #define TGSI_PROPERTY_NUM_CULLDIST_ENABLED   16
 #define TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL 17
-#define TGSI_PROPERTY_COUNT                  18
+#define TGSI_PROPERTY_NEXT_SHADER            18
+#define TGSI_PROPERTY_COUNT                  19
 
 struct tgsi_property {
    unsigned Type         : 4;  /**< TGSI_TOKEN_TYPE_PROPERTY */
diff --git a/src/gallium/include/state_tracker/drm_driver.h b/src/gallium/include/state_tracker/drm_driver.h
index 959a7625e30..fefab11cccd 100644
--- a/src/gallium/include/state_tracker/drm_driver.h
+++ b/src/gallium/include/state_tracker/drm_driver.h
@@ -26,6 +26,11 @@ struct winsys_handle
     */
    unsigned type;
    /**
+    * Input for texture_get_handle, allows to export the offset
+    * of a specific layer of an array texture.
+    */
+   unsigned layer;
+   /**
     * Input to texture_from_handle.
     * Output for texture_get_handle.
     */
@@ -35,6 +40,11 @@ struct winsys_handle
     * Output for texture_get_handle.
     */
    unsigned stride;
+   /**
+    * Input to texture_from_handle.
+    * Output for texture_get_handle.
+    */
+   unsigned offset;
 };
 
 
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index 7f7fbc47e6d..fb0a1802cf2 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -534,6 +534,7 @@ dri2_allocate_textures(struct dri_context *ctx,
          templ.bind = bind;
          whandle.handle = buf->name;
          whandle.stride = buf->pitch;
+         whandle.offset = 0;
          if (screen->can_share_buffer)
             whandle.type = DRM_API_HANDLE_TYPE_SHARED;
          else
@@ -756,6 +757,7 @@ dri2_create_image_from_winsys(__DRIscreen *_screen,
    templ.array_size = 1;
 
    whandle->stride = pitch * util_format_get_blocksize(pf);
+   whandle->offset = 0;
 
    img->texture = screen->base.screen->resource_from_handle(screen->base.screen,
          &templ, whandle, PIPE_HANDLE_USAGE_READ_WRITE);
diff --git a/src/gallium/state_trackers/omx/vid_dec.c b/src/gallium/state_trackers/omx/vid_dec.c
index 5584348761e..108a46029e0 100644
--- a/src/gallium/state_trackers/omx/vid_dec.c
+++ b/src/gallium/state_trackers/omx/vid_dec.c
@@ -140,7 +140,7 @@ static OMX_ERRORTYPE vid_dec_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
 
    r = omx_base_filter_Constructor(comp, name);
    if (r)
-	return r;
+      return r;
 
    priv->profile = PIPE_VIDEO_PROFILE_UNKNOWN;
 
@@ -268,7 +268,7 @@ static OMX_ERRORTYPE vid_dec_SetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i
       r = checkHeader(param, sizeof(OMX_PARAM_COMPONENTROLETYPE));
       if (r)
          return r;
- 
+
       if (!strcmp((char *)role->cRole, OMX_VID_DEC_MPEG2_ROLE)) {
          priv->profile = PIPE_VIDEO_PROFILE_MPEG2_MAIN;
       } else if (!strcmp((char *)role->cRole, OMX_VID_DEC_AVC_ROLE)) {
@@ -321,7 +321,7 @@ static OMX_ERRORTYPE vid_dec_GetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i
          strcpy((char *)role->cRole, OMX_VID_DEC_MPEG2_ROLE);
       else if (priv->profile == PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH)
          strcpy((char *)role->cRole, OMX_VID_DEC_AVC_ROLE);
- 
+
       break;
    }
 
@@ -419,6 +419,7 @@ static OMX_ERRORTYPE vid_dec_DecodeBuffer(omx_base_PortType *port, OMX_BUFFERHEA
    priv->in_buffers[i] = buf;
    priv->sizes[i] = buf->nFilledLen;
    priv->inputs[i] = buf->pBuffer;
+   priv->timestamps[i] = buf->nTimeStamp;
 
    while (priv->num_in_buffers > (!!(buf->nFlags & OMX_BUFFERFLAG_EOS) ? 0 : 1)) {
       bool eos = !!(priv->in_buffers[0]->nFlags & OMX_BUFFERFLAG_EOS);
@@ -469,12 +470,13 @@ static OMX_ERRORTYPE vid_dec_DecodeBuffer(omx_base_PortType *port, OMX_BUFFERHEA
          priv->in_buffers[0] = priv->in_buffers[1];
          priv->sizes[0] = priv->sizes[1] - delta;
          priv->inputs[0] = priv->inputs[1] + delta;
+         priv->timestamps[0] = priv->timestamps[1];
       }
 
       if (r)
          return r;
    }
- 
+
    return OMX_ErrorNone;
 }
 
@@ -513,7 +515,7 @@ static void vid_dec_FillOutput(vid_dec_PrivateType *priv, struct pipe_video_buff
 
    box.width = def->nFrameWidth / 2;
    box.height = def->nFrameHeight / 2;
- 
+
    src = priv->pipe->transfer_map(priv->pipe, views[1]->texture, 0,
                                   PIPE_TRANSFER_READ, &box, &transfer);
    util_copy_rect(dst, views[1]->texture->format, def->nStride, 0, 0,
@@ -526,9 +528,13 @@ static void vid_dec_FrameDecoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE*
 {
    vid_dec_PrivateType *priv = comp->pComponentPrivate;
    bool eos = !!(input->nFlags & OMX_BUFFERFLAG_EOS);
+   OMX_TICKS timestamp;
 
-   if (!input->pInputPortPrivate)
-      input->pInputPortPrivate = priv->Flush(priv);
+   if (!input->pInputPortPrivate) {
+      input->pInputPortPrivate = priv->Flush(priv, &timestamp);
+      if (timestamp != OMX_VID_DEC_TIMESTAMP_INVALID)
+         input->nTimeStamp = timestamp;
+   }
 
    if (input->pInputPortPrivate) {
       if (output->pInputPortPrivate) {
@@ -539,6 +545,7 @@ static void vid_dec_FrameDecoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE*
          vid_dec_FillOutput(priv, input->pInputPortPrivate, output);
       }
       output->nFilledLen = output->nAllocLen;
+      output->nTimeStamp = input->nTimeStamp;
    }
 
    if (eos && input->pInputPortPrivate)
diff --git a/src/gallium/state_trackers/omx/vid_dec.h b/src/gallium/state_trackers/omx/vid_dec.h
index 3b39826b8a7..649d745dfc0 100644
--- a/src/gallium/state_trackers/omx/vid_dec.h
+++ b/src/gallium/state_trackers/omx/vid_dec.h
@@ -59,6 +59,8 @@
 #define OMX_VID_DEC_AVC_NAME "OMX.mesa.video_decoder.avc"
 #define OMX_VID_DEC_AVC_ROLE "video_decoder.avc"
 
+#define OMX_VID_DEC_TIMESTAMP_INVALID ((OMX_TICKS) -1)
+
 struct vl_vlc;
 
 DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType)
@@ -69,7 +71,7 @@ DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType)
    struct pipe_video_codec *codec; \
    void (*Decode)(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left); \
    void (*EndFrame)(vid_dec_PrivateType *priv); \
-   struct pipe_video_buffer *(*Flush)(vid_dec_PrivateType *priv); \
+   struct pipe_video_buffer *(*Flush)(vid_dec_PrivateType *priv, OMX_TICKS *timestamp); \
    struct pipe_video_buffer *target, *shadow; \
    union { \
       struct { \
@@ -100,6 +102,9 @@ DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType)
    OMX_BUFFERHEADERTYPE *in_buffers[2]; \
    const void *inputs[2]; \
    unsigned sizes[2]; \
+   OMX_TICKS timestamps[2]; \
+   OMX_TICKS timestamp; \
+   bool first_buf_in_frame; \
    bool frame_finished; \
    bool frame_started; \
    unsigned bytes_left; \
diff --git a/src/gallium/state_trackers/omx/vid_dec_h264.c b/src/gallium/state_trackers/omx/vid_dec_h264.c
index b4536828909..9aab6d1a1a3 100644
--- a/src/gallium/state_trackers/omx/vid_dec_h264.c
+++ b/src/gallium/state_trackers/omx/vid_dec_h264.c
@@ -45,6 +45,7 @@
 struct dpb_list {
    struct list_head list;
    struct pipe_video_buffer *buffer;
+   OMX_TICKS timestamp;
    unsigned poc;
 };
 
@@ -82,7 +83,7 @@ static const uint8_t Default_8x8_Inter[64] = {
 
 static void vid_dec_h264_Decode(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left);
 static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv);
-static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv);
+static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp);
 
 void vid_dec_h264_Init(vid_dec_PrivateType *priv)
 {
@@ -91,9 +92,10 @@ void vid_dec_h264_Init(vid_dec_PrivateType *priv)
    priv->Decode = vid_dec_h264_Decode;
    priv->EndFrame = vid_dec_h264_EndFrame;
    priv->Flush = vid_dec_h264_Flush;
-   
+
    LIST_INITHEAD(&priv->codec_data.h264.dpb_list);
    priv->picture.h264.field_order_cnt[0] = priv->picture.h264.field_order_cnt[1] = INT_MAX;
+   priv->first_buf_in_frame = true;
 }
 
 static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv)
@@ -104,6 +106,9 @@ static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv)
       return;
 
    vid_dec_NeedTarget(priv);
+   if (priv->first_buf_in_frame)
+      priv->timestamp = priv->timestamps[0];
+   priv->first_buf_in_frame = false;
 
    priv->picture.h264.num_ref_frames = priv->picture.h264.pps->sps->max_num_ref_frames;
 
@@ -127,7 +132,8 @@ static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv)
    priv->frame_started = true;
 }
 
-static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv)
+static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv,
+                                                    OMX_TICKS *timestamp)
 {
    struct dpb_list *entry, *result = NULL;
    struct pipe_video_buffer *buf;
@@ -146,6 +152,8 @@ static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv)
       return NULL;
 
    buf = result->buffer;
+   if (timestamp)
+      *timestamp = result->timestamp;
 
    --priv->codec_data.h264.dpb_num;
    LIST_DEL(&result->list);
@@ -159,6 +167,7 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv)
    struct dpb_list *entry;
    struct pipe_video_buffer *tmp;
    bool top_field_first;
+   OMX_TICKS timestamp;
 
    if (!priv->frame_started)
       return;
@@ -181,7 +190,9 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv)
    if (!entry)
       return;
 
+   priv->first_buf_in_frame = true;
    entry->buffer = priv->target;
+   entry->timestamp = priv->timestamp;
    entry->poc = MIN2(priv->picture.h264.field_order_cnt[0], priv->picture.h264.field_order_cnt[1]);
    LIST_ADDTAIL(&entry->list, &priv->codec_data.h264.dpb_list);
    ++priv->codec_data.h264.dpb_num;
@@ -192,7 +203,8 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv)
       return;
 
    tmp = priv->in_buffers[0]->pInputPortPrivate;
-   priv->in_buffers[0]->pInputPortPrivate = vid_dec_h264_Flush(priv);
+   priv->in_buffers[0]->pInputPortPrivate = vid_dec_h264_Flush(priv, &timestamp);
+   priv->in_buffers[0]->nTimeStamp = timestamp;
    priv->target = tmp;
    priv->frame_finished = priv->in_buffers[0]->pInputPortPrivate != NULL;
 }
@@ -829,7 +841,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
          priv->picture.h264.field_order_cnt[0] = expectedPicOrderCnt + priv->codec_data.h264.delta_pic_order_cnt[0];
          priv->picture.h264.field_order_cnt[1] = priv->picture.h264.field_order_cnt[0] +
             sps->offset_for_top_to_bottom_field + priv->codec_data.h264.delta_pic_order_cnt[1];
-         
+
       } else if (!priv->picture.h264.bottom_field_flag)
          priv->picture.h264.field_order_cnt[0] = expectedPicOrderCnt + priv->codec_data.h264.delta_pic_order_cnt[0];
       else
@@ -859,7 +871,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
       if (!priv->picture.h264.field_pic_flag) {
          priv->picture.h264.field_order_cnt[0] = tempPicOrderCnt;
          priv->picture.h264.field_order_cnt[1] = tempPicOrderCnt;
-         
+
       } else if (!priv->picture.h264.bottom_field_flag)
          priv->picture.h264.field_order_cnt[0] = tempPicOrderCnt;
       else
@@ -876,7 +888,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
 
    priv->picture.h264.num_ref_idx_l0_active_minus1 = pps->num_ref_idx_l0_default_active_minus1;
    priv->picture.h264.num_ref_idx_l1_active_minus1 = pps->num_ref_idx_l1_default_active_minus1;
- 
+
    if (slice_type == PIPE_H264_SLICE_TYPE_P ||
        slice_type == PIPE_H264_SLICE_TYPE_SP ||
        slice_type == PIPE_H264_SLICE_TYPE_B) {
diff --git a/src/gallium/state_trackers/omx/vid_dec_mpeg12.c b/src/gallium/state_trackers/omx/vid_dec_mpeg12.c
index bef83ecd85a..7b2df8f40fa 100644
--- a/src/gallium/state_trackers/omx/vid_dec_mpeg12.c
+++ b/src/gallium/state_trackers/omx/vid_dec_mpeg12.c
@@ -61,7 +61,7 @@ static uint8_t default_non_intra_matrix[64] = {
 
 static void vid_dec_mpeg12_Decode(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left);
 static void vid_dec_mpeg12_EndFrame(vid_dec_PrivateType *priv);
-static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv);
+static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp);
 
 void vid_dec_mpeg12_Init(vid_dec_PrivateType *priv)
 {
@@ -131,10 +131,12 @@ static void vid_dec_mpeg12_EndFrame(vid_dec_PrivateType *priv)
    priv->in_buffers[0]->pInputPortPrivate = done;
 }
 
-static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv)
+static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp)
 {
    struct pipe_video_buffer *result = priv->picture.mpeg12.ref[1];
    priv->picture.mpeg12.ref[1] = NULL;
+   if (timestamp)
+      *timestamp = OMX_VID_DEC_TIMESTAMP_INVALID;
    return result;
 }
 
diff --git a/src/gallium/state_trackers/omx/vid_enc.c b/src/gallium/state_trackers/omx/vid_enc.c
index df22a97a42c..4505fe1a693 100644
--- a/src/gallium/state_trackers/omx/vid_enc.c
+++ b/src/gallium/state_trackers/omx/vid_enc.c
@@ -179,7 +179,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
    if (!screen->get_video_param(screen, PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
                                 PIPE_VIDEO_ENTRYPOINT_ENCODE, PIPE_VIDEO_CAP_SUPPORTED))
       return OMX_ErrorBadParameter;
- 
+
    priv->stacked_frames_num = screen->get_video_param(screen,
                                 PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
                                 PIPE_VIDEO_ENTRYPOINT_ENCODE,
@@ -242,7 +242,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
 
    port->Port_AllocateBuffer = vid_enc_AllocateOutBuffer;
    port->Port_FreeBuffer = vid_enc_FreeOutBuffer;
- 
+
    priv->bitrate.eControlRate = OMX_Video_ControlRateDisable;
    priv->bitrate.nTargetBitrate = 0;
 
@@ -253,7 +253,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
    priv->profile_level.eProfile = OMX_VIDEO_AVCProfileBaseline;
    priv->profile_level.eLevel = OMX_VIDEO_AVCLevel42;
 
-   priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; 
+   priv->force_pic_type.IntraRefreshVOP = OMX_FALSE;
    priv->frame_num = 0;
    priv->pic_order_cnt = 0;
    priv->restricted_b_frames = debug_get_bool_option("OMX_USE_RESTRICTED_B_FRAMES", FALSE);
@@ -380,7 +380,7 @@ static OMX_ERRORTYPE vid_enc_SetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i
 
          port = (omx_base_video_PortType *)priv->ports[OMX_BASE_FILTER_OUTPUTPORT_INDEX];
          port->sPortParam.nBufferSize = framesize * 512 / (16*16);
-      
+
          priv->frame_rate = def->format.video.xFramerate;
 
          priv->callbacks->EventHandler(comp, priv->callbackData, OMX_EventPortSettingsChanged,
@@ -532,10 +532,10 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
    vid_enc_PrivateType *priv = comp->pComponentPrivate;
    OMX_ERRORTYPE r;
    int i;
- 
+
    if (!config)
       return OMX_ErrorBadParameter;
-                         
+
    switch(idx) {
    case OMX_IndexConfigVideoIntraVOPRefresh: {
       OMX_CONFIG_INTRAREFRESHVOPTYPE *type = config;
@@ -543,9 +543,9 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
       r = checkHeader(config, sizeof(OMX_CONFIG_INTRAREFRESHVOPTYPE));
       if (r)
          return r;
-      
+
       priv->force_pic_type = *type;
-      
+
       break;
    }
    case OMX_IndexConfigCommonScale: {
@@ -568,11 +568,11 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
       priv->scale = *scale;
       if (priv->scale.xWidth != 0xffffffff && priv->scale.xHeight != 0xffffffff) {
          struct pipe_video_buffer templat = {};
- 
+
          templat.buffer_format = PIPE_FORMAT_NV12;
          templat.chroma_format = PIPE_VIDEO_CHROMA_FORMAT_420;
-         templat.width = priv->scale.xWidth; 
-         templat.height = priv->scale.xHeight; 
+         templat.width = priv->scale.xWidth;
+         templat.height = priv->scale.xHeight;
          templat.interlaced = false;
          for (i = 0; i < OMX_VID_ENC_NUM_SCALING_BUFFERS; ++i) {
             priv->scale_buffer[i] = priv->s_pipe->create_video_buffer(priv->s_pipe, &templat);
@@ -615,7 +615,7 @@ static OMX_ERRORTYPE vid_enc_GetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
    default:
       return omx_base_component_GetConfig(handle, idx, config);
    }
-   
+
    return OMX_ErrorNone;
 }
 
@@ -1010,10 +1010,10 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
    switch (priv->bitrate.eControlRate) {
    case OMX_Video_ControlRateVariable:
       rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_VARIABLE;
-      break; 
+      break;
    case OMX_Video_ControlRateConstant:
       rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_CONSTANT;
-      break; 
+      break;
    case OMX_Video_ControlRateVariableSkipFrames:
       rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_VARIABLE_SKIP;
       break;
@@ -1023,8 +1023,8 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
    default:
       rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_DISABLE;
       break;
-   } 
-      
+   }
+
    rate_ctrl->frame_rate_den = OMX_VID_ENC_CONTROL_FRAME_RATE_DEN_DEFAULT;
    rate_ctrl->frame_rate_num = ((priv->frame_rate) >> 16) * rate_ctrl->frame_rate_den;
 
@@ -1035,7 +1035,7 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
          rate_ctrl->target_bitrate = priv->bitrate.nTargetBitrate;
       else
          rate_ctrl->target_bitrate = OMX_VID_ENC_BITRATE_MAX;
-      rate_ctrl->peak_bitrate = rate_ctrl->target_bitrate;    
+      rate_ctrl->peak_bitrate = rate_ctrl->target_bitrate;
       if (rate_ctrl->target_bitrate < OMX_VID_ENC_BITRATE_MEDIAN)
          rate_ctrl->vbv_buffer_size = MIN2((rate_ctrl->target_bitrate * 2.75), OMX_VID_ENC_BITRATE_MEDIAN);
       else
@@ -1051,7 +1051,7 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
       rate_ctrl->peak_bits_picture_integer = rate_ctrl->target_bits_picture;
       rate_ctrl->peak_bits_picture_fraction = 0;
    }
-   
+
    picture->quant_i_frames = priv->quant.nQpI;
    picture->quant_p_frames = priv->quant.nQpP;
    picture->quant_b_frames = priv->quant.nQpB;
@@ -1069,7 +1069,7 @@ static void enc_HandleTask(omx_base_PortType *port, struct encode_task *task,
    unsigned size = priv->ports[OMX_BASE_FILTER_OUTPUTPORT_INDEX]->sPortParam.nBufferSize;
    struct pipe_video_buffer *vbuf = task->buf;
    struct pipe_h264_enc_picture_desc picture = {};
- 
+
    /* -------------- scale input image --------- */
    enc_ScaleInput(port, &vbuf, &size);
    priv->s_pipe->flush(priv->s_pipe, NULL, 0);
@@ -1160,7 +1160,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
        priv->force_pic_type.IntraRefreshVOP) {
       enc_ClearBframes(port, inp);
       picture_type = PIPE_H264_ENC_PICTURE_TYPE_IDR;
-      priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; 
+      priv->force_pic_type.IntraRefreshVOP = OMX_FALSE;
       priv->frame_num = 0;
    } else if (priv->codec->profile == PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE ||
               !(priv->pic_order_cnt % OMX_VID_ENC_P_PERIOD_DEFAULT) ||
@@ -1169,7 +1169,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
    } else {
       picture_type = PIPE_H264_ENC_PICTURE_TYPE_B;
    }
-   
+
    task->pic_order_cnt = priv->pic_order_cnt++;
 
    if (picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
@@ -1245,7 +1245,7 @@ static void vid_enc_BufferEncoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE*
    output->pBuffer = priv->t_pipe->transfer_map(priv->t_pipe, outp->bitstream, 0,
                                                 PIPE_TRANSFER_READ_WRITE,
                                                 &box, &outp->transfer);
- 
+
    /* ------------- get size of result ----------------- */
 
    priv->codec->get_feedback(priv->codec, task->feedback, &size);
diff --git a/src/gallium/tests/graw/quad-tex.c b/src/gallium/tests/graw/quad-tex.c
index 5f90166830f..8a9d1b80f1a 100644
--- a/src/gallium/tests/graw/quad-tex.c
+++ b/src/gallium/tests/graw/quad-tex.c
@@ -92,6 +92,7 @@ static void set_fragment_shader( void )
       "DCL OUT[0], COLOR\n"
       "DCL TEMP[0]\n"
       "DCL SAMP[0]\n"
+      "DCL SVIEW[0], 2D, FLOAT\n"
       "  0: TXP TEMP[0], IN[0], SAMP[0], 2D\n"
       "  1: MOV OUT[0], TEMP[0]\n"
       "  2: END\n";
diff --git a/src/gallium/tests/graw/tex-srgb.c b/src/gallium/tests/graw/tex-srgb.c
index af989d72a2e..3b43bcbf73e 100644
--- a/src/gallium/tests/graw/tex-srgb.c
+++ b/src/gallium/tests/graw/tex-srgb.c
@@ -108,6 +108,7 @@ static void set_fragment_shader( void )
       "DCL OUT[0], COLOR\n"
       "DCL TEMP[0]\n"
       "DCL SAMP[0]\n"
+      "DCL SVIEW[0], 2D, FLOAT\n"
       "  0: TXP TEMP[0], IN[0], SAMP[0], 2D\n"
       "  1: MOV OUT[0], TEMP[0]\n"
       "  2: END\n";
diff --git a/src/gallium/tests/graw/tex-swizzle.c b/src/gallium/tests/graw/tex-swizzle.c
index e45b848b48e..8b472c9364c 100644
--- a/src/gallium/tests/graw/tex-swizzle.c
+++ b/src/gallium/tests/graw/tex-swizzle.c
@@ -89,6 +89,7 @@ static void set_fragment_shader(void)
       "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
       "DCL OUT[0], COLOR\n"
       "DCL SAMP[0]\n"
+      "DCL SVIEW[0], 2D, FLOAT\n"
       "  0: TXP OUT[0], IN[0], SAMP[0], 2D\n"
       "  2: END\n";
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index b670f263329..c79bed45753 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -519,7 +519,8 @@ amdgpu_bo_create(struct radeon_winsys *rws,
 
 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
                                                struct winsys_handle *whandle,
-                                               unsigned *stride)
+                                               unsigned *stride,
+                                               unsigned *offset)
 {
    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
    struct amdgpu_winsys_bo *bo;
@@ -587,6 +588,8 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
 
    if (stride)
       *stride = whandle->stride;
+   if (offset)
+      *offset = whandle->offset;
 
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
       ws->allocated_vram += align(bo->base.size, ws->gart_page_size);
@@ -609,7 +612,8 @@ error:
 }
 
 static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
-                                    unsigned stride,
+                                    unsigned stride, unsigned offset,
+                                    unsigned slice_size,
                                     struct winsys_handle *whandle)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
@@ -637,6 +641,8 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
       return FALSE;
 
    whandle->stride = stride;
+   whandle->offset = offset;
+   whandle->offset += slice_size * whandle->layer;
    bo->is_shared = true;
    return TRUE;
 }
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 83da740f649..a9fc55f4a5a 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -335,8 +335,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
                  enum ring_type ring_type,
                  void (*flush)(void *ctx, unsigned flags,
                                struct pipe_fence_handle **fence),
-                 void *flush_ctx,
-                 struct pb_buffer *trace_buf)
+                 void *flush_ctx)
 {
    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
    struct amdgpu_cs *cs;
@@ -609,8 +608,7 @@ DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE)
 
 static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
                             unsigned flags,
-                            struct pipe_fence_handle **fence,
-                            uint32_t cs_trace_id)
+                            struct pipe_fence_handle **fence)
 {
    struct amdgpu_cs *cs = amdgpu_cs(rcs);
    struct amdgpu_winsys *ws = cs->ctx->ws;
diff --git a/src/gallium/winsys/radeon/drm/Makefile.am b/src/gallium/winsys/radeon/drm/Makefile.am
index 0320aca01f9..b413b0b93a0 100644
--- a/src/gallium/winsys/radeon/drm/Makefile.am
+++ b/src/gallium/winsys/radeon/drm/Makefile.am
@@ -8,5 +8,3 @@ AM_CFLAGS = \
 noinst_LTLIBRARIES = libradeonwinsys.la
 
 libradeonwinsys_la_SOURCES = $(C_SOURCES)
-
-EXTRA_DIST = $(TOOLS_HDR)
diff --git a/src/gallium/winsys/radeon/drm/Makefile.sources b/src/gallium/winsys/radeon/drm/Makefile.sources
index a00c84d35b3..2762c91e216 100644
--- a/src/gallium/winsys/radeon/drm/Makefile.sources
+++ b/src/gallium/winsys/radeon/drm/Makefile.sources
@@ -2,12 +2,8 @@ C_SOURCES := \
 	radeon_drm_bo.c \
 	radeon_drm_bo.h \
 	radeon_drm_cs.c \
-	radeon_drm_cs_dump.c \
 	radeon_drm_cs.h \
 	radeon_drm_public.h \
 	radeon_drm_surface.c \
 	radeon_drm_winsys.c \
 	radeon_drm_winsys.h
-
-TOOLS_HDR := \
-	radeon_ctx.h
diff --git a/src/gallium/winsys/radeon/drm/radeon_ctx.h b/src/gallium/winsys/radeon/drm/radeon_ctx.h
deleted file mode 100644
index 5618b3a8d00..00000000000
--- a/src/gallium/winsys/radeon/drm/radeon_ctx.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright 2011 Jerome Glisse <[email protected]>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *      Jérôme Glisse
- */
-#ifndef RADEON_CTX_H
-#define RADEON_CTX_H
-
-#define _FILE_OFFSET_BITS 64
-#include <sys/mman.h>
-
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include "xf86drm.h"
-#include "radeon_drm.h"
-
-struct ctx {
-    int                         fd;
-};
-
-struct bo {
-    uint32_t                    handle;
-    uint32_t                    alignment;
-    uint64_t                    size;
-    uint64_t                    va;
-    void                        *ptr;
-};
-
-static void ctx_init(struct ctx *ctx)
-{
-    ctx->fd = drmOpen("radeon", NULL);
-    if (ctx->fd < 0) {
-        fprintf(stderr, "failed to open radeon drm device file\n");
-        exit(-1);
-    }
-}
-
-static void bo_wait(struct ctx *ctx, struct bo *bo)
-{
-    struct drm_radeon_gem_wait_idle args;
-    void *ptr;
-    int r;
-
-    /* Zero out args to make valgrind happy */
-    memset(&args, 0, sizeof(args));
-    args.handle = bo->handle;
-    do {
-        r = drmCommandWrite(ctx->fd, DRM_RADEON_GEM_WAIT_IDLE, &args, sizeof(args));
-    } while (r == -EBUSY);
-}
-
-
-static void ctx_cs(struct ctx *ctx, uint32_t *cs, uint32_t cs_flags[2], unsigned ndw,
-                   struct bo **bo, uint32_t *bo_relocs, unsigned nbo)
-{
-    struct drm_radeon_cs args;
-    struct drm_radeon_cs_chunk chunks[3];
-    uint64_t chunk_array[3];
-    unsigned i;
-    int r;
-
-    /* update handle */
-    for (i = 0; i < nbo; i++) {
-        bo_relocs[i*4+0] = bo[i]->handle;
-    }
-
-    args.num_chunks = 2;
-    if (cs_flags[0] || cs_flags[1]) {
-        /* enable RADEON_CHUNK_ID_FLAGS */
-        args.num_chunks = 3;
-    }
-    args.chunks = (uint64_t)(uintptr_t)chunk_array;
-    chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
-    chunks[0].length_dw = ndw;
-    chunks[0].chunk_data = (uintptr_t)cs;
-    chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
-    chunks[1].length_dw = nbo * 4;
-    chunks[1].chunk_data = (uintptr_t)bo_relocs;
-    chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
-    chunks[2].length_dw = 2;
-    chunks[2].chunk_data = (uintptr_t)cs_flags;
-    chunk_array[0] = (uintptr_t)&chunks[0];
-    chunk_array[1] = (uintptr_t)&chunks[1];
-    chunk_array[2] = (uintptr_t)&chunks[2];
-
-    fprintf(stderr, "emiting cs %ddw with %d bo\n", ndw, nbo);
-    r = drmCommandWriteRead(ctx->fd, DRM_RADEON_CS, &args, sizeof(args));
-    if (r) {
-        fprintf(stderr, "cs submission failed with %d\n", r);
-        return;
-    }
-}
-
-static void bo_map(struct ctx *ctx, struct bo *bo)
-{
-    struct drm_radeon_gem_mmap args;
-    void *ptr;
-    int r;
-
-    /* Zero out args to make valgrind happy */
-    memset(&args, 0, sizeof(args));
-    args.handle = bo->handle;
-    args.offset = 0;
-    args.size = (uint64_t)bo->size;
-    r = drmCommandWriteRead(ctx->fd, DRM_RADEON_GEM_MMAP, &args, sizeof(args));
-    if (r) {
-        fprintf(stderr, "error mapping %p 0x%08X (error = %d)\n", bo, bo->handle, r);
-        exit(-1);
-    }
-    ptr = mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED, ctx->fd, args.addr_ptr);
-    if (ptr == MAP_FAILED) {
-        fprintf(stderr, "%s failed to map bo\n", __func__);
-        exit(-1);
-    }
-    bo->ptr = ptr;
-}
-
-static void bo_va(struct ctx *ctx, struct bo *bo)
-{
-    struct drm_radeon_gem_va args;
-    int r;
-
-    args.handle = bo->handle;
-    args.vm_id = 0;
-    args.operation = RADEON_VA_MAP;
-    args.flags = RADEON_VM_PAGE_READABLE | RADEON_VM_PAGE_WRITEABLE | RADEON_VM_PAGE_SNOOPED;
-    args.offset = bo->va;
-    r = drmCommandWriteRead(ctx->fd, DRM_RADEON_GEM_VA, &args, sizeof(args));
-    if (r && args.operation == RADEON_VA_RESULT_ERROR) {
-        fprintf(stderr, "radeon: Failed to allocate virtual address for buffer:\n");
-        fprintf(stderr, "radeon:    size      : %d bytes\n", bo->size);
-        fprintf(stderr, "radeon:    alignment : %d bytes\n", bo->alignment);
-        fprintf(stderr, "radeon:    va        : 0x%016llx\n", (unsigned long long)bo->va);
-        exit(-1);
-    }
-}
-
-static struct bo *bo_new(struct ctx *ctx, unsigned ndw, uint32_t *data, uint64_t va, uint32_t alignment)
-{
-    struct drm_radeon_gem_create args;
-    struct bo *bo;
-    int r;
-
-    bo = calloc(1, sizeof(*bo));
-    if (bo == NULL) {
-        fprintf(stderr, "failed to malloc bo struct\n");
-        exit(-1);
-    }
-    bo->size = ndw * 4ULL;
-    bo->va = va;
-    bo->alignment = alignment;
-
-    args.size = bo->size;
-    args.alignment = bo->alignment;
-    args.initial_domain = RADEON_GEM_DOMAIN_GTT;
-    args.flags = 0;
-    args.handle = 0;
-
-    r = drmCommandWriteRead(ctx->fd, DRM_RADEON_GEM_CREATE, &args, sizeof(args));
-    bo->handle = args.handle;
-    if (r) {
-        fprintf(stderr, "Failed to allocate :\n");
-        fprintf(stderr, "   size      : %d bytes\n", bo->size);
-        fprintf(stderr, "   alignment : %d bytes\n", bo->alignment);
-        free(bo);
-        exit(-1);
-    }
-
-    if (data) {
-        bo_map(ctx, bo);
-        memcpy(bo->ptr, data, bo->size);
-    }
-
-    if (va) {
-        bo_va(ctx, bo);
-    }
-
-    return bo;
-}
-
-
-#endif
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 978df52447e..08856dff430 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -851,7 +851,8 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
 
 static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws,
                                                       struct winsys_handle *whandle,
-                                                      unsigned *stride)
+                                                      unsigned *stride,
+                                                      unsigned *offset)
 {
     struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
     struct radeon_bo *bo;
@@ -941,6 +942,8 @@ done:
 
     if (stride)
         *stride = whandle->stride;
+    if (offset)
+        *offset = whandle->offset;
 
     if (ws->info.has_virtual_memory && !bo->va) {
         struct drm_radeon_gem_va va;
@@ -991,7 +994,8 @@ fail:
 }
 
 static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
-                                           unsigned stride,
+                                           unsigned stride, unsigned offset,
+                                           unsigned slice_size,
                                            struct winsys_handle *whandle)
 {
     struct drm_gem_flink flink;
@@ -1025,6 +1029,9 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
     }
 
     whandle->stride = stride;
+    whandle->offset = offset;
+    whandle->offset += slice_size * whandle->layer;
+
     return TRUE;
 }
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 155a13008a4..b50e19c0381 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -168,8 +168,7 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
                      enum ring_type ring_type,
                      void (*flush)(void *ctx, unsigned flags,
                                    struct pipe_fence_handle **fence),
-                     void *flush_ctx,
-                     struct pb_buffer *trace_buf)
+                     void *flush_ctx)
 {
     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
     struct radeon_drm_cs *cs;
@@ -183,7 +182,6 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
     cs->ws = ws;
     cs->flush_cs = flush;
     cs->flush_data = flush_ctx;
-    cs->trace_buf = (struct radeon_bo*)trace_buf;
 
     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
         FREE(cs);
@@ -439,10 +437,6 @@ void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs
         }
     }
 
-    if (cs->trace_buf) {
-        radeon_dump_cs_on_lockup(cs, csc);
-    }
-
     for (i = 0; i < csc->crelocs; i++)
         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 
@@ -467,8 +461,7 @@ DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
 
 static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
                                 unsigned flags,
-                                struct pipe_fence_handle **fence,
-                                uint32_t cs_trace_id)
+                                struct pipe_fence_handle **fence)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_cs_context *tmp;
@@ -520,8 +513,6 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
     cs->csc = cs->cst;
     cs->cst = tmp;
 
-    cs->cst->cs_trace_id = cs_trace_id;
-
     /* If the CS is not empty or overflowed, emit it in a separate thread. */
     if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) {
         unsigned i, crelocs;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index 81f66f56d99..4ffa91ae8b2 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -43,8 +43,6 @@ struct radeon_cs_context {
     uint64_t                    chunk_array[3];
     uint32_t                    flags[2];
 
-    uint32_t                    cs_trace_id;
-
     /* Buffers. */
     unsigned                    nrelocs;
     unsigned                    crelocs;
@@ -80,7 +78,6 @@ struct radeon_drm_cs {
     void *flush_data;
 
     pipe_semaphore flush_completed;
-    struct radeon_bo                    *trace_buf;
 };
 
 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo);
@@ -126,6 +123,4 @@ void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs);
 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws);
 void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs_context *csc);
 
-void radeon_dump_cs_on_lockup(struct radeon_drm_cs *cs, struct radeon_cs_context *csc);
-
 #endif
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c
deleted file mode 100644
index 99585956a49..00000000000
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright © 2013 Jérôme Glisse
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
-/*
- * Authors:
- *      Jérôme Glisse <[email protected]>
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <xf86drm.h>
-#include "radeon_drm_cs.h"
-#include "radeon_drm_bo.h"
-
-#define RADEON_CS_DUMP_AFTER_MS_TIMEOUT         500
-
-void radeon_dump_cs_on_lockup(struct radeon_drm_cs *cs, struct radeon_cs_context *csc)
-{
-    struct drm_radeon_gem_busy args;
-    FILE *dump;
-    unsigned i, lockup;
-    uint32_t *ptr;
-    char fname[32];
-
-    /* only dump the first cs to cause a lockup */
-    if (!csc->crelocs) {
-        /* can not determine if there was a lockup if no bo were use by
-         * the cs and most likely in such case no lockup occurs
-         */
-        return;
-    }
-
-    memset(&args, 0, sizeof(args));
-    args.handle = csc->relocs_bo[0].bo->handle;
-    for (i = 0; i < RADEON_CS_DUMP_AFTER_MS_TIMEOUT; i++) {
-        usleep(1);
-        lockup = drmCommandWriteRead(csc->fd, DRM_RADEON_GEM_BUSY, &args, sizeof(args));
-        if (!lockup) {
-            break;
-        }
-    }
-    if (!lockup || i < RADEON_CS_DUMP_AFTER_MS_TIMEOUT) {
-        return;
-    }
-
-    ptr = radeon_bo_do_map(cs->trace_buf);
-    fprintf(stderr, "timeout on cs lockup likely happen at cs 0x%08x dw 0x%08x\n", ptr[1], ptr[0]);
-
-    if (csc->cs_trace_id != ptr[1]) {
-        return;
-    }
-
-    /* ok we are most likely facing a lockup write the standalone replay file */
-    snprintf(fname, sizeof(fname), "rlockup_0x%08x.c", csc->cs_trace_id);
-    dump = fopen(fname, "w");
-    if (dump == NULL) {
-        return;
-    }
-    fprintf(dump, "/* To build this file you will need to copy radeon_ctx.h\n");
-    fprintf(dump, " * in same directory. You can find radeon_ctx.h in mesa tree :\n");
-    fprintf(dump, " * mesa/src/gallium/winsys/radeon/drm/radeon_ctx.h\n");
-    fprintf(dump, " * Build with :\n");
-    fprintf(dump, " * gcc -O0 -g `pkg-config --cflags --libs libdrm` %s -o rlockup_0x%08x \n", fname, csc->cs_trace_id);
-    fprintf(dump, " */\n");
-    fprintf(dump, " /* timeout on cs lockup likely happen at cs 0x%08x dw 0x%08x*/\n", ptr[1], ptr[0]);
-    fprintf(dump, "#include <stdio.h>\n");
-    fprintf(dump, "#include <stdint.h>\n");
-    fprintf(dump, "#include \"radeon_ctx.h\"\n");
-    fprintf(dump, "\n");
-    fprintf(dump, "#define ARRAY_SIZE(x)  (sizeof(x)/sizeof(x[0]))\n");
-    fprintf(dump, "\n");
-
-    for (i = 0; i < csc->crelocs; i++) {
-        unsigned j, ndw = (csc->relocs_bo[i].bo->base.size + 3) >> 2;
-
-        ptr = radeon_bo_do_map(csc->relocs_bo[i].bo);
-        if (ptr) {
-            fprintf(dump, "static uint32_t bo_%04d_data[%d] = {\n   ", i, ndw);
-            for (j = 0; j < ndw; j++) {
-                if (j && !(j % 8)) {
-                    uint32_t offset = (j - 8) << 2;
-                    fprintf(dump, "  /* [0x%08x] va[0x%016"PRIx64"] */\n   ", offset, offset + csc->relocs_bo[i].bo->va);
-                }
-                fprintf(dump, " 0x%08x,", ptr[j]);
-            }
-            fprintf(dump, "};\n\n");
-        }
-    }
-
-    fprintf(dump, "static uint32_t bo_relocs[%d] = {\n", csc->crelocs * 4);
-    for (i = 0; i < csc->crelocs; i++) {
-        fprintf(dump, "    0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
-                0, csc->relocs[i].read_domains, csc->relocs[i].write_domain, csc->relocs[i].flags);
-    }
-    fprintf(dump, "};\n\n");
-
-    fprintf(dump, "/* cs %d dw */\n", csc->chunks[0].length_dw);
-    fprintf(dump, "static uint32_t cs[] = {\n");
-    ptr = csc->buf;
-    for (i = 0; i < csc->chunks[0].length_dw; i++) {
-        fprintf(dump, "    0x%08x,\n", ptr[i]);
-    }
-    fprintf(dump, "};\n\n");
-
-    fprintf(dump, "static uint32_t cs_flags[2] = {\n");
-    fprintf(dump, "    0x%08x,\n", csc->flags[0]);
-    fprintf(dump, "    0x%08x,\n", csc->flags[1]);
-    fprintf(dump, "};\n\n");
-
-    fprintf(dump, "int main(int argc, char *argv[])\n");
-    fprintf(dump, "{\n");
-    fprintf(dump, "    struct bo *bo[%d];\n", csc->crelocs);
-    fprintf(dump, "    struct ctx ctx;\n");
-    fprintf(dump, "\n");
-    fprintf(dump, "    ctx_init(&ctx);\n");
-    fprintf(dump, "\n");
-
-    for (i = 0; i < csc->crelocs; i++) {
-        unsigned ndw = (csc->relocs_bo[i].bo->base.size + 3) >> 2;
-        uint32_t *ptr;
-
-        ptr = radeon_bo_do_map(csc->relocs_bo[i].bo);
-        if (ptr) {
-            fprintf(dump, "    bo[%d] = bo_new(&ctx, %d, bo_%04d_data, 0x%016"PRIx64", 0x%08x);\n",
-                    i, ndw, i, csc->relocs_bo[i].bo->va, csc->relocs_bo[i].bo->base.alignment);
-        } else {
-            fprintf(dump, "    bo[%d] = bo_new(&ctx, %d, NULL, 0x%016"PRIx64", 0x%08x);\n",
-                    i, ndw, csc->relocs_bo[i].bo->va, csc->relocs_bo[i].bo->base.alignment);
-        }
-    }
-    fprintf(dump, "\n");
-    fprintf(dump, "    ctx_cs(&ctx, cs, cs_flags, ARRAY_SIZE(cs), bo, bo_relocs, %d);\n", csc->crelocs);
-    fprintf(dump, "\n");
-    fprintf(dump, "    fprintf(stderr, \"waiting for cs execution to end ....\\n\");\n");
-    fprintf(dump, "    bo_wait(&ctx, bo[0]);\n");
-    fprintf(dump, "}\n");
-    fclose(dump);
-}
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_dri.c b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
index 01bb0e2d753..baa22a90beb 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_dri.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
@@ -357,6 +357,7 @@ vmw_drm_surface_get_handle(struct svga_winsys_screen *sws,
     vsrf = vmw_svga_winsys_surface(surface);
     whandle->handle = vsrf->sid;
     whandle->stride = stride;
+    whandle->offset = 0;
 
     switch (whandle->type) {
     case DRM_API_HANDLE_TYPE_SHARED:
diff --git a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
index 1e859717f1c..9aaee8844be 100644
--- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
@@ -309,17 +309,20 @@ kms_sw_displaytarget_get_handle(struct sw_winsys *winsys,
    case DRM_API_HANDLE_TYPE_KMS:
       whandle->handle = kms_sw_dt->handle;
       whandle->stride = kms_sw_dt->stride;
+      whandle->offset = 0;
       return TRUE;
    case DRM_API_HANDLE_TYPE_FD:
       if (!drmPrimeHandleToFD(kms_sw->fd, kms_sw_dt->handle,
                              DRM_CLOEXEC, (int*)&whandle->handle)) {
          whandle->stride = kms_sw_dt->stride;
+         whandle->offset = 0;
          return TRUE;
       }
       /* fallthrough */
    default:
       whandle->handle = 0;
       whandle->stride = 0;
+      whandle->offset = 0;
       return FALSE;
    }
 }
diff --git a/src/intel/vulkan/anv_meta_blit.c b/src/intel/vulkan/anv_meta_blit.c
index e23b6978819..218499a8787 100644
--- a/src/intel/vulkan/anv_meta_blit.c
+++ b/src/intel/vulkan/anv_meta_blit.c
@@ -100,7 +100,7 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim)
    tex->texture = nir_deref_var_create(tex, sampler);
    tex->sampler = nir_deref_var_create(tex, sampler);
 
-   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, "tex");
+   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
    nir_builder_instr_insert(&b, &tex->instr);
 
    nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
diff --git a/src/intel/vulkan/anv_meta_blit2d.c b/src/intel/vulkan/anv_meta_blit2d.c
index 4a0bed1a335..87c3358f045 100644
--- a/src/intel/vulkan/anv_meta_blit2d.c
+++ b/src/intel/vulkan/anv_meta_blit2d.c
@@ -455,7 +455,7 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim)
    tex->texture = nir_deref_var_create(tex, sampler);
    tex->sampler = NULL;
 
-   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, "tex");
+   nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
    nir_builder_instr_insert(&b, &tex->instr);
 
    nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
diff --git a/src/intel/vulkan/anv_meta_resolve.c b/src/intel/vulkan/anv_meta_resolve.c
index f50af52ece5..3e7c7d39ba5 100644
--- a/src/intel/vulkan/anv_meta_resolve.c
+++ b/src/intel/vulkan/anv_meta_resolve.c
@@ -164,7 +164,7 @@ build_nir_fs(uint32_t num_samples)
       tex->dest_type = nir_type_float;
       tex->is_array = false;
       tex->coord_components = 3;
-      nir_ssa_dest_init(&tex->instr, &tex->dest, /*num_components*/ 4, "tex");
+      nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
       nir_builder_instr_insert(&b, &tex->instr);
 
       accum = nir_fadd(&b, accum, &tex->dest.ssa);
diff --git a/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c b/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c
index 46bc5d23a4e..234855c3779 100644
--- a/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c
+++ b/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c
@@ -85,7 +85,7 @@ apply_dynamic_offsets_block(nir_block *block, void *void_state)
       offset_load->src[0] = nir_src_for_ssa(nir_imul(b, res_intrin->src[0].ssa,
                                                      nir_imm_int(b, 8)));
 
-      nir_ssa_dest_init(&offset_load->instr, &offset_load->dest, 2, NULL);
+      nir_ssa_dest_init(&offset_load->instr, &offset_load->dest, 2, 32, NULL);
       nir_builder_instr_insert(b, &offset_load->instr);
 
       nir_src *offset_src = nir_get_io_offset_src(intrin);
@@ -107,7 +107,8 @@ apply_dynamic_offsets_block(nir_block *block, void *void_state)
          /* It's a load, we need a phi node */
          nir_phi_instr *phi = nir_phi_instr_create(b->shader);
          nir_ssa_dest_init(&phi->instr, &phi->dest,
-                           intrin->num_components, NULL);
+                           intrin->num_components,
+                           intrin->dest.ssa.bit_size, NULL);
 
          nir_phi_src *src1 = ralloc(phi, nir_phi_src);
          struct exec_node *tnode = exec_list_get_tail(&if_stmt->then_list);
@@ -117,7 +118,7 @@ apply_dynamic_offsets_block(nir_block *block, void *void_state)
 
          b->cursor = nir_after_cf_list(&if_stmt->else_list);
          nir_ssa_def *zero = nir_build_imm(b, intrin->num_components,
-            (nir_const_value) { .u = { 0, 0, 0, 0 } });
+            (nir_const_value) { .u32 = { 0, 0, 0, 0 } });
 
          nir_phi_src *src2 = ralloc(phi, nir_phi_src);
          struct exec_node *enode = exec_list_get_tail(&if_stmt->else_list);
diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
index eeb9b97f554..ef81afaf552 100644
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -119,7 +119,7 @@ lower_res_index_intrinsic(nir_intrinsic_instr *intrin,
 
    nir_ssa_def *block_index;
    if (const_block_idx) {
-      block_index = nir_imm_int(b, surface_index + const_block_idx->u[0]);
+      block_index = nir_imm_int(b, surface_index + const_block_idx->u32[0]);
    } else {
       block_index = nir_iadd(b, nir_imm_int(b, surface_index),
                              nir_ssa_for_src(b, intrin->src[0], 1));
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 0066f7f9184..6761238b014 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -597,6 +597,7 @@ blitframebuffer_texture(struct gl_context *ctx,
                         GLenum filter, GLint flipX, GLint flipY,
                         GLboolean glsl_version, GLboolean do_depth)
 {
+   struct save_state *save = &ctx->Meta->Save[ctx->Meta->SaveStackDepth - 1];
    int att_index = do_depth ? BUFFER_DEPTH : readFb->_ColorReadBufferIndex;
    const struct gl_renderbuffer_attachment *readAtt =
       &readFb->Attachment[att_index];
@@ -709,7 +710,7 @@ blitframebuffer_texture(struct gl_context *ctx,
    fb_tex_blit.samp_obj = _mesa_meta_setup_sampler(ctx, texObj, target, filter,
                                                    srcLevel);
 
-   /* Always do our blits with no net sRGB decode or encode.
+   /* For desktop GL, we do our blits with no net sRGB decode or encode.
     *
     * However, if both the src and dst can be srgb decode/encoded, enable them
     * so that we do any blending (from scaling or from MSAA resolves) in the
@@ -723,18 +724,42 @@ blitframebuffer_texture(struct gl_context *ctx,
     *      scissor test."
     *
     * The GL 4.4 specification disagrees and says that the sRGB part of the
-    * fragment pipeline applies, but this was found to break applications.
+    * fragment pipeline applies, but this was found to break applications
+    * (such as Left 4 Dead 2).
+    *
+    * However, for ES 3.0, we follow the specification and perform sRGB
+    * decoding and encoding.  The specification has always been clear in
+    * the ES world, and hasn't changed over time.
     */
    if (ctx->Extensions.EXT_texture_sRGB_decode) {
-      if (_mesa_get_format_color_encoding(rb->Format) == GL_SRGB &&
-          drawFb->Visual.sRGBCapable) {
+      bool src_srgb = _mesa_get_format_color_encoding(rb->Format) == GL_SRGB;
+      if (save->API == API_OPENGLES2 && ctx->Version >= 30) {
+         /* From the ES 3.0.4 specification, page 198:
+          * "When values are taken from the read buffer, if the value of
+          *  FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING for the framebuffer
+          *  attachment corresponding to the read buffer is SRGB (see section
+          *  6.1.13), the red, green, and blue components are converted from
+          *  the non-linear sRGB color space according to equation 3.24.
+          *
+          *  When values are written to the draw buffers, blit operations
+          *  bypass the fragment pipeline. The only fragment operations which
+          *  affect a blit are the pixel ownership test, the scissor test,
+          *  and sRGB conversion (see section 4.1.8)."
+          */
          _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
-                                       GL_DECODE_EXT);
-         _mesa_set_framebuffer_srgb(ctx, GL_TRUE);
+                                       src_srgb ? GL_DECODE_EXT
+                                                : GL_SKIP_DECODE_EXT);
+         _mesa_set_framebuffer_srgb(ctx, drawFb->Visual.sRGBCapable);
       } else {
-         _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
-                                       GL_SKIP_DECODE_EXT);
-         /* set_framebuffer_srgb was set by _mesa_meta_begin(). */
+         if (src_srgb && drawFb->Visual.sRGBCapable) {
+            _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
+                                          GL_DECODE_EXT);
+            _mesa_set_framebuffer_srgb(ctx, GL_TRUE);
+         } else {
+            _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
+                                          GL_SKIP_DECODE_EXT);
+            /* set_framebuffer_srgb was set by _mesa_meta_begin(). */
+         }
       }
    }
 
diff --git a/src/mesa/drivers/common/meta_copy_image.c b/src/mesa/drivers/common/meta_copy_image.c
index 18b9681b710..9402a4652eb 100644
--- a/src/mesa/drivers/common/meta_copy_image.c
+++ b/src/mesa/drivers/common/meta_copy_image.c
@@ -269,6 +269,9 @@ _mesa_meta_CopyImageSubData_uncompressed(struct gl_context *ctx,
    if (status != GL_FRAMEBUFFER_COMPLETE)
       goto meta_end;
 
+   /* Explicitly disable sRGB encoding */
+   ctx->DrawBuffer->Visual.sRGBCapable = false;
+
    /* Since we've bound a new draw framebuffer, we need to update its
     * derived state -- _Xmin, etc -- for BlitFramebuffer's clipping to
     * be correct.
diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index dfd3327dd55..62c3fce4249 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -263,6 +263,9 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims,
    if (status != GL_FRAMEBUFFER_COMPLETE)
       goto fail;
 
+   /* Explicitly disable sRGB encoding */
+   ctx->DrawBuffer->Visual.sRGBCapable = false;
+
    _mesa_update_state(ctx);
 
    if (_mesa_meta_BlitFramebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
@@ -420,6 +423,9 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    if (status != GL_FRAMEBUFFER_COMPLETE)
       goto fail;
 
+   /* Explicitly disable sRGB encoding */
+   ctx->DrawBuffer->Visual.sRGBCapable = false;
+
    _mesa_update_state(ctx);
 
    if (_mesa_meta_BlitFramebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index 4497eab3bf0..38a32361f0b 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -115,12 +115,11 @@ brw_blorp_surface_info::set(struct brw_context *brw,
       this->brw_surfaceformat = BRW_SURFACEFORMAT_R16_UNORM;
       break;
    default: {
-      mesa_format linear_format = _mesa_get_srgb_format_linear(format);
       if (is_render_target) {
-         assert(brw->format_supported_as_render_target[linear_format]);
-         this->brw_surfaceformat = brw->render_target_format[linear_format];
+         assert(brw->format_supported_as_render_target[format]);
+         this->brw_surfaceformat = brw->render_target_format[format];
       } else {
-         this->brw_surfaceformat = brw_format_for_mesa_format(linear_format);
+         this->brw_surfaceformat = brw_format_for_mesa_format(format);
       }
       break;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h
index a04a1dfa719..f04e1969351 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -46,7 +46,8 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
                         float src_x1, float src_y1,
                         float dst_x0, float dst_y0,
                         float dst_x1, float dst_y1,
-                        GLenum filter, bool mirror_x, bool mirror_y);
+                        GLenum filter, bool mirror_x, bool mirror_y,
+                        bool decode_srgb, bool encode_srgb);
 
 #ifdef __cplusplus
 } /* end extern "C" */
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 05fff91ed57..5fd25f1ffe4 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -21,6 +21,7 @@
  * IN THE SOFTWARE.
  */
 
+#include "main/context.h"
 #include "main/teximage.h"
 #include "main/fbobject.h"
 
@@ -63,7 +64,8 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
                         float src_x1, float src_y1,
                         float dst_x0, float dst_y0,
                         float dst_x1, float dst_y1,
-                        GLenum filter, bool mirror_x, bool mirror_y)
+                        GLenum filter, bool mirror_x, bool mirror_y,
+                        bool decode_srgb, bool encode_srgb)
 {
    /* Get ready to blit.  This includes depth resolving the src and dst
     * buffers if necessary.  Note: it's not necessary to do a color resolve on
@@ -89,6 +91,12 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
        dst_level, dst_layer, dst_x0, dst_y0, dst_x1, dst_y1,
        mirror_x, mirror_y);
 
+   if (!decode_srgb && _mesa_get_format_color_encoding(src_format) == GL_SRGB)
+      src_format = _mesa_get_srgb_format_linear(src_format);
+
+   if (!encode_srgb && _mesa_get_format_color_encoding(dst_format) == GL_SRGB)
+      dst_format = _mesa_get_srgb_format_linear(dst_format);
+
    brw_blorp_blit_params params(brw,
                                 src_mt, src_level, src_layer, src_format,
                                 dst_mt, dst_level, dst_layer, dst_format,
@@ -114,6 +122,8 @@ do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit,
    struct intel_mipmap_tree *src_mt = find_miptree(buffer_bit, src_irb);
    struct intel_mipmap_tree *dst_mt = find_miptree(buffer_bit, dst_irb);
 
+   const bool es3 = _mesa_is_gles3(&brw->ctx);
+
    /* Do the blit */
    brw_blorp_blit_miptrees(brw,
                            src_mt, src_irb->mt_level, src_irb->mt_layer,
@@ -122,7 +132,8 @@ do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit,
                            dst_format,
                            srcX0, srcY0, srcX1, srcY1,
                            dstX0, dstY0, dstX1, dstY1,
-                           filter, mirror_x, mirror_y);
+                           filter, mirror_x, mirror_y,
+                           es3, es3);
 
    dst_irb->need_downsample = true;
 }
@@ -289,7 +300,8 @@ brw_blorp_copytexsubimage(struct brw_context *brw,
                            dst_image->TexFormat,
                            srcX0, srcY0, srcX1, srcY1,
                            dstX0, dstY0, dstX1, dstY1,
-                           GL_NEAREST, false, mirror_y);
+                           GL_NEAREST, false, mirror_y,
+                           false, false);
 
    /* If we're copying to a packed depth stencil texture and the source
     * framebuffer has separate stencil, we need to also copy the stencil data
@@ -314,7 +326,8 @@ brw_blorp_copytexsubimage(struct brw_context *brw,
                                  dst_mt->format,
                                  srcX0, srcY0, srcX1, srcY1,
                                  dstX0, dstY0, dstX1, dstY1,
-                                 GL_NEAREST, false, mirror_y);
+                                 GL_NEAREST, false, mirror_y,
+                                 false, false);
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
index a95f51bfa4a..b32252f7b9b 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -108,6 +108,26 @@ static const struct nir_shader_compiler_options vector_nir_options = {
     */
    .fdot_replicates = true,
 
+   /* Prior to Gen6, there are no three source operations for SIMD4x2. */
+   .lower_flrp = true,
+
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+};
+
+static const struct nir_shader_compiler_options vector_nir_options_gen6 = {
+   COMMON_OPTIONS,
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all the
+    * components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    */
+   .fdot_replicates = true,
+
    .lower_pack_snorm_2x16 = true,
    .lower_pack_unorm_2x16 = true,
    .lower_unpack_snorm_2x16 = true,
@@ -160,8 +180,12 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       if (devinfo->gen < 7)
          compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
 
-      compiler->glsl_compiler_options[i].NirOptions =
-         is_scalar ? &scalar_nir_options : &vector_nir_options;
+      if (is_scalar) {
+         compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options;
+      } else {
+         compiler->glsl_compiler_options[i].NirOptions =
+            devinfo->gen < 6 ? &vector_nir_options : &vector_nir_options_gen6;
+      }
 
       compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 0b99356b27d..5b142525752 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1294,7 +1294,7 @@ pop_if_stack(struct brw_codegen *p)
 static void
 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
 {
-   if (p->loop_stack_array_size < p->loop_stack_depth) {
+   if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
       p->loop_stack_array_size *= 2;
       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
 			       p->loop_stack_array_size);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 874053c8489..33c4adc4705 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2307,17 +2307,6 @@ fs_visitor::opt_algebraic()
             progress = true;
          }
          break;
-      case SHADER_OPCODE_RCP: {
-         fs_inst *prev = (fs_inst *)inst->prev;
-         if (prev->opcode == SHADER_OPCODE_SQRT) {
-            if (inst->src[0].equals(prev->dst)) {
-               inst->opcode = SHADER_OPCODE_RSQ;
-               inst->src[0] = prev->src[0];
-               progress = true;
-            }
-         }
-         break;
-      }
       case SHADER_OPCODE_BROADCAST:
          if (is_uniform(inst->src[0])) {
             inst->opcode = BRW_OPCODE_MOV;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 2616e65fc62..ffab0a8ebd5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -654,21 +654,6 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
          }
          break;
 
-      case SHADER_OPCODE_RCP:
-         /* The hardware doesn't do math on immediate values
-          * (because why are you doing that, seriously?), but
-          * the correct answer is to just constant fold it
-          * anyway.
-          */
-         assert(i == 0);
-         if (inst->src[0].f != 0.0f) {
-            inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0] = val;
-            inst->src[0].f = 1.0f / inst->src[0].f;
-            progress = true;
-         }
-         break;
-
       case SHADER_OPCODE_UNTYPED_ATOMIC:
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 29ef609fce3..aa4c745db69 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -518,10 +518,10 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
 
    enum opcode extract_op;
    if (src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16) {
-      assert(element->u[0] <= 1);
+      assert(element->u32[0] <= 1);
       extract_op = SHADER_OPCODE_EXTRACT_WORD;
    } else {
-      assert(element->u[0] <= 3);
+      assert(element->u32[0] <= 3);
       extract_op = SHADER_OPCODE_EXTRACT_BYTE;
    }
 
@@ -530,7 +530,7 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
 
    set_saturate(instr->dest.saturate,
-                bld.emit(extract_op, result, op0, brw_imm_ud(element->u[0])));
+                bld.emit(extract_op, result, op0, brw_imm_ud(element->u32[0])));
    return true;
 }
 
@@ -549,11 +549,11 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
       return false;
 
    nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
-   if (!value1 || fabsf(value1->f[0]) != 1.0f)
+   if (!value1 || fabsf(value1->f32[0]) != 1.0f)
       return false;
 
    nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
-   if (!value2 || fabsf(value2->f[0]) != 1.0f)
+   if (!value2 || fabsf(value2->f32[0]) != 1.0f)
       return false;
 
    fs_reg tmp = vgrf(glsl_type::int_type);
@@ -573,7 +573,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
        * surely be TRIANGLES
        */
 
-      if (value1->f[0] == -1.0f) {
+      if (value1->f32[0] == -1.0f) {
          g0.negate = true;
       }
 
@@ -601,7 +601,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
        * surely be TRIANGLES
        */
 
-      if (value1->f[0] == -1.0f) {
+      if (value1->f32[0] == -1.0f) {
          g1_6.negate = true;
       }
 
@@ -1180,7 +1180,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    case nir_op_extract_i8: {
       nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
       bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
-               result, op[0], brw_imm_ud(byte->u[0]));
+               result, op[0], brw_imm_ud(byte->u32[0]));
       break;
    }
 
@@ -1188,7 +1188,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    case nir_op_extract_i16: {
       nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
       bld.emit(SHADER_OPCODE_EXTRACT_WORD,
-               result, op[0], brw_imm_ud(word->u[0]));
+               result, op[0], brw_imm_ud(word->u32[0]));
       break;
    }
 
@@ -1215,7 +1215,7 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld,
    fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components);
 
    for (unsigned i = 0; i < instr->def.num_components; i++)
-      bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i[i]));
+      bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
 
    nir_ssa_values[instr->def.index] = reg;
 }
@@ -1769,9 +1769,9 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
    const bool is_point_size = (base_offset == 0);
 
    if (offset_const != NULL && vertex_const != NULL &&
-       4 * (base_offset + offset_const->u[0]) < push_reg_count) {
-      int imm_offset = (base_offset + offset_const->u[0]) * 4 +
-                       vertex_const->u[0] * push_reg_count;
+       4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
+      int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
+                       vertex_const->u32[0] * push_reg_count;
       /* This input was pushed into registers. */
       if (is_point_size) {
          /* gl_PointSize comes in .w */
@@ -1793,7 +1793,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
       if (vertex_const) {
          /* The vertex index is constant; just select the proper URB handle. */
          icp_handle =
-            retype(brw_vec8_grf(first_icp_handle + vertex_const->i[0], 0),
+            retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
                    BRW_REGISTER_TYPE_UD);
       } else {
          /* The vertex index is non-constant.  We need to use indirect
@@ -1837,7 +1837,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
       if (offset_const) {
          /* Constant indexing - use global offset. */
          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
-         inst->offset = base_offset + offset_const->u[0];
+         inst->offset = base_offset + offset_const->u32[0];
          inst->base_mrf = -1;
          inst->mlen = 1;
          inst->regs_written = num_components;
@@ -1875,7 +1875,7 @@ fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
        * add_const_offset_to_base() will fold other constant offsets
        * into instr->const_index[0].
        */
-      assert(const_value->u[0] == 0);
+      assert(const_value->u32[0] == 0);
       return fs_reg();
    }
 
@@ -2193,7 +2193,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
          nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
 
          if (const_sample) {
-            unsigned msg_data = const_sample->i[0] << 4;
+            unsigned msg_data = const_sample->i32[0] << 4;
 
             emit_pixel_interpolater_send(bld,
                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
@@ -2260,8 +2260,8 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
          nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
 
          if (const_offset) {
-            unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
-            unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
+            unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
+            unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
 
             emit_pixel_interpolater_send(bld,
                                          FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
@@ -2420,7 +2420,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
       fs_reg offset_reg;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
       if (const_offset) {
-         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]);
+         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
       } else {
          offset_reg = vgrf(glsl_type::uint_type);
          bld.ADD(offset_reg,
@@ -2464,7 +2464,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
 
          nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
          if (const_offset) {
-            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] +
+            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
                                     4 * first_component);
          } else {
             offset_reg = vgrf(glsl_type::uint_type);
@@ -2695,8 +2695,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
       if (const_offset) {
          /* Offsets are in bytes but they should always be multiples of 4 */
-         assert(const_offset->u[0] % 4 == 0);
-         src.reg_offset = const_offset->u[0] / 4;
+         assert(const_offset->u32[0] % 4 == 0);
+         src.reg_offset = const_offset->u32[0] / 4;
 
          for (unsigned j = 0; j < instr->num_components; j++) {
             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
@@ -2729,7 +2729,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
       if (const_index) {
          const unsigned index = stage_prog_data->binding_table.ubo_start +
-                                const_index->u[0];
+                                const_index->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(prog_data, index);
       } else {
@@ -2762,12 +2762,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg packed_consts = vgrf(glsl_type::float_type);
          packed_consts.type = dest.type;
 
-         struct brw_reg const_offset_reg = brw_imm_ud(const_offset->u[0] & ~15);
+         struct brw_reg const_offset_reg = brw_imm_ud(const_offset->u32[0] & ~15);
          bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
                   surf_index, const_offset_reg);
 
          for (unsigned i = 0; i < instr->num_components; i++) {
-            packed_consts.set_smear(const_offset->u[0] % 16 / 4 + i);
+            packed_consts.set_smear(const_offset->u32[0] % 16 / 4 + i);
 
             /* The std140 packing rules don't allow vectors to cross 16-byte
              * boundaries, and a reg is 32 bytes.
@@ -2790,7 +2790,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg surf_index;
       if (const_uniform_block) {
          unsigned index = stage_prog_data->binding_table.ssbo_start +
-                          const_uniform_block->u[0];
+                          const_uniform_block->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(prog_data, index);
       } else {
@@ -2809,7 +2809,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg offset_reg;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset) {
-         offset_reg = brw_imm_ud(const_offset->u[0]);
+         offset_reg = brw_imm_ud(const_offset->u32[0]);
       } else {
          offset_reg = get_nir_src(instr->src[1]);
       }
@@ -2837,7 +2837,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
       assert(const_offset && "Indirect input loads not allowed");
-      src = offset(src, bld, const_offset->u[0]);
+      src = offset(src, bld, const_offset->u32[0]);
 
       for (unsigned j = 0; j < instr->num_components; j++) {
          bld.MOV(offset(dest, bld, j), offset(src, bld, j));
@@ -2854,7 +2854,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          nir_src_as_const_value(instr->src[1]);
       if (const_uniform_block) {
          unsigned index = stage_prog_data->binding_table.ssbo_start +
-                          const_uniform_block->u[0];
+                          const_uniform_block->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(prog_data, index);
       } else {
@@ -2885,7 +2885,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg offset_reg;
          nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
          if (const_offset) {
-            offset_reg = brw_imm_ud(const_offset->u[0] + 4 * first_component);
+            offset_reg = brw_imm_ud(const_offset->u32[0] + 4 * first_component);
          } else {
             offset_reg = vgrf(glsl_type::uint_type);
             bld.ADD(offset_reg,
@@ -2913,7 +2913,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       assert(const_offset && "Indirect output stores not allowed");
-      new_dest = offset(new_dest, bld, const_offset->u[0]);
+      new_dest = offset(new_dest, bld, const_offset->u32[0]);
 
       for (unsigned j = 0; j < instr->num_components; j++) {
          bld.MOV(offset(new_dest, bld, j), offset(src, bld, j));
@@ -2954,7 +2954,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
    case nir_intrinsic_get_buffer_size: {
       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
-      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
+      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
       int reg_width = dispatch_width / 8;
 
       /* Set LOD = 0 */
@@ -3005,7 +3005,7 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
    if (const_surface) {
       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
-                            const_surface->u[0];
+                            const_surface->u32[0];
       surface = brw_imm_ud(surf_index);
       brw_mark_surface_used(prog_data, surf_index);
    } else {
@@ -3134,7 +3134,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
          nir_const_value *const_offset =
             nir_src_as_const_value(instr->src[i].src);
          if (const_offset) {
-            tex_offset = brw_imm_ud(brw_texture_offset(const_offset->i, 3));
+            tex_offset = brw_imm_ud(brw_texture_offset(const_offset->i32, 3));
          } else {
             tex_offset = retype(src, BRW_REGISTER_TYPE_D);
          }
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 941920adccd..ab6000b0573 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -77,7 +77,7 @@ add_const_offset_to_base_block(nir_block *block, void *closure)
          nir_const_value *const_offset = nir_src_as_const_value(*offset);
 
          if (const_offset) {
-            intrin->const_index[0] += const_offset->u[0];
+            intrin->const_index[0] += const_offset->u32[0];
             b->cursor = nir_before_instr(&intrin->instr);
             nir_instr_rewrite_src(&intrin->instr, offset,
                                   nir_src_for_ssa(nir_imm_int(b, 0)));
@@ -175,7 +175,7 @@ remap_patch_urb_offsets(nir_block *block, void *closure)
          if (vertex) {
             nir_const_value *const_vertex = nir_src_as_const_value(*vertex);
             if (const_vertex) {
-               intrin->const_index[0] += const_vertex->u[0] *
+               intrin->const_index[0] += const_vertex->u32[0] *
                                          state->vue_map->num_per_vertex_slots;
             } else {
                state->b.cursor = nir_before_instr(&intrin->instr);
@@ -623,12 +623,24 @@ brw_type_for_nir_type(nir_alu_type type)
 {
    switch (type) {
    case nir_type_uint:
+   case nir_type_uint32:
       return BRW_REGISTER_TYPE_UD;
    case nir_type_bool:
    case nir_type_int:
+   case nir_type_bool32:
+   case nir_type_int32:
       return BRW_REGISTER_TYPE_D;
    case nir_type_float:
+   case nir_type_float32:
       return BRW_REGISTER_TYPE_F;
+   case nir_type_float64:
+      return BRW_REGISTER_TYPE_DF;
+   case nir_type_int64:
+   case nir_type_uint64:
+      /* TODO we should only see these in moves, so for now it's ok, but when
+       * we add actual 64-bit integer support we should fix this.
+       */
+      return BRW_REGISTER_TYPE_DF;
    default:
       unreachable("unknown type");
    }
@@ -644,12 +656,18 @@ brw_glsl_base_type_for_nir_type(nir_alu_type type)
 {
    switch (type) {
    case nir_type_float:
+   case nir_type_float32:
       return GLSL_TYPE_FLOAT;
 
+   case nir_type_float64:
+      return GLSL_TYPE_DOUBLE;
+
    case nir_type_int:
+   case nir_type_int32:
       return GLSL_TYPE_INT;
 
    case nir_type_uint:
+   case nir_type_uint32:
       return GLSL_TYPE_UINT;
 
    default:
diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
index 56e15ef935f..22eeb1a1296 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -165,7 +165,7 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          }
 
          default:
-            if (nir_op_infos[alu->op].output_type == nir_type_bool) {
+            if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) {
                /* This instructions will turn into a CMP when we actually emit
                 * them so the result will have to be resolved before it can be
                 * used.
@@ -225,7 +225,7 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
           * have to worry about resolving them.
           */
          instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
-         if (load->value.u[0] == NIR_TRUE || load->value.u[0] == NIR_FALSE) {
+         if (load->value.u32[0] == NIR_TRUE || load->value.u32[0] == NIR_FALSE) {
             instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE;
          } else {
             instr->pass_flags |= BRW_NIR_NON_BOOLEAN;
diff --git a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
index 5ff2cba0464..6e8b1f99505 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
@@ -168,7 +168,9 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
       if (add->op != nir_op_fadd)
          continue;
 
-      /* TODO: Maybe bail if this expression is considered "precise"? */
+      assert(add->dest.dest.is_ssa);
+      if (add->exact)
+         continue;
 
       assert(add->src[0].src.is_ssa && add->src[1].src.is_ssa);
 
@@ -201,6 +203,8 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
       if (mul == NULL)
          continue;
 
+      unsigned bit_size = add->dest.dest.ssa.bit_size;
+
       nir_ssa_def *mul_src[2];
       mul_src[0] = mul->src[0].src.ssa;
       mul_src[1] = mul->src[1].src.ssa;
@@ -220,7 +224,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
                                                       nir_op_fabs);
             abs->src[0].src = nir_src_for_ssa(mul_src[i]);
             nir_ssa_dest_init(&abs->instr, &abs->dest.dest,
-                              mul_src[i]->num_components, NULL);
+                              mul_src[i]->num_components, bit_size, NULL);
             abs->dest.write_mask = (1 << mul_src[i]->num_components) - 1;
             nir_instr_insert_before(&add->instr, &abs->instr);
             mul_src[i] = &abs->dest.dest.ssa;
@@ -232,7 +236,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
                                                    nir_op_fneg);
          neg->src[0].src = nir_src_for_ssa(mul_src[0]);
          nir_ssa_dest_init(&neg->instr, &neg->dest.dest,
-                           mul_src[0]->num_components, NULL);
+                           mul_src[0]->num_components, bit_size, NULL);
          neg->dest.write_mask = (1 << mul_src[0]->num_components) - 1;
          nir_instr_insert_before(&add->instr, &neg->instr);
          mul_src[0] = &neg->dest.dest.ssa;
@@ -253,6 +257,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
 
       nir_ssa_dest_init(&ffma->instr, &ffma->dest.dest,
                         add->dest.dest.ssa.num_components,
+                        bit_size,
                         add->dest.dest.ssa.name);
       nir_ssa_def_rewrite_uses(&add->dest.dest.ssa,
                                nir_src_for_ssa(&ffma->dest.dest.ssa));
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 6b85eac77d6..783af78479e 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -34,6 +34,7 @@
 #define BRW_STATE_H
 
 #include "brw_context.h"
+#include "brw_defines.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -406,6 +407,59 @@ void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw);
 void
 gen7_restore_default_l3_config(struct brw_context *brw);
 
+static inline bool
+is_drawing_points(const struct brw_context *brw)
+{
+   /* Determine if the primitives *reaching the SF* are points */
+   /* _NEW_POLYGON */
+   if (brw->ctx.Polygon.FrontMode == GL_POINT ||
+       brw->ctx.Polygon.BackMode == GL_POINT) {
+      return true;
+   }
+
+   if (brw->geometry_program) {
+      /* BRW_NEW_GEOMETRY_PROGRAM */
+      return brw->geometry_program->OutputType == GL_POINTS;
+   } else if (brw->tes.prog_data) {
+      /* BRW_NEW_TES_PROG_DATA */
+      return brw->tes.prog_data->output_topology ==
+             BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+   } else {
+      /* BRW_NEW_PRIMITIVE */
+      return brw->primitive == _3DPRIM_POINTLIST;
+   }
+}
+
+static inline bool
+is_drawing_lines(const struct brw_context *brw)
+{
+   /* Determine if the primitives *reaching the SF* are points */
+   /* _NEW_POLYGON */
+   if (brw->ctx.Polygon.FrontMode == GL_LINE ||
+       brw->ctx.Polygon.BackMode == GL_LINE) {
+      return true;
+   }
+
+   if (brw->geometry_program) {
+      /* BRW_NEW_GEOMETRY_PROGRAM */
+      return brw->geometry_program->OutputType == GL_LINE_STRIP;
+   } else if (brw->tes.prog_data) {
+      /* BRW_NEW_TES_PROG_DATA */
+      return brw->tes.prog_data->output_topology ==
+             BRW_TESS_OUTPUT_TOPOLOGY_LINE;
+   } else {
+      /* BRW_NEW_PRIMITIVE */
+      switch (brw->primitive) {
+      case _3DPRIM_LINELIST:
+      case _3DPRIM_LINESTRIP:
+      case _3DPRIM_LINELOOP:
+         return true;
+      }
+   }
+   return false;
+}
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index 46667884125..b7b0a86f1c7 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -423,11 +423,12 @@ static void gen7_dump_sampler_state(struct brw_context *brw,
                 GET_BITS(samp[1], 15, 8)
                );
       batch_out(brw, name, offset, i+2, "Border Color\n"); /* FINISHME: gen8+ */
-      batch_out(brw, name, offset, i+3, "Max aniso: RATIO %d:1, TC[XYZ] Address Control: %s|%s|%s\n",
+      batch_out(brw, name, offset, i+3, "Max aniso: RATIO %d:1, TC[XYZ] Address Control: %s|%s|%s, %snormalized coords\n",
                 (GET_FIELD(samp[3], BRW_SAMPLER_MAX_ANISOTROPY) + 1) * 2,
                 sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCX_WRAP_MODE)],
                 sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCY_WRAP_MODE)],
-                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCZ_WRAP_MODE)]
+                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCZ_WRAP_MODE)],
+                (samp[3] & GEN7_SAMPLER_NON_NORMALIZED_COORDINATES) ? "non-" : ""
                );
 
       samp += 4;
diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index 1f27e9862a7..3e9a6ee48d2 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -34,6 +34,7 @@
 #define BRW_UTIL_H
 
 #include "brw_context.h"
+#include "main/framebuffer.h"
 
 extern GLuint brw_translate_blend_factor( GLenum factor );
 extern GLuint brw_translate_blend_equation( GLenum mode );
@@ -49,13 +50,13 @@ brw_get_line_width(struct brw_context *brw)
     * implementation-dependent maximum non-antialiased line width."
     */
    float line_width =
-      CLAMP(!brw->ctx.Multisample._Enabled && !brw->ctx.Line.SmoothFlag
+      CLAMP(!_mesa_is_multisample_enabled(&brw->ctx) && !brw->ctx.Line.SmoothFlag
             ? roundf(brw->ctx.Line.Width) : brw->ctx.Line.Width,
             0.0f, brw->ctx.Const.MaxLineWidth);
    uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
 
    /* Line width of 0 is not allowed when MSAA enabled */
-   if (brw->ctx.Multisample._Enabled) {
+   if (_mesa_is_multisample_enabled(&brw->ctx)) {
       if (line_width_u3_7 == 0)
          line_width_u3_7 = 1;
    } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5f) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 65e57ba5e62..00253438c44 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -693,17 +693,6 @@ vec4_visitor::opt_algebraic()
             break;
          }
          break;
-      case SHADER_OPCODE_RCP: {
-         vec4_instruction *prev = (vec4_instruction *)inst->prev;
-         if (prev->opcode == SHADER_OPCODE_SQRT) {
-            if (inst->src[0].equals(src_reg(prev->dst))) {
-               inst->opcode = SHADER_OPCODE_RSQ;
-               inst->src[0] = prev->src[0];
-               progress = true;
-            }
-         }
-         break;
-      }
       case SHADER_OPCODE_BROADCAST:
          if (is_uniform(inst->src[0]) ||
              inst->src[1].is_zero()) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
index d9c048e1764..e915aee3bd0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -70,8 +70,8 @@ vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       /* Make up a type...we have no way of knowing... */
       const glsl_type *const type = glsl_type::ivec(instr->num_components);
 
-      src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u[0] +
-                          instr->const_index[0] + offset->u[0],
+      src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] +
+                          instr->const_index[0] + offset->u32[0],
                     type);
       /* gl_PointSize is passed in the .w component of the VUE header */
       if (instr->const_index[0] == VARYING_SLOT_PSIZ)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 4686f2014c6..7c06f92e784 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -343,7 +343,7 @@ vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
        * add_const_offset_to_base() will fold other constant offsets
        * into instr->const_index[0].
        */
-      assert(const_value->u[0] == 0);
+      assert(const_value->u32[0] == 0);
       return src_reg();
    }
 
@@ -369,13 +369,13 @@ vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
          continue;
 
       for (unsigned j = i; j < instr->def.num_components; j++) {
-         if (instr->value.u[i] == instr->value.u[j]) {
+         if (instr->value.u32[i] == instr->value.u32[j]) {
             writemask |= 1 << j;
          }
       }
 
       reg.writemask = writemask;
-      emit(MOV(reg, brw_imm_d(instr->value.i[i])));
+      emit(MOV(reg, brw_imm_d(instr->value.i32[i])));
 
       remaining &= ~writemask;
    }
@@ -400,7 +400,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       /* We set EmitNoIndirectInput for VS */
       assert(const_offset);
 
-      src = src_reg(ATTR, instr->const_index[0] + const_offset->u[0],
+      src = src_reg(ATTR, instr->const_index[0] + const_offset->u32[0],
                     glsl_type::uvec4_type);
 
       dest = get_nir_dest(instr->dest, src.type);
@@ -414,7 +414,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       assert(const_offset);
 
-      int varying = instr->const_index[0] + const_offset->u[0];
+      int varying = instr->const_index[0] + const_offset->u32[0];
 
       src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F,
                         instr->num_components);
@@ -425,7 +425,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
    case nir_intrinsic_get_buffer_size: {
       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
-      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
+      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
 
       const unsigned index =
          prog_data->base.binding_table.ssbo_start + ssbo_index;
@@ -458,7 +458,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          nir_src_as_const_value(instr->src[1]);
       if (const_uniform_block) {
          unsigned index = prog_data->base.binding_table.ssbo_start +
-                          const_uniform_block->u[0];
+                          const_uniform_block->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(&prog_data->base, index);
       } else {
@@ -476,7 +476,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg offset_reg;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
       if (const_offset) {
-         offset_reg = brw_imm_ud(const_offset->u[0]);
+         offset_reg = brw_imm_ud(const_offset->u32[0]);
       } else {
          offset_reg = get_nir_src(instr->src[2], 1);
       }
@@ -596,7 +596,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg surf_index;
       if (const_uniform_block) {
          unsigned index = prog_data->base.binding_table.ssbo_start +
-                          const_uniform_block->u[0];
+                          const_uniform_block->u32[0];
          surf_index = brw_imm_ud(index);
 
          brw_mark_surface_used(&prog_data->base, index);
@@ -617,7 +617,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg offset_reg;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset) {
-         offset_reg = brw_imm_ud(const_offset->u[0]);
+         offset_reg = brw_imm_ud(const_offset->u32[0]);
       } else {
          offset_reg = get_nir_src(instr->src[1], 1);
       }
@@ -697,8 +697,8 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
       if (const_offset) {
          /* Offsets are in bytes but they should always be multiples of 16 */
-         assert(const_offset->u[0] % 16 == 0);
-         src.reg_offset = const_offset->u[0] / 16;
+         assert(const_offset->u32[0] % 16 == 0);
+         src.reg_offset = const_offset->u32[0] / 16;
 
          emit(MOV(dest, src));
       } else {
@@ -760,7 +760,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           * as an immediate.
           */
          const unsigned index = prog_data->base.binding_table.ubo_start +
-                                const_block_index->u[0];
+                                const_block_index->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(&prog_data->base, index);
       } else {
@@ -785,7 +785,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg offset;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset) {
-         offset = brw_imm_ud(const_offset->u[0] & ~15);
+         offset = brw_imm_ud(const_offset->u32[0] & ~15);
       } else {
          offset = get_nir_src(instr->src[1], nir_type_int, 1);
       }
@@ -800,10 +800,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
       if (const_offset) {
-         packed_consts.swizzle += BRW_SWIZZLE4(const_offset->u[0] % 16 / 4,
-                                               const_offset->u[0] % 16 / 4,
-                                               const_offset->u[0] % 16 / 4,
-                                               const_offset->u[0] % 16 / 4);
+         packed_consts.swizzle += BRW_SWIZZLE4(const_offset->u32[0] % 16 / 4,
+                                               const_offset->u32[0] % 16 / 4,
+                                               const_offset->u32[0] % 16 / 4,
+                                               const_offset->u32[0] % 16 / 4);
       }
 
       emit(MOV(dest, packed_consts));
@@ -845,7 +845,7 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
    if (const_surface) {
       unsigned surf_index = prog_data->base.binding_table.ssbo_start +
-                            const_surface->u[0];
+                            const_surface->u32[0];
       surface = brw_imm_ud(surf_index);
       brw_mark_surface_used(&prog_data->base, surf_index);
    } else {
@@ -1042,12 +1042,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
           * operand. If we can determine that one of the args is in the low
           * 16 bits, though, we can just emit a single MUL.
           */
-         if (value0 && value0->u[0] < (1 << 16)) {
+         if (value0 && value0->u32[0] < (1 << 16)) {
             if (devinfo->gen < 7)
                emit(MUL(dst, op[0], op[1]));
             else
                emit(MUL(dst, op[1], op[0]));
-         } else if (value1 && value1->u[0] < (1 << 16)) {
+         } else if (value1 && value1->u32[0] < (1 << 16)) {
             if (devinfo->gen < 7)
                emit(MUL(dst, op[1], op[0]));
             else
@@ -1793,7 +1793,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
          nir_const_value *const_offset =
             nir_src_as_const_value(instr->src[i].src);
          if (const_offset) {
-            constant_offset = brw_texture_offset(const_offset->i, 3);
+            constant_offset = brw_texture_offset(const_offset->i32, 3);
          } else {
             offset_value =
                get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index f344eaad664..0ce48b8df5f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -353,7 +353,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
       src_reg vertex_index =
-         vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0]))
+         vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0]))
                       : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
 
       dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
@@ -400,6 +400,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          }
       } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
          dst.type = BRW_REGISTER_TYPE_F;
+         unsigned swiz = BRW_SWIZZLE_WZYX;
 
          /* This is a read of gl_TessLevelOuter[], which lives in the
           * high 4 DWords of the Patch URB header, in reverse order.
@@ -412,6 +413,8 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
             dst.writemask = WRITEMASK_XYZ;
             break;
          case GL_ISOLINES:
+            /* Isolines are not reversed; swizzle .zw -> .xy */
+            swiz = BRW_SWIZZLE_ZWZW;
             dst.writemask = WRITEMASK_XY;
             return;
          default:
@@ -420,7 +423,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
          dst_reg tmp(this, glsl_type::vec4_type);
          emit_output_urb_read(tmp, 1, src_reg());
-         emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
+         emit(MOV(dst, swizzle(src_reg(tmp), swiz)));
       } else {
          emit_output_urb_read(dst, imm_offset, indirect_offset);
       }
@@ -473,8 +476,15 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           * Patch URB Header at DWords 4-7.  However, it's reversed, so
           * instead of .xyzw we have .wzyx.
           */
-         swiz = BRW_SWIZZLE_WZYX;
-         mask = writemask_for_backwards_vector(mask);
+         if (key->tes_primitive_mode == GL_ISOLINES) {
+            /* Isolines .xy should be stored in .zw, in order. */
+            swiz = BRW_SWIZZLE4(0, 0, 0, 1);
+            mask <<= 2;
+         } else {
+            /* Other domains are reversed; store .wzyx instead of .xyzw. */
+            swiz = BRW_SWIZZLE_WZYX;
+            mask = writemask_for_backwards_vector(mask);
+         }
       }
 
       emit_urb_write(swizzle(value, swiz), mask,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
index e3c23f1a52f..7ba494fbffc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
@@ -149,9 +149,15 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
                src_reg(brw_vec8_grf(1, 0))));
       break;
    case nir_intrinsic_load_tess_level_outer:
-      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
-               swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
-                       BRW_SWIZZLE_WZYX)));
+      if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+                          BRW_SWIZZLE_ZWZW)));
+      } else {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+                          BRW_SWIZZLE_WZYX)));
+      }
       break;
    case nir_intrinsic_load_tess_level_inner:
       if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) {
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index cee139b7fd4..f5a7d4d0ef6 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -198,14 +198,14 @@ gen6_upload_blend_state(struct brw_context *brw)
       if(!is_buffer_zero_integer_format) {
          /* _NEW_MULTISAMPLE */
          blend[b].blend1.alpha_to_coverage =
-            ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage;
+            _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage;
 
 	/* From SandyBridge PRM, volume 2 Part 1, section 8.2.3, BLEND_STATE:
 	 * DWord 1, Bit 30 (AlphaToOne Enable):
 	 * "If Dual Source Blending is enabled, this bit must be disabled"
 	 */
          WARN_ONCE(ctx->Color.Blend[b]._UsesDualSrc &&
-                   ctx->Multisample._Enabled &&
+                   _mesa_is_multisample_enabled(ctx) &&
                    ctx->Multisample.SampleAlphaToOne,
                    "HW workaround: disabling alpha to one with dual src "
                    "blending\n");
@@ -213,7 +213,7 @@ gen6_upload_blend_state(struct brw_context *brw)
             blend[b].blend1.alpha_to_one = false;
 	 else
 	    blend[b].blend1.alpha_to_one =
-	       ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToOne;
+	       _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToOne;
 
          blend[b].blend1.alpha_to_coverage_dither = (brw->gen >= 7);
       }
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index 9a29366f0e0..004ecebc69e 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -174,12 +174,14 @@ upload_clip_state(struct brw_context *brw)
    else
       enable = GEN6_CLIP_ENABLE;
 
+   if (!is_drawing_points(brw) && !is_drawing_lines(brw))
+      dw2 |= GEN6_CLIP_XY_TEST;
+
    BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
    OUT_BATCH(dw1);
    OUT_BATCH(enable |
 	     GEN6_CLIP_MODE_NORMAL |
-	     GEN6_CLIP_XY_TEST |
 	     dw2);
    OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
              U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
@@ -195,7 +197,9 @@ const struct brw_tracked_state gen6_clip_state = {
                _NEW_TRANSFORM,
       .brw   = BRW_NEW_CONTEXT |
                BRW_NEW_FS_PROG_DATA |
+               BRW_NEW_GEOMETRY_PROGRAM |
                BRW_NEW_META_IN_PROGRESS |
+               BRW_NEW_PRIMITIVE |
                BRW_NEW_RASTERIZER_DISCARD,
    },
    .emit = upload_clip_state,
@@ -209,7 +213,9 @@ const struct brw_tracked_state gen7_clip_state = {
                _NEW_TRANSFORM,
       .brw   = BRW_NEW_CONTEXT |
                BRW_NEW_FS_PROG_DATA |
+               BRW_NEW_GEOMETRY_PROGRAM |
                BRW_NEW_META_IN_PROGRESS |
+               BRW_NEW_PRIMITIVE |
                BRW_NEW_RASTERIZER_DISCARD,
    },
    .emit = upload_clip_state,
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index 8eb620de56b..fcd313aece2 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -171,7 +171,7 @@ gen6_determine_sample_mask(struct brw_context *brw)
    /* BRW_NEW_NUM_SAMPLES */
    unsigned num_samples = brw->num_samples;
 
-   if (ctx->Multisample._Enabled) {
+   if (_mesa_is_multisample_enabled(ctx)) {
       if (ctx->Multisample.SampleCoverage) {
          coverage = ctx->Multisample.SampleCoverageValue;
          coverage_invert = ctx->Multisample.SampleCoverageInvert;
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index 17b4a7fba96..a20673282f2 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -58,10 +58,10 @@ gen6_upload_scissor_state(struct brw_context *brw)
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
       int bbox[4];
 
-      bbox[0] = 0;
-      bbox[1] = fb_width;
-      bbox[2] = 0;
-      bbox[3] = fb_height;
+      bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
+      bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
+      bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
+      bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
       _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
 
       if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 2634e6ba6fd..42f9a5ca8b6 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -147,26 +147,6 @@ get_attr_override(const struct brw_vue_map *vue_map, int urb_entry_read_offset,
 }
 
 
-static bool
-is_drawing_points(const struct brw_context *brw)
-{
-   /* Determine if the primitives *reaching the SF* are points */
-   /* _NEW_POLYGON */
-   if (brw->ctx.Polygon.FrontMode == GL_POINT ||
-       brw->ctx.Polygon.BackMode == GL_POINT) {
-      return true;
-   }
-
-   if (brw->geometry_program) {
-      /* BRW_NEW_GEOMETRY_PROGRAM */
-      return brw->geometry_program->OutputType == GL_POINTS;
-   } else {
-      /* BRW_NEW_PRIMITIVE */
-      return brw->primitive == _3DPRIM_POINTLIST;
-   }
-}
-
-
 /**
  * Create the mapping from the FS inputs we produce to the previous pipeline
  * stage (GS or VS) outputs they source from.
@@ -216,8 +196,10 @@ calculate_attr_overrides(const struct brw_context *brw,
     * This is not required on Haswell, as the hardware ignores this state
     * when drawing non-points -- although we do still need to be careful to
     * correctly set the attr overrides.
+    *
+    * _NEW_POLYGON
+    * BRW_NEW_PRIMITIVE | BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA
     */
-   /* BRW_NEW_PRIMITIVE | BRW_NEW_GEOMETRY_PROGRAM */
    bool drawing_points = is_drawing_points(brw);
 
    /* Initialize all the attr_overrides to 0.  In the loop below we'll modify
@@ -369,8 +351,9 @@ upload_sf_state(struct brw_context *brw)
        unreachable("not reached");
    }
 
-   /* _NEW_SCISSOR */
-   if (ctx->Scissor.EnableFlags)
+   /* _NEW_SCISSOR _NEW_POLYGON BRW_NEW_GEOMETRY_PROGRAM BRW_NEW_PRIMITIVE */
+   if (ctx->Scissor.EnableFlags ||
+       is_drawing_points(brw) || is_drawing_lines(brw))
       dw3 |= GEN6_SF_SCISSOR_ENABLE;
 
    /* _NEW_POLYGON */
@@ -484,6 +467,7 @@ const struct brw_tracked_state gen6_sf_state = {
                BRW_NEW_FS_PROG_DATA |
                BRW_NEW_GEOMETRY_PROGRAM |
                BRW_NEW_PRIMITIVE |
+               BRW_NEW_TES_PROG_DATA |
                BRW_NEW_VUE_MAP_GEOM_OUT,
    },
    .emit = upload_sf_state,
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index b1f13aceba4..7c98c73edf8 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -188,8 +188,9 @@ upload_sf_state(struct brw_context *brw)
       dw2 |= GEN6_SF_CULL_NONE;
    }
 
-   /* _NEW_SCISSOR */
-   if (ctx->Scissor.EnableFlags)
+   /* _NEW_SCISSOR _NEW_POLYGON BRW_NEW_GEOMETRY_PROGRAM BRW_NEW_PRIMITIVE */
+   if (ctx->Scissor.EnableFlags ||
+       is_drawing_points(brw) || is_drawing_lines(brw))
       dw2 |= GEN6_SF_SCISSOR_ENABLE;
 
    /* _NEW_LINE */
@@ -254,7 +255,8 @@ const struct brw_tracked_state gen7_sf_state = {
                _NEW_POLYGON |
                _NEW_PROGRAM |
                _NEW_SCISSOR,
-      .brw   = BRW_NEW_CONTEXT,
+      .brw   = BRW_NEW_CONTEXT |
+               BRW_NEW_PRIMITIVE,
    },
    .emit = upload_sf_state,
 };
diff --git a/src/mesa/drivers/dri/i965/gen8_blend_state.c b/src/mesa/drivers/dri/i965/gen8_blend_state.c
index 786c79ad44d..63186bd4897 100644
--- a/src/mesa/drivers/dri/i965/gen8_blend_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_blend_state.c
@@ -65,7 +65,7 @@ gen8_upload_blend_state(struct brw_context *brw)
 
    if (rb_zero_type != GL_INT && rb_zero_type != GL_UNSIGNED_INT) {
       /* _NEW_MULTISAMPLE */
-      if (ctx->Multisample._Enabled) {
+      if (_mesa_is_multisample_enabled(ctx)) {
          if (ctx->Multisample.SampleAlphaToCoverage) {
             blend[0] |= GEN8_BLEND_ALPHA_TO_COVERAGE_ENABLE;
             blend[0] |= GEN8_BLEND_ALPHA_TO_COVERAGE_DITHER_ENABLE;
@@ -183,7 +183,7 @@ gen8_upload_blend_state(struct brw_context *brw)
       * "If Dual Source Blending is enabled, this bit must be disabled."
       */
       WARN_ONCE(ctx->Color.Blend[i]._UsesDualSrc &&
-                ctx->Multisample._Enabled &&
+                _mesa_is_multisample_enabled(ctx) &&
                 ctx->Multisample.SampleAlphaToOne,
                 "HW workaround: disabling alpha to one with dual src "
                 "blending\n");
@@ -226,7 +226,7 @@ gen8_upload_ps_blend(struct brw_context *brw)
       dw1 |= GEN8_PS_BLEND_ALPHA_TEST_ENABLE;
 
    /* _NEW_MULTISAMPLE */
-   if (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage)
+   if (_mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage)
       dw1 |= GEN8_PS_BLEND_ALPHA_TO_COVERAGE_ENABLE;
 
    /* Used for implementing the following bit of GL_EXT_texture_integer:
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 93100a0708f..8aaa1a8e449 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -29,6 +29,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_wm.h"
+#include "main/framebuffer.h"
 
 /**
  * Helper function to emit depth related command packets.
@@ -303,7 +304,7 @@ pma_fix_enable(const struct brw_context *brw)
    const bool kill_pixel =
       brw->wm.prog_data->uses_kill ||
       brw->wm.prog_data->uses_omask ||
-      (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage) ||
+      (_mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage) ||
       ctx->Color.AlphaEnabled;
 
    /* The big formula in CACHE_MODE_1::NP PMA FIX ENABLE. */
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index 8b6f31f3be6..2ac21f7c873 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -178,7 +178,7 @@ upload_sf(struct brw_context *brw)
       dw3 |= GEN6_SF_USE_STATE_POINT_WIDTH;
 
    /* _NEW_POINT | _NEW_MULTISAMPLE */
-   if ((ctx->Point.SmoothFlag || ctx->Multisample._Enabled) &&
+   if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
        !ctx->Point.PointSprite) {
       dw3 |= GEN8_SF_SMOOTH_POINT_ENABLE;
    }
@@ -249,7 +249,7 @@ upload_raster(struct brw_context *brw)
    if (ctx->Point.SmoothFlag)
       dw1 |= GEN8_RASTER_SMOOTH_POINT_ENABLE;
 
-   if (ctx->Multisample._Enabled)
+   if (_mesa_is_multisample_enabled(ctx))
       dw1 |= GEN8_RASTER_API_MULTISAMPLE_ENABLE;
 
    if (ctx->Polygon.OffsetFill)
diff --git a/src/mesa/drivers/dri/i965/intel_copy_image.c b/src/mesa/drivers/dri/i965/intel_copy_image.c
index 08b7623e63d..ccb82b64d5f 100644
--- a/src/mesa/drivers/dri/i965/intel_copy_image.c
+++ b/src/mesa/drivers/dri/i965/intel_copy_image.c
@@ -140,9 +140,9 @@ copy_image_with_memcpy(struct brw_context *brw,
    _mesa_get_format_block_size(src_mt->format, &src_bw, &src_bh);
 
    assert(src_width % src_bw == 0);
-   assert(src_height % src_bw == 0);
+   assert(src_height % src_bh == 0);
    assert(src_x % src_bw == 0);
-   assert(src_y % src_bw == 0);
+   assert(src_y % src_bh == 0);
 
    /* If we are on the same miptree, same level, and same slice, then
     * intel_miptree_map won't let us map it twice.  We have to do things a
@@ -153,7 +153,7 @@ copy_image_with_memcpy(struct brw_context *brw,
 
    if (same_slice) {
       assert(dst_x % src_bw == 0);
-      assert(dst_y % src_bw == 0);
+      assert(dst_y % src_bh == 0);
 
       map_x1 = MIN2(src_x, dst_x);
       map_y1 = MIN2(src_y, dst_y);
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 6c233d84df9..9e84abb8d9f 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -2172,7 +2172,8 @@ intel_miptree_updownsample(struct brw_context *brw,
                               src->logical_width0, src->logical_height0,
                               0, 0,
                               dst->logical_width0, dst->logical_height0,
-                              GL_NEAREST, false, false /*mirror x, y*/);
+                              GL_NEAREST, false, false /*mirror x, y*/,
+                              false, false);
    } else if (src->format == MESA_FORMAT_S_UINT8) {
       brw_meta_stencil_updownsample(brw, src, dst);
    } else {
@@ -2194,7 +2195,8 @@ intel_miptree_updownsample(struct brw_context *brw,
                               src->logical_width0, src->logical_height0,
                               0, 0,
                               dst->logical_width0, dst->logical_height0,
-                              GL_NEAREST, false, false /*mirror x, y*/);
+                              GL_NEAREST, false, false /*mirror x, y*/,
+                              false, false /* decode/encode srgb */);
    }
 }
 
diff --git a/src/mesa/main/debug_output.c b/src/mesa/main/debug_output.c
index c2b9f053352..85f64bd459f 100644
--- a/src/mesa/main/debug_output.c
+++ b/src/mesa/main/debug_output.c
@@ -779,7 +779,7 @@ _mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname)
       break;
    case GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH:
       val = (debug->Log.NumMessages) ?
-         debug->Log.Messages[debug->Log.NextMessage].length : 0;
+         debug->Log.Messages[debug->Log.NextMessage].length + 1 : 0;
       break;
    case GL_DEBUG_GROUP_STACK_DEPTH:
       val = debug->CurrentGroup + 1;
@@ -1009,15 +1009,16 @@ _mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id,
    if (!validate_length(ctx, callerstr, length, buf))
       return; /* GL_INVALID_VALUE */
 
+   /* if length not specified, string will be null terminated: */
+   if (length < 0)
+      length = strlen(buf);
+
    _mesa_log_msg(ctx, gl_enum_to_debug_source(source),
                  gl_enum_to_debug_type(type), id,
                  gl_enum_to_debug_severity(severity),
                  length, buf);
 
    if (type == GL_DEBUG_TYPE_MARKER && ctx->Driver.EmitStringMarker) {
-      /* if length not specified, string will be null terminated: */
-      if (length < 0)
-         length = strlen(buf);
       ctx->Driver.EmitStringMarker(ctx, buf, length);
    }
 }
@@ -1188,6 +1189,9 @@ _mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length,
    if (!validate_length(ctx, callerstr, length, message))
       return; /* GL_INVALID_VALUE */
 
+   if (length < 0)
+      length = strlen(message);
+
    debug = _mesa_lock_debug_state(ctx);
    if (!debug)
       return;
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index d490918b816..bb8d4c3112b 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -3623,6 +3623,23 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
                      _mesa_enum_to_string(attachment));
          return;
       }
+
+      /* The specs are not clear about how to handle
+       * GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME with the default framebuffer,
+       * but dEQP-GLES3 expects an INVALID_ENUM error. This has also been
+       * discussed in:
+       *
+       * https://cvs.khronos.org/bugzilla/show_bug.cgi?id=12928#c1
+       * and https://bugs.freedesktop.org/show_bug.cgi?id=31947
+       */
+      if (pname == GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME) {
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "%s(requesting GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME "
+                     "when GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE is "
+                     "GL_FRAMEBUFFER_DEFAULT is not allowed)", caller);
+         return;
+      }
+
       /* the default / window-system FBO */
       att = _mesa_get_fb0_attachment(ctx, buffer, attachment);
    }
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index d18166d528e..f69dc6cb3e6 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -983,3 +983,22 @@ _mesa_is_front_buffer_drawing(const struct gl_framebuffer *fb)
    return (fb->_NumColorDrawBuffers >= 1 &&
            fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT);
 }
+
+static inline GLuint
+_mesa_geometric_nonvalidated_samples(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->Visual.samples :
+      buffer->DefaultGeometry.NumSamples;
+}
+
+bool _mesa_is_multisample_enabled(const struct gl_context *ctx)
+{
+   /* The sample count may not be validated by the driver, but when it is set,
+    * we know that is in a valid range and no driver should ever validate a
+    * multisampled framebuffer to non-multisampled and vice-versa.
+    */
+   return ctx->Multisample.Enabled &&
+          ctx->DrawBuffer &&
+          _mesa_geometric_nonvalidated_samples(ctx->DrawBuffer) > 1;
+}
diff --git a/src/mesa/main/framebuffer.h b/src/mesa/main/framebuffer.h
index fa434d447ae..384f7498776 100644
--- a/src/mesa/main/framebuffer.h
+++ b/src/mesa/main/framebuffer.h
@@ -146,4 +146,7 @@ _mesa_is_front_buffer_reading(const struct gl_framebuffer *fb);
 extern bool
 _mesa_is_front_buffer_drawing(const struct gl_framebuffer *fb);
 
+extern bool
+_mesa_is_multisample_enabled(const struct gl_context *ctx);
+
 #endif /* FRAMEBUFFER_H */
diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index 6eacd424df7..1a6ae9a5f3c 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -79,6 +79,20 @@ bool
 _mesa_is_valid_generate_texture_mipmap_internalformat(struct gl_context *ctx,
                                                       GLenum internalformat)
 {
+   if (_mesa_is_gles3(ctx)) {
+      /* From the ES 3.2 specification's description of GenerateMipmap():
+       * "An INVALID_OPERATION error is generated if the levelbase array was
+       *  not specified with an unsized internal format from table 8.3 or a
+       *  sized internal format that is both color-renderable and
+       *  texture-filterable according to table 8.10."
+       */
+      return internalformat == GL_RGBA || internalformat == GL_RGB ||
+             internalformat == GL_LUMINANCE_ALPHA ||
+             internalformat == GL_LUMINANCE || internalformat == GL_ALPHA ||
+             (_mesa_is_es3_color_renderable(internalformat) &&
+              _mesa_is_es3_texture_filterable(internalformat));
+   }
+
    return (!_mesa_is_enum_format_integer(internalformat) &&
            !_mesa_is_depthstencil_format(internalformat) &&
            !_mesa_is_astc_format(internalformat) &&
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index cf6495885b6..96ab393c0e1 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -3556,3 +3556,86 @@ _mesa_format_from_format_and_type(GLenum format, GLenum type)
     */
    unreachable("Unsupported format");
 }
+
+/**
+ * Returns true if \p internal_format is a sized internal format that
+ * is marked "Color Renderable" in Table 8.10 of the ES 3.2 specification.
+ */
+bool
+_mesa_is_es3_color_renderable(GLenum internal_format)
+{
+   switch (internal_format) {
+   case GL_R8:
+   case GL_RG8:
+   case GL_RGB8:
+   case GL_RGB565:
+   case GL_RGBA4:
+   case GL_RGB5_A1:
+   case GL_RGBA8:
+   case GL_RGB10_A2:
+   case GL_RGB10_A2UI:
+   case GL_SRGB8_ALPHA8:
+   case GL_R16F:
+   case GL_RG16F:
+   case GL_RGBA16F:
+   case GL_R32F:
+   case GL_RG32F:
+   case GL_RGBA32F:
+   case GL_R11F_G11F_B10F:
+   case GL_R8I:
+   case GL_R8UI:
+   case GL_R16I:
+   case GL_R16UI:
+   case GL_R32I:
+   case GL_R32UI:
+   case GL_RG8I:
+   case GL_RG8UI:
+   case GL_RG16I:
+   case GL_RG16UI:
+   case GL_RG32I:
+   case GL_RG32UI:
+   case GL_RGBA8I:
+   case GL_RGBA8UI:
+   case GL_RGBA16I:
+   case GL_RGBA16UI:
+   case GL_RGBA32I:
+   case GL_RGBA32UI:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Returns true if \p internal_format is a sized internal format that
+ * is marked "Texture Filterable" in Table 8.10 of the ES 3.2 specification.
+ */
+bool
+_mesa_is_es3_texture_filterable(GLenum internal_format)
+{
+   switch (internal_format) {
+   case GL_R8:
+   case GL_R8_SNORM:
+   case GL_RG8:
+   case GL_RG8_SNORM:
+   case GL_RGB8:
+   case GL_RGB8_SNORM:
+   case GL_RGB565:
+   case GL_RGBA4:
+   case GL_RGB5_A1:
+   case GL_RGBA8:
+   case GL_RGBA8_SNORM:
+   case GL_RGB10_A2:
+   case GL_SRGB8:
+   case GL_SRGB8_ALPHA8:
+   case GL_R16F:
+   case GL_RG16F:
+   case GL_RGB16F:
+   case GL_RGBA16F:
+   case GL_R11F_G11F_B10F:
+   case GL_RGB9_E5:
+      return true;
+   default:
+      return false;
+   }
+}
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index 00d2767085d..c73f464e5f9 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -28,6 +28,7 @@
 #define GLFORMATS_H
 
 
+#include <stdbool.h>
 #include <GL/gl.h>
 
 
@@ -144,6 +145,12 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat );
 extern uint32_t
 _mesa_format_from_format_and_type(GLenum format, GLenum type);
 
+extern bool
+_mesa_is_es3_color_renderable(GLenum internal_format);
+
+extern bool
+_mesa_is_es3_texture_filterable(GLenum internal_format);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 2e43996f23a..71aae178adf 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -667,7 +667,6 @@ struct gl_list_attrib
 struct gl_multisample_attrib
 {
    GLboolean Enabled;
-   GLboolean _Enabled;   /**< true if Enabled and multisample buffer */
    GLboolean SampleAlphaToCoverage;
    GLboolean SampleAlphaToOne;
    GLboolean SampleCoverage;
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 57f13411fdf..917ae4da023 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -344,20 +344,6 @@ update_frontbit(struct gl_context *ctx)
 
 
 /**
- * Update derived multisample state.
- */
-static void
-update_multisample(struct gl_context *ctx)
-{
-   ctx->Multisample._Enabled = GL_FALSE;
-   if (ctx->Multisample.Enabled &&
-       ctx->DrawBuffer &&
-       _mesa_geometric_samples(ctx->DrawBuffer) > 0)
-      ctx->Multisample._Enabled = GL_TRUE;
-}
-
-
-/**
  * Update the ctx->VertexProgram._TwoSideEnabled flag.
  */
 static void
@@ -450,9 +436,6 @@ _mesa_update_state_locked( struct gl_context *ctx )
    if (new_state & _NEW_PIXEL)
       _mesa_update_pixel( ctx, new_state );
 
-   if (new_state & (_NEW_MULTISAMPLE | _NEW_BUFFERS))
-      update_multisample( ctx );
-
    /* ctx->_NeedEyeCoords is now up to date.
     *
     * If the truth value of this variable has changed, update for the
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 10d931c8b6b..1d9047ee6fd 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2356,7 +2356,7 @@ add_uniform_to_shader::visit_field(const glsl_type *type, const char *name,
       file = PROGRAM_UNIFORM;
    }
 
-   int index = _mesa_lookup_parameter_index(params, -1, name);
+   int index = _mesa_lookup_parameter_index(params, name);
    if (index < 0) {
       index = _mesa_add_parameter(params, file, name, size, type->gl_type,
 				  NULL, NULL);
diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 34183d4d95f..02d84f20cd8 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -37,6 +37,99 @@
 #include "prog_statevars.h"
 
 
+/**
+ * Look for a float vector in the given parameter list.  The float vector
+ * may be of length 1, 2, 3 or 4.  If swizzleOut is non-null, we'll try
+ * swizzling to find a match.
+ * \param list  the parameter list to search
+ * \param v  the float vector to search for
+ * \param vSize  number of element in v
+ * \param posOut  returns the position of the constant, if found
+ * \param swizzleOut  returns a swizzle mask describing location of the
+ *                    vector elements if found.
+ * \return GL_TRUE if found, GL_FALSE if not found
+ */
+static GLboolean
+lookup_parameter_constant(const struct gl_program_parameter_list *list,
+                          const gl_constant_value v[], GLuint vSize,
+                          GLint *posOut, GLuint *swizzleOut)
+{
+   GLuint i;
+
+   assert(vSize >= 1);
+   assert(vSize <= 4);
+
+   if (!list) {
+      *posOut = -1;
+      return GL_FALSE;
+   }
+
+   for (i = 0; i < list->NumParameters; i++) {
+      if (list->Parameters[i].Type == PROGRAM_CONSTANT) {
+         if (!swizzleOut) {
+            /* swizzle not allowed */
+            GLuint j, match = 0;
+            for (j = 0; j < vSize; j++) {
+               if (v[j].u == list->ParameterValues[i][j].u)
+                  match++;
+            }
+            if (match == vSize) {
+               *posOut = i;
+               return GL_TRUE;
+            }
+         }
+         else {
+            /* try matching w/ swizzle */
+             if (vSize == 1) {
+                /* look for v[0] anywhere within float[4] value */
+                GLuint j;
+                for (j = 0; j < list->Parameters[i].Size; j++) {
+                   if (list->ParameterValues[i][j].u == v[0].u) {
+                      /* found it */
+                      *posOut = i;
+                      *swizzleOut = MAKE_SWIZZLE4(j, j, j, j);
+                      return GL_TRUE;
+                   }
+                }
+             }
+             else if (vSize <= list->Parameters[i].Size) {
+                /* see if we can match this constant (with a swizzle) */
+                GLuint swz[4];
+                GLuint match = 0, j, k;
+                for (j = 0; j < vSize; j++) {
+                   if (v[j].u == list->ParameterValues[i][j].u) {
+                      swz[j] = j;
+                      match++;
+                   }
+                   else {
+                      for (k = 0; k < list->Parameters[i].Size; k++) {
+                         if (v[j].u == list->ParameterValues[i][k].u) {
+                            swz[j] = k;
+                            match++;
+                            break;
+                         }
+                      }
+                   }
+                }
+                /* smear last value to remaining positions */
+                for (; j < 4; j++)
+                   swz[j] = swz[j-1];
+
+                if (match == vSize) {
+                   *posOut = i;
+                   *swizzleOut = MAKE_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]);
+                   return GL_TRUE;
+                }
+             }
+         }
+      }
+   }
+
+   *posOut = -1;
+   return GL_FALSE;
+}
+
+
 struct gl_program_parameter_list *
 _mesa_new_parameter_list(void)
 {
@@ -54,17 +147,17 @@ _mesa_new_parameter_list_sized(unsigned size)
 
       /* alloc arrays */
       p->Parameters = (struct gl_program_parameter *)
-	 calloc(size, sizeof(struct gl_program_parameter));
+         calloc(size, sizeof(struct gl_program_parameter));
 
       p->ParameterValues = (gl_constant_value (*)[4])
          _mesa_align_malloc(size * 4 *sizeof(gl_constant_value), 16);
 
 
       if ((p->Parameters == NULL) || (p->ParameterValues == NULL)) {
-	 free(p->Parameters);
-	 _mesa_align_free(p->ParameterValues);
-	 free(p);
-	 p = NULL;
+         free(p->Parameters);
+         _mesa_align_free(p->ParameterValues);
+         free(p);
+         p = NULL;
       }
    }
 
@@ -191,7 +284,7 @@ _mesa_add_parameter(struct gl_program_parameter_list *paramList,
          else {
             /* silence valgrind */
             for (j = 0; j < 4; j++)
-            	paramList->ParameterValues[oldNum + i][j].f = 0;
+               paramList->ParameterValues[oldNum + i][j].f = 0;
          }
          size -= 4;
       }
@@ -228,8 +321,7 @@ _mesa_add_typed_unnamed_constant(struct gl_program_parameter_list *paramList,
    assert(size <= 4);
 
    if (swizzleOut &&
-       _mesa_lookup_parameter_constant(paramList, values,
-                                       size, &pos, swizzleOut)) {
+       lookup_parameter_constant(paramList, values, size, &pos, swizzleOut)) {
       return pos;
    }
 
@@ -264,28 +356,6 @@ _mesa_add_typed_unnamed_constant(struct gl_program_parameter_list *paramList,
    return pos;
 }
 
-/**
- * Add a new unnamed constant to the parameter list.  This will be used
- * when a fragment/vertex program contains something like this:
- *    MOV r, { 0, 1, 2, 3 };
- * If swizzleOut is non-null we'll search the parameter list for an
- * existing instance of the constant which matches with a swizzle.
- *
- * \param paramList  the parameter list
- * \param values  four float values
- * \param swizzleOut  returns swizzle mask for accessing the constant
- * \return index/position of the new parameter in the parameter list.
- * \sa _mesa_add_typed_unnamed_constant
- */
-GLint
-_mesa_add_unnamed_constant(struct gl_program_parameter_list *paramList,
-                           const gl_constant_value values[4], GLuint size,
-                           GLuint *swizzleOut)
-{
-   return _mesa_add_typed_unnamed_constant(paramList, values, size, GL_NONE,
-                                           swizzleOut);
-}
-
 
 /**
  * Add a new state reference to the parameter list.
@@ -307,8 +377,8 @@ _mesa_add_state_reference(struct gl_program_parameter_list *paramList,
    /* Check if the state reference is already in the list */
    for (index = 0; index < (GLint) paramList->NumParameters; index++) {
       if (!memcmp(paramList->Parameters[index].StateIndexes,
-		  stateTokens, STATE_LENGTH * sizeof(gl_state_index))) {
-	 return index;
+                  stateTokens, STATE_LENGTH * sizeof(gl_state_index))) {
+         return index;
       }
    }
 
@@ -323,134 +393,3 @@ _mesa_add_state_reference(struct gl_program_parameter_list *paramList,
 
    return index;
 }
-
-
-/**
- * Given a program parameter name, find its position in the list of parameters.
- * \param paramList  the parameter list to search
- * \param nameLen  length of name (in chars).
- *                 If length is negative, assume that name is null-terminated.
- * \param name  the name to search for
- * \return index of parameter in the list.
- */
-GLint
-_mesa_lookup_parameter_index(const struct gl_program_parameter_list *paramList,
-                             GLsizei nameLen, const char *name)
-{
-   GLint i;
-
-   if (!paramList)
-      return -1;
-
-   if (nameLen == -1) {
-      /* name is null-terminated */
-      for (i = 0; i < (GLint) paramList->NumParameters; i++) {
-         if (paramList->Parameters[i].Name &&
-	     strcmp(paramList->Parameters[i].Name, name) == 0)
-            return i;
-      }
-   }
-   else {
-      /* name is not null-terminated, use nameLen */
-      for (i = 0; i < (GLint) paramList->NumParameters; i++) {
-         if (paramList->Parameters[i].Name &&
-	     strncmp(paramList->Parameters[i].Name, name, nameLen) == 0
-             && strlen(paramList->Parameters[i].Name) == (size_t)nameLen)
-            return i;
-      }
-   }
-   return -1;
-}
-
-
-/**
- * Look for a float vector in the given parameter list.  The float vector
- * may be of length 1, 2, 3 or 4.  If swizzleOut is non-null, we'll try
- * swizzling to find a match.
- * \param list  the parameter list to search
- * \param v  the float vector to search for
- * \param vSize  number of element in v
- * \param posOut  returns the position of the constant, if found
- * \param swizzleOut  returns a swizzle mask describing location of the
- *                    vector elements if found.
- * \return GL_TRUE if found, GL_FALSE if not found
- */
-GLboolean
-_mesa_lookup_parameter_constant(const struct gl_program_parameter_list *list,
-                                const gl_constant_value v[], GLuint vSize,
-                                GLint *posOut, GLuint *swizzleOut)
-{
-   GLuint i;
-
-   assert(vSize >= 1);
-   assert(vSize <= 4);
-
-   if (!list) {
-      *posOut = -1;
-      return GL_FALSE;
-   }
-
-   for (i = 0; i < list->NumParameters; i++) {
-      if (list->Parameters[i].Type == PROGRAM_CONSTANT) {
-         if (!swizzleOut) {
-            /* swizzle not allowed */
-            GLuint j, match = 0;
-            for (j = 0; j < vSize; j++) {
-               if (v[j].u == list->ParameterValues[i][j].u)
-                  match++;
-            }
-            if (match == vSize) {
-               *posOut = i;
-               return GL_TRUE;
-            }
-         }
-         else {
-            /* try matching w/ swizzle */
-             if (vSize == 1) {
-                /* look for v[0] anywhere within float[4] value */
-                GLuint j;
-                for (j = 0; j < list->Parameters[i].Size; j++) {
-                   if (list->ParameterValues[i][j].u == v[0].u) {
-                      /* found it */
-                      *posOut = i;
-                      *swizzleOut = MAKE_SWIZZLE4(j, j, j, j);
-                      return GL_TRUE;
-                   }
-                }
-             }
-             else if (vSize <= list->Parameters[i].Size) {
-                /* see if we can match this constant (with a swizzle) */
-                GLuint swz[4];
-                GLuint match = 0, j, k;
-                for (j = 0; j < vSize; j++) {
-                   if (v[j].u == list->ParameterValues[i][j].u) {
-                      swz[j] = j;
-                      match++;
-                   }
-                   else {
-                      for (k = 0; k < list->Parameters[i].Size; k++) {
-                         if (v[j].u == list->ParameterValues[i][k].u) {
-                            swz[j] = k;
-                            match++;
-                            break;
-                         }
-                      }
-                   }
-                }
-                /* smear last value to remaining positions */
-                for (; j < 4; j++)
-                   swz[j] = swz[j-1];
-
-                if (match == vSize) {
-                   *posOut = i;
-                   *swizzleOut = MAKE_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]);
-                   return GL_TRUE;
-                }
-             }
-         }
-      }
-   }
-
-   *posOut = -1;
-   return GL_FALSE;
-}
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index c04d7a2e634..320f64f3f54 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -34,6 +34,7 @@
 #include "main/mtypes.h"
 #include "prog_statevars.h"
 
+#include <string.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -99,12 +100,6 @@ _mesa_new_parameter_list_sized(unsigned size);
 extern void
 _mesa_free_parameter_list(struct gl_program_parameter_list *paramList);
 
-static inline GLuint
-_mesa_num_parameters(const struct gl_program_parameter_list *list)
-{
-   return list ? list->NumParameters : 0;
-}
-
 extern void
 _mesa_reserve_parameter_storage(struct gl_program_parameter_list *paramList,
                                 unsigned reserve_slots);
@@ -121,23 +116,36 @@ _mesa_add_typed_unnamed_constant(struct gl_program_parameter_list *paramList,
                            const gl_constant_value values[4], GLuint size,
                            GLenum datatype, GLuint *swizzleOut);
 
-extern GLint
+static inline GLint
 _mesa_add_unnamed_constant(struct gl_program_parameter_list *paramList,
                            const gl_constant_value values[4], GLuint size,
-                           GLuint *swizzleOut);
+                           GLuint *swizzleOut)
+{
+   return _mesa_add_typed_unnamed_constant(paramList, values, size, GL_NONE,
+                                           swizzleOut);
+}
 
 extern GLint
 _mesa_add_state_reference(struct gl_program_parameter_list *paramList,
                           const gl_state_index stateTokens[STATE_LENGTH]);
 
-extern GLint
+
+static inline GLint
 _mesa_lookup_parameter_index(const struct gl_program_parameter_list *paramList,
-                             GLsizei nameLen, const char *name);
+                             const char *name)
+{
+   if (!paramList)
+      return -1;
 
-extern GLboolean
-_mesa_lookup_parameter_constant(const struct gl_program_parameter_list *list,
-                                const gl_constant_value v[], GLuint vSize,
-                                GLint *posOut, GLuint *swizzleOut);
+   /* name must be null-terminated */
+   for (GLint i = 0; i < (GLint) paramList->NumParameters; i++) {
+      if (paramList->Parameters[i].Name &&
+         strcmp(paramList->Parameters[i].Name, name) == 0)
+         return i;
+   }
+
+   return -1;
+}
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c
index db53377d705..03ece6711c2 100644
--- a/src/mesa/program/prog_statevars.c
+++ b/src/mesa/program/prog_statevars.c
@@ -502,7 +502,7 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[],
                minImplSize = ctx->Const.MinPointSizeAA;
                maxImplSize = ctx->Const.MaxPointSize;
             }
-            else if (ctx->Point.SmoothFlag || ctx->Multisample._Enabled) {
+            else if (ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) {
                minImplSize = ctx->Const.MinPointSizeAA;
                maxImplSize = ctx->Const.MaxPointSizeAA;
             }
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index 1f916ab9299..16b79c94c84 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -142,7 +142,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
       load->num_components = 4;
       load->variables[0] = nir_deref_var_create(load, c->input_vars[prog_src->Index]);
 
-      nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
       nir_builder_instr_insert(b, &load->instr);
 
       src.src = nir_src_for_ssa(&load->dest.ssa);
@@ -171,7 +171,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
 
          nir_intrinsic_instr *load =
             nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_var);
-         nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+         nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
          load->num_components = 4;
 
          load->variables[0] = nir_deref_var_create(load, c->parameters);
@@ -246,7 +246,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
          } else {
             assert(swizzle != SWIZZLE_NIL);
             nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_fmov);
-            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, NULL);
+            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, 32, NULL);
             mov->dest.write_mask = 0x1;
             mov->src[0] = src;
             mov->src[0].swizzle[0] = swizzle;
@@ -676,7 +676,7 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src,
 
    assert(src_number == num_srcs);
 
-   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, 32, NULL);
    nir_builder_instr_insert(b, &instr->instr);
 
    /* Resolve the writemask on the texture op. */
@@ -974,7 +974,7 @@ setup_registers_and_variables(struct ptn_compile *c)
                nir_intrinsic_instr_create(shader, nir_intrinsic_load_var);
             load_x->num_components = 1;
             load_x->variables[0] = nir_deref_var_create(load_x, var);
-            nir_ssa_dest_init(&load_x->instr, &load_x->dest, 1, NULL);
+            nir_ssa_dest_init(&load_x->instr, &load_x->dest, 1, 32, NULL);
             nir_builder_instr_insert(b, &load_x->instr);
 
             nir_ssa_def *f001 = nir_vec4(b, &load_x->dest.ssa, nir_imm_float(b, 0.0),
diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index c20cadf508f..366163e42df 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -31,6 +31,7 @@
   */
  
 #include "main/macros.h"
+#include "main/framebuffer.h"
 #include "st_context.h"
 #include "st_atom.h"
 #include "st_debug.h"
@@ -235,12 +236,12 @@ static void update_raster_state( struct st_context *st )
    raster->line_stipple_factor = ctx->Line.StippleFactor - 1;
 
    /* _NEW_MULTISAMPLE */
-   raster->multisample = ctx->Multisample._Enabled;
+   raster->multisample = _mesa_is_multisample_enabled(ctx);
 
    /* _NEW_MULTISAMPLE | _NEW_BUFFERS */
    raster->force_persample_interp =
          !st->force_persample_in_shader &&
-         ctx->Multisample._Enabled &&
+         _mesa_is_multisample_enabled(ctx) &&
          ctx->Multisample.SampleShading &&
          ctx->Multisample.MinSampleShadingValue *
          ctx->DrawBuffer->Visual.samples > 1;
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index ff90bd61d5b..709f0cbcb91 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -74,7 +74,7 @@ update_fp( struct st_context *st )
    /* _NEW_MULTISAMPLE | _NEW_BUFFERS */
    key.persample_shading =
       st->force_persample_in_shader &&
-      st->ctx->Multisample._Enabled &&
+      _mesa_is_multisample_enabled(st->ctx) &&
       st->ctx->Multisample.SampleShading &&
       st->ctx->Multisample.MinSampleShadingValue *
       _mesa_geometric_samples(st->ctx->DrawBuffer) > 1;
diff --git a/src/mesa/state_tracker/st_cb_bitmap.h b/src/mesa/state_tracker/st_cb_bitmap.h
index 4d1ae222b81..323158ea11d 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.h
+++ b/src/mesa/state_tracker/st_cb_bitmap.h
@@ -49,7 +49,7 @@ st_flush_bitmap_cache(struct st_context *st);
 
 extern const struct tgsi_token *
 st_get_bitmap_shader(const struct tgsi_token *tokens,
-                     unsigned sampler_index,
+                     unsigned tex_target, unsigned sampler_index,
                      bool use_texcoord, bool swizzle_xxxx);
 
 #endif /* ST_CB_BITMAP_H */
diff --git a/src/mesa/state_tracker/st_cb_bitmap_shader.c b/src/mesa/state_tracker/st_cb_bitmap_shader.c
index cddea36d4f6..7ce078d5008 100644
--- a/src/mesa/state_tracker/st_cb_bitmap_shader.c
+++ b/src/mesa/state_tracker/st_cb_bitmap_shader.c
@@ -36,6 +36,7 @@ struct tgsi_bitmap_transform {
    struct tgsi_transform_context base;
    struct tgsi_shader_info info;
    unsigned sampler_index;
+   unsigned tex_target;
    bool use_texcoord;
    bool swizzle_xxxx;
    bool first_instruction_emitted;
@@ -52,8 +53,9 @@ transform_instr(struct tgsi_transform_context *tctx,
 		struct tgsi_full_instruction *current_inst)
 {
    struct tgsi_bitmap_transform *ctx = tgsi_bitmap_transform(tctx);
-   struct tgsi_full_declaration decl;
    struct tgsi_full_instruction inst;
+   unsigned tgsi_tex_target = ctx->tex_target == PIPE_TEXTURE_2D
+      ? TGSI_TEXTURE_2D : TGSI_TEXTURE_RECT;
    unsigned i, semantic;
    int texcoord_index = -1;
 
@@ -66,9 +68,7 @@ transform_instr(struct tgsi_transform_context *tctx,
 
    /* Add TEMP[0] if it's missing. */
    if (ctx->info.file_max[TGSI_FILE_TEMPORARY] == -1) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_TEMPORARY;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_temp_decl(tctx, 0);
    }
 
    /* Add TEXCOORD[0] if it's missing. */
@@ -83,45 +83,23 @@ transform_instr(struct tgsi_transform_context *tctx,
    }
 
    if (texcoord_index == -1) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_INPUT;
-      decl.Declaration.Semantic = 1;
-      decl.Semantic.Name = semantic;
-      decl.Declaration.Interpolate = 1;
-      decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
-      decl.Range.First = decl.Range.Last = ctx->info.num_inputs;
       texcoord_index = ctx->info.num_inputs;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_input_decl(tctx, texcoord_index,
+                                semantic, 0, TGSI_INTERPOLATE_PERSPECTIVE);
    }
 
    /* Declare the sampler. */
-   decl = tgsi_default_full_declaration();
-   decl.Declaration.File = TGSI_FILE_SAMPLER;
-   decl.Range.First = decl.Range.Last = ctx->sampler_index;
-   tctx->emit_declaration(tctx, &decl);
+   tgsi_transform_sampler_decl(tctx, ctx->sampler_index);
 
-   /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
-   inst = tgsi_default_full_instruction();
-   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
-   inst.Instruction.Texture = 1;
-   inst.Texture.Texture = TGSI_TEXTURE_2D;
-
-   inst.Instruction.NumDstRegs = 1;
-   inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
-   inst.Dst[0].Register.Index = 0;
-   inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
-
-   inst.Instruction.NumSrcRegs = 2;
-   inst.Src[0].Register.File  = TGSI_FILE_INPUT;
-   inst.Src[0].Register.Index = texcoord_index;
-   inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X;
-   inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_Y;
-   inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_Z;
-   inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_W;
-   inst.Src[1].Register.File  = TGSI_FILE_SAMPLER;
-   inst.Src[1].Register.Index = ctx->sampler_index;
+   /* Declare the sampler view. */
+   tgsi_transform_sampler_view_decl(tctx, ctx->sampler_index,
+                                    tgsi_tex_target, TGSI_RETURN_TYPE_FLOAT);
 
-   tctx->emit_instruction(tctx, &inst);
+   /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
+   tgsi_transform_tex_inst(tctx,
+                           TGSI_FILE_TEMPORARY, 0,
+                           TGSI_FILE_INPUT, texcoord_index,
+                           tgsi_tex_target, ctx->sampler_index);
 
    /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
    inst = tgsi_default_full_instruction();
@@ -150,15 +128,19 @@ transform_instr(struct tgsi_transform_context *tctx,
 
 const struct tgsi_token *
 st_get_bitmap_shader(const struct tgsi_token *tokens,
-                     unsigned sampler_index,
+                     unsigned tex_target, unsigned sampler_index,
                      bool use_texcoord, bool swizzle_xxxx)
 {
    struct tgsi_bitmap_transform ctx;
    struct tgsi_token *newtoks;
    int newlen;
 
+   assert(tex_target == PIPE_TEXTURE_2D ||
+          tex_target == PIPE_TEXTURE_RECT);
+
    memset(&ctx, 0, sizeof(ctx));
    ctx.base.transform_instruction = transform_instr;
+   ctx.tex_target = tex_target;
    ctx.sampler_index = sampler_index;
    ctx.use_texcoord = use_texcoord;
    ctx.swizzle_xxxx = swizzle_xxxx;
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 51d4ae51918..09f4d8e00d1 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -142,11 +142,21 @@ get_drawpix_z_stencil_program(struct st_context *st,
       out_color = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
 
       depth_sampler = ureg_DECL_sampler(ureg, 0);
+      ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D,
+                             TGSI_RETURN_TYPE_FLOAT,
+                             TGSI_RETURN_TYPE_FLOAT,
+                             TGSI_RETURN_TYPE_FLOAT,
+                             TGSI_RETURN_TYPE_FLOAT);
       out_depth = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0);
    }
 
    if (write_stencil) {
       stencil_sampler = ureg_DECL_sampler(ureg, 1);
+      ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D,
+                             TGSI_RETURN_TYPE_UINT,
+                             TGSI_RETURN_TYPE_UINT,
+                             TGSI_RETURN_TYPE_UINT,
+                             TGSI_RETURN_TYPE_UINT);
       out_stencil = ureg_DECL_output(ureg, TGSI_SEMANTIC_STENCIL, 0);
    }
 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h
index f1fb32dd6cf..24526d55402 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.h
+++ b/src/mesa/state_tracker/st_cb_drawpixels.h
@@ -46,6 +46,6 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord,
                       bool scale_and_bias, unsigned scale_const,
                       unsigned bias_const, bool pixel_maps,
                       unsigned drawpix_sampler, unsigned pixelmap_sampler,
-                      unsigned texcoord_const);
+                      unsigned texcoord_const, unsigned tex_target);
 
 #endif /* ST_CB_DRAWPIXELS_H */
diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
index 749b46cfbf7..35a9da0643d 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels_shader.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
@@ -43,6 +43,7 @@ struct tgsi_drawpix_transform {
    unsigned drawpix_sampler;
    unsigned pixelmap_sampler;
    unsigned texcoord_const;
+   unsigned tex_target;
 };
 
 static inline struct tgsi_drawpix_transform *
@@ -72,8 +73,8 @@ transform_instr(struct tgsi_transform_context *tctx,
 		struct tgsi_full_instruction *current_inst)
 {
    struct tgsi_drawpix_transform *ctx = tgsi_drawpix_transform(tctx);
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
+   const unsigned tgsi_tex_target = ctx->tex_target == PIPE_TEXTURE_2D
+      ? TGSI_TEXTURE_2D : TGSI_TEXTURE_RECT;
    unsigned i, sem_texcoord = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD :
                                                   TGSI_SEMANTIC_GENERIC;
    int texcoord_index = -1;
@@ -86,33 +87,21 @@ transform_instr(struct tgsi_transform_context *tctx,
    /* Add scale and bias constants. */
    if (ctx->scale_and_bias) {
       if (ctx->info.const_file_max[0] < (int)ctx->scale_const) {
-         decl = tgsi_default_full_declaration();
-         decl.Declaration.File = TGSI_FILE_CONSTANT;
-         decl.Range.First = decl.Range.Last = ctx->scale_const;
-         tctx->emit_declaration(tctx, &decl);
+         tgsi_transform_const_decl(tctx, ctx->scale_const, ctx->scale_const);
       }
 
       if (ctx->info.const_file_max[0] < (int)ctx->bias_const) {
-         decl = tgsi_default_full_declaration();
-         decl.Declaration.File = TGSI_FILE_CONSTANT;
-         decl.Range.First = decl.Range.Last = ctx->bias_const;
-         tctx->emit_declaration(tctx, &decl);
+         tgsi_transform_const_decl(tctx, ctx->bias_const, ctx->bias_const);
       }
    }
 
    if (ctx->info.const_file_max[0] < (int)ctx->texcoord_const) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_CONSTANT;
-      decl.Range.First = decl.Range.Last = ctx->texcoord_const;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_const_decl(tctx, ctx->texcoord_const, ctx->texcoord_const);
    }
 
    /* Add a new temp. */
    ctx->color_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1;
-   decl = tgsi_default_full_declaration();
-   decl.Declaration.File = TGSI_FILE_TEMPORARY;
-   decl.Range.First = decl.Range.Last = ctx->color_temp;
-   tctx->emit_declaration(tctx, &decl);
+   tgsi_transform_temp_decl(tctx, ctx->color_temp);
 
    /* Add TEXCOORD[texcoord_slot] if it's missing. */
    for (i = 0; i < ctx->info.num_inputs; i++) {
@@ -124,75 +113,51 @@ transform_instr(struct tgsi_transform_context *tctx,
    }
 
    if (texcoord_index == -1) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_INPUT;
-      decl.Declaration.Semantic = 1;
-      decl.Semantic.Name = sem_texcoord;
-      decl.Declaration.Interpolate = 1;
-      decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
-      decl.Range.First = decl.Range.Last = ctx->info.num_inputs;
       texcoord_index = ctx->info.num_inputs;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_input_decl(tctx, texcoord_index, sem_texcoord, 0,
+                                TGSI_INTERPOLATE_PERSPECTIVE);
    }
 
    /* Declare the drawpix sampler if it's missing. */
    if (!(ctx->info.samplers_declared & (1 << ctx->drawpix_sampler))) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_SAMPLER;
-      decl.Range.First = decl.Range.Last = ctx->drawpix_sampler;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_sampler_decl(tctx, ctx->drawpix_sampler);
+
+      /* emit sampler view declaration */
+      tgsi_transform_sampler_view_decl(tctx, ctx->drawpix_sampler,
+                                       tgsi_tex_target, TGSI_RETURN_TYPE_FLOAT);
    }
 
    /* Declare the pixel map sampler if it's missing. */
    if (ctx->pixel_maps &&
        !(ctx->info.samplers_declared & (1 << ctx->pixelmap_sampler))) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_SAMPLER;
-      decl.Range.First = decl.Range.Last = ctx->pixelmap_sampler;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_sampler_decl(tctx, ctx->pixelmap_sampler);
+
+      /* emit sampler view declaration */
+      tgsi_transform_sampler_view_decl(tctx, ctx->pixelmap_sampler,
+                                       TGSI_TEXTURE_2D, TGSI_RETURN_TYPE_FLOAT);
    }
 
    /* Get initial pixel color from the texture.
     * TEX temp, fragment.texcoord[0], texture[0], 2D;
     */
-   inst = tgsi_default_full_instruction();
-   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
-   inst.Instruction.Texture = 1;
-   inst.Texture.Texture = TGSI_TEXTURE_2D;
-
-   inst.Instruction.NumDstRegs = 1;
-   inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
-   inst.Dst[0].Register.Index = ctx->color_temp;
-   inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
-
-   inst.Instruction.NumSrcRegs = 2;
-   SET_SRC(&inst, 0, TGSI_FILE_INPUT, texcoord_index, X, Y, Z, W);
-   inst.Src[1].Register.File  = TGSI_FILE_SAMPLER;
-   inst.Src[1].Register.Index = ctx->drawpix_sampler;
-
-   tctx->emit_instruction(tctx, &inst);
+   tgsi_transform_tex_inst(tctx, TGSI_FILE_TEMPORARY, ctx->color_temp,
+                           TGSI_FILE_INPUT, texcoord_index,
+                           tgsi_tex_target, ctx->drawpix_sampler);
 
    /* Apply the scale and bias. */
    if (ctx->scale_and_bias) {
       /* MAD temp, temp, scale, bias; */
-      inst = tgsi_default_full_instruction();
-      inst.Instruction.Opcode = TGSI_OPCODE_MAD;
-
-      inst.Instruction.NumDstRegs = 1;
-      inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
-      inst.Dst[0].Register.Index = ctx->color_temp;
-      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
-
-      inst.Instruction.NumSrcRegs = 3;
-      SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Z, W);
-      SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, ctx->scale_const, X, Y, Z, W);
-      SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, ctx->bias_const, X, Y, Z, W);
-
-      tctx->emit_instruction(tctx, &inst);
+      tgsi_transform_op3_inst(tctx, TGSI_OPCODE_MAD,
+                              TGSI_FILE_TEMPORARY, ctx->color_temp,
+                              TGSI_WRITEMASK_XYZW,
+                              TGSI_FILE_TEMPORARY, ctx->color_temp,
+                              TGSI_FILE_CONSTANT, ctx->scale_const,
+                              TGSI_FILE_CONSTANT, ctx->bias_const);
    }
 
    if (ctx->pixel_maps) {
       /* do four pixel map look-ups with two TEX instructions: */
+      struct tgsi_full_instruction inst;
 
       /* TEX temp.xy, temp.xyyy, texture[1], 2D; */
       inst = tgsi_default_full_instruction();
@@ -250,12 +215,15 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord,
                       bool scale_and_bias, unsigned scale_const,
                       unsigned bias_const, bool pixel_maps,
                       unsigned drawpix_sampler, unsigned pixelmap_sampler,
-                      unsigned texcoord_const)
+                      unsigned texcoord_const, unsigned tex_target)
 {
    struct tgsi_drawpix_transform ctx;
    struct tgsi_token *newtoks;
    int newlen;
 
+   assert(tex_target == PIPE_TEXTURE_2D ||
+          tex_target == PIPE_TEXTURE_RECT);
+
    memset(&ctx, 0, sizeof(ctx));
    ctx.base.transform_instruction = transform_instr;
    ctx.use_texcoord = use_texcoord;
@@ -266,9 +234,10 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord,
    ctx.drawpix_sampler = drawpix_sampler;
    ctx.pixelmap_sampler = pixelmap_sampler;
    ctx.texcoord_const = texcoord_const;
+   ctx.tex_target = tex_target;
    tgsi_scan_shader(tokens, &ctx.info);
 
-   newlen = tgsi_num_tokens(tokens) + 30;
+   newlen = tgsi_num_tokens(tokens) + 60;
    newtoks = tgsi_alloc_tokens(newlen);
    if (!newtoks)
       return NULL;
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 82ab914503b..ff570e0e444 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -387,6 +387,7 @@ st_update_renderbuffer_surface(struct st_context *st,
 {
    struct pipe_context *pipe = st->pipe;
    struct pipe_resource *resource = strb->texture;
+   struct st_texture_object *stTexObj = NULL;
    unsigned rtt_width = strb->Base.Width;
    unsigned rtt_height = strb->Base.Height;
    unsigned rtt_depth = strb->Base.Depth;
@@ -398,9 +399,18 @@ st_update_renderbuffer_surface(struct st_context *st,
     */
    boolean enable_srgb = (st->ctx->Color.sRGBEnabled &&
          _mesa_get_format_color_encoding(strb->Base.Format) == GL_SRGB);
-   enum pipe_format format = (enable_srgb) ?
-      util_format_srgb(resource->format) :
-      util_format_linear(resource->format);
+   enum pipe_format format = resource->format;
+
+   if (strb->is_rtt) {
+      stTexObj = st_texture_object(strb->Base.TexImage->TexObject);
+      if (stTexObj->surface_based)
+         format = stTexObj->surface_format;
+   }
+
+   format = (enable_srgb) ?
+      util_format_srgb(format) :
+      util_format_linear(format);
+
    unsigned first_layer, last_layer, level;
 
    if (resource->target == PIPE_TEXTURE_1D_ARRAY) {
@@ -431,8 +441,8 @@ st_update_renderbuffer_surface(struct st_context *st,
 
    /* Adjust for texture views */
    if (strb->is_rtt && resource->array_size > 1 &&
-       strb->Base.TexImage->TexObject->Immutable) {
-      struct gl_texture_object *tex = strb->Base.TexImage->TexObject;
+       stTexObj->base.Immutable) {
+      struct gl_texture_object *tex = &stTexObj->base;
       first_layer += tex->MinLayer;
       if (!strb->rtt_layered)
          last_layer += tex->MinLayer;
@@ -492,8 +502,6 @@ st_render_texture(struct gl_context *ctx,
 
    st_update_renderbuffer_surface(st, strb);
 
-   strb->Base.Format = st_pipe_format_to_mesa_format(pt->format);
-
    /* Invalidate buffer state so that the pipe's framebuffer state
     * gets updated.
     * That's where the new renderbuffer (which we just created) gets
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index bffa4d026cb..460c1790663 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -2886,10 +2886,13 @@ st_finalize_texture(struct gl_context *ctx,
          /* Need to import images in main memory or held in other textures.
           */
          if (stImage && stObj->pt != stImage->pt) {
+            GLuint depth = stObj->depth0;
+            if (stObj->base.Target == GL_TEXTURE_3D)
+               depth = u_minify(depth, level);
             if (level == 0 ||
                 (stImage->base.Width == u_minify(stObj->width0, level) &&
                  stImage->base.Height == u_minify(stObj->height0, level) &&
-                 stImage->base.Depth == u_minify(stObj->depth0, level))) {
+                 stImage->base.Depth == depth)) {
                /* src image fits expected dest mipmap level size */
                copy_image_data_to_texture(st, stObj, level, stImage);
             }
diff --git a/src/mesa/state_tracker/st_cb_texturebarrier.c b/src/mesa/state_tracker/st_cb_texturebarrier.c
index 2de150ba13a..fecba65a64d 100644
--- a/src/mesa/state_tracker/st_cb_texturebarrier.c
+++ b/src/mesa/state_tracker/st_cb_texturebarrier.c
@@ -63,16 +63,54 @@ st_MemoryBarrier(struct gl_context *ctx, GLbitfield barriers)
    struct pipe_context *pipe = st_context(ctx)->pipe;
    unsigned flags = 0;
 
+   if (barriers & GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT)
+      flags |= PIPE_BARRIER_VERTEX_BUFFER;
+   if (barriers & GL_ELEMENT_ARRAY_BARRIER_BIT)
+      flags |= PIPE_BARRIER_INDEX_BUFFER;
+   if (barriers & GL_UNIFORM_BARRIER_BIT)
+      flags |= PIPE_BARRIER_CONSTANT_BUFFER;
+   if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
+      flags |= PIPE_BARRIER_TEXTURE;
+   if (barriers & GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
+      flags |= PIPE_BARRIER_IMAGE;
+   if (barriers & GL_COMMAND_BARRIER_BIT)
+      flags |= PIPE_BARRIER_INDIRECT_BUFFER;
+   if (barriers & GL_PIXEL_BUFFER_BARRIER_BIT) {
+      /* The PBO may be
+       *  (1) bound as a texture for PBO uploads, or
+       *  (2) accessed by the CPU via transfer ops.
+       * For case (2), we assume automatic flushing by the driver.
+       */
+      flags |= PIPE_BARRIER_TEXTURE;
+   }
+   /* GL_TEXTURE_UPDATE_BARRIER_BIT:
+    * Texture updates translate to:
+    *  (1) texture transfers to/from the CPU,
+    *  (2) texture as blit destination, or
+    *  (3) texture as framebuffer.
+    * In all cases, we assume the driver does the required flushing
+    * automatically.
+    */
+   /* GL_BUFFER_UPDATE_BARRIER_BIT:
+    * Buffer updates translate to
+    *  (1) buffer transfers to/from the CPU,
+    *  (2) resource copies and clears.
+    * In all cases, we assume the driver does the required flushing
+    * automatically.
+    */
    if (barriers & GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT)
       flags |= PIPE_BARRIER_MAPPED_BUFFER;
+   if (barriers & GL_QUERY_BUFFER_BARRIER_BIT)
+      flags |= PIPE_BARRIER_QUERY_BUFFER;
+   if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
+      flags |= PIPE_BARRIER_FRAMEBUFFER;
+   if (barriers & GL_TRANSFORM_FEEDBACK_BARRIER_BIT)
+      flags |= PIPE_BARRIER_STREAMOUT_BUFFER;
    if (barriers & GL_ATOMIC_COUNTER_BARRIER_BIT)
       flags |= PIPE_BARRIER_SHADER_BUFFER;
    if (barriers & GL_SHADER_STORAGE_BARRIER_BIT)
       flags |= PIPE_BARRIER_SHADER_BUFFER;
 
-   if (barriers & GL_QUERY_BUFFER_BARRIER_BIT)
-      flags |= PIPE_BARRIER_QUERY_BUFFER;
-
    if (flags && pipe->memory_barrier)
       pipe->memory_barrier(pipe, flags);
 }
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 3666ece8ee7..2fdaba073a2 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -253,6 +253,13 @@ void st_init_limits(struct pipe_screen *screen,
       pc->MaxLocalParams = MIN2(pc->MaxParameters, MAX_PROGRAM_LOCAL_PARAMS);
       pc->MaxEnvParams = MIN2(pc->MaxParameters, MAX_PROGRAM_ENV_PARAMS);
 
+      if (screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_INTEGERS)) {
+         pc->LowInt.RangeMin = 31;
+         pc->LowInt.RangeMax = 30;
+         pc->LowInt.Precision = 0;
+         pc->MediumInt = pc->HighInt = pc->LowInt;
+      }
+
       options->EmitNoNoise = TRUE;
 
       /* TODO: make these more fine-grained if anyone needs it */
@@ -783,6 +790,7 @@ void st_init_extensions(struct pipe_screen *screen,
    extensions->ARB_fragment_shader = GL_TRUE;
    extensions->ARB_half_float_vertex = GL_TRUE;
    extensions->ARB_internalformat_query = GL_TRUE;
+   extensions->ARB_internalformat_query2 = GL_TRUE;
    extensions->ARB_map_buffer_range = GL_TRUE;
    extensions->ARB_texture_border_clamp = GL_TRUE; /* XXX temp */
    extensions->ARB_texture_cube_map = GL_TRUE;
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 5392c23ec00..9a280fc004b 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -1114,12 +1114,12 @@ static const struct format_mapping format_map[] = {
    },
    {
       { GL_RGB10_A2, 0 },
-      { PIPE_FORMAT_B10G10R10A2_UNORM, PIPE_FORMAT_R10G10B10A2_UNORM,
+      { PIPE_FORMAT_R10G10B10A2_UNORM, PIPE_FORMAT_B10G10R10A2_UNORM,
         DEFAULT_RGBA_FORMATS }
    },
    {
       { 4, GL_RGBA, GL_RGBA8, 0 },
-      { DEFAULT_RGBA_FORMATS }
+      { PIPE_FORMAT_R8G8B8A8_UNORM, DEFAULT_RGBA_FORMATS }
    },
    {
       { GL_BGRA, 0 },
@@ -1127,7 +1127,7 @@ static const struct format_mapping format_map[] = {
    },
    {
       { 3, GL_RGB, GL_RGB8, 0 },
-      { DEFAULT_RGB_FORMATS }
+      { PIPE_FORMAT_R8G8B8X8_UNORM, DEFAULT_RGB_FORMATS }
    },
    {
       { GL_RGB12, GL_RGB16, 0 },
@@ -1309,7 +1309,7 @@ static const struct format_mapping format_map[] = {
    },
    {
       { GL_SRGB_ALPHA_EXT, GL_SRGB8_ALPHA8_EXT, 0 },
-      { DEFAULT_SRGBA_FORMATS }
+      { PIPE_FORMAT_R8G8B8A8_SRGB, DEFAULT_SRGBA_FORMATS }
    },
    {
       { GL_COMPRESSED_SRGB_EXT, GL_COMPRESSED_SRGB_S3TC_DXT1_EXT, 0 },
@@ -2022,20 +2022,10 @@ static const struct exact_format_mapping rgbx8888_tbl[] =
    { 0,           0,                              0                          }
 };
 
-static const struct exact_format_mapping rgba1010102_tbl[] =
-{
-   { GL_BGRA,     GL_UNSIGNED_INT_2_10_10_10_REV, PIPE_FORMAT_B10G10R10A2_UNORM },
-   /* No Mesa formats for these Gallium formats:
-   { GL_RGBA,     GL_UNSIGNED_INT_2_10_10_10_REV, PIPE_FORMAT_R10G10B10A2_UNORM },
-   { GL_ABGR_EXT, GL_UNSIGNED_INT_10_10_10_2,     PIPE_FORMAT_R10G10B10A2_UNORM },
-   { GL_ABGR_EXT, GL_UNSIGNED_INT,                PIPE_FORMAT_R10G10B10A2_UNORM },
-   */
-   { 0,           0,                              0                             }
-};
-
 /**
- * If there is an exact pipe_format match for {internalFormat, format, type}
- * return that, otherwise return PIPE_FORMAT_NONE so we can do fuzzy matching.
+ * For unsized/base internal formats, we may choose a convenient effective
+ * internal format for {format, type}. If one exists, return that, otherwise
+ * return PIPE_FORMAT_NONE.
  */
 static enum pipe_format
 find_exact_format(GLint internalFormat, GLenum format, GLenum type)
@@ -2049,17 +2039,12 @@ find_exact_format(GLint internalFormat, GLenum format, GLenum type)
    switch (internalFormat) {
    case 4:
    case GL_RGBA:
-   case GL_RGBA8:
       tbl = rgba8888_tbl;
       break;
    case 3:
    case GL_RGB:
-   case GL_RGB8:
       tbl = rgbx8888_tbl;
       break;
-   case GL_RGB10_A2:
-      tbl = rgba1010102_tbl;
-      break;
    default:
       return PIPE_FORMAT_NONE;
    }
@@ -2216,7 +2201,15 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
    enum pipe_format pFormat;
    mesa_format mFormat;
    unsigned bindings;
-   enum pipe_texture_target pTarget = gl_target_to_pipe(target);
+   bool is_renderbuffer = false;
+   enum pipe_texture_target pTarget;
+
+   if (target == GL_RENDERBUFFER) {
+      pTarget = PIPE_TEXTURE_2D;
+      is_renderbuffer = true;
+   } else {
+      pTarget = gl_target_to_pipe(target);
+   }
 
    if (target == GL_TEXTURE_1D || target == GL_TEXTURE_1D_ARRAY) {
       /* We don't do compression for these texture targets because of
@@ -2234,7 +2227,7 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
    bindings = PIPE_BIND_SAMPLER_VIEW;
    if (_mesa_is_depth_or_stencil_format(internalFormat))
       bindings |= PIPE_BIND_DEPTH_STENCIL;
-   else if (internalFormat == 3 || internalFormat == 4 ||
+   else if (is_renderbuffer || internalFormat == 3 || internalFormat == 4 ||
             internalFormat == GL_RGB || internalFormat == GL_RGBA ||
             internalFormat == GL_RGB8 || internalFormat == GL_RGBA8 ||
             internalFormat == GL_BGRA ||
@@ -2267,19 +2260,21 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
          if (pFormat != PIPE_FORMAT_NONE)
             return st_pipe_format_to_mesa_format(pFormat);
 
-         /* try choosing format again, this time without render target bindings */
-         pFormat = st_choose_matching_format(st, PIPE_BIND_SAMPLER_VIEW,
-                                             format, type,
-                                             ctx->Unpack.SwapBytes);
-         if (pFormat != PIPE_FORMAT_NONE)
-            return st_pipe_format_to_mesa_format(pFormat);
+         if (!is_renderbuffer) {
+            /* try choosing format again, this time without render target bindings */
+            pFormat = st_choose_matching_format(st, PIPE_BIND_SAMPLER_VIEW,
+                                                format, type,
+                                                ctx->Unpack.SwapBytes);
+            if (pFormat != PIPE_FORMAT_NONE)
+               return st_pipe_format_to_mesa_format(pFormat);
+         }
       }
    }
 
    pFormat = st_choose_format(st, internalFormat, format, type,
                               pTarget, 0, bindings, ctx->Mesa_DXTn);
 
-   if (pFormat == PIPE_FORMAT_NONE) {
+   if (pFormat == PIPE_FORMAT_NONE && !is_renderbuffer) {
       /* try choosing format again, this time without render target bindings */
       pFormat = st_choose_format(st, internalFormat, format, type,
                                  pTarget, 0, PIPE_BIND_SAMPLER_VIEW,
@@ -2357,6 +2352,7 @@ void
 st_QueryInternalFormat(struct gl_context *ctx, GLenum target,
                        GLenum internalFormat, GLenum pname, GLint *params)
 {
+   struct st_context *st = st_context(ctx);
    /* The API entry-point gives us a temporary params buffer that is non-NULL
     * and guaranteed to have at least 16 elements.
     */
@@ -2374,7 +2370,30 @@ st_QueryInternalFormat(struct gl_context *ctx, GLenum target,
       params[0] = (GLint) num_samples;
       break;
    }
-
+   case GL_INTERNALFORMAT_PREFERRED: {
+      params[0] = GL_NONE;
+
+      /* We need to resolve an internal format that is compatible with
+       * the passed internal format, and optimal to the driver. By now,
+       * we just validate that the passed internal format is supported by
+       * the driver, and if so return the same internal format, otherwise
+       * return GL_NONE.
+       */
+      uint usage;
+      if (_mesa_is_depth_or_stencil_format(internalFormat))
+         usage = PIPE_BIND_DEPTH_STENCIL;
+      else
+         usage = PIPE_BIND_RENDER_TARGET;
+      enum pipe_format pformat = st_choose_format(st,
+                                                  internalFormat,
+                                                  GL_NONE,
+                                                  GL_NONE,
+                                                  PIPE_TEXTURE_2D, 1,
+                                                  usage, FALSE);
+      if (pformat)
+         params[0] = internalFormat;
+      break;
+   }
    default:
       /* For the rest of the pnames, we call back the Mesa's default
        * function for drivers that don't implement ARB_internalformat_query2.
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 18414055549..06b4bb41a9b 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -6345,7 +6345,7 @@ st_translate_program(
    }
 
    if (program->use_shared_memory)
-      t->shared_memory = ureg_DECL_shared_memory(ureg);
+      t->shared_memory = ureg_DECL_memory(ureg, TGSI_MEMORY_TYPE_SHARED);
 
    for (i = 0; i < program->shader->NumImages; i++) {
       if (program->images_used & (1 << i)) {
@@ -6370,6 +6370,42 @@ st_translate_program(
                        t->insn[t->labels[i].branch_target]);
    }
 
+   /* Set the next shader stage hint for VS and TES. */
+   switch (procType) {
+   case TGSI_PROCESSOR_VERTEX:
+   case TGSI_PROCESSOR_TESS_EVAL:
+      if (program->shader_program->SeparateShader)
+         break;
+
+      for (i = program->shader->Stage+1; i <= MESA_SHADER_FRAGMENT; i++) {
+         if (program->shader_program->_LinkedShaders[i]) {
+            unsigned next;
+
+            switch (i) {
+            case MESA_SHADER_TESS_CTRL:
+               next = TGSI_PROCESSOR_TESS_CTRL;
+               break;
+            case MESA_SHADER_TESS_EVAL:
+               next = TGSI_PROCESSOR_TESS_EVAL;
+               break;
+            case MESA_SHADER_GEOMETRY:
+               next = TGSI_PROCESSOR_GEOMETRY;
+               break;
+            case MESA_SHADER_FRAGMENT:
+               next = TGSI_PROCESSOR_FRAGMENT;
+               break;
+            default:
+               assert(0);
+               continue;
+            }
+
+            ureg_set_next_shader_processor(ureg, next);
+            break;
+         }
+      }
+      break;
+   }
+
 out:
    if (t) {
       free(t->arrays);
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 8772efb0944..7a686b199d5 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -241,43 +241,75 @@ src_register( struct st_translate *t,
  * Map mesa texture target to TGSI texture target.
  */
 unsigned
-st_translate_texture_target( GLuint textarget,
-                          GLboolean shadow )
+st_translate_texture_target(GLuint textarget, GLboolean shadow)
 {
    if (shadow) {
-      switch( textarget ) {
-      case TEXTURE_1D_INDEX:   return TGSI_TEXTURE_SHADOW1D;
-      case TEXTURE_2D_INDEX:   return TGSI_TEXTURE_SHADOW2D;
-      case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_SHADOWRECT;
-      case TEXTURE_1D_ARRAY_INDEX: return TGSI_TEXTURE_SHADOW1D_ARRAY;
-      case TEXTURE_2D_ARRAY_INDEX: return TGSI_TEXTURE_SHADOW2D_ARRAY;
-      case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_SHADOWCUBE;
-      case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_SHADOWCUBE_ARRAY;
-      default: break;
+      switch (textarget) {
+      case TEXTURE_1D_INDEX:
+         return TGSI_TEXTURE_SHADOW1D;
+      case TEXTURE_2D_INDEX:
+         return TGSI_TEXTURE_SHADOW2D;
+      case TEXTURE_RECT_INDEX:
+         return TGSI_TEXTURE_SHADOWRECT;
+      case TEXTURE_1D_ARRAY_INDEX:
+         return TGSI_TEXTURE_SHADOW1D_ARRAY;
+      case TEXTURE_2D_ARRAY_INDEX:
+         return TGSI_TEXTURE_SHADOW2D_ARRAY;
+      case TEXTURE_CUBE_INDEX:
+         return TGSI_TEXTURE_SHADOWCUBE;
+      case TEXTURE_CUBE_ARRAY_INDEX:
+         return TGSI_TEXTURE_SHADOWCUBE_ARRAY;
+      default:
+         break;
       }
    }
 
-   switch( textarget ) {
-   case TEXTURE_2D_MULTISAMPLE_INDEX: return TGSI_TEXTURE_2D_MSAA;
-   case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY_MSAA;
-   case TEXTURE_BUFFER_INDEX: return TGSI_TEXTURE_BUFFER;
-   case TEXTURE_1D_INDEX:   return TGSI_TEXTURE_1D;
-   case TEXTURE_2D_INDEX:   return TGSI_TEXTURE_2D;
-   case TEXTURE_3D_INDEX:   return TGSI_TEXTURE_3D;
-   case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_CUBE;
-   case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_CUBE_ARRAY;
-   case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_RECT;
-   case TEXTURE_1D_ARRAY_INDEX:   return TGSI_TEXTURE_1D_ARRAY;
-   case TEXTURE_2D_ARRAY_INDEX:   return TGSI_TEXTURE_2D_ARRAY;
-   case TEXTURE_EXTERNAL_INDEX:   return TGSI_TEXTURE_2D;
+   switch (textarget) {
+   case TEXTURE_2D_MULTISAMPLE_INDEX:
+      return TGSI_TEXTURE_2D_MSAA;
+   case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX:
+      return TGSI_TEXTURE_2D_ARRAY_MSAA;
+   case TEXTURE_BUFFER_INDEX:
+      return TGSI_TEXTURE_BUFFER;
+   case TEXTURE_1D_INDEX:
+      return TGSI_TEXTURE_1D;
+   case TEXTURE_2D_INDEX:
+      return TGSI_TEXTURE_2D;
+   case TEXTURE_3D_INDEX:
+      return TGSI_TEXTURE_3D;
+   case TEXTURE_CUBE_INDEX:
+      return TGSI_TEXTURE_CUBE;
+   case TEXTURE_CUBE_ARRAY_INDEX:
+      return TGSI_TEXTURE_CUBE_ARRAY;
+   case TEXTURE_RECT_INDEX:
+      return TGSI_TEXTURE_RECT;
+   case TEXTURE_1D_ARRAY_INDEX:
+      return TGSI_TEXTURE_1D_ARRAY;
+   case TEXTURE_2D_ARRAY_INDEX:
+      return TGSI_TEXTURE_2D_ARRAY;
+   case TEXTURE_EXTERNAL_INDEX:
+      return TGSI_TEXTURE_2D;
    default:
-      debug_assert( 0 );
+      debug_assert(!"unexpected texture target index");
       return TGSI_TEXTURE_1D;
    }
 }
 
 
 /**
+ * Translate a (1 << TEXTURE_x_INDEX) bit into a TGSI_TEXTURE_x enum.
+ */
+static unsigned
+translate_texture_index(GLbitfield texBit, bool shadow)
+{
+   int index = ffs(texBit);
+   assert(index > 0);
+   assert(index - 1 < NUM_TEXTURE_TARGETS);
+   return st_translate_texture_target(index - 1, shadow);
+}
+
+
+/**
  * Create a TGSI ureg_dst register from a Mesa dest register.
  */
 static struct ureg_dst
@@ -1128,7 +1160,16 @@ st_translate_mesa_program(
    /* texture samplers */
    for (i = 0; i < ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; i++) {
       if (program->SamplersUsed & (1 << i)) {
+         unsigned target =
+            translate_texture_index(program->TexturesUsed[i],
+                                    !!(program->ShadowSamplers & (1 << i)));
          t->samplers[i] = ureg_DECL_sampler( ureg, i );
+         ureg_DECL_sampler_view(ureg, i, target,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT);
+
       }
    }
 
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index c9f390aa9a2..80dcfd82743 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -871,6 +871,7 @@ st_create_fp_variant(struct st_context *st,
       variant->bitmap_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1;
 
       tokens = st_get_bitmap_shader(tgsi.tokens,
+                                    st->internal_target,
                                     variant->bitmap_sampler,
                                     st->needs_texcoord_semantic,
                                     st->bitmap.tex_format ==
@@ -923,7 +924,7 @@ st_create_fp_variant(struct st_context *st,
                                      bias_const, key->pixelMaps,
                                      variant->drawpix_sampler,
                                      variant->pixelmap_sampler,
-                                     texcoord_const);
+                                     texcoord_const, st->internal_target);
 
       if (tokens) {
          if (tgsi.tokens != stfp->tgsi.tokens)
diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c
index d9aae73302c..3163b0407ea 100644
--- a/src/mesa/swrast/s_points.c
+++ b/src/mesa/swrast/s_points.c
@@ -22,7 +22,7 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-
+#include "main/framebuffer.h"
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "s_context.h"
@@ -257,7 +257,7 @@ smooth_point(struct gl_context *ctx, const SWvertex *vert)
    size = get_size(ctx, vert, GL_TRUE);
 
    /* alpha attenuation / fade factor */
-   if (ctx->Multisample._Enabled) {
+   if (_mesa_is_multisample_enabled(ctx)) {
       if (vert->pointSize >= ctx->Point.Threshold) {
          alphaAtten = 1.0F;
       }
diff --git a/src/mesa/swrast/s_texture.c b/src/mesa/swrast/s_texture.c
index 9ccd0e34702..d35bea96b92 100644
--- a/src/mesa/swrast/s_texture.c
+++ b/src/mesa/swrast/s_texture.c
@@ -60,7 +60,7 @@ _swrast_delete_texture_image(struct gl_context *ctx,
 }
 
 static unsigned int
-texture_slices(struct gl_texture_image *texImage)
+texture_slices(const struct gl_texture_image *texImage)
 {
    if (texImage->TexObject->Target == GL_TEXTURE_1D_ARRAY)
       return texImage->Height;
@@ -188,6 +188,7 @@ check_map_teximage(const struct gl_texture_image *texImage,
    assert(y < texImage->Height || texImage->Height == 0);
    assert(x + w <= texImage->Width);
    assert(y + h <= texImage->Height);
+   assert(slice < texture_slices(texImage));
 }
 
 /**
@@ -240,7 +241,6 @@ _swrast_map_teximage(struct gl_context *ctx,
    assert(swImage->Buffer);
    assert(swImage->Buffer == swImage->ImageSlices[0]);
 
-   assert(slice < texture_slices(texImage));
    map = swImage->ImageSlices[slice];
 
    /* apply x/y offset to map address */