Merge remote-tracking branch 'mesa-public/master' into vulkan

This pulls in Matt's big compiler refactor.
author: Jason Ekstrand <[email protected]> 2015-11-14 07:56:10 -0800
committer: Jason Ekstrand <[email protected]> 2015-11-14 07:56:10 -0800
commit: 1469ccb7464836c752fa2664c36d8fae7e80606c (patch)
tree: 6f15e2eeb7e16e4085a0c58d50a36a4c12b231a5
parent: e8f51fe4deb5082fece5f8cb167b89b0f03eb244 (diff)
parent: f94e1d97381ec787c2abbbcd5265252596217e33 (diff)
336 files changed, 9083 insertions, 4388 deletions
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 7f6b8c9ef27..b768eea789a 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -149,7 +149,7 @@ GL 4.2, GLSL 4.20:
 
 GL 4.3, GLSL 4.30:
 
-  GL_ARB_arrays_of_arrays                              started (Timothy)
+  GL_ARB_arrays_of_arrays                              DONE (i965)
   GL_ARB_ES3_compatibility                             DONE (all drivers that support GLSL 3.30)
   GL_ARB_clear_buffer_object                           DONE (all drivers)
   GL_ARB_compute_shader                                in progress (jljusten)
@@ -169,7 +169,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
   GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
   GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_vertex_attrib_binding                         DONE (all drivers)
 
 
@@ -177,7 +177,7 @@ GL 4.4, GLSL 4.40:
 
   GL_MAX_VERTEX_ATTRIB_STRIDE                          DONE (all drivers)
   GL_ARB_buffer_storage                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_clear_texture                                 DONE (i965) (gallium - in progress, VMware)
+  GL_ARB_clear_texture                                 DONE (i965, nv50, nvc0)
   GL_ARB_enhanced_layouts                              in progress (Timothy)
   - compile-time constant expressions                  in progress
   - explicit byte offsets for blocks                   in progress
@@ -209,7 +209,7 @@ GL 4.5, GLSL 4.50:
 
 These are the extensions cherry-picked to make GLES 3.1
 GLES3.1, GLSL ES 3.1
-  GL_ARB_arrays_of_arrays                              started (Timothy)
+  GL_ARB_arrays_of_arrays                              DONE (i965)
   GL_ARB_compute_shader                                in progress (jljusten)
   GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
diff --git a/docs/README.UVD b/docs/README.UVD
index 38ea8646a64..b0f4b9d8c67 100644
--- a/docs/README.UVD
+++ b/docs/README.UVD
@@ -2,8 +2,8 @@ The software may implement third party technologies (e.g. third party
 libraries) that are not licensed to you by AMD and for which you may need
 to obtain licenses from other parties.  Unless explicitly stated otherwise,
 these third party technologies are not licensed hereunder.  Such third
-party technologies include, but are not limited, to H.264, MPEG-2, MPEG-4,
-AVC, and VC-1.
+party technologies include, but are not limited, to H.264, H.265, HEVC, MPEG-2,
+MPEG-4, AVC, and VC-1.
 
 For MPEG-2 Encoding Products ANY USE OF THIS PRODUCT IN ANY MANNER OTHER
 THAN PERSONAL USE THAT COMPLIES WITH THE MPEG-2 STANDARD FOR ENCODING VIDEO
diff --git a/docs/index.html b/docs/index.html
index c8d4a5c2699..0e317fdf411 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,12 @@
 
 <h1>News</h1>
 
+<h2>November 11, 2015</h2>
+<p>
+<a href="relnotes/11.0.5.html">Mesa 11.0.5</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>October 24, 2015</h2>
 <p>
 <a href="relnotes/11.0.4.html">Mesa 11.0.4</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index d1dde4fd726..588cf0b2c83 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/11.0.5.html">11.0.5 release notes</a>
 <li><a href="relnotes/11.0.4.html">11.0.4 release notes</a>
 <li><a href="relnotes/11.0.3.html">11.0.3 release notes</a>
 <li><a href="relnotes/10.6.9.html">10.6.9 release notes</a>
diff --git a/docs/relnotes/11.0.5.html b/docs/relnotes/11.0.5.html
new file mode 100644
index 00000000000..05fa0b98325
--- /dev/null
+++ b/docs/relnotes/11.0.5.html
@@ -0,0 +1,174 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.5 Release Notes / November 11, 2015</h1>
+
+<p>
+Mesa 11.0.5 is a bug fix release which fixes bugs found since the 11.0.4 release.
+</p>
+<p>
+Mesa 11.0.5 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+8495ef5c06f7f726452462b7d408a5b40048373ff908f2283a3b4d1f49b45ee6  mesa-11.0.5.tar.gz
+9c255a2a6695fcc6ef4a279e1df0aeaf417dc142f39ee59dfb533d80494bb67a  mesa-11.0.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91993">Bug 91993</a> - Graphical glitch in Astromenace (open-source game).</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92214">Bug 92214</a> - Flightgear crashes during splashboot with R600 driver, LLVM 3.7.0 and mesa 11.0.2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92437">Bug 92437</a> - osmesa: Expose GL entry points for Windows build, via .def file</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92476">Bug 92476</a> - [cts] ES2-CTS.gtf.GL2ExtensionTests.egl_image.egl_image fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92623">Bug 92623</a> - Differences in prog_data ignored when caching fragment programs (causes hangs)</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Deucher (1):</p>
+<ul>
+  <li>radeon/uvd: don't expose HEVC on old UVD hw (v3)</li>
+</ul>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965/skl: Add GT4 PCI IDs</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.4</li>
+  <li>cherry-ignore: ignore a possible wrong nomination</li>
+  <li>Revert "mesa/glformats: Undo code changes from _mesa_base_tex_format() move"</li>
+  <li>Update version to 11.0.5</li>
+</ul>
+
+<p>Emmanuel Gil Peyrot (1):</p>
+<ul>
+  <li>gbm.h: Add a missing stddef.h include for size_t.</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>vc4: When the create ioctl fails, free our cache and try again.</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>i965: Fix is-renderable check in intel_image_target_renderbuffer_storage</li>
+</ul>
+
+<p>Ilia Mirkin (3):</p>
+<ul>
+  <li>nvc0: respect edgeflag attribute width</li>
+  <li>nouveau: set MaxDrawBuffers to the same value as MaxColorAttachments</li>
+  <li>nouveau: relax fence emit space assert</li>
+</ul>
+
+<p>Ivan Kalvachev (1):</p>
+<ul>
+  <li>r600g: Fix special negative immediate constants when using ABS modifier.</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>nir/lower_vec_to_movs: Pass the shader around directly</li>
+  <li>nir: Report progress from lower_vec_to_movs().</li>
+</ul>
+
+<p>Jose Fonseca (2):</p>
+<ul>
+  <li>gallivm: Translate all util_cpu_caps bits to LLVM attributes.</li>
+  <li>gallivm: Explicitly disable unsupported CPU features.</li>
+</ul>
+
+<p>Julien Isorce (4):</p>
+<ul>
+  <li>st/va: pass picture desc to begin and decode</li>
+  <li>nvc0: fix crash when nv50_miptree_from_handle fails</li>
+  <li>st/va: do not destroy old buffer when new one failed</li>
+  <li>st/va: add more errors checks in vlVaBufferSetNumElements and vlVaMapBuffer</li>
+</ul>
+
+<p>Kenneth Graunke (6):</p>
+<ul>
+  <li>i965: Fix missing BRW_NEW_*_PROG_DATA flagging caused by cache reuse.</li>
+  <li>nir: Report progress from nir_split_var_copies().</li>
+  <li>nir: Properly invalidate metadata in nir_split_var_copies().</li>
+  <li>nir: Properly invalidate metadata in nir_opt_copy_prop().</li>
+  <li>nir: Properly invalidate metadata in nir_lower_vec_to_movs().</li>
+  <li>nir: Properly invalidate metadata in nir_opt_remove_phis().</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: add register definitions for Stoney</li>
+</ul>
+
+<p>Nanley Chery (1):</p>
+<ul>
+  <li>mesa/glformats: Undo code changes from _mesa_base_tex_format() move</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>st/mesa: fix mipmap generation for immutable textures with incomplete pyramids</li>
+</ul>
+
+<p>Nigel Stewart (1):</p>
+<ul>
+  <li>osmesa: Expose GL entry points for Windows build via DEF file.</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>gallivm: disable f16c when not using AVX</li>
+</ul>
+
+<p>Samuel Li (2):</p>
+<ul>
+  <li>radeonsi: add support for Stoney asics (v3)</li>
+  <li>radeonsi: add Stoney pci ids</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html
index 7160244fcb4..82ee3c4037b 100644
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -44,8 +44,10 @@ Note: some of the new features are only available with certain drivers.
 </p>
 
 <ul>
+<li>GL_ARB_arrays_of_arrays on i965</li>
 <li>GL_ARB_blend_func_extended on freedreno (a3xx)</li>
-<li>GL_ARB_copy_image on radeonsi</li>
+<li>GL_ARB_clear_texture on nv50, nvc0</li>
+<li>GL_ARB_copy_image on nv50, nvc0, radeonsi</li>
 <li>GL_ARB_gpu_shader_fp64 on r600 for Cypress/Cayman/Aruba chips</li>
 <li>GL_ARB_gpu_shader5 on r600 for Evergreen and later chips</li>
 <li>GL_ARB_shader_clock on i965 (gen7+)</li>
@@ -54,7 +56,8 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi</li>
 <li>GL_ARB_texture_barrier / GL_NV_texture_barrier on i965</li>
 <li>GL_ARB_texture_query_lod on softpipe</li>
-<li>GL_ARB_texture_view on radeonsi</li>
+<li>GL_ARB_texture_view on radeonsi and r600 (for evergeen and newer)</li>
+<li>GL_EXT_buffer_storage implemented for when ES 3.1 support is gained</li>
 <li>GL_EXT_draw_elements_base_vertex on all drivers</li>
 <li>GL_OES_draw_elements_base_vertex on all drivers</li>
 <li>EGL_KHR_create_context on softpipe, llvmpipe</li>
diff --git a/docs/utilities.html b/docs/utilities.html
index 9541d3082f7..5c0a4fdcd2e 100644
--- a/docs/utilities.html
+++ b/docs/utilities.html
@@ -30,6 +30,10 @@
   <dt><a href="http://www.valgrind.org">Valgrind</a></dt>
   <dd>is a very useful tool for tracking down
   memory-related problems in your code.</dd>
+
+  <dt><a href="http:scan.coverity.com/projects/mesa">Coverity</a><dt>
+  <dd>provides static code analysis of Mesa.  If you create an account
+  you can see the results and try to fix outstanding issues.</dd>
 </dl>
 
 </div>
diff --git a/docs/vmware-guest.html b/docs/vmware-guest.html
index 284c6c261d2..b495bc255ce 100644
--- a/docs/vmware-guest.html
+++ b/docs/vmware-guest.html
@@ -148,10 +148,33 @@ To get the latest code from git:
 <h2>Building the Code</h2>
 
 <ul>
-<li>Build libdrm: If you're on a 32-bit system, you should skip the --libdir configure option. Note also the comment about toolchain libdrm above. 
+<li>
+Determine where the GL-related libraries reside on your system and set
+the LIBDIR environment variable accordingly.
+<br><br>
+For 32-bit Ubuntu systems:
+<pre>
+  export LIBDIR=/usr/lib/i386-linux-gnu
+</pre>
+For 64-bit Ubuntu systems:
+<pre>
+  export LIBDIR=/usr/lib/x86_64-linux-gnu
+</pre>
+For 32-bit Fedora systems:
+<pre>
+  export LIBDIR=/usr/lib
+</pre>
+For 64-bit Fedora systems:
+<pre>
+  export LIBDIR=/usr/lib64
+</pre>
+
+</li>
+
+<li>Build libdrm:
   <pre>
   cd $TOP/drm
-  ./autogen.sh --prefix=/usr --libdir=/usr/lib64
+  ./autogen.sh --prefix=/usr --libdir=${LIBDIR}
   make
   sudo make install
   </pre>
@@ -162,12 +185,9 @@ The libxatracker library is used exclusively by the X server to do render,
 copy and video acceleration:
 <br>
 The following configure options doesn't build the EGL system.
-<br>
-As before, if you're on a 32-bit system, you should skip the --libdir
-configure option.
   <pre>
   cd $TOP/mesa
-  ./autogen.sh --prefix=/usr --libdir=/usr/lib64 --with-gallium-drivers=svga --with-dri-drivers= --enable-xa --disable-dri3
+  ./autogen.sh --prefix=/usr --libdir=${LIBDIR} --with-gallium-drivers=svga --with-dri-drivers=swrast --enable-xa --disable-dri3 --enable-glx-tls
   make
   sudo make install
   </pre>
@@ -177,25 +197,39 @@ if they're not installed in your system.  You should be told what's missing.
 <br>
 <br>
 
-<li>xf86-video-vmware: Now, once libxatracker is installed, we proceed with building and replacing the current Xorg driver. First check if your system is 32- or 64-bit. If you're building for a 32-bit system, you will not be needing the --libdir=/usr/lib64 option to autogen. 
+<li>xf86-video-vmware: Now, once libxatracker is installed, we proceed with
+building and replacing the current Xorg driver.
+First check if your system is 32- or 64-bit.
   <pre>
   cd $TOP/xf86-video-vmware
-  ./autogen.sh --prefix=/usr --libdir=/usr/lib64
+  ./autogen.sh --prefix=/usr --libdir=${LIBDIR}
   make
   sudo make install
   </pre>
+
 <li>vmwgfx kernel module. First make sure that any old version of this kernel module is removed from the system by issuing
-  <pre>
+<pre>
   sudo rm /lib/modules/`uname -r`/kernel/drivers/gpu/drm/vmwgfx.ko*
-  </pre>
-Then 
-  <pre>
+</pre>
+Build and install:
+<pre>
   cd $TOP/vmwgfx
   make
   sudo make install
-  sudo cp 00-vmwgfx.rules /etc/udev/rules.d
-  sudo depmod -ae
-  </pre>
+  sudo depmod -a
+</pre>
+If you're using a Ubuntu OS:
+<pre>
+  sudo update-initramfs -u
+</pre>
+If you're using a Fedora OS:
+<pre>
+  sudo dracut --force
+</pre>
+Add 'vmwgfx' to the /etc/modules file:
+<pre>
+  echo vmwgfx | sudo tee -a /etc/modules
+</pre>
 
 Note: some distros put DRM kernel drivers in different directories.
 For example, sometimes vmwgfx.ko might be found in
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index 0d161f617a1..a635c758da1 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -703,18 +703,10 @@ dri2_wl_swap_buffers_with_damage(_EGLDriver *drv,
    dri2_surf->dx = 0;
    dri2_surf->dy = 0;
 
-   if (n_rects == 0) {
-      wl_surface_damage(dri2_surf->wl_win->surface,
-                        0, 0, INT32_MAX, INT32_MAX);
-   } else {
-      for (i = 0; i < n_rects; i++) {
-         const int *rect = &rects[i * 4];
-         wl_surface_damage(dri2_surf->wl_win->surface,
-                           rect[0],
-                           dri2_surf->base.Height - rect[1] - rect[3],
-                           rect[2], rect[3]);
-      }
-   }
+   /* We deliberately ignore the damage region and post maximum damage, due to
+    * https://bugs.freedesktop.org/78190 */
+   wl_surface_damage(dri2_surf->wl_win->surface,
+                     0, 0, INT32_MAX, INT32_MAX);
 
    if (dri2_dpy->is_different_gpu) {
       _EGLContext *ctx = _eglGetCurrentContext();
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 9df4e265b5b..82ef5ecfce4 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -349,7 +349,8 @@ VL_SOURCES := \
 
 # XXX: Nuke this as our dri targets no longer depend on VL.
 VL_WINSYS_SOURCES := \
-	vl/vl_winsys_dri.c
+	vl/vl_winsys_dri.c \
+	vl/vl_winsys_drm.c
 
 VL_STUB_SOURCES := \
 	vl/vl_stubs.c
@@ -378,7 +379,9 @@ GALLIVM_SOURCES := \
 	gallivm/lp_bld_flow.h \
 	gallivm/lp_bld_format_aos_array.c \
 	gallivm/lp_bld_format_aos.c \
+	gallivm/lp_bld_format_cached.c \
 	gallivm/lp_bld_format_float.c \
+	gallivm/lp_bld_format.c \
 	gallivm/lp_bld_format.h \
 	gallivm/lp_bld_format_soa.c \
 	gallivm/lp_bld_format_srgb.c \
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index b1e1bcbee04..8435991fb6b 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -625,6 +625,7 @@ generate_vs(struct draw_llvm_variant *variant,
                      inputs,
                      outputs,
                      context_ptr,
+                     NULL,
                      draw_sampler,
                      &llvm->draw->vs.vertex_shader->info,
                      NULL);
@@ -749,7 +750,8 @@ generate_fetch(struct gallivm_state *gallivm,
                                     lp_float32_vec4_type(),
                                     FALSE,
                                     map_ptr,
-                                    zero, zero, zero);
+                                    zero, zero, zero,
+                                    NULL);
       LLVMBuildStore(builder, val, temp_ptr);
    }
    lp_build_endif(&if_ctx);
@@ -2193,6 +2195,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
                      NULL,
                      outputs,
                      context_ptr,
+                     NULL,
                      sampler,
                      &llvm->draw->gs.geometry_shader->info,
                      (const struct lp_build_tgsi_gs_iface *)&gs_iface);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.c b/src/gallium/auxiliary/gallivm/lp_bld_format.c
new file mode 100644
index 00000000000..a82fd8feee8
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.c
@@ -0,0 +1,56 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "lp_bld_format.h"
+
+
+
+LLVMTypeRef
+lp_build_format_cache_type(struct gallivm_state *gallivm)
+{
+   LLVMTypeRef elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_COUNT];
+   LLVMTypeRef s;
+
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_DATA] =
+         LLVMArrayType(LLVMInt32TypeInContext(gallivm->context),
+                       LP_BUILD_FORMAT_CACHE_SIZE * 16);
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_TAGS] =
+         LLVMArrayType(LLVMInt64TypeInContext(gallivm->context),
+                       LP_BUILD_FORMAT_CACHE_SIZE);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL] =
+         LLVMInt64TypeInContext(gallivm->context);
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS] =
+         LLVMInt64TypeInContext(gallivm->context);
+#endif
+
+   s = LLVMStructTypeInContext(gallivm->context, elem_types,
+                               LP_BUILD_FORMAT_CACHE_MEMBER_COUNT, 0);
+
+   return s;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 969f1f6cc94..5c866f420bd 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -44,6 +44,45 @@ struct lp_type;
 struct lp_build_context;
 
 
+#define LP_BUILD_FORMAT_CACHE_DEBUG 0
+/*
+ * Block cache
+ *
+ * Optional block cache to be used when unpacking big pixel blocks.
+ * Must be a power of 2
+ */
+
+#define LP_BUILD_FORMAT_CACHE_SIZE 128
+
+/*
+ * Note: cache_data needs 16 byte alignment.
+ */
+struct lp_build_format_cache
+{
+   PIPE_ALIGN_VAR(16) uint32_t cache_data[LP_BUILD_FORMAT_CACHE_SIZE][4][4];
+   uint64_t cache_tags[LP_BUILD_FORMAT_CACHE_SIZE];
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   uint64_t cache_access_total;
+   uint64_t cache_access_miss;
+#endif
+};
+
+
+enum {
+   LP_BUILD_FORMAT_CACHE_MEMBER_DATA = 0,
+   LP_BUILD_FORMAT_CACHE_MEMBER_TAGS,
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL,
+   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS,
+#endif
+   LP_BUILD_FORMAT_CACHE_MEMBER_COUNT
+};
+
+
+LLVMTypeRef
+lp_build_format_cache_type(struct gallivm_state *gallivm);
+
+
 /*
  * AoS
  */
@@ -66,7 +105,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offset,
                         LLVMValueRef i,
-                        LLVMValueRef j);
+                        LLVMValueRef j,
+                        LLVMValueRef cache);
 
 LLVMValueRef
 lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
@@ -107,13 +147,13 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         LLVMValueRef offsets,
                         LLVMValueRef i,
                         LLVMValueRef j,
+                        LLVMValueRef cache,
                         LLVMValueRef rgba_out[4]);
 
 /*
  * YUV
  */
 
-
 LLVMValueRef
 lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
                                    const struct util_format_description *format_desc,
@@ -123,6 +163,18 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
                                    LLVMValueRef i,
                                    LLVMValueRef j);
 
+
+LLVMValueRef
+lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
+                             const struct util_format_description *format_desc,
+                             unsigned n,
+                             LLVMValueRef base_ptr,
+                             LLVMValueRef offset,
+                             LLVMValueRef i,
+                             LLVMValueRef j,
+                             LLVMValueRef cache);
+
+
 /*
  * special float formats
  */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index ddf3ad1dfc6..a41b30bbb96 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -370,7 +370,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offset,
                         LLVMValueRef i,
-                        LLVMValueRef j)
+                        LLVMValueRef j,
+                        LLVMValueRef cache)
 {
    LLVMBuilderRef builder = gallivm->builder;
    unsigned num_pixels = type.length / 4;
@@ -503,6 +504,34 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
    }
 
    /*
+    * s3tc rgb formats
+    */
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
+      struct lp_type tmp_type;
+      LLVMValueRef tmp;
+
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = num_pixels * 4;
+      tmp_type.norm = TRUE;
+
+      tmp = lp_build_fetch_cached_texels(gallivm,
+                                         format_desc,
+                                         num_pixels,
+                                         base_ptr,
+                                         offset,
+                                         i, j,
+                                         cache);
+
+      lp_build_conv(gallivm,
+                    tmp_type, type,
+                    &tmp, 1, &tmp, 1);
+
+       return tmp;
+   }
+
+   /*
     * Fallback to util_format_description::fetch_rgba_8unorm().
     */
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
new file mode 100644
index 00000000000..b683e7f960c
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
@@ -0,0 +1,374 @@
+/**************************************************************************
+ *
+ * Copyright 2015 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_format.h"
+#include "lp_bld_type.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_const.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_swizzle.h"
+
+#include "util/u_math.h"
+
+
+/**
+ * @file
+ * Complex block-compression based formats are handled here by using a cache,
+ * so re-decoding of every pixel is not required.
+ * Especially for bilinear filtering, texel reuse is very high hence even
+ * a small cache helps.
+ * The elements in the cache are the decoded blocks - currently things
+ * are restricted to formats which are 4x4 block based, and the decoded
+ * texels must fit into 4x8 bits.
+ * The cache is direct mapped so hitrates aren't all that great and cache
+ * thrashing could happen.
+ *
+ * @author Roland Scheidegger <[email protected]>
+ */
+
+
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+static void
+update_cache_access(struct gallivm_state *gallivm,
+                    LLVMValueRef ptr,
+                    unsigned count,
+                    unsigned index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, cache_access;
+
+   assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
+          index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+
+   member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
+   cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
+   cache_access = LLVMBuildAdd(builder, cache_access,
+                               LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
+                                                                   count, 0), "");
+   LLVMBuildStore(builder, cache_access, member_ptr);
+}
+#endif
+
+
+static void
+store_cached_block(struct gallivm_state *gallivm,
+                   LLVMValueRef *col,
+                   LLVMValueRef tag_value,
+                   LLVMValueRef hash_index,
+                   LLVMValueRef cache)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef ptr, indices[3];
+   LLVMTypeRef type_ptr4x32;
+   unsigned count;
+
+   type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+   indices[2] = hash_index;
+   ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), "");
+   LLVMBuildStore(builder, tag_value, ptr);
+
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+   hash_index = LLVMBuildMul(builder, hash_index,
+                             lp_build_const_int32(gallivm, 16), "");
+   for (count = 0; count < 4; count++) {
+      indices[2] = hash_index;
+      ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), "");
+      ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
+      LLVMBuildStore(builder, col[count], ptr);
+      hash_index = LLVMBuildAdd(builder, hash_index,
+                                lp_build_const_int32(gallivm, 4), "");
+   }
+}
+
+
+static LLVMValueRef
+lookup_cached_pixel(struct gallivm_state *gallivm,
+                    LLVMValueRef ptr,
+                    LLVMValueRef index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, indices[3];
+
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+   indices[2] = index;
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   return LLVMBuildLoad(builder, member_ptr, "cache_data");
+}
+
+
+static LLVMValueRef
+lookup_tag_data(struct gallivm_state *gallivm,
+                LLVMValueRef ptr,
+                LLVMValueRef index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, indices[3];
+
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+   indices[2] = index;
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   return LLVMBuildLoad(builder, member_ptr, "tag_data");
+}
+
+
+static void
+update_cached_block(struct gallivm_state *gallivm,
+                    const struct util_format_description *format_desc,
+                    LLVMValueRef ptr_addr,
+                    LLVMValueRef hash_index,
+                    LLVMValueRef cache)
+
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+   LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+   LLVMValueRef function;
+   LLVMValueRef tag_value, tmp_ptr;
+   LLVMValueRef col[4];
+   unsigned i, j;
+
+   /*
+    * Use format_desc->fetch_rgba_8unorm() for each pixel in the block.
+    * This doesn't actually make any sense whatsoever, someone would need
+    * to write a function doing this for all pixels in a block (either as
+    * an external c function or with generated code). Don't ask.
+    */
+
+   {
+      /*
+       * Function to call looks like:
+       *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+       */
+      LLVMTypeRef ret_type;
+      LLVMTypeRef arg_types[4];
+      LLVMTypeRef function_type;
+
+      assert(format_desc->fetch_rgba_8unorm);
+
+      ret_type = LLVMVoidTypeInContext(gallivm->context);
+      arg_types[0] = pi8t;
+      arg_types[1] = pi8t;
+      arg_types[2] = i32t;
+      arg_types[3] = i32t;
+      function_type = LLVMFunctionType(ret_type, arg_types,
+                                       Elements(arg_types), 0);
+
+      /* make const pointer for the C fetch_rgba_8unorm function */
+      function = lp_build_const_int_pointer(gallivm,
+         func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
+
+      /* cast the callee pointer to the function's type */
+      function = LLVMBuildBitCast(builder, function,
+                                  LLVMPointerType(function_type, 0),
+                                  "cast callee");
+   }
+
+   tmp_ptr = lp_build_array_alloca(gallivm, i32x4,
+                                   lp_build_const_int32(gallivm, 16),
+                                   "tmp_decode_store");
+   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
+
+   /*
+    * Invoke format_desc->fetch_rgba_8unorm() for each pixel.
+    * This is going to be really really slow.
+    * Note: the block store format is actually
+    * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ...
+    */
+   for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j) {
+         LLVMValueRef args[4];
+         LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4);
+
+         /*
+          * Note we actually supply a pointer to the start of the block,
+          * not the start of the texture.
+          */
+         args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, "");
+         args[1] = ptr_addr;
+         args[2] = LLVMConstInt(i32t, i, 0);
+         args[3] = LLVMConstInt(i32t, j, 0);
+         LLVMBuildCall(builder, function, args, Elements(args), "");
+      }
+   }
+
+   /* Finally store the block - pointless mem copy + update tag. */
+   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), "");
+   for (i = 0; i < 4; ++i) {
+      LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i);
+      LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, "");
+      col[i] = LLVMBuildLoad(builder, ptr, "");
+   }
+
+   tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
+                                 LLVMInt64TypeInContext(gallivm->context), "");
+   store_cached_block(gallivm, col, tag_value, hash_index, cache);
+}
+
+
+/*
+ * Do a cached lookup.
+ *
+ * Returns (vectors of) 4x8 rgba aos value
+ */
+LLVMValueRef
+lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
+                             const struct util_format_description *format_desc,
+                             unsigned n,
+                             LLVMValueRef base_ptr,
+                             LLVMValueRef offset,
+                             LLVMValueRef i,
+                             LLVMValueRef j,
+                             LLVMValueRef cache)
+
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned count, low_bit, log2size;
+   LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
+   LLVMValueRef ij_index, hash_index, hash_mask, block_index;
+   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
+   struct lp_type type;
+   struct lp_build_context bld32;
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   assert(format_desc->block.width == 4);
+   assert(format_desc->block.height == 4);
+
+   lp_build_context_init(&bld32, gallivm, type);
+
+   /*
+    * compute hash - we use direct mapped cache, the hash function could
+    *                be better but it needs to be simple
+    * per-element:
+    *    compare offset with offset stored at tag (hash)
+    *    if not equal decode/store block, update tag
+    *    extract color from cache
+    *    assemble result vector
+    */
+
+   /* TODO: not ideal with 32bit pointers... */
+
+   low_bit = util_logbase2(format_desc->block.bits / 8);
+   log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
+   addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
+   ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
+   ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
+   /* For the hash function, first mask off the unused lowest bits. Then just
+      do some xor with address bits - only use lower 32bits */
+   ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
+   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+                                 lp_build_const_int_vec(gallivm, type, low_bit), "");
+   /* This only really makes sense for size 64,128,256 */
+   hash_index = ptr_addrtrunc;
+   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+                                 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
+   hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
+   tmp = LLVMBuildLShr(builder, hash_index,
+                       lp_build_const_int_vec(gallivm, type, log2size), "");
+   hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
+
+   hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
+   hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
+   ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
+   ij_index = LLVMBuildAdd(builder, ij_index, j, "");
+   block_index = LLVMBuildShl(builder, hash_index,
+                              lp_build_const_int_vec(gallivm, type, 4), "");
+   block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
+
+   if (n > 1) {
+      color = LLVMGetUndef(LLVMVectorType(i32t, n));
+      for (count = 0; count < n; count++) {
+         LLVMValueRef index, cond, colorx;
+         LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
+         struct lp_build_if_state if_ctx;
+
+         index = lp_build_const_int32(gallivm, count);
+         offsetx = LLVMBuildExtractElement(builder, offset, index, "");
+         addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
+         addrx = LLVMBuildAdd(builder, addrx, addr, "");
+         block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
+         hash_indexx = LLVMBuildLShr(builder, block_indexx,
+                                     lp_build_const_int32(gallivm, 4), "");
+         offset_stored = lookup_tag_data(gallivm, cache, hash_indexx);
+         cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
+
+         lp_build_if(&if_ctx, gallivm, cond);
+         {
+            ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
+                                          LLVMPointerType(i8t, 0), "");
+            update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+            update_cache_access(gallivm, cache, 1,
+                                LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+         }
+         lp_build_endif(&if_ctx);
+
+         colorx = lookup_cached_pixel(gallivm, cache, block_indexx);
+
+         color = LLVMBuildInsertElement(builder, color, colorx,
+                                        lp_build_const_int32(gallivm, count), "");
+      }
+   }
+   else {
+      LLVMValueRef cond;
+      struct lp_build_if_state if_ctx;
+
+      tmp = LLVMBuildZExt(builder, offset, i64t, "");
+      addr = LLVMBuildAdd(builder, tmp, addr, "");
+      offset_stored = lookup_tag_data(gallivm, cache, hash_index);
+      cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
+
+      lp_build_if(&if_ctx, gallivm, cond);
+      {
+         tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
+         update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+         update_cache_access(gallivm, cache, 1,
+                             LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+      }
+      lp_build_endif(&if_ctx);
+
+      color = lookup_cached_pixel(gallivm, cache, block_index);
+   }
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   update_cache_access(gallivm, cache, n,
+                       LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
+#endif
+   return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index afaabc08790..8bae94af3d7 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -346,6 +346,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
  *              these will always be (0,0).  For compressed formats, i will
  *              be in [0, block_width-1] and j will be in [0, block_height-1].
+ * \param cache  optional value pointing to a lp_build_format_cache structure
  */
 void
 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
@@ -355,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j,
+                        LLVMValueRef cache,
                         LLVMValueRef rgba_out[4])
 {
    LLVMBuilderRef builder = gallivm->builder;
@@ -473,7 +475,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       tmp_type.norm = TRUE;
 
       tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-                                    TRUE, base_ptr, offset, i, j);
+                                    TRUE, base_ptr, offset, i, j, cache);
 
       lp_build_rgba8_to_fi32_soa(gallivm,
                                 type,
@@ -483,6 +485,39 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       return;
    }
 
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
+       /* non-srgb case is already handled above */
+       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+       type.floating && type.width == 32 &&
+       (type.length == 1 || (type.length % 4 == 0)) &&
+       cache) {
+      const struct util_format_description *format_decompressed;
+      const struct util_format_description *flinear_desc;
+      LLVMValueRef packed;
+      flinear_desc = util_format_description(util_format_linear(format_desc->format));
+      packed = lp_build_fetch_cached_texels(gallivm,
+                                            flinear_desc,
+                                            type.length,
+                                            base_ptr,
+                                            offset,
+                                            i, j,
+                                            cache);
+      packed = LLVMBuildBitCast(builder, packed,
+                                lp_build_int_vec_type(gallivm, type), "");
+      /*
+       * The values are now packed so they match ordinary srgb RGBA8 format,
+       * hence need to use matching format for unpack.
+       */
+      format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+
+      lp_build_unpack_rgba_soa(gallivm,
+                               format_decompressed,
+                               type,
+                               packed, rgba_out);
+
+      return;
+   }
+
    /*
     * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
     *
@@ -524,7 +559,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
          /* Get a single float[4]={R,G,B,A} pixel */
          tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                        TRUE, base_ptr, offset_elem,
-                                       i_elem, j_elem);
+                                       i_elem, j_elem, cache);
 
          /*
           * Insert the AoS tmp value channels into the SoA result vectors at
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index eba758da6ae..a6f0eff42f6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -99,6 +99,7 @@ struct lp_sampler_params
    unsigned sampler_index;
    unsigned sample_key;
    LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr;
    const LLVMValueRef *coords;
    const LLVMValueRef *offsets;
    LLVMValueRef lod;
@@ -267,6 +268,17 @@ struct lp_sampler_dynamic_state
                    struct gallivm_state *gallivm,
                    LLVMValueRef context_ptr,
                    unsigned sampler_unit);
+
+   /** 
+    * Obtain texture cache (returns ptr to lp_build_format_cache).
+    *
+    * It's optional: no caching will be done if it's NULL.
+    */
+   LLVMValueRef
+   (*cache_ptr)(const struct lp_sampler_dynamic_state *state,
+                struct gallivm_state *gallivm,
+                LLVMValueRef thread_data_ptr,
+                unsigned unit);
 };
 
 
@@ -356,6 +368,7 @@ struct lp_build_sample_context
    LLVMValueRef img_stride_array;
    LLVMValueRef base_ptr;
    LLVMValueRef mip_offsets;
+   LLVMValueRef cache;
 
    /** Integer vector with texture width, height, depth */
    LLVMValueRef int_size;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index d7fde810a76..729c5b8f6ef 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -593,7 +593,8 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
                                       TRUE,
                                       data_ptr, offset,
                                       x_subcoord,
-                                      y_subcoord);
+                                      y_subcoord,
+                                      bld->cache);
    }
 
    *colors = rgba8;
@@ -933,7 +934,8 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
                                                TRUE,
                                                data_ptr, offset[k][j][i],
                                                x_subcoord[i],
-                                               y_subcoord[j]);
+                                               y_subcoord[j],
+                                               bld->cache);
             }
 
             neighbors[k][j][i] = rgba8;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 26bfa0d2677..e21933ffc85 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -161,6 +161,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                            bld->texel_type,
                            data_ptr, offset,
                            i, j,
+                           bld->cache,
                            texel_out);
 
    /*
@@ -2389,6 +2390,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
                            bld->texel_type,
                            bld->base_ptr, offset,
                            i, j,
+                           bld->cache,
                            colors_out);
 
    if (out_of_bound_ret_zero) {
@@ -2442,6 +2444,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                          unsigned texture_index,
                          unsigned sampler_index,
                          LLVMValueRef context_ptr,
+                         LLVMValueRef thread_data_ptr,
                          const LLVMValueRef *coords,
                          const LLVMValueRef *offsets,
                          const struct lp_derivatives *derivs, /* optional */
@@ -2707,6 +2710,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                                                 context_ptr, texture_index);
    /* Note that mip_offsets is an array[level] of offsets to texture images */
 
+   if (dynamic_state->cache_ptr && thread_data_ptr) {
+      bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
+                                           thread_data_ptr, texture_index);
+   }
+
    /* width, height, depth as single int vector */
    if (dims <= 1) {
       bld.int_size = tex_width;
@@ -2883,6 +2891,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
          bld4.base_ptr = bld.base_ptr;
          bld4.mip_offsets = bld.mip_offsets;
          bld4.int_size = bld.int_size;
+         bld4.cache = bld.cache;
 
          bld4.vector_width = lp_type_width(type4);
 
@@ -3081,12 +3090,14 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
    LLVMValueRef offsets[3] = { NULL };
    LLVMValueRef lod = NULL;
    LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr = NULL;
    LLVMValueRef texel_out[4];
    struct lp_derivatives derivs;
    struct lp_derivatives *deriv_ptr = NULL;
    unsigned num_param = 0;
    unsigned i, num_coords, num_derivs, num_offsets, layer;
    enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;
 
    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                     LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3094,8 +3105,19 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
    get_target_info(static_texture_state->target,
                    &num_coords, &num_derivs, &num_offsets, &layer);
 
+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         need_cache = TRUE;
+      }
+   }
+
    /* "unpack" arguments */
    context_ptr = LLVMGetParam(function, num_param++);
+   if (need_cache) {
+      thread_data_ptr = LLVMGetParam(function, num_param++);
+   }
    for (i = 0; i < num_coords; i++) {
       coords[i] = LLVMGetParam(function, num_param++);
    }
@@ -3146,6 +3168,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
                             texture_index,
                             sampler_index,
                             context_ptr,
+                            thread_data_ptr,
                             coords,
                             offsets,
                             deriv_ptr,
@@ -3189,6 +3212,7 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
    const LLVMValueRef *offsets = params->offsets;
    const struct lp_derivatives *derivs = params->derivs;
    enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;
 
    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                     LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3196,6 +3220,17 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
    get_target_info(static_texture_state->target,
                    &num_coords, &num_derivs, &num_offsets, &layer);
 
+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         /*
+          * This is not 100% correct, if we have cache but the
+          * util_format_s3tc_prefer is true the cache won't get used
+          * regardless (could hook up the block decode there...) */
+         need_cache = TRUE;
+      }
+   }
    /*
     * texture function matches are found by name.
     * Thus the name has to include both the texture and sampler unit
@@ -3221,6 +3256,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
        */
 
       arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
+      if (need_cache) {
+         arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
+      }
       for (i = 0; i < num_coords; i++) {
          arg_types[num_param++] = LLVMTypeOf(coords[0]);
          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
@@ -3280,6 +3318,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
 
    num_args = 0;
    args[num_args++] = params->context_ptr;
+   if (need_cache) {
+      args[num_args++] = params->thread_data_ptr;
+   }
    for (i = 0; i < num_coords; i++) {
       args[num_args++] = coords[i];
    }
@@ -3384,6 +3425,7 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
                                params->texture_index,
                                params->sampler_index,
                                params->context_ptr,
+                               params->thread_data_ptr,
                                params->coords,
                                params->offsets,
                                params->derivs,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 2ca9c6194b3..cc4549778a3 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -230,6 +230,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
                   const LLVMValueRef (*inputs)[4],
                   LLVMValueRef (*outputs)[4],
                   LLVMValueRef context_ptr,
+                  LLVMValueRef thread_data_ptr,
                   struct lp_build_sampler_soa *sampler,
                   const struct tgsi_shader_info *info,
                   const struct lp_build_tgsi_gs_iface *gs_iface);
@@ -447,6 +448,7 @@ struct lp_build_tgsi_soa_context
    const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS];
    LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS];
    LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr;
 
    const struct lp_build_sampler_soa *sampler;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index fae604e2f9c..7d2cd9a9e73 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -2321,6 +2321,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    params.texture_index = unit;
    params.sampler_index = unit;
    params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
    params.coords = coords;
    params.offsets = offsets;
    params.lod = lod;
@@ -2488,6 +2489,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
    params.texture_index = texture_unit;
    params.sampler_index = sampler_unit;
    params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
    params.coords = coords;
    params.offsets = offsets;
    params.lod = lod;
@@ -2608,6 +2610,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
    params.texture_index = unit;
    params.sampler_index = unit;
    params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
    params.coords = coords;
    params.offsets = offsets;
    params.derivs = NULL;
@@ -3858,6 +3861,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
                   const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
                   LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
                   LLVMValueRef context_ptr,
+                  LLVMValueRef thread_data_ptr,
                   struct lp_build_sampler_soa *sampler,
                   const struct tgsi_shader_info *info,
                   const struct lp_build_tgsi_gs_iface *gs_iface)
@@ -3893,6 +3897,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.info = info;
    bld.indirect_files = info->indirect_files;
    bld.context_ptr = context_ptr;
+   bld.thread_data_ptr = thread_data_ptr;
 
    /*
     * If the number of temporaries is rather large then we just
diff --git a/src/gallium/auxiliary/hud/hud_cpu.c b/src/gallium/auxiliary/hud/hud_cpu.c
index cd20deec9bd..c06e7770d63 100644
--- a/src/gallium/auxiliary/hud/hud_cpu.c
+++ b/src/gallium/auxiliary/hud/hud_cpu.c
@@ -33,6 +33,58 @@
 #include "util/u_memory.h"
 #include <stdio.h>
 #include <inttypes.h>
+#ifdef PIPE_OS_WINDOWS
+#include <windows.h>
+#endif
+
+
+#ifdef PIPE_OS_WINDOWS
+
+static inline uint64_t
+filetime_to_scalar(FILETIME ft)
+{
+   ULARGE_INTEGER uli;
+   uli.LowPart = ft.dwLowDateTime;
+   uli.HighPart = ft.dwHighDateTime;
+   return uli.QuadPart;
+}
+
+static boolean
+get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time)
+{
+   SYSTEM_INFO sysInfo;
+   FILETIME ftNow, ftCreation, ftExit, ftKernel, ftUser;
+
+   GetSystemInfo(&sysInfo);
+   assert(sysInfo.dwNumberOfProcessors >= 1);
+   if (cpu_index != ALL_CPUS && cpu_index >= sysInfo.dwNumberOfProcessors) {
+      /* Tell hud_get_num_cpus there are only this many CPUs. */
+      return FALSE;
+   }
+
+   /* Get accumulated user and sys time for all threads */
+   if (!GetProcessTimes(GetCurrentProcess(), &ftCreation, &ftExit,
+                        &ftKernel, &ftUser))
+      return FALSE;
+
+   GetSystemTimeAsFileTime(&ftNow);
+
+   *busy_time = filetime_to_scalar(ftUser) + filetime_to_scalar(ftKernel);
+   *total_time = filetime_to_scalar(ftNow) - filetime_to_scalar(ftCreation);
+
+   /* busy_time already has the time accross all cpus.
+    * XXX: if we want 100% to mean one CPU, 200% two cpus, eliminate the
+    * following line.
+    */
+   *total_time *= sysInfo.dwNumberOfProcessors;
+
+   /* XXX: we ignore cpu_index, i.e, we assume that the individual CPU usage
+    * and the system usage are one and the same.
+    */
+   return TRUE;
+}
+
+#else
 
 static boolean
 get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time)
@@ -81,6 +133,8 @@ get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time)
    fclose(f);
    return FALSE;
 }
+#endif
+
 
 struct cpu_info {
    unsigned cpu_index;
diff --git a/src/gallium/auxiliary/indices/u_indices.c b/src/gallium/auxiliary/indices/u_indices.c
index c25594b4b7a..436f8f008cb 100644
--- a/src/gallium/auxiliary/indices/u_indices.c
+++ b/src/gallium/auxiliary/indices/u_indices.c
@@ -68,17 +68,18 @@ static void translate_memcpy_uint( const void *in,
  * \param out_nr  returns number of new vertices
  * \param out_translate  returns the translation function to use by the caller
  */
-int u_index_translator( unsigned hw_mask,
-                        unsigned prim,
-                        unsigned in_index_size,
-                        unsigned nr,
-                        unsigned in_pv,
-                        unsigned out_pv,
-                        unsigned prim_restart,
-                        unsigned *out_prim,
-                        unsigned *out_index_size,
-                        unsigned *out_nr,
-                        u_translate_func *out_translate )
+enum indices_mode
+u_index_translator(unsigned hw_mask,
+                   unsigned prim,
+                   unsigned in_index_size,
+                   unsigned nr,
+                   unsigned in_pv,
+                   unsigned out_pv,
+                   unsigned prim_restart,
+                   unsigned *out_prim,
+                   unsigned *out_index_size,
+                   unsigned *out_nr,
+                   u_translate_func *out_translate)
 {
    unsigned in_idx;
    unsigned out_idx;
@@ -204,17 +205,17 @@ int u_index_translator( unsigned hw_mask,
  * \param out_nr  returns new number of vertices to draw
  * \param out_generate  returns pointer to the generator function
  */
-int u_index_generator( unsigned hw_mask,
-                       unsigned prim,
-                       unsigned start,
-                       unsigned nr,
-                       unsigned in_pv,
-                       unsigned out_pv,
-                       unsigned *out_prim,
-                       unsigned *out_index_size,
-                       unsigned *out_nr,
-                       u_generate_func *out_generate )
-
+enum indices_mode
+u_index_generator(unsigned hw_mask,
+                  unsigned prim,
+                  unsigned start,
+                  unsigned nr,
+                  unsigned in_pv,
+                  unsigned out_pv,
+                  unsigned *out_prim,
+                  unsigned *out_index_size,
+                  unsigned *out_nr,
+                  u_generate_func *out_generate)
 {
    unsigned out_idx;
 
diff --git a/src/gallium/auxiliary/indices/u_indices.h b/src/gallium/auxiliary/indices/u_indices.h
index e01201e4b04..4483eb81337 100644
--- a/src/gallium/auxiliary/indices/u_indices.h
+++ b/src/gallium/auxiliary/indices/u_indices.h
@@ -67,66 +67,68 @@ typedef void (*u_generate_func)( unsigned start,
 /* Return codes describe the translate/generate operation.  Caller may
  * be able to reuse translated indices under some circumstances.
  */
-#define U_TRANSLATE_ERROR  -1
-#define U_TRANSLATE_NORMAL  1
-#define U_TRANSLATE_MEMCPY  2
-#define U_GENERATE_LINEAR   3
-#define U_GENERATE_REUSABLE 4
-#define U_GENERATE_ONE_OFF  5
-
+enum indices_mode {
+   U_TRANSLATE_ERROR = -1,
+   U_TRANSLATE_NORMAL = 1,
+   U_TRANSLATE_MEMCPY = 2,
+   U_GENERATE_LINEAR  = 3,
+   U_GENERATE_REUSABLE= 4,
+   U_GENERATE_ONE_OFF = 5,
+};
 
 void u_index_init( void );
 
-int u_index_translator( unsigned hw_mask,
-                        unsigned prim,
-                        unsigned in_index_size,
-                        unsigned nr,
-                        unsigned in_pv,   /* API */
-                        unsigned out_pv,  /* hardware */
-                        unsigned prim_restart,
-                        unsigned *out_prim,
-                        unsigned *out_index_size,
-                        unsigned *out_nr,
-                        u_translate_func *out_translate );
+enum indices_mode
+u_index_translator(unsigned hw_mask,
+                   unsigned prim,
+                   unsigned in_index_size,
+                   unsigned nr,
+                   unsigned in_pv,   /* API */
+                   unsigned out_pv,  /* hardware */
+                   unsigned prim_restart,
+                   unsigned *out_prim,
+                   unsigned *out_index_size,
+                   unsigned *out_nr,
+                   u_translate_func *out_translate);
 
 /* Note that even when generating it is necessary to know what the
  * API's PV is, as the indices generated will depend on whether it is
  * the same as hardware or not, and in the case of triangle strips,
  * whether it is first or last.
  */
-int u_index_generator( unsigned hw_mask,
-                       unsigned prim,
-                       unsigned start,
-                       unsigned nr,
-                       unsigned in_pv,   /* API */
-                       unsigned out_pv,  /* hardware */
-                       unsigned *out_prim,
-                       unsigned *out_index_size,
-                       unsigned *out_nr,
-                       u_generate_func *out_generate );
+enum indices_mode
+u_index_generator(unsigned hw_mask,
+                  unsigned prim,
+                  unsigned start,
+                  unsigned nr,
+                  unsigned in_pv,   /* API */
+                  unsigned out_pv,  /* hardware */
+                  unsigned *out_prim,
+                  unsigned *out_index_size,
+                  unsigned *out_nr,
+                  u_generate_func *out_generate);
 
 
 void u_unfilled_init( void );
 
-int u_unfilled_translator( unsigned prim,
-                           unsigned in_index_size,
-                           unsigned nr,
-                           unsigned unfilled_mode,
-                           unsigned *out_prim,
-                           unsigned *out_index_size,
-                           unsigned *out_nr,
-                           u_translate_func *out_translate );
-
-int u_unfilled_generator( unsigned prim,
-                          unsigned start,
-                          unsigned nr,
-                          unsigned unfilled_mode,
-                          unsigned *out_prim,
-                          unsigned *out_index_size,
-                          unsigned *out_nr,
-                          u_generate_func *out_generate );
-
-
-
+enum indices_mode
+u_unfilled_translator(unsigned prim,
+                      unsigned in_index_size,
+                      unsigned nr,
+                      unsigned unfilled_mode,
+                      unsigned *out_prim,
+                      unsigned *out_index_size,
+                      unsigned *out_nr,
+                      u_translate_func *out_translate);
+
+enum indices_mode
+u_unfilled_generator(unsigned prim,
+                     unsigned start,
+                     unsigned nr,
+                     unsigned unfilled_mode,
+                     unsigned *out_prim,
+                     unsigned *out_index_size,
+                     unsigned *out_nr,
+                     u_generate_func *out_generate);
 
 #endif
diff --git a/src/gallium/auxiliary/indices/u_unfilled_indices.c b/src/gallium/auxiliary/indices/u_unfilled_indices.c
index 121877a60fb..fc974f8b946 100644
--- a/src/gallium/auxiliary/indices/u_unfilled_indices.c
+++ b/src/gallium/auxiliary/indices/u_unfilled_indices.c
@@ -111,14 +111,15 @@ static unsigned nr_lines( unsigned prim,
                               
 
 
-int u_unfilled_translator( unsigned prim,
-                        unsigned in_index_size,
-                        unsigned nr,
-                        unsigned unfilled_mode,
-                        unsigned *out_prim,
-                        unsigned *out_index_size,
-                        unsigned *out_nr,
-                        u_translate_func *out_translate )
+enum indices_mode
+u_unfilled_translator(unsigned prim,
+                      unsigned in_index_size,
+                      unsigned nr,
+                      unsigned unfilled_mode,
+                      unsigned *out_prim,
+                      unsigned *out_index_size,
+                      unsigned *out_nr,
+                      u_translate_func *out_translate)
 {
    unsigned in_idx;
    unsigned out_idx;
@@ -170,14 +171,15 @@ int u_unfilled_translator( unsigned prim,
  * different front/back fill modes, that can be handled with the
  * 'draw' module.
  */
-int u_unfilled_generator( unsigned prim,
-                          unsigned start,
-                          unsigned nr,
-                          unsigned unfilled_mode,
-                          unsigned *out_prim,
-                          unsigned *out_index_size,
-                          unsigned *out_nr,
-                          u_generate_func *out_generate )
+enum indices_mode
+u_unfilled_generator(unsigned prim,
+                     unsigned start,
+                     unsigned nr,
+                     unsigned unfilled_mode,
+                     unsigned *out_prim,
+                     unsigned *out_index_size,
+                     unsigned *out_nr,
+                     u_generate_func *out_generate)
 {
    unsigned out_idx;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 89369d60f4e..fc29a2398aa 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -95,6 +95,7 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
    "TESSOUTER",
    "TESSINNER",
    "VERTICESIN",
+   "HELPER_INVOCATION",
 };
 
 const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index b7b1ece389b..fccc92c06c4 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -70,7 +70,7 @@ struct blitter_context_priv
    /* Constant state objects. */
    /* Vertex shaders. */
    void *vs; /**< Vertex shader which passes {pos, generic} to the output.*/
-   void *vs_pos_only; /**< Vertex shader which passes pos to the output.*/
+   void *vs_pos_only[4]; /**< Vertex shader which passes pos to the output.*/
    void *vs_layered; /**< Vertex shader which sets LAYER = INSTANCEID. */
 
    /* Fragment shaders. */
@@ -325,27 +325,29 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    return &ctx->base;
 }
 
-static void bind_vs_pos_only(struct blitter_context_priv *ctx)
+static void bind_vs_pos_only(struct blitter_context_priv *ctx,
+                             unsigned num_so_channels)
 {
    struct pipe_context *pipe = ctx->base.pipe;
+   int index = num_so_channels ? num_so_channels - 1 : 0;
 
-   if (!ctx->vs_pos_only) {
+   if (!ctx->vs_pos_only[index]) {
       struct pipe_stream_output_info so;
       const uint semantic_names[] = { TGSI_SEMANTIC_POSITION };
       const uint semantic_indices[] = { 0 };
 
       memset(&so, 0, sizeof(so));
       so.num_outputs = 1;
-      so.output[0].num_components = 1;
-      so.stride[0] = 1;
+      so.output[0].num_components = num_so_channels;
+      so.stride[0] = num_so_channels;
 
-      ctx->vs_pos_only =
+      ctx->vs_pos_only[index] =
          util_make_vertex_passthrough_shader_with_so(pipe, 1, semantic_names,
                                                      semantic_indices, FALSE,
                                                      &so);
    }
 
-   pipe->bind_vs_state(pipe, ctx->vs_pos_only);
+   pipe->bind_vs_state(pipe, ctx->vs_pos_only[index]);
 }
 
 static void bind_vs_passthrough(struct blitter_context_priv *ctx)
@@ -441,8 +443,9 @@ void util_blitter_destroy(struct blitter_context *blitter)
       pipe->delete_rasterizer_state(pipe, ctx->rs_discard_state);
    if (ctx->vs)
       pipe->delete_vs_state(pipe, ctx->vs);
-   if (ctx->vs_pos_only)
-      pipe->delete_vs_state(pipe, ctx->vs_pos_only);
+   for (i = 0; i < 4; i++)
+      if (ctx->vs_pos_only[i])
+         pipe->delete_vs_state(pipe, ctx->vs_pos_only[i]);
    if (ctx->vs_layered)
       pipe->delete_vs_state(pipe, ctx->vs_layered);
    pipe->delete_vertex_elements_state(pipe, ctx->velem_state);
@@ -2036,7 +2039,7 @@ void util_blitter_copy_buffer(struct blitter_context *blitter,
 
    pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb);
    pipe->bind_vertex_elements_state(pipe, ctx->velem_state_readbuf[0]);
-   bind_vs_pos_only(ctx);
+   bind_vs_pos_only(ctx, 1);
    if (ctx->has_geometry_shader)
       pipe->bind_gs_state(pipe, NULL);
    if (ctx->has_tessellation) {
@@ -2103,7 +2106,7 @@ void util_blitter_clear_buffer(struct blitter_context *blitter,
    pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb);
    pipe->bind_vertex_elements_state(pipe,
                                     ctx->velem_state_readbuf[num_channels-1]);
-   bind_vs_pos_only(ctx);
+   bind_vs_pos_only(ctx, num_channels);
    if (ctx->has_geometry_shader)
       pipe->bind_gs_state(pipe, NULL);
    if (ctx->has_tessellation) {
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 7388a499c74..702953673ba 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -70,6 +70,20 @@ void _debug_vprintf(const char *format, va_list ap)
 #endif
 }
 
+void
+_pipe_debug_message(
+   struct pipe_debug_callback *cb,
+   unsigned *id,
+   enum pipe_debug_type type,
+   const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+   if (cb && cb->debug_message)
+      cb->debug_message(cb->data, id, type, fmt, args);
+   va_end(args);
+}
+
 
 void
 debug_disable_error_message_boxes(void)
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 926063a1918..aaf223c6f68 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -42,6 +42,7 @@
 #include "os/os_misc.h"
 
 #include "pipe/p_format.h"
+#include "pipe/p_defines.h"
 
 
 #ifdef	__cplusplus
@@ -262,6 +263,25 @@ void _debug_assert_fail(const char *expr,
    _debug_printf("error: %s\n", __msg)
 #endif
 
+/**
+ * Output a debug log message to the debug info callback.
+ */
+#define pipe_debug_message(cb, type, fmt, ...) do { \
+   static unsigned id = 0; \
+   _pipe_debug_message(cb, &id, \
+                       PIPE_DEBUG_TYPE_ ## type, \
+                       fmt, __VA_ARGS__); \
+} while (0)
+
+struct pipe_debug_callback;
+
+void
+_pipe_debug_message(
+   struct pipe_debug_callback *cb,
+   unsigned *id,
+   enum pipe_debug_type type,
+   const char *fmt, ...) _util_printf_format(4, 5);
+
 
 /**
  * Used by debug_dump_enum and debug_dump_flags to describe symbols.
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index b31ada138b8..54e9e717104 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -998,26 +998,30 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
    return PIPE_OK;
 }
 
-static boolean u_vbuf_need_minmax_index(struct u_vbuf *mgr)
+static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr)
 {
    /* See if there are any per-vertex attribs which will be uploaded or
     * translated. Use bitmasks to get the info instead of looping over vertex
     * elements. */
    return (mgr->ve->used_vb_mask &
-           ((mgr->user_vb_mask | mgr->incompatible_vb_mask |
+           ((mgr->user_vb_mask |
+             mgr->incompatible_vb_mask |
              mgr->ve->incompatible_vb_mask_any) &
-            mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask)) != 0;
+            mgr->ve->noninstance_vb_mask_any &
+            mgr->nonzero_stride_vb_mask)) != 0;
 }
 
-static boolean u_vbuf_mapping_vertex_buffer_blocks(struct u_vbuf *mgr)
+static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr)
 {
    /* Return true if there are hw buffers which don't need to be translated.
     *
     * We could query whether each buffer is busy, but that would
     * be way more costly than this. */
    return (mgr->ve->used_vb_mask &
-           (~mgr->user_vb_mask & ~mgr->incompatible_vb_mask &
-            mgr->ve->compatible_vb_mask_all & mgr->ve->noninstance_vb_mask_any &
+           (~mgr->user_vb_mask &
+            ~mgr->incompatible_vb_mask &
+            mgr->ve->compatible_vb_mask_all &
+            mgr->ve->noninstance_vb_mask_any &
             mgr->nonzero_stride_vb_mask)) != 0;
 }
 
diff --git a/src/gallium/auxiliary/vl/vl_video_buffer.c b/src/gallium/auxiliary/vl/vl_video_buffer.c
index 5e0ae0ecb8b..6cd2557a892 100644
--- a/src/gallium/auxiliary/vl/vl_video_buffer.c
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.c
@@ -62,6 +62,18 @@ const enum pipe_format const_resource_formats_VUYA[3] = {
    PIPE_FORMAT_NONE
 };
 
+const enum pipe_format const_resource_formats_YUVX[3] = {
+   PIPE_FORMAT_R8G8B8X8_UNORM,
+   PIPE_FORMAT_NONE,
+   PIPE_FORMAT_NONE
+};
+
+const enum pipe_format const_resource_formats_VUYX[3] = {
+   PIPE_FORMAT_B8G8R8X8_UNORM,
+   PIPE_FORMAT_NONE,
+   PIPE_FORMAT_NONE
+};
+
 const enum pipe_format const_resource_formats_YUYV[3] = {
    PIPE_FORMAT_R8G8_R8B8_UNORM,
    PIPE_FORMAT_NONE,
@@ -102,6 +114,12 @@ vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format)
    case PIPE_FORMAT_B8G8R8A8_UNORM:
       return const_resource_formats_VUYA;
 
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      return const_resource_formats_VUYX;
+
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return const_resource_formats_VUYX;
+
    case PIPE_FORMAT_YUYV:
       return const_resource_formats_YUYV;
 
diff --git a/src/gallium/auxiliary/vl/vl_winsys.h b/src/gallium/auxiliary/vl/vl_winsys.h
index f6b47c964f9..df01917466f 100644
--- a/src/gallium/auxiliary/vl/vl_winsys.h
+++ b/src/gallium/auxiliary/vl/vl_winsys.h
@@ -66,4 +66,10 @@ vl_screen_set_next_timestamp(struct vl_screen *vscreen, uint64_t stamp);
 void*
 vl_screen_get_private(struct vl_screen *vscreen);
 
+struct vl_screen*
+vl_drm_screen_create(int fd);
+
+void
+vl_drm_screen_destroy(struct vl_screen *vscreen);
+
 #endif
diff --git a/src/gallium/auxiliary/vl/vl_winsys_drm.c b/src/gallium/auxiliary/vl/vl_winsys_drm.c
new file mode 100644
index 00000000000..1167fcf6a90
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c
@@ -0,0 +1,77 @@
+/**************************************************************************
+ *
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <assert.h>
+
+#include "pipe/p_screen.h"
+#include "pipe-loader/pipe_loader.h"
+#include "state_tracker/drm_driver.h"
+
+#include "util/u_memory.h"
+#include "vl/vl_winsys.h"
+
+struct vl_screen*
+vl_drm_screen_create(int fd)
+{
+   struct vl_screen *vscreen;
+
+   vscreen = CALLOC_STRUCT(vl_screen);
+   if (!vscreen)
+      return NULL;
+
+#if GALLIUM_STATIC_TARGETS
+   vscreen->pscreen = dd_create_screen(fd);
+#else
+   if (pipe_loader_drm_probe_fd(&vscreen->dev, dup(fd))) {
+      vscreen->pscreen =
+         pipe_loader_create_screen(vscreen->dev, PIPE_SEARCH_DIR);
+      if (!vscreen->pscreen)
+         pipe_loader_release(&vscreen->dev, 1);
+   }
+#endif
+
+   if (!vscreen->pscreen) {
+      FREE(vscreen);
+      return NULL;
+   }
+
+   return vscreen;
+}
+
+void
+vl_drm_screen_destroy(struct vl_screen *vscreen)
+{
+   assert(vscreen);
+
+   vscreen->pscreen->destroy(vscreen->pscreen);
+
+#if !GALLIUM_STATIC_TARGETS
+   pipe_loader_release(&vscreen->dev, 1);
+#endif
+
+   FREE(vscreen);
+}
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index a7d08d2c7f9..9a32716f921 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -84,6 +84,9 @@ objects. They all follow simple, one-method binding calls, e.g.
     levels. This corresponds to GL's ``PATCH_DEFAULT_OUTER_LEVEL``.
   * ``default_inner_level`` is the default value for the inner tessellation
     levels. This corresponds to GL's ``PATCH_DEFAULT_INNER_LEVEL``.
+* ``set_debug_callback`` sets the callback to be used for reporting
+  various debug messages, eventually reported via KHR_debug and
+  similar mechanisms.
 
 
 Sampler Views
@@ -224,6 +227,10 @@ is is also possible to only clear one or the other part). While it is only
 possible to clear one surface at a time (which can include several layers),
 this surface need not be bound to the framebuffer.
 
+``clear_texture`` clears a non-PIPE_BUFFER resource's specified level
+and bounding box with a clear value provided in that resource's native
+format.
+
 ``clear_buffer`` clears a PIPE_BUFFER resource with the specified clear value
 (which may be multiple bytes in length). Logically this is a memset with a
 multi-byte element value starting at offset bytes from resource start, going
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 91fdb43cfbb..e900283f731 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -281,6 +281,8 @@ The integer capabilities:
 * ``PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS``:
   Whether copying between compressed and plain formats is supported where
   a compressed block is copied to/from a plain pixel of the same size.
+* ``PIPE_CAP_CLEAR_TEXTURE``: Whether `clear_texture` will be
+  available in contexts.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 01e18f3084e..e7b0c2f6377 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2941,6 +2941,14 @@ TGSI_SEMANTIC_VERTICESIN
 For tessellation evaluation/control shaders, this semantic label indicates the
 number of vertices provided in the input patch. Only the X value is defined.
 
+TGSI_SEMANTIC_HELPER_INVOCATION
+"""""""""""""""""""""""""""""""
+
+For fragment shaders, this semantic indicates whether the current
+invocation is covered or not. Helper invocations are created in order
+to properly compute derivatives, however it may be desirable to skip
+some of the logic in those cases. See ``gl_HelperInvocation`` documentation.
+
 
 Declaration Interpolate
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index 2853787a340..ef235734755 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index 4bbcb33614c..b5e1ddadde0 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 819f5b14a17..9f970365464 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
@@ -489,8 +490,8 @@ static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(enum adreno_r
 	return ((val) << A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__SHIFT) & A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__MASK;
 }
 
-#define REG_A4XX_RB_BLEND_RED					0x000020f3
-#define A4XX_RB_BLEND_RED_UINT__MASK				0x00007fff
+#define REG_A4XX_RB_BLEND_RED					0x000020f0
+#define A4XX_RB_BLEND_RED_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_RED_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_RED_UINT(uint32_t val)
 {
@@ -503,8 +504,16 @@ static inline uint32_t A4XX_RB_BLEND_RED_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_RED_FLOAT__SHIFT) & A4XX_RB_BLEND_RED_FLOAT__MASK;
 }
 
-#define REG_A4XX_RB_BLEND_GREEN					0x000020f4
-#define A4XX_RB_BLEND_GREEN_UINT__MASK				0x00007fff
+#define REG_A4XX_RB_BLEND_RED_F32				0x000020f1
+#define A4XX_RB_BLEND_RED_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_RED_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_RED_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_RED_F32__SHIFT) & A4XX_RB_BLEND_RED_F32__MASK;
+}
+
+#define REG_A4XX_RB_BLEND_GREEN					0x000020f2
+#define A4XX_RB_BLEND_GREEN_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_GREEN_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_GREEN_UINT(uint32_t val)
 {
@@ -517,8 +526,16 @@ static inline uint32_t A4XX_RB_BLEND_GREEN_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_GREEN_FLOAT__SHIFT) & A4XX_RB_BLEND_GREEN_FLOAT__MASK;
 }
 
-#define REG_A4XX_RB_BLEND_BLUE					0x000020f5
-#define A4XX_RB_BLEND_BLUE_UINT__MASK				0x00007fff
+#define REG_A4XX_RB_BLEND_GREEN_F32				0x000020f3
+#define A4XX_RB_BLEND_GREEN_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_GREEN_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_GREEN_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_GREEN_F32__SHIFT) & A4XX_RB_BLEND_GREEN_F32__MASK;
+}
+
+#define REG_A4XX_RB_BLEND_BLUE					0x000020f4
+#define A4XX_RB_BLEND_BLUE_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_BLUE_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_BLUE_UINT(uint32_t val)
 {
@@ -531,8 +548,16 @@ static inline uint32_t A4XX_RB_BLEND_BLUE_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_BLUE_FLOAT__SHIFT) & A4XX_RB_BLEND_BLUE_FLOAT__MASK;
 }
 
+#define REG_A4XX_RB_BLEND_BLUE_F32				0x000020f5
+#define A4XX_RB_BLEND_BLUE_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_BLUE_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_BLUE_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_BLUE_F32__SHIFT) & A4XX_RB_BLEND_BLUE_F32__MASK;
+}
+
 #define REG_A4XX_RB_BLEND_ALPHA					0x000020f6
-#define A4XX_RB_BLEND_ALPHA_UINT__MASK				0x00007fff
+#define A4XX_RB_BLEND_ALPHA_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_ALPHA_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_ALPHA_UINT(uint32_t val)
 {
@@ -545,6 +570,14 @@ static inline uint32_t A4XX_RB_BLEND_ALPHA_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_ALPHA_FLOAT__SHIFT) & A4XX_RB_BLEND_ALPHA_FLOAT__MASK;
 }
 
+#define REG_A4XX_RB_BLEND_ALPHA_F32				0x000020f7
+#define A4XX_RB_BLEND_ALPHA_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_ALPHA_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_ALPHA_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_ALPHA_F32__SHIFT) & A4XX_RB_BLEND_ALPHA_F32__MASK;
+}
+
 #define REG_A4XX_RB_ALPHA_CONTROL				0x000020f8
 #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__MASK			0x000000ff
 #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__SHIFT			0
@@ -2645,20 +2678,6 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val)
 
 #define REG_A4XX_UNKNOWN_20EF					0x000020ef
 
-#define REG_A4XX_UNKNOWN_20F0					0x000020f0
-
-#define REG_A4XX_UNKNOWN_20F1					0x000020f1
-
-#define REG_A4XX_UNKNOWN_20F2					0x000020f2
-
-#define REG_A4XX_UNKNOWN_20F7					0x000020f7
-#define A4XX_UNKNOWN_20F7__MASK					0xffffffff
-#define A4XX_UNKNOWN_20F7__SHIFT				0
-static inline uint32_t A4XX_UNKNOWN_20F7(float val)
-{
-	return ((fui(val)) << A4XX_UNKNOWN_20F7__SHIFT) & A4XX_UNKNOWN_20F7__MASK;
-}
-
 #define REG_A4XX_UNKNOWN_2152					0x00002152
 
 #define REG_A4XX_UNKNOWN_2153					0x00002153
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index cf5dd7b0f17..26b58718cd8 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -613,15 +613,19 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	if (dirty & FD_DIRTY_BLEND_COLOR) {
 		struct pipe_blend_color *bcolor = &ctx->blend_color;
-		OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4);
-		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 255.0) |
+		OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8);
+		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 65535.0) |
 				A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0]));
-		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 255.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0]));
+		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 65535.0) |
 				A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]));
-		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 255.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_GREEN_F32(bcolor->color[1]));
+		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 65535.0) |
 				A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]));
-		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 255.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2]));
+		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 65535.0) |
 				A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]));
+		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
 	}
 
 	if (dirty & FD_DIRTY_VERTTEX) {
@@ -699,15 +703,6 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20EF, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F0, 1);
-	OUT_RING(ring, 0x00000000);
-
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F1, 1);
-	OUT_RING(ring, 0x00000000);
-
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F2, 1);
-	OUT_RING(ring, 0x00000000);
-
 	OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4);
 	OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(0) |
 			A4XX_RB_BLEND_RED_FLOAT(0.0));
@@ -718,9 +713,6 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(0x7fff) |
 			A4XX_RB_BLEND_ALPHA_FLOAT(1.0));
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F7, 1);
-	OUT_RING(ring, 0x3f800000);
-
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_2152, 1);
 	OUT_RING(ring, 0x00000000);
 
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index 906368c0efa..ca3d2ac3fca 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 490cf5beaf0..f095e3061b2 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 9f8c33263fb..56d1834ef9c 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -239,6 +239,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_CLEAR_TEXTURE:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -549,6 +550,7 @@ fd_screen_create(struct fd_device *dev)
 	case 220:
 		fd2_screen_init(pscreen);
 		break;
+	case 305:
 	case 307:
 	case 320:
 	case 330:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 8c9234b3847..157dc73a3c6 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -2325,17 +2325,17 @@ emit_instructions(struct ir3_compile *ctx)
 	}
 
 	/* Setup inputs: */
-	foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) {
+	nir_foreach_variable(var, &ctx->s->inputs) {
 		setup_input(ctx, var);
 	}
 
 	/* Setup outputs: */
-	foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) {
+	nir_foreach_variable(var, &ctx->s->outputs) {
 		setup_output(ctx, var);
 	}
 
 	/* Setup variables (which should only be arrays): */
-	foreach_list_typed(nir_variable, var, node, &ctx->s->globals) {
+	nir_foreach_variable(var, &ctx->s->globals) {
 		declare_var(ctx, var);
 	}
 
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 2d2fd375656..a5b161882cd 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -253,6 +253,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 888f7aa6782..cfa2fb41152 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -475,6 +475,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index df262fa4716..ceac86abe1d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -746,7 +746,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
 
    pos_init(bld, x0, y0);
 
-   if (coeff_type.length > 4) {
+   /*
+    * Simple method (single step interpolation) may be slower if vector length
+    * is just 4, but the results are different (generally less accurate) with
+    * the other method, so always use more accurate version.
+    */
+   if (1) {
       bld->simple_interp = TRUE;
       {
          /* XXX this should use a global static table */
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 9acde4f1b06..b915c1d64ff 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,6 +36,7 @@
 #include "util/u_memory.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_format.h"
 #include "lp_context.h"
 #include "lp_jit.h"
 
@@ -208,6 +209,8 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp)
       LLVMTypeRef elem_types[LP_JIT_THREAD_DATA_COUNT];
       LLVMTypeRef thread_data_type;
 
+      elem_types[LP_JIT_THREAD_DATA_CACHE] =
+            LLVMPointerType(lp_build_format_cache_type(gallivm), 0);
       elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc);
       elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] =
             LLVMInt32TypeInContext(lc);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 097fa7dce7c..9db26f2cba9 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -43,6 +43,7 @@
 #include "lp_texture.h"
 
 
+struct lp_build_format_cache;
 struct lp_fragment_shader_variant;
 struct llvmpipe_screen;
 
@@ -189,6 +190,7 @@ enum {
 
 struct lp_jit_thread_data
 {
+   struct lp_build_format_cache *cache;
    uint64_t vis_counter;
 
    /*
@@ -201,12 +203,16 @@ struct lp_jit_thread_data
 
 
 enum {
-   LP_JIT_THREAD_DATA_COUNTER = 0,
+   LP_JIT_THREAD_DATA_CACHE = 0,
+   LP_JIT_THREAD_DATA_COUNTER,
    LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX,
    LP_JIT_THREAD_DATA_COUNT
 };
 
 
+#define lp_jit_thread_data_cache(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_THREAD_DATA_CACHE, "cache")
+
 #define lp_jit_thread_data_counter(_gallivm, _ptr) \
    lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, "counter")
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index c726707c062..d22e50777fa 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -43,6 +43,7 @@
 #include "lp_query.h"
 #include "lp_rast.h"
 #include "lp_rast_priv.h"
+#include "gallivm/lp_bld_format.h"
 #include "gallivm/lp_bld_debug.h"
 #include "lp_scene.h"
 #include "lp_tex_sample.h"
@@ -664,6 +665,17 @@ rasterize_scene(struct lp_rasterizer_task *task,
 {
    task->scene = scene;
 
+   /* Clear the cache tags. This should not always be necessary but
+      simpler for now. */
+#if LP_USE_TEXTURE_CACHE
+   memset(task->thread_data.cache->cache_tags, 0,
+          sizeof(task->thread_data.cache->cache_tags));
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   task->thread_data.cache->cache_access_total = 0;
+   task->thread_data.cache->cache_access_miss = 0;
+#endif
+#endif
+
    if (!task->rast->no_rast && !scene->discard) {
       /* loop over scene bins, rasterize each */
       {
@@ -679,6 +691,20 @@ rasterize_scene(struct lp_rasterizer_task *task,
    }
 
 
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   {
+      uint64_t total, miss;
+      total = task->thread_data.cache->cache_access_total;
+      miss = task->thread_data.cache->cache_access_miss;
+      if (total) {
+         debug_printf("thread %d cache access %llu miss %llu hit rate %f\n",
+                 task->thread_index, (long long unsigned)total,
+                 (long long unsigned)miss,
+                 (float)(total - miss)/(float)total);
+      }
+   }
+#endif
+
    if (scene->fence) {
       lp_fence_signal(scene->fence);
    }
@@ -866,10 +892,15 @@ lp_rast_create( unsigned num_threads )
       goto no_full_scenes;
    }
 
-   for (i = 0; i < Elements(rast->tasks); i++) {
+   for (i = 0; i < MAX2(1, num_threads); i++) {
       struct lp_rasterizer_task *task = &rast->tasks[i];
       task->rast = rast;
       task->thread_index = i;
+      task->thread_data.cache = align_malloc(sizeof(struct lp_build_format_cache),
+                                             16);
+      if (!task->thread_data.cache) {
+         goto no_thread_data_cache;
+      }
    }
 
    rast->num_threads = num_threads;
@@ -885,6 +916,14 @@ lp_rast_create( unsigned num_threads )
 
    return rast;
 
+no_thread_data_cache:
+   for (i = 0; i < MAX2(1, rast->num_threads); i++) {
+      if (rast->tasks[i].thread_data.cache) {
+         align_free(rast->tasks[i].thread_data.cache);
+      }
+   }
+
+   lp_scene_queue_destroy(rast->full_scenes);
 no_full_scenes:
    FREE(rast);
 no_rast:
@@ -923,6 +962,9 @@ void lp_rast_destroy( struct lp_rasterizer *rast )
       pipe_semaphore_destroy(&rast->tasks[i].work_ready);
       pipe_semaphore_destroy(&rast->tasks[i].work_done);
    }
+   for (i = 0; i < MAX2(1, rast->num_threads); i++) {
+      align_free(rast->tasks[i].thread_data.cache);
+   }
 
    /* for synchronizing rasterization threads */
    pipe_barrier_destroy( &rast->barrier );
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index d1c50aefc84..9f5e7378ac7 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -300,6 +300,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index fd6c49aacd8..f55f6b4fa4f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -421,7 +421,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
    lp_build_tgsi_soa(gallivm, tokens, type, &mask,
                      consts_ptr, num_consts_ptr, &system_values,
                      interp->inputs,
-                     outputs, context_ptr,
+                     outputs, context_ptr, thread_data_ptr,
                      sampler, &shader->info.base, NULL);
 
    /* Alpha test */
@@ -2303,8 +2303,8 @@ generate_fragment(struct llvmpipe_context *lp,
    lp_build_name(dady_ptr, "dady");
    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
    lp_build_name(depth_ptr, "depth");
-   lp_build_name(thread_data_ptr, "thread_data");
    lp_build_name(mask_input, "mask_input");
+   lp_build_name(thread_data_ptr, "thread_data");
    lp_build_name(stride_ptr, "stride_ptr");
    lp_build_name(depth_stride, "depth_stride");
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index d9abd1ae37c..0640a217874 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -44,6 +44,9 @@
 
 #include "lp_test.h"
 
+#define USE_TEXTURE_CACHE 1
+
+static struct lp_build_format_cache *cache_ptr;
 
 void
 write_tsv_header(FILE *fp)
@@ -71,7 +74,7 @@ write_tsv_row(FILE *fp,
 
 typedef void
 (*fetch_ptr_t)(void *unpacked, const void *packed,
-               unsigned i, unsigned j);
+               unsigned i, unsigned j, struct lp_build_format_cache *cache);
 
 
 static LLVMValueRef
@@ -83,7 +86,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    LLVMContextRef context = gallivm->context;
    LLVMModuleRef module = gallivm->module;
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef args[4];
+   LLVMTypeRef args[5];
    LLVMValueRef func;
    LLVMValueRef packed_ptr;
    LLVMValueRef offset = LLVMConstNull(LLVMInt32TypeInContext(context));
@@ -92,6 +95,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    LLVMValueRef j;
    LLVMBasicBlockRef block;
    LLVMValueRef rgba;
+   LLVMValueRef cache = NULL;
 
    util_snprintf(name, sizeof name, "fetch_%s_%s", desc->short_name,
                  type.floating ? "float" : "unorm8");
@@ -99,6 +103,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    args[0] = LLVMPointerType(lp_build_vec_type(gallivm, type), 0);
    args[1] = LLVMPointerType(LLVMInt8TypeInContext(context), 0);
    args[3] = args[2] = LLVMInt32TypeInContext(context);
+   args[4] = LLVMPointerType(lp_build_format_cache_type(gallivm), 0);
 
    func = LLVMAddFunction(module, name,
                           LLVMFunctionType(LLVMVoidTypeInContext(context),
@@ -109,11 +114,15 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    i = LLVMGetParam(func, 2);
    j = LLVMGetParam(func, 3);
 
+   if (cache_ptr) {
+      cache = LLVMGetParam(func, 4);
+   }
+
    block = LLVMAppendBasicBlockInContext(context, func, "entry");
    LLVMPositionBuilderAtEnd(builder, block);
 
    rgba = lp_build_fetch_rgba_aos(gallivm, desc, type, TRUE,
-                                  packed_ptr, offset, i, j);
+                                  packed_ptr, offset, i, j, cache);
 
    LLVMBuildStore(builder, rgba, rgba_ptr);
 
@@ -170,7 +179,7 @@ test_format_float(unsigned verbose, FILE *fp,
 
                memset(unpacked, 0, sizeof unpacked);
 
-               fetch_ptr(unpacked, packed, j, i);
+               fetch_ptr(unpacked, packed, j, i, cache_ptr);
 
                for(k = 0; k < 4; ++k) {
                   if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) {
@@ -187,6 +196,11 @@ test_format_float(unsigned verbose, FILE *fp,
                   }
                }
 
+               /* Ignore errors in S3TC for now */
+               if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+                  match = TRUE;
+               }
+
                if (!match) {
                   printf("FAILED\n");
                   printf("  Packed: %02x %02x %02x %02x\n",
@@ -261,7 +275,7 @@ test_format_unorm8(unsigned verbose, FILE *fp,
 
                memset(unpacked, 0, sizeof unpacked);
 
-               fetch_ptr(unpacked, packed, j, i);
+               fetch_ptr(unpacked, packed, j, i, cache_ptr);
 
                match = TRUE;
                for(k = 0; k < 4; ++k) {
@@ -277,6 +291,11 @@ test_format_unorm8(unsigned verbose, FILE *fp,
                      match = FALSE;
                }
 
+               /* Ignore errors in S3TC as we only implement a poor man approach */
+               if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+                  match = TRUE;
+               }
+
                if (!match) {
                   printf("FAILED\n");
                   printf("  Packed: %02x %02x %02x %02x\n",
@@ -334,6 +353,10 @@ test_all(unsigned verbose, FILE *fp)
 
    util_format_s3tc_init();
 
+#if USE_TEXTURE_CACHE
+   cache_ptr = align_malloc(sizeof(struct lp_build_format_cache), 16);
+#endif
+
    for (format = 1; format < PIPE_FORMAT_COUNT; ++format) {
       const struct util_format_description *format_desc;
 
@@ -363,6 +386,9 @@ test_all(unsigned verbose, FILE *fp)
            success = FALSE;
       }
    }
+#if USE_TEXTURE_CACHE
+   align_free(cache_ptr);
+#endif
 
    return success;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index 316d1c55082..217abe963b7 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -221,6 +221,21 @@ LP_LLVM_SAMPLER_MEMBER(lod_bias,   LP_JIT_SAMPLER_LOD_BIAS, TRUE)
 LP_LLVM_SAMPLER_MEMBER(border_color, LP_JIT_SAMPLER_BORDER_COLOR, FALSE)
 
 
+#if LP_USE_TEXTURE_CACHE
+static LLVMValueRef
+lp_llvm_texture_cache_ptr(const struct lp_sampler_dynamic_state *base,
+                          struct gallivm_state *gallivm,
+                          LLVMValueRef thread_data_ptr,
+                          unsigned unit)
+{
+   /* We use the same cache for all units */
+   (void)unit;
+
+   return lp_jit_thread_data_cache(gallivm, thread_data_ptr);
+}
+#endif
+
+
 static void
 lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
 {
@@ -314,6 +329,10 @@ lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state)
    sampler->dynamic_state.base.lod_bias = lp_llvm_sampler_lod_bias;
    sampler->dynamic_state.base.border_color = lp_llvm_sampler_border_color;
 
+#if LP_USE_TEXTURE_CACHE
+   sampler->dynamic_state.base.cache_ptr = lp_llvm_texture_cache_ptr;
+#endif
+
    sampler->dynamic_state.static_state = static_state;
 
    return &sampler->base;
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index f4aff226ce1..e26d608c9eb 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -34,6 +34,10 @@
 
 struct lp_sampler_static_state;
 
+/**
+ * Whether texture cache is used for s3tc textures.
+ */
+#define LP_USE_TEXTURE_CACHE 0
 
 /**
  * Pure-LLVM texture sampling code generator.
@@ -42,5 +46,4 @@ struct lp_sampler_static_state;
 struct lp_build_sampler_soa *
 lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key);
 
-
 #endif /* LP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 7862ac8f217..82868814581 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -805,7 +805,7 @@ llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen)
 #endif
 
    screen->resource_create = llvmpipe_resource_create;
-   screen->resource_create_front = llvmpipe_resource_create_front;
+/*   screen->resource_create_front = llvmpipe_resource_create_front; */
    screen->resource_destroy = llvmpipe_resource_destroy;
    screen->resource_from_handle = llvmpipe_resource_from_handle;
    screen->resource_get_handle = llvmpipe_resource_get_handle;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index f6e93081e76..d09a0ab0610 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -389,6 +389,7 @@ enum SVSemantic
    SV_SBASE,
    SV_VERTEX_STRIDE,
    SV_INVOCATION_INFO,
+   SV_THREAD_KILL,
    SV_UNDEFINED,
    SV_LAST
 };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 19418c0e0f1..dca799dd9b5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -392,6 +392,12 @@ BuildUtil::mkImm(float f)
    return mkImm(u.u32);
 }
 
+ImmediateValue *
+BuildUtil::mkImm(double d)
+{
+   return new_ImmediateValue(prog, d);
+}
+
 Value *
 BuildUtil::loadImm(Value *dst, float f)
 {
@@ -399,6 +405,12 @@ BuildUtil::loadImm(Value *dst, float f)
 }
 
 Value *
+BuildUtil::loadImm(Value *dst, double d)
+{
+   return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(), mkImm(d));
+}
+
+Value *
 BuildUtil::loadImm(Value *dst, uint32_t u)
 {
    return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u));
@@ -555,6 +567,12 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
    switch (i->dType) {
    case TYPE_U64: hTy = TYPE_U32; break;
    case TYPE_S64: hTy = TYPE_S32; break;
+   case TYPE_F64:
+      if (i->op == OP_MOV) {
+         hTy = TYPE_U32;
+         break;
+      }
+      /* fallthrough */
    default:
       return NULL;
    }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
index 0d544581697..8f3bf77949c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -90,12 +90,14 @@ public:
    void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
 
    ImmediateValue *mkImm(float);
+   ImmediateValue *mkImm(double);
    ImmediateValue *mkImm(uint32_t);
    ImmediateValue *mkImm(uint64_t);
 
    ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); }
 
    Value *loadImm(Value *dst, float);
+   Value *loadImm(Value *dst, double);
    Value *loadImm(Value *dst, uint32_t);
    Value *loadImm(Value *dst, uint64_t);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index c0cab3299b5..b49bf9d53bc 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -96,6 +96,7 @@ struct nv50_ir_prog_info
       uint32_t tlsSpace;  /* required local memory per thread */
       uint32_t *code;
       uint32_t codeSize;
+      uint32_t instructions;
       uint8_t sourceRep;  /* NV50_PROGRAM_IR */
       const void *source;
       void *relocData;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index d712c9c300a..b163cd2db4a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1644,6 +1644,7 @@ CodeEmitterGK110::getSRegEncoding(const ValueRef& ref)
    case SV_VERTEX_COUNT:  return 0x10;
    case SV_INVOCATION_ID: return 0x11;
    case SV_YDIR:          return 0x12;
+   case SV_THREAD_KILL:   return 0x13;
    case SV_TID:           return 0x21 + SDATA(ref).sv.index;
    case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
    case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index a327d572470..e9ddd366391 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -244,6 +244,7 @@ CodeEmitterGM107::emitSYS(int pos, const Value *val)
    case SV_LANEID         : id = 0x00; break;
    case SV_VERTEX_COUNT   : id = 0x10; break;
    case SV_INVOCATION_ID  : id = 0x11; break;
+   case SV_THREAD_KILL    : id = 0x13; break;
    case SV_INVOCATION_INFO: id = 0x1d; break;
    default:
       assert(!"invalid system value");
@@ -310,9 +311,12 @@ CodeEmitterGM107::emitIMMD(int pos, int len, const ValueRef &ref)
    uint32_t val = imm->reg.data.u32;
 
    if (len == 19) {
-      if (isFloatType(insn->sType)) {
+      if (insn->sType == TYPE_F32 || insn->sType == TYPE_F16) {
          assert(!(val & 0x00000fff));
          val >>= 12;
+      } else if (insn->sType == TYPE_F64) {
+         assert(!(imm->reg.data.u64 & 0x00000fffffffffffULL));
+         val = imm->reg.data.u64 >> 44;
       }
       assert(!(val & 0xfff00000) || (val & 0xfff00000) == 0xfff00000);
       emitField( 56,   1, (val & 0x80000) >> 19);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 9f1e4b803d5..0b5288218d1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -96,9 +96,12 @@ private:
    void emitUADD(const Instruction *);
    void emitAADD(const Instruction *);
    void emitFADD(const Instruction *);
+   void emitDADD(const Instruction *);
    void emitIMUL(const Instruction *);
    void emitFMUL(const Instruction *);
+   void emitDMUL(const Instruction *);
    void emitFMAD(const Instruction *);
+   void emitDMAD(const Instruction *);
    void emitIMAD(const Instruction *);
    void emitISAD(const Instruction *);
 
@@ -438,9 +441,9 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
       return;
 
    if ((mode & 3) == 1) {
-      const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
+      const int pos = ((mode >> 2) & 3) == 3 ? 13 : 14;
 
-      switch (i->getSrc(0)->reg.type) {
+      switch (i->sType) {
       case TYPE_U8:
          break;
       case TYPE_U16:
@@ -954,11 +957,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
          assert(0);
          break;
       }
-      code[1] |= i->src(0).mod.abs() << 20;
-      code[1] |= i->src(0).mod.neg() << 26;
-      code[1] |= i->src(1).mod.abs() << 19;
-      code[1] |= i->src(1).mod.neg() << 27;
    }
+
+   code[1] |= i->src(0).mod.abs() << 20;
+   code[1] |= i->src(0).mod.neg() << 26;
+   code[1] |= i->src(1).mod.abs() << 19;
+   code[1] |= i->src(1).mod.neg() << 27;
+
    emitForm_MAD(i);
 }
 
@@ -994,6 +999,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitDMAD(const Instruction *i)
+{
+   const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   const int neg_add = i->src(2).mod.neg();
+
+   assert(i->encSize == 8);
+   assert(!i->saturate);
+
+   code[1] = 0x40000000;
+   code[0] = 0xe0000000;
+
+   code[1] |= neg_mul << 26;
+   code[1] |= neg_add << 27;
+
+   roundMode_MAD(i);
+
+   emitForm_MAD(i);
+}
+
+void
 CodeEmitterNV50::emitFADD(const Instruction *i)
 {
    const int neg0 = i->src(0).mod.neg();
@@ -1028,6 +1053,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitDADD(const Instruction *i)
+{
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+   assert(!(i->src(0).mod | i->src(1).mod).abs());
+   assert(!i->saturate);
+   assert(i->encSize == 8);
+
+   code[1] = 0x60000000;
+   code[0] = 0xe0000000;
+
+   emitForm_ADD(i);
+
+   code[1] |= neg0 << 26;
+   code[1] |= neg1 << 27;
+}
+
+void
 CodeEmitterNV50::emitUADD(const Instruction *i)
 {
    const int neg0 = i->src(0).mod.neg();
@@ -1081,7 +1125,10 @@ CodeEmitterNV50::emitIMUL(const Instruction *i)
 
    if (i->encSize == 8) {
       code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
-      emitForm_MAD(i);
+      if (i->src(1).getFile() == FILE_IMMEDIATE)
+         emitForm_IMM(i);
+      else
+         emitForm_MAD(i);
    } else {
       if (i->sType == TYPE_S16)
          code[0] |= 0x8100;
@@ -1121,6 +1168,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitDMUL(const Instruction *i)
+{
+   const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   assert(!i->saturate);
+   assert(i->encSize == 8);
+
+   code[1] = 0x80000000;
+   code[0] = 0xe0000000;
+
+   if (neg)
+      code[1] |= 0x08000000;
+
+   roundMode_CVT(i->rnd);
+
+   emitForm_MAD(i);
+}
+
+void
 CodeEmitterNV50::emitIMAD(const Instruction *i)
 {
    code[0] = 0x60000000;
@@ -1136,7 +1202,10 @@ CodeEmitterNV50::emitIMAD(const Instruction *i)
    code[1] |= neg1 << 27;
    code[1] |= neg2 << 26;
 
-   emitForm_MAD(i);
+   if (i->src(1).getFile() == FILE_IMMEDIATE)
+      emitForm_IMM(i);
+   else
+      emitForm_MAD(i);
 
    if (i->flagsSrc >= 0) {
       // add with carry from $cX
@@ -1181,9 +1250,11 @@ CodeEmitterNV50::emitSET(const Instruction *i)
    code[0] = 0x30000000;
    code[1] = 0x60000000;
 
-   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
-
    switch (i->sType) {
+   case TYPE_F64:
+      code[0] = 0xe0000000;
+      code[1] = 0xe0000000;
+      break;
    case TYPE_F32: code[0] |= 0x80000000; break;
    case TYPE_S32: code[1] |= 0x0c000000; break;
    case TYPE_U32: code[1] |= 0x04000000; break;
@@ -1193,6 +1264,9 @@ CodeEmitterNV50::emitSET(const Instruction *i)
       assert(0);
       break;
    }
+
+   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
    if (i->src(0).mod.neg()) code[1] |= 0x04000000;
    if (i->src(1).mod.neg()) code[1] |= 0x08000000;
    if (i->src(0).mod.abs()) code[1] |= 0x00100000;
@@ -1756,7 +1830,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
       break;
    case OP_ADD:
    case OP_SUB:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDADD(insn);
+      else if (isFloatType(insn->dType))
          emitFADD(insn);
       else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
          emitAADD(insn);
@@ -1764,14 +1840,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
          emitUADD(insn);
       break;
    case OP_MUL:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMUL(insn);
+      else if (isFloatType(insn->dType))
          emitFMUL(insn);
       else
          emitIMUL(insn);
       break;
    case OP_MAD:
    case OP_FMA:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMAD(insn);
+      else if (isFloatType(insn->dType))
          emitFMAD(insn);
       else
          emitIMAD(insn);
@@ -1943,7 +2023,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
 {
    const Target::OpInfo &info = targ->getOpInfo(i);
 
-   if (info.minEncSize > 4)
+   if (info.minEncSize > 4 || i->dType == TYPE_F64)
       return 8;
 
    // check constraints on dst and src operands
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index fd103146c72..2a13e1086a0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -323,6 +323,14 @@ CodeEmitterNVC0::setImmediate(const Instruction *i, const int s)
    assert(imm);
    u32 = imm->reg.data.u32;
 
+   if ((code[0] & 0xf) == 0x1) {
+      // double immediate
+      uint64_t u64 = imm->reg.data.u64;
+      assert(!(u64 & 0x00000fffffffffffULL));
+      assert(!(code[1] & 0xc000));
+      code[0] |= ((u64 >> 44) & 0x3f) << 26;
+      code[1] |= 0xc000 | (u64 >> 50);
+   } else
    if ((code[0] & 0xf) == 0x2) {
       // LIMM
       code[0] |= (u32 & 0x3f) << 26;
@@ -1831,6 +1839,7 @@ CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref)
    case SV_VERTEX_COUNT:  return 0x10;
    case SV_INVOCATION_ID: return 0x11;
    case SV_YDIR:          return 0x12;
+   case SV_THREAD_KILL:   return 0x13;
    case SV_TID:           return 0x21 + SDATA(ref).sv.index;
    case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
    case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 6a7cb4224f4..08a73d79781 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -376,6 +376,7 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
    case TGSI_SEMANTIC_TESSOUTER:  return nv50_ir::SV_TESS_OUTER;
    case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
    case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
+   case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL;
    default:
       assert(0);
       return nv50_ir::SV_CLOCK;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index eec502be798..75164ef0641 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -75,7 +75,7 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
    s[0] = mul->getSrc(0);
    s[1] = mul->getSrc(1);
 
-   if (isSignedType(mul->sType)) {
+   if (isSignedType(mul->sType) && highResult) {
       s[0] = bld->getSSA(fullSize);
       s[1] = bld->getSSA(fullSize);
       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 44f74c61304..0f1dcf0dacd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -155,7 +155,7 @@ private:
    void checkSwapSrc01(Instruction *);
 
    bool isCSpaceLoad(Instruction *);
-   bool isImmd32Load(Instruction *);
+   bool isImmdLoad(Instruction *);
    bool isAttribOrSharedLoad(Instruction *);
 };
 
@@ -166,9 +166,10 @@ LoadPropagation::isCSpaceLoad(Instruction *ld)
 }
 
 bool
-LoadPropagation::isImmd32Load(Instruction *ld)
+LoadPropagation::isImmdLoad(Instruction *ld)
 {
-   if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
+   if (!ld || (ld->op != OP_MOV) ||
+       ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
       return false;
    return ld->src(0).getFile() == FILE_IMMEDIATE;
 }
@@ -201,8 +202,8 @@ LoadPropagation::checkSwapSrc01(Instruction *insn)
       else
          return;
    } else
-   if (isImmd32Load(i0)) {
-      if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
+   if (isImmdLoad(i0)) {
+      if (!isCSpaceLoad(i1) && !isImmdLoad(i1))
          insn->swapSources(0, 1);
       else
          return;
@@ -447,6 +448,7 @@ ConstantFolding::expr(Instruction *i,
 {
    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
    struct Storage res;
+   DataType type = i->dType;
 
    memset(&res.data, 0, sizeof(res.data));
 
@@ -588,6 +590,18 @@ ConstantFolding::expr(Instruction *i,
       // The two arguments to pfetch are logically added together. Normally
       // the second argument will not be constant, but that can happen.
       res.data.u32 = a->data.u32 + b->data.u32;
+      type = TYPE_U32;
+      break;
+   case OP_MERGE:
+      switch (i->dType) {
+      case TYPE_U64:
+      case TYPE_S64:
+      case TYPE_F64:
+         res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
+         break;
+      default:
+         return;
+      }
       break;
    default:
       return;
@@ -602,6 +616,8 @@ ConstantFolding::expr(Instruction *i,
    i->setSrc(1, NULL);
 
    i->getSrc(0)->reg.data = res.data;
+   i->getSrc(0)->reg.type = type;
+   i->getSrc(0)->reg.size = typeSizeof(type);
 
    switch (i->op) {
    case OP_MAD:
@@ -1148,6 +1164,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
    case type: \
       switch (i->sType) { \
+      case TYPE_F64: \
+         res.data.dst = util_iround(i->saturate ? \
+                                    CLAMP(imm0.reg.data.f64, fmin, fmax) : \
+                                    imm0.reg.data.f64); \
+         break; \
       case TYPE_F32: \
          res.data.dst = util_iround(i->saturate ? \
                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
@@ -1185,6 +1206,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
       case TYPE_F32:
          switch (i->sType) {
+         case TYPE_F64:
+            res.data.f32 = i->saturate ?
+               CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+               imm0.reg.data.f64;
+            break;
          case TYPE_F32:
             res.data.f32 = i->saturate ?
                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
@@ -1199,6 +1225,27 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
          }
          i->setSrc(0, bld.mkImm(res.data.f32));
          break;
+      case TYPE_F64:
+         switch (i->sType) {
+         case TYPE_F64:
+            res.data.f64 = i->saturate ?
+               CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+               imm0.reg.data.f64;
+            break;
+         case TYPE_F32:
+            res.data.f64 = i->saturate ?
+               CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+               imm0.reg.data.f32;
+            break;
+         case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
+         case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
+         case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
+         case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
+         default:
+            return;
+         }
+         i->setSrc(0, bld.mkImm(res.data.f64));
+         break;
       default:
          return;
       }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index 5f30f3d354b..0b02599dbdd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -275,6 +275,7 @@ static const char *SemanticStr[SV_LAST + 1] =
    "SBASE",
    "VERTEX_STRIDE",
    "INVOCATION_INFO",
+   "THREAD_KILL",
    "?",
    "(INVALID)"
 };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index afc8ff1374f..4390a726d1c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -373,6 +373,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
    if (!code)
       return false;
    emit->setCodeLocation(code, binSize);
+   info->bin.instructions = 0;
 
    for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
       Function *fn = reinterpret_cast<Function *>(fi.get());
@@ -382,6 +383,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
       for (int b = 0; b < fn->bbCount; ++b) {
          for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) {
             emit->emitInstruction(i);
+            info->bin.instructions++;
             if (i->sType == TYPE_F64 || i->dType == TYPE_F64)
                info->io.fp64 = true;
          }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index f3ddcaa5199..94cf0f0e05e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -343,7 +343,7 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
    }
 
    if (sf == FILE_IMMEDIATE)
-      return true;
+      return ldSize <= 4;
 
 
    // Check if memory access is encodable:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 27df0eba66b..8f59d86a72f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -338,17 +338,30 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
    if (sf == FILE_IMMEDIATE) {
       Storage &reg = ld->getSrc(0)->asImm()->reg;
 
-      if (typeSizeof(i->sType) > 4)
-         return false;
-      if (opInfo[i->op].immdBits != 0xffffffff) {
-         if (i->sType == TYPE_F32) {
+      if (opInfo[i->op].immdBits != 0xffffffff || typeSizeof(i->sType) > 4) {
+         switch (i->sType) {
+         case TYPE_F64:
+            if (reg.data.u64 & 0x00000fffffffffffULL)
+               return false;
+            break;
+         case TYPE_F32:
             if (reg.data.u32 & 0xfff)
                return false;
-         } else
-         if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
+            break;
+         case TYPE_S32:
+         case TYPE_U32:
             // with u32, 0xfffff counts as 0xffffffff as well
             if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
                return false;
+            break;
+         case TYPE_U8:
+         case TYPE_S8:
+         case TYPE_U16:
+         case TYPE_S16:
+         case TYPE_F16:
+            break;
+         default:
+            return false;
          }
       } else
       if (i->op == OP_MAD || i->op == OP_FMA) {
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 72e070b5f06..68e69beb08f 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -225,21 +225,22 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
  * for write/read by waiting on the buffer's relevant fences.
  */
 static inline bool
-nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw)
+nouveau_buffer_sync(struct nouveau_context *nv,
+                    struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ) {
       if (!buf->fence_wr)
          return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence_wr));
-      if (!nouveau_fence_wait(buf->fence_wr))
+      if (!nouveau_fence_wait(buf->fence_wr, &nv->debug))
          return false;
    } else {
       if (!buf->fence)
          return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence));
-      if (!nouveau_fence_wait(buf->fence))
+      if (!nouveau_fence_wait(buf->fence, &nv->debug))
          return false;
 
       nouveau_fence_ref(NULL, &buf->fence);
@@ -478,7 +479,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
       if (unlikely(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) {
          /* Discarding was not possible, must sync because
           * subsequent transfers might use UNSYNCHRONIZED. */
-         nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE);
+         nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE);
       } else
       if (usage & PIPE_TRANSFER_DISCARD_RANGE) {
          /* The whole range is being discarded, so it doesn't matter what was
@@ -490,7 +491,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
          if (usage & PIPE_TRANSFER_DONTBLOCK)
             map = NULL;
          else
-            nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE);
+            nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE);
       } else {
          /* It is expected that the returned buffer be a representation of the
           * data in question, so we must copy it over from the buffer. */
@@ -615,7 +616,7 @@ nouveau_resource_map_offset(struct nouveau_context *nv,
    if (res->mm) {
       unsigned rw;
       rw = (flags & NOUVEAU_BO_WR) ? PIPE_TRANSFER_WRITE : PIPE_TRANSFER_READ;
-      nouveau_buffer_sync(res, rw);
+      nouveau_buffer_sync(nv, res, rw);
       if (nouveau_bo_map(res->bo, 0, NULL))
          return NULL;
    } else {
diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h
index decb2714ede..c3bbb11bd60 100644
--- a/src/gallium/drivers/nouveau/nouveau_context.h
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@@ -2,6 +2,7 @@
 #define __NOUVEAU_CONTEXT_H__
 
 #include "pipe/p_context.h"
+#include "pipe/p_state.h"
 #include <nouveau.h>
 
 #define NOUVEAU_MAX_SCRATCH_BUFS 4
@@ -14,6 +15,7 @@ struct nouveau_context {
 
    struct nouveau_client *client;
    struct nouveau_pushbuf *pushbuf;
+   struct pipe_debug_callback debug;
 
    bool vbo_dirty;
 
@@ -64,6 +66,9 @@ void
 nouveau_context_init_vdec(struct nouveau_context *);
 
 void
+nouveau_context_init(struct nouveau_context *);
+
+void
 nouveau_scratch_runout_release(struct nouveau_context *);
 
 /* This is needed because we don't hold references outside of context::scratch,
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c
index 21cf2b9ae5e..691553ae7e4 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@@ -23,6 +23,7 @@
 #include "nouveau_screen.h"
 #include "nouveau_winsys.h"
 #include "nouveau_fence.h"
+#include "os/os_time.h"
 
 #ifdef PIPE_OS_UNIX
 #include <sched.h>
@@ -58,26 +59,6 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence)
    }
 }
 
-bool
-nouveau_fence_work(struct nouveau_fence *fence,
-                   void (*func)(void *), void *data)
-{
-   struct nouveau_fence_work *work;
-
-   if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
-      func(data);
-      return true;
-   }
-
-   work = CALLOC_STRUCT(nouveau_fence_work);
-   if (!work)
-      return false;
-   work->func = func;
-   work->data = data;
-   LIST_ADD(&work->list, &fence->work);
-   return true;
-}
-
 void
 nouveau_fence_emit(struct nouveau_fence *fence)
 {
@@ -181,11 +162,10 @@ nouveau_fence_signalled(struct nouveau_fence *fence)
    return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED;
 }
 
-bool
-nouveau_fence_wait(struct nouveau_fence *fence)
+static bool
+nouveau_fence_kick(struct nouveau_fence *fence)
 {
    struct nouveau_screen *screen = fence->screen;
-   uint32_t spins = 0;
 
    /* wtf, someone is waiting on a fence in flush_notify handler? */
    assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING);
@@ -206,11 +186,32 @@ nouveau_fence_wait(struct nouveau_fence *fence)
    if (fence == screen->fence.current)
       nouveau_fence_next(screen);
 
-   do {
-      nouveau_fence_update(screen, false);
+   nouveau_fence_update(screen, false);
+
+   return true;
+}
 
-      if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
+bool
+nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debug)
+{
+   struct nouveau_screen *screen = fence->screen;
+   uint32_t spins = 0;
+   int64_t start = 0;
+
+   if (debug && debug->debug_message)
+      start = os_time_get_nano();
+
+   if (!nouveau_fence_kick(fence))
+      return false;
+
+   do {
+      if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
+         if (debug && debug->debug_message)
+            pipe_debug_message(debug, PERF_INFO,
+                               "stalled %.3f ms waiting for fence",
+                               (os_time_get_nano() - start) / 1000000.f);
          return true;
+      }
       if (!spins)
          NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1);
       spins++;
@@ -218,6 +219,8 @@ nouveau_fence_wait(struct nouveau_fence *fence)
       if (!(spins % 8)) /* donate a few cycles */
          sched_yield();
 #endif
+
+      nouveau_fence_update(screen, false);
    } while (spins < NOUVEAU_FENCE_MAX_SPINS);
 
    debug_printf("Wait on fence %u (ack = %u, next = %u) timed out !\n",
@@ -249,3 +252,26 @@ nouveau_fence_unref_bo(void *data)
 
    nouveau_bo_ref(NULL, &bo);
 }
+
+bool
+nouveau_fence_work(struct nouveau_fence *fence,
+                   void (*func)(void *), void *data)
+{
+   struct nouveau_fence_work *work;
+
+   if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
+      func(data);
+      return true;
+   }
+
+   work = CALLOC_STRUCT(nouveau_fence_work);
+   if (!work)
+      return false;
+   work->func = func;
+   work->data = data;
+   LIST_ADD(&work->list, &fence->work);
+   p_atomic_inc(&fence->work_count);
+   if (fence->work_count > 64)
+      nouveau_fence_kick(fence);
+   return true;
+}
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h
index 2efcab2172d..f10016da826 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.h
+++ b/src/gallium/drivers/nouveau/nouveau_fence.h
@@ -11,6 +11,8 @@
 #define NOUVEAU_FENCE_STATE_FLUSHED   3
 #define NOUVEAU_FENCE_STATE_SIGNALLED 4
 
+struct pipe_debug_callback;
+
 struct nouveau_fence_work {
    struct list_head list;
    void (*func)(void *);
@@ -23,6 +25,7 @@ struct nouveau_fence {
    int state;
    int ref;
    uint32_t sequence;
+   uint32_t work_count;
    struct list_head work;
 };
 
@@ -34,7 +37,7 @@ bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
 bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
 void nouveau_fence_update(struct nouveau_screen *, bool flushed);
 void nouveau_fence_next(struct nouveau_screen *);
-bool nouveau_fence_wait(struct nouveau_fence *);
+bool nouveau_fence_wait(struct nouveau_fence *, struct pipe_debug_callback *);
 bool nouveau_fence_signalled(struct nouveau_fence *);
 
 void nouveau_fence_unref_bo(void *data); /* generic unref bo callback */
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 47603b0b7fd..a6065e45aaa 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -18,6 +18,7 @@
 
 #include "nouveau_winsys.h"
 #include "nouveau_screen.h"
+#include "nouveau_context.h"
 #include "nouveau_fence.h"
 #include "nouveau_mm.h"
 #include "nouveau_buffer.h"
@@ -75,7 +76,7 @@ nouveau_screen_fence_finish(struct pipe_screen *screen,
    if (!timeout)
       return nouveau_fence_signalled(nouveau_fence(pfence));
 
-   return nouveau_fence_wait(nouveau_fence(pfence));
+   return nouveau_fence_wait(nouveau_fence(pfence), NULL);
 }
 
 
@@ -238,3 +239,21 @@ nouveau_screen_fini(struct nouveau_screen *screen)
 
    nouveau_device_del(&screen->device);
 }
+
+static void
+nouveau_set_debug_callback(struct pipe_context *pipe,
+                           const struct pipe_debug_callback *cb)
+{
+   struct nouveau_context *context = nouveau_context(pipe);
+
+   if (cb)
+      context->debug = *cb;
+   else
+      memset(&context->debug, 0, sizeof(context->debug));
+}
+
+void
+nouveau_context_init(struct nouveau_context *context)
+{
+   context->pipe.set_debug_callback = nouveau_set_debug_callback;
+}
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.c b/src/gallium/drivers/nouveau/nouveau_vp3_video.c
index f3a64b22c57..4652e56c49a 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.c
@@ -437,6 +437,7 @@ nouveau_vp3_screen_get_video_param(struct pipe_screen *pscreen,
       /* VP3 does not support MPEG4, VP4+ do. */
       return entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM &&
          profile >= PIPE_VIDEO_PROFILE_MPEG1 &&
+         profile < PIPE_VIDEO_PROFILE_HEVC_MAIN &&
          (!vp3 || codec != PIPE_VIDEO_FORMAT_MPEG4) &&
          firmware_present(pscreen, profile);
    case PIPE_VIDEO_CAP_NPOT_TEXTURES:
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
index a36fd57fae7..3ed088912e2 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -242,6 +242,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    if (debug_get_bool_option("NV30_SWTNL", false))
       nv30->draw_flags |= NV30_NEW_SWTNL;
 
+   nouveau_context_init(&nv30->base);
    nv30->sample_mask = 0xffff;
    nv30_vbo_init(pipe);
    nv30_query_init(pipe);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index bdecb0a32b3..154c3d370f7 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -173,6 +173,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -353,7 +354,7 @@ nv30_screen_fence_emit(struct pipe_screen *pscreen, uint32_t *sequence)
 
    *sequence = ++screen->base.fence.sequence;
 
-   assert(PUSH_AVAIL(push) >= 3);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 3);
    PUSH_DATA (push, NV30_3D_FENCE_OFFSET |
               (2 /* size */ << 18) | (7 /* subchan */ << 13));
    PUSH_DATA (push, 0);
@@ -383,7 +384,7 @@ nv30_screen_destroy(struct pipe_screen *pscreen)
        * _current_ one, and remove both.
        */
       nouveau_fence_ref(screen->base.fence.current, &current);
-      nouveau_fence_wait(current);
+      nouveau_fence_wait(current, NULL);
       nouveau_fence_ref(NULL, &current);
       nouveau_fence_ref(NULL, &screen->base.fence.current);
    }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 4108f48005e..7867c2df7f3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -306,6 +306,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    }
    nv50->base.pushbuf->kick_notify = nv50_default_kick_notify;
 
+   nouveau_context_init(&nv50->base);
    nv50_init_query_functions(nv50);
    nv50_init_surface_functions(nv50);
    nv50_init_state_functions(nv50);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
index 80f92be682d..49a93bf1d91 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@@ -203,10 +203,8 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
    F3B(B5G6R5_UNORM, B5G6R5_UNORM, C2, C1, C0, xx, UNORM, 5_6_5, TD),
    C4B(B5G5R5A1_UNORM, BGR5_A1_UNORM, C2, C1, C0, C3, UNORM, 5_5_5_1, TD),
    F3B(B5G5R5X1_UNORM, BGR5_X1_UNORM, C2, C1, C0, xx, UNORM, 5_5_5_1, TD),
-#if NOUVEAU_DRIVER != 0xc0
    C4B(B4G4R4A4_UNORM, NONE, C2, C1, C0, C3, UNORM, 4_4_4_4, T),
    F3B(B4G4R4X4_UNORM, NONE, C2, C1, C0, xx, UNORM, 4_4_4_4, T),
-#endif
    F3B(R9G9B9E5_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 9_9_9_E5, T),
 
    C4A(R10G10B10A2_UNORM, RGB10_A2_UNORM, C0, C1, C2, C3, UNORM, 10_10_10_2,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index 299629b6438..89e7a338283 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -318,7 +318,8 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
 }
 
 bool
-nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
+nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
+                       struct pipe_debug_callback *debug)
 {
    struct nv50_ir_prog_info *info;
    int ret;
@@ -406,6 +407,11 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
       prog->so = nv50_program_create_strmout_state(info,
                                                    &prog->pipe.stream_output);
 
+   pipe_debug_message(debug, SHADER_INFO,
+                      "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
+                      prog->type, info->bin.tlsSpace, prog->max_gpr,
+                      info->bin.instructions, info->bin.codeSize);
+
 out:
    FREE(info);
    return !ret;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index 24cc96567d7..7a33eb11d6d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -106,7 +106,8 @@ struct nv50_program {
    struct nv50_stream_output_state *so;
 };
 
-bool nv50_program_translate(struct nv50_program *, uint16_t chipset);
+bool nv50_program_translate(struct nv50_program *, uint16_t chipset,
+                            struct pipe_debug_callback *);
 bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
 void nv50_program_destroy(struct nv50_context *, struct nv50_program *);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
index a46e622c597..b40370a1d78 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@@ -151,4 +151,11 @@ nv50_surface_from_buffer(struct pipe_context *pipe,
 void
 nv50_surface_destroy(struct pipe_context *, struct pipe_surface *);
 
+void
+nv50_clear_texture(struct pipe_context *pipe,
+                   struct pipe_resource *res,
+                   unsigned level,
+                   const struct pipe_box *box,
+                   const void *data);
+
 #endif /* __NV50_RESOURCE_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index a9e0c478322..f47e998ab1e 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -350,7 +351,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
        * _current_ one, and remove both.
        */
       nouveau_fence_ref(screen->base.fence.current, &current);
-      nouveau_fence_wait(current);
+      nouveau_fence_wait(current, NULL);
       nouveau_fence_ref(NULL, &current);
       nouveau_fence_ref(NULL, &screen->base.fence.current);
    }
@@ -392,7 +393,7 @@ nv50_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
    /* we need to do it after possible flush in MARK_RING */
    *sequence = ++screen->base.fence.sequence;
 
-   assert(PUSH_AVAIL(push) >= 5);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
    PUSH_DATA (push, NV50_FIFO_PKHDR(NV50_3D(QUERY_ADDRESS_HIGH), 4));
    PUSH_DATAh(push, screen->fence.bo->offset);
    PUSH_DATA (push, screen->fence.bo->offset);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 9b911043132..8e4b2b42bda 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -113,7 +113,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog)
 {
    if (!prog->translated) {
       prog->translated = nv50_program_translate(
-         prog, nv50->screen->base.device->chipset);
+         prog, nv50->screen->base.device->chipset, &nv50->base.debug);
       if (!prog->translated)
          return false;
    } else
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 6c8c9f0b4e6..d27f12ca94b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -727,7 +727,8 @@ nv50_sp_state_create(struct pipe_context *pipe,
       prog->pipe.stream_output = cso->stream_output;
 
    prog->translated = nv50_program_translate(
-         prog, nv50_context(pipe)->screen->base.device->chipset);
+         prog, nv50_context(pipe)->screen->base.device->chipset,
+         &nouveau_context(pipe)->debug);
 
    return (void *)prog;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 237d76d6adb..916a7d44a31 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -27,6 +27,7 @@
 #include "util/u_inlines.h"
 #include "util/u_pack_color.h"
 #include "util/u_format.h"
+#include "util/u_math.h"
 #include "util/u_surface.h"
 
 #include "tgsi/tgsi_ureg.h"
@@ -324,6 +325,9 @@ nv50_clear_render_target(struct pipe_context *pipe,
    else
       PUSH_DATA(push, 512);
 
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, mt->ms_mode);
+
    if (!nouveau_bo_memtype(bo)) {
       BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
       PUSH_DATA (push, 0);
@@ -404,6 +408,9 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
    BEGIN_NV04(push, NV50_3D(RT_ARRAY_MODE), 1);
    PUSH_DATA (push, 512);
 
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, mt->ms_mode);
+
    BEGIN_NV04(push, NV50_3D(VIEWPORT_HORIZ(0)), 2);
    PUSH_DATA (push, (width << 16) | dstx);
    PUSH_DATA (push, (height << 16) | dsty);
@@ -418,6 +425,80 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
 }
 
 void
+nv50_clear_texture(struct pipe_context *pipe,
+                   struct pipe_resource *res,
+                   unsigned level,
+                   const struct pipe_box *box,
+                   const void *data)
+{
+   struct pipe_surface tmpl = {{0}}, *sf;
+
+   tmpl.format = res->format;
+   tmpl.u.tex.first_layer = box->z;
+   tmpl.u.tex.last_layer = box->z + box->depth - 1;
+   tmpl.u.tex.level = level;
+   sf = pipe->create_surface(pipe, res, &tmpl);
+   if (!sf)
+      return;
+
+   if (util_format_is_depth_or_stencil(res->format)) {
+      float depth = 0;
+      uint8_t stencil = 0;
+      unsigned clear = 0;
+      const struct util_format_description *desc =
+         util_format_description(res->format);
+
+      if (util_format_has_depth(desc)) {
+         clear |= PIPE_CLEAR_DEPTH;
+         desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
+      }
+      if (util_format_has_stencil(desc)) {
+         clear |= PIPE_CLEAR_STENCIL;
+         desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+      }
+      pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil,
+                                box->x, box->y, box->width, box->height);
+   } else {
+      union pipe_color_union color;
+
+      switch (util_format_get_blocksizebits(res->format)) {
+      case 128:
+         sf->format = PIPE_FORMAT_R32G32B32A32_UINT;
+         memcpy(&color.ui, data, 128 / 8);
+         break;
+      case 64:
+         sf->format = PIPE_FORMAT_R32G32_UINT;
+         memcpy(&color.ui, data, 64 / 8);
+         memset(&color.ui[2], 0, 64 / 8);
+         break;
+      case 32:
+         sf->format = PIPE_FORMAT_R32_UINT;
+         memcpy(&color.ui, data, 32 / 8);
+         memset(&color.ui[1], 0, 96 / 8);
+         break;
+      case 16:
+         sf->format = PIPE_FORMAT_R16_UINT;
+         color.ui[0] = util_cpu_to_le32(
+            util_le16_to_cpu(*(unsigned short *)data));
+         memset(&color.ui[1], 0, 96 / 8);
+         break;
+      case 8:
+         sf->format = PIPE_FORMAT_R8_UINT;
+         color.ui[0] = util_cpu_to_le32(*(unsigned char *)data);
+         memset(&color.ui[1], 0, 96 / 8);
+         break;
+      default:
+         assert(!"Unknown texel element size");
+         return;
+      }
+
+      pipe->clear_render_target(pipe, sf, &color,
+                                box->x, box->y, box->width, box->height);
+   }
+   pipe->surface_destroy(pipe, sf);
+}
+
+void
 nv50_clear(struct pipe_context *pipe, unsigned buffers,
            const union pipe_color_union *color,
            double depth, unsigned stencil)
@@ -464,11 +545,9 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
    if (mode) {
       int zs_layers = 0, color0_layers = 0;
       if (fb->cbufs[0] && (mode & 0x3c))
-         color0_layers = fb->cbufs[0]->u.tex.last_layer -
-            fb->cbufs[0]->u.tex.first_layer + 1;
+         color0_layers = nv50_surface(fb->cbufs[0])->depth;
       if (fb->zsbuf && (mode & ~0x3c))
-         zs_layers = fb->zsbuf->u.tex.last_layer -
-            fb->zsbuf->u.tex.first_layer + 1;
+         zs_layers = nv50_surface(fb->zsbuf)->depth;
 
       for (j = 0; j < MIN2(zs_layers, color0_layers); j++) {
          BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1);
@@ -488,7 +567,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
       struct pipe_surface *sf = fb->cbufs[i];
       if (!sf || !(buffers & (PIPE_CLEAR_COLOR0 << i)))
          continue;
-      for (j = 0; j <= sf->u.tex.last_layer - sf->u.tex.first_layer; j++) {
+      for (j = 0; j < nv50_surface(sf)->depth; j++) {
          BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1);
          PUSH_DATA (push, (i << 6) | 0x3c |
                     (j << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
@@ -585,6 +664,8 @@ nv50_clear_buffer(struct pipe_context *pipe,
    PUSH_DATA (push, height);
    BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
    PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, 0);
 
    /* NOTE: only works with D3D clear flag (5097/0x143c bit 4) */
 
@@ -1593,6 +1674,7 @@ nv50_init_surface_functions(struct nv50_context *nv50)
    pipe->resource_copy_region = nv50_resource_copy_region;
    pipe->blit = nv50_blit;
    pipe->flush_resource = nv50_flush_resource;
+   pipe->clear_texture = nv50_clear_texture;
    pipe->clear_render_target = nv50_clear_render_target;
    pipe->clear_depth_stencil = nv50_clear_depth_stencil;
    pipe->clear_buffer = nv50_clear_buffer;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 9fa6fceeefa..9aa593f919e 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -636,7 +636,7 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
        * pushbuf submit, but it's probably not a big performance difference.
        */
       if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
-         nouveau_fence_wait(buf->fence_wr);
+         nouveau_fence_wait(buf->fence_wr, &nv50->base.debug);
 
       while (instance_count--) {
          BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index e33af042620..2e7c790e9ee 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -120,7 +120,7 @@ nvc0_compute_validate_program(struct nvc0_context *nvc0)
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
-         prog, nvc0->screen->base.device->chipset);
+         prog, nvc0->screen->base.device->chipset, &nvc0->base.debug);
       if (!prog->translated)
          return false;
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index f7604f11788..82ed5a1864e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -309,6 +309,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    pipe->memory_barrier = nvc0_memory_barrier;
    pipe->get_sample_position = nvc0_context_get_sample_position;
 
+   nouveau_context_init(&nvc0->base);
    nvc0_init_query_functions(nvc0);
    nvc0_init_surface_functions(nvc0);
    nvc0_init_state_functions(nvc0);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 4af83c53224..39b73ecb0c2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -224,7 +224,8 @@ void nvc0_default_kick_notify(struct nouveau_pushbuf *);
 extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *);
 
 /* nvc0_program.c */
-bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
+bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset,
+                            struct pipe_debug_callback *);
 bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_library_upload(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 68048f9d6c0..43d7c7b1123 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -517,7 +517,8 @@ nvc0_program_dump(struct nvc0_program *prog)
 #endif
 
 bool
-nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
+nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
+                       struct pipe_debug_callback *debug)
 {
    struct nv50_ir_prog_info *info;
    int ret;
@@ -639,6 +640,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
       prog->tfb = nvc0_program_create_tfb_state(info,
                                                 &prog->pipe.stream_output);
 
+   pipe_debug_message(debug, SHADER_INFO,
+                      "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
+                      prog->type, info->bin.tlsSpace, prog->num_gprs,
+                      info->bin.instructions, info->bin.codeSize);
+
 out:
    FREE(info);
    return !ret;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 6ad3980911d..461fcaaf677 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -182,11 +182,12 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_COMPUTE:
-      return (class_3d == NVE4_3D_CLASS) ? 1 : 0;
+      return (class_3d <= NVE4_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
 
@@ -245,7 +246,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
          return 0;
       break;
    case PIPE_SHADER_COMPUTE:
-      if (class_3d != NVE4_3D_CLASS)
+      if (class_3d > NVE4_3D_CLASS)
          return 0;
       break;
    default:
@@ -415,7 +416,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
        * _current_ one, and remove both.
        */
       nouveau_fence_ref(screen->base.fence.current, &current);
-      nouveau_fence_wait(current);
+      nouveau_fence_wait(current, NULL);
       nouveau_fence_ref(NULL, &current);
       nouveau_fence_ref(NULL, &screen->base.fence.current);
    }
@@ -547,7 +548,7 @@ nvc0_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
    /* we need to do it after possible flush in MARK_RING */
    *sequence = ++screen->base.fence.sequence;
 
-   assert(PUSH_AVAIL(push) >= 5);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
    PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(NVC0_3D(QUERY_ADDRESS_HIGH), 4));
    PUSH_DATAh(push, screen->fence.bo->offset);
    PUSH_DATA (push, screen->fence.bo->offset);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 8595800592c..7e2e9992fe8 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -72,7 +72,7 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
-         prog, nvc0->screen->base.device->chipset);
+         prog, nvc0->screen->base.device->chipset, &nvc0->base.debug);
       if (!prog->translated)
          return false;
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index ba1714da010..5dce5f0e65d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -681,7 +681,8 @@ nvc0_sp_state_create(struct pipe_context *pipe,
       prog->pipe.stream_output = cso->stream_output;
 
    prog->translated = nvc0_program_translate(
-      prog, nvc0_context(pipe)->screen->base.device->chipset);
+      prog, nvc0_context(pipe)->screen->base.device->chipset,
+      &nouveau_context(pipe)->debug);
 
    return (void *)prog;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index be123349148..cdb1fc1145f 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -67,7 +67,7 @@ nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
    case 1:
       return NV50_SURFACE_FORMAT_R8_UNORM;
    case 2:
-      return NV50_SURFACE_FORMAT_R16_UNORM;
+      return NV50_SURFACE_FORMAT_RG8_UNORM;
    case 4:
       return NV50_SURFACE_FORMAT_BGRA8_UNORM;
    case 8:
@@ -319,6 +319,7 @@ nvc0_clear_render_target(struct pipe_context *pipe,
       PUSH_DATA(push, dst->u.tex.first_layer + sf->depth);
       PUSH_DATA(push, mt->layer_stride >> 2);
       PUSH_DATA(push, dst->u.tex.first_layer);
+      IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode);
    } else {
       if (res->base.target == PIPE_BUFFER) {
          PUSH_DATA(push, 262144);
@@ -334,6 +335,7 @@ nvc0_clear_render_target(struct pipe_context *pipe,
       PUSH_DATA(push, 0);
 
       IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0);
+      IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0);
 
       /* tiled textures don't have to be fenced, they're not mapped directly */
       nvc0_resource_fence(res, NOUVEAU_BO_WR);
@@ -466,6 +468,7 @@ nvc0_clear_buffer(struct pipe_context *pipe,
    PUSH_DATA (push, 0);
 
    IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0);
 
    IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
 
@@ -540,6 +543,7 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe,
    PUSH_DATA (push, (unk << 16) | (dst->u.tex.first_layer + sf->depth));
    BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1);
    PUSH_DATA (push, dst->u.tex.first_layer);
+   IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode);
 
    BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
@@ -1541,5 +1545,6 @@ nvc0_init_surface_functions(struct nvc0_context *nvc0)
    pipe->flush_resource = nvc0_flush_resource;
    pipe->clear_render_target = nvc0_clear_render_target;
    pipe->clear_depth_stencil = nvc0_clear_depth_stencil;
+   pipe->clear_texture = nv50_clear_texture;
    pipe->clear_buffer = nvc0_clear_buffer;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
index d459dd61c19..279c7e93cc8 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -340,8 +340,8 @@ nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage)
       return !nouveau_bo_wait(mt->base.bo, access, nvc0->base.client);
    }
    if (usage & PIPE_TRANSFER_WRITE)
-      return !mt->base.fence || nouveau_fence_wait(mt->base.fence);
-   return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr);
+      return !mt->base.fence || nouveau_fence_wait(mt->base.fence, &nvc0->base.debug);
+   return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr, &nvc0->base.debug);
 }
 
 void *
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index d5981248a86..606e25f915b 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -199,6 +199,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
         case PIPE_CAP_SHAREABLE_SHADERS:
         case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+        case PIPE_CAP_CLEAR_TEXTURE:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 6f2b7ba0db3..5743e3fe538 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -346,7 +346,7 @@ static void evergreen_emit_direct_dispatch(
 		const uint *block_layout, const uint *grid_layout)
 {
 	int i;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	unsigned num_waves;
 	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
@@ -417,12 +417,12 @@ static void evergreen_emit_direct_dispatch(
 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 		const uint *grid_layout)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 	unsigned i;
 
 	/* make sure that the gfx ring is only one active */
-	if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) {
-		ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
+		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 
 	/* Initialize all the compute-related registers.
@@ -439,7 +439,7 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
-		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.gfx,
+		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 						       (struct r600_resource*)cb->base.texture,
 						       RADEON_USAGE_READWRITE,
 						       RADEON_PRIO_SHADER_RW_BUFFER);
@@ -538,7 +538,7 @@ void evergreen_emit_cs_shader(
 	struct r600_cs_shader_state *state =
 					(struct r600_cs_shader_state*)atom;
 	struct r600_pipe_compute *shader = state->shader;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint64_t va;
 	struct r600_resource *code_bo;
 	unsigned ngpr, nstack;
@@ -564,7 +564,7 @@ void evergreen_emit_cs_shader(
 	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 
 	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
-	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 					      code_bo, RADEON_USAGE_READ,
 					      RADEON_PRIO_USER_SHADER));
 }
diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index 89abe92cbb4..a0f46800403 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -35,7 +35,7 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
 			       uint64_t src_offset,
 			       uint64_t size)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
 	unsigned i, ncopy, csize, sub_cmd, shift;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -64,9 +64,9 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
 	for (i = 0; i < ncopy; i++) {
 		csize = size < EG_DMA_COPY_MAX_SIZE ? size : EG_DMA_COPY_MAX_SIZE;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ,
 				      RADEON_PRIO_SDMA_BUFFER);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE,
 				      RADEON_PRIO_SDMA_BUFFER);
 		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize);
 		cs->buf[cs->cdw++] = dst_offset & 0xffffffff;
@@ -86,7 +86,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 				   struct pipe_resource *dst, uint64_t offset,
 				   unsigned size, uint32_t clear_value)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 
 	assert(size);
 	assert(rctx->screen->b.has_cp_dma);
@@ -129,7 +129,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 		}
 
 		/* This must be done after r600_need_cs_space. */
-		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 					      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
 					      RADEON_PRIO_CP_DMA);
 
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index c6702a9ca34..684eee7a355 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -666,6 +666,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 	enum pipe_format pipe_format = state->format;
 	struct radeon_surf_level *surflevel;
 	unsigned base_level, first_level, last_level;
+	unsigned dim, last_layer;
 	uint64_t va;
 
 	if (view == NULL)
@@ -679,7 +680,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 	view->base.reference.count = 1;
 	view->base.context = ctx;
 
-	if (texture->target == PIPE_BUFFER)
+	if (state->target == PIPE_BUFFER)
 		return texture_buffer_sampler_view(rctx, view, width0, height0);
 
 	swizzle[0] = state->swizzle_r;
@@ -773,12 +774,12 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 	}
 	nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks);
 
-	if (texture->target == PIPE_TEXTURE_1D_ARRAY) {
+	if (state->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
 		depth = texture->array_size;
-	} else if (texture->target == PIPE_TEXTURE_2D_ARRAY) {
+	} else if (state->target == PIPE_TEXTURE_2D_ARRAY) {
 		depth = texture->array_size;
-	} else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY)
+	} else if (state->target == PIPE_TEXTURE_CUBE_ARRAY)
 		depth = texture->array_size / 6;
 
 	va = tmp->resource.gpu_address;
@@ -790,7 +791,13 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 		view->is_stencil_sampler = true;
 
 	view->tex_resource = &tmp->resource;
-	view->tex_resource_words[0] = (S_030000_DIM(r600_tex_dim(texture->target, texture->nr_samples)) |
+
+	/* array type views and views into array types need to use layer offset */
+	dim = state->target;
+	if (state->target != PIPE_TEXTURE_CUBE)
+		dim = MAX2(state->target, texture->target);
+
+	view->tex_resource_words[0] = (S_030000_DIM(r600_tex_dim(dim, texture->nr_samples)) |
 				       S_030000_PITCH((pitch / 8) - 1) |
 				       S_030000_TEX_WIDTH(width - 1));
 	if (rscreen->b.chip_class == CAYMAN)
@@ -818,10 +825,14 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 		view->tex_resource_words[3] = (surflevel[base_level].offset + va) >> 8;
 	}
 
+	last_layer = state->u.tex.last_layer;
+	if (state->target != texture->target && depth == 1) {
+		last_layer = state->u.tex.first_layer;
+	}
 	view->tex_resource_words[4] = (word4 |
 				       S_030010_ENDIAN_SWAP(endian));
 	view->tex_resource_words[5] = S_030014_BASE_ARRAY(state->u.tex.first_layer) |
-				      S_030014_LAST_ARRAY(state->u.tex.last_layer);
+				      S_030014_LAST_ARRAY(last_layer);
 	view->tex_resource_words[6] = S_030018_TILE_SPLIT(tile_split);
 
 	if (texture->nr_samples > 1) {
@@ -860,7 +871,7 @@ evergreen_create_sampler_view(struct pipe_context *ctx,
 
 static void evergreen_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_clip_state *state = &rctx->clip_state.state;
 
 	radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP0_X, 6*4);
@@ -910,7 +921,7 @@ static void evergreen_set_scissor_states(struct pipe_context *ctx,
 
 static void evergreen_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_scissor_state *rstate = &rctx->scissor;
 	struct pipe_scissor_state *state;
 	uint32_t dirty_mask;
@@ -1514,7 +1525,7 @@ static void evergreen_get_sample_position(struct pipe_context *ctx,
 static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples, int ps_iter_samples)
 {
 
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned max_dist = 0;
 
 	switch (nr_samples) {
@@ -1555,7 +1566,7 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples,
 
 static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_framebuffer_state *state = &rctx->framebuffer.state;
 	unsigned nr_cbufs = state->nr_cbufs;
 	unsigned i, tl, br;
@@ -1580,7 +1591,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 
 		tex = (struct r600_texture *)cb->base.texture;
 		reloc = radeon_add_to_buffer_list(&rctx->b,
-					      &rctx->b.rings.gfx,
+					      &rctx->b.gfx,
 					      (struct r600_resource*)cb->base.texture,
 					      RADEON_USAGE_READWRITE,
 					      tex->surface.nsamples > 1 ?
@@ -1588,7 +1599,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 						      RADEON_PRIO_COLOR_BUFFER);
 
 		if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) {
-			cmask_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+			cmask_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 				tex->cmask_buffer, RADEON_USAGE_READWRITE,
 				RADEON_PRIO_CMASK);
 		} else {
@@ -1634,7 +1645,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 
 		if (!rctx->keep_tiling_flags) {
 			unsigned reloc = radeon_add_to_buffer_list(&rctx->b,
-							       &rctx->b.rings.gfx,
+							       &rctx->b.gfx,
 							       (struct r600_resource*)state->cbufs[0]->texture,
 							       RADEON_USAGE_READWRITE,
 							       RADEON_PRIO_COLOR_BUFFER);
@@ -1657,7 +1668,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 	if (state->zsbuf) {
 		struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
 		unsigned reloc = radeon_add_to_buffer_list(&rctx->b,
-						       &rctx->b.rings.gfx,
+						       &rctx->b.gfx,
 						       (struct r600_resource*)state->zsbuf->texture,
 						       RADEON_USAGE_READWRITE,
 						       zb->base.texture->nr_samples > 1 ?
@@ -1719,7 +1730,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 
 static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a;
 	float offset_units = state->offset_units;
 	float offset_scale = state->offset_scale;
@@ -1746,7 +1757,7 @@ static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600
 
 static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom;
 	unsigned fb_colormask = (1ULL << ((unsigned)a->nr_cbufs * 4)) - 1;
 	unsigned ps_colormask = (1ULL << ((unsigned)a->nr_ps_color_outputs * 4)) - 1;
@@ -1761,7 +1772,7 @@ static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_
 
 static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_db_state *a = (struct r600_db_state*)atom;
 
 	if (a->rsurf && a->rsurf->db_htile_surface) {
@@ -1772,7 +1783,7 @@ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom
 		radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, a->rsurf->db_htile_surface);
 		radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, a->rsurf->db_preload_control);
 		radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base);
-		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer,
+		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer,
 						  RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE);
 		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
 		cs->buf[cs->cdw++] = reloc_idx;
@@ -1784,7 +1795,7 @@ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom
 
 static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom;
 	unsigned db_render_control = 0;
 	unsigned db_count_control = 0;
@@ -1851,7 +1862,7 @@ static void evergreen_emit_vertex_buffers(struct r600_context *rctx,
 					  unsigned resource_offset,
 					  unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1886,7 +1897,7 @@ static void evergreen_emit_vertex_buffers(struct r600_context *rctx,
 		radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD7 */
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER));
 	}
 	state->dirty_mask = 0;
@@ -1910,7 +1921,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
 					    unsigned reg_alu_const_cache,
 					    unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1934,7 +1945,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
 		}
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
 
 		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags);
@@ -1959,7 +1970,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
 			    S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER));
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
 
 		dirty_mask &= ~(1 << buffer_index);
@@ -2007,7 +2018,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 					 struct r600_samplerview_state *state,
 					 unsigned resource_id_base, unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -2022,7 +2033,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 		radeon_emit(cs, (resource_id_base + resource_index) * 8);
 		radeon_emit_array(cs, rview->tex_resource_words, 8);
 
-		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rview->tex_resource,
+		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rview->tex_resource,
 					      RADEON_USAGE_READ,
 					      r600_get_sampler_view_priority(rview->tex_resource));
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
@@ -2066,7 +2077,7 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx,
 				unsigned border_index_reg,
 				unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = texinfo->states.dirty_mask;
 
 	while (dirty_mask) {
@@ -2119,14 +2130,14 @@ static void evergreen_emit_sample_mask(struct r600_context *rctx, struct r600_at
 	struct r600_sample_mask *s = (struct r600_sample_mask*)a;
 	uint8_t mask = s->sample_mask;
 
-	radeon_set_context_reg(rctx->b.rings.gfx.cs, R_028C3C_PA_SC_AA_MASK,
+	radeon_set_context_reg(rctx->b.gfx.cs, R_028C3C_PA_SC_AA_MASK,
 			       mask | (mask << 8) | (mask << 16) | (mask << 24));
 }
 
 static void cayman_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a)
 {
 	struct r600_sample_mask *s = (struct r600_sample_mask*)a;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint16_t mask = s->sample_mask;
 
 	radeon_set_context_reg_seq(cs, CM_R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
@@ -2136,21 +2147,21 @@ static void cayman_emit_sample_mask(struct r600_context *rctx, struct r600_atom
 
 static void evergreen_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_cso_state *state = (struct r600_cso_state*)a;
 	struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso;
 
 	radeon_set_context_reg(cs, R_0288A4_SQ_PGM_START_FS,
 			       (shader->buffer->gpu_address + shader->offset) >> 8);
 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->buffer,
+	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->buffer,
                                                   RADEON_USAGE_READ,
                                                   RADEON_PRIO_INTERNAL_SHADER));
 }
 
 static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a;
 
 	uint32_t v = 0, v2 = 0, primid = 0;
@@ -2189,7 +2200,7 @@ static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_
 
 static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a;
 	struct r600_resource *rbuffer;
 
@@ -2202,7 +2213,7 @@ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom
 		radeon_set_config_reg(cs, R_008C40_SQ_ESGS_RING_BASE,
 				rbuffer->gpu_address >> 8);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READWRITE,
 						      RADEON_PRIO_RINGS_STREAMOUT));
 		radeon_set_config_reg(cs, R_008C44_SQ_ESGS_RING_SIZE,
@@ -2212,7 +2223,7 @@ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom
 		radeon_set_config_reg(cs, R_008C48_SQ_GSVS_RING_BASE,
 				rbuffer->gpu_address >> 8);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READWRITE,
 						      RADEON_PRIO_RINGS_STREAMOUT));
 		radeon_set_config_reg(cs, R_008C4C_SQ_GSVS_RING_SIZE,
@@ -2362,6 +2373,8 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
 
 	r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0);
 
 	/* to avoid GPU doing any preloading of constant from random address */
@@ -2801,6 +2814,8 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx)
 
 	r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0);
 
 	/* to avoid GPU doing any preloading of constant from random address */
@@ -2940,6 +2955,19 @@ void evergreen_update_ps_state(struct pipe_context *ctx, struct r600_pipe_shader
 	db_shader_control |= S_02880C_STENCIL_EXPORT_ENABLE(stencil_export);
 	db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(mask_export);
 
+	switch (rshader->ps_conservative_z) {
+	default: /* fall through */
+	case TGSI_FS_DEPTH_LAYOUT_ANY:
+		db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_ANY_Z);
+		break;
+	case TGSI_FS_DEPTH_LAYOUT_GREATER:
+		db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+		break;
+	case TGSI_FS_DEPTH_LAYOUT_LESS:
+		db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+		break;
+	}
+
 	exports_ps = 0;
 	for (i = 0; i < rshader->noutput; i++) {
 		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION ||
@@ -3246,7 +3274,7 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 				unsigned pitch,
 				unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
 	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
@@ -3334,9 +3362,9 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 		}
 		size = (cheight * pitch) / 4;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rsrc->resource,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource,
 				      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rdst->resource,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource,
 				      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
 		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, size);
 		cs->buf[cs->cdw++] = base >> 8;
@@ -3371,7 +3399,7 @@ static void evergreen_dma_copy(struct pipe_context *ctx,
 	unsigned src_x, src_y;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (rctx->b.rings.dma.cs == NULL) {
+	if (rctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
@@ -3515,6 +3543,7 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->viewport.atom, id++, r600_emit_viewport_state, 0);
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5);
+	r600_add_atom(rctx, &rctx->b.render_cond_atom, id++);
 	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
 	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index 937ffcbddb9..25237c6f650 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -815,6 +815,13 @@
 #define     V_02880C_EXPORT_DB_FOUR16                  0x01
 #define     V_02880C_EXPORT_DB_TWO                     0x02
 #define   S_02880C_ALPHA_TO_MASK_DISABLE(x)            (((x) & 0x1) << 12)
+#define   S_02880C_CONSERVATIVE_Z_EXPORT(x)            (((x) & 0x03) << 16)
+#define   G_02880C_CONSERVATIVE_Z_EXPORT(x)            (((x) >> 16) & 0x03)
+#define   C_02880C_CONSERVATIVE_Z_EXPORT               0xFFFCFFFF
+#define     V_02880C_EXPORT_ANY_Z                      0
+#define     V_02880C_EXPORT_LESS_THAN_Z                1
+#define     V_02880C_EXPORT_GREATER_THAN_Z             2
+#define     V_02880C_EXPORT_RESERVED                   3
 
 #define R_028A00_PA_SU_POINT_SIZE                    0x028A00
 #define   S_028A00_HEIGHT(x)                           (((x) & 0xFFFF) << 0)
@@ -1497,6 +1504,7 @@
 #define   S_028878_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
 #define   G_028878_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
 #define   C_028878_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define R_02887C_SQ_PGM_RESOURCES_2_GS                 0x02887C
 
 #define R_028890_SQ_PGM_RESOURCES_ES                 0x028890
 #define   S_028890_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
@@ -1511,6 +1519,7 @@
 #define   S_028890_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
 #define   G_028890_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
 #define   C_028890_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define R_028894_SQ_PGM_RESOURCES_2_ES                 0x028894
 
 #define R_028864_SQ_PGM_RESOURCES_2_VS               0x028864
 #define   S_028864_SINGLE_ROUND(x)                     (((x) & 0x3) << 0)
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index aede8408446..8a90489318e 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -87,18 +87,16 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op
 			(struct pipe_sampler_view**)rctx->samplers[PIPE_SHADER_FRAGMENT].views.views);
 	}
 
-	if ((op & R600_DISABLE_RENDER_COND) && rctx->b.current_render_cond) {
-           util_blitter_save_render_condition(rctx->blitter,
-                                              rctx->b.current_render_cond,
-                                              rctx->b.current_render_cond_cond,
-                                              rctx->b.current_render_cond_mode);
-        }
+	if (op & R600_DISABLE_RENDER_COND)
+		rctx->b.render_cond_force_off = true;
 }
 
 static void r600_blitter_end(struct pipe_context *ctx)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-        r600_resume_nontimer_queries(&rctx->b);
+
+	rctx->b.render_cond_force_off = false;
+	r600_resume_nontimer_queries(&rctx->b);
 }
 
 static unsigned u_max_sample(struct pipe_resource *r)
@@ -527,7 +525,7 @@ static void r600_copy_buffer(struct pipe_context *ctx, struct pipe_resource *dst
 	 * Can we somehow flush the index buffer cache? Starting a new IB seems
 	 * to do the trick. */
 	if (rctx->b.chip_class <= R700)
-		rctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		rctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 }
 
 /**
@@ -604,6 +602,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *ds
 	} else {
 		uint32_t *map = r600_buffer_map_sync_with_rings(&rctx->b, r600_resource(dst),
 								 PIPE_TRANSFER_WRITE);
+		map += offset / 4;
 		size /= 4;
 		for (unsigned i = 0; i < size; i++)
 			*map++ = value;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 6f11366e606..6409f0bd9f7 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -33,11 +33,16 @@
 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 			boolean count_draw_in)
 {
+	struct radeon_winsys_cs *dma = ctx->b.dma.cs;
 
-	if (!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, ctx->b.vram, ctx->b.gtt)) {
+	/* Flush the DMA IB if it's not empty. */
+	if (dma && dma->cdw)
+		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+
+	if (!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs, ctx->b.vram, ctx->b.gtt)) {
 		ctx->b.gtt = 0;
 		ctx->b.vram = 0;
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 		return;
 	}
 	/* all will be accounted once relocation are emited */
@@ -45,7 +50,7 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	ctx->b.vram = 0;
 
 	/* The number of dwords we already used in the CS so far. */
-	num_dw += ctx->b.rings.gfx.cs->cdw;
+	num_dw += ctx->b.gfx.cs->cdw;
 
 	if (count_draw_in) {
 		uint64_t mask;
@@ -75,11 +80,6 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 		num_dw += ctx->b.streamout.num_dw_for_end;
 	}
 
-	/* Count in render_condition(NULL) at the end of CS. */
-	if (ctx->b.predicate_drawing) {
-		num_dw += 3;
-	}
-
 	/* SX_MISC */
 	if (ctx->b.chip_class == R600) {
 		num_dw += 3;
@@ -92,14 +92,14 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	num_dw += 10;
 
 	/* Flush if there's not enough space. */
-	if (num_dw > ctx->b.rings.gfx.cs->max_dw) {
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+	if (num_dw > ctx->b.gfx.cs->max_dw) {
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 }
 
 void r600_flush_emit(struct r600_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned cp_coher_cntl = 0;
 	unsigned wait_until = 0;
 
@@ -246,13 +246,11 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 			    struct pipe_fence_handle **fence)
 {
 	struct r600_context *ctx = context;
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 
 	if (cs->cdw == ctx->b.initial_gfx_cs_size && !fence)
 		return;
 
-	ctx->b.rings.gfx.flushing = true;
-
 	r600_preflush_suspend_features(&ctx->b);
 
 	/* flush the framebuffer cache */
@@ -278,7 +276,6 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 
 	/* Flush the CS. */
 	ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++);
-	ctx->b.rings.gfx.flushing = false;
 
 	r600_begin_new_cs(ctx);
 }
@@ -292,7 +289,7 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	ctx->b.vram = 0;
 
 	/* Begin a new CS. */
-	r600_emit_command_buffer(ctx->b.rings.gfx.cs, &ctx->start_cs_cmd);
+	r600_emit_command_buffer(ctx->b.gfx.cs, &ctx->start_cs_cmd);
 
 	/* Re-emit states. */
 	r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom);
@@ -326,6 +323,7 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	}
 	r600_mark_atom_dirty(ctx, &ctx->vertex_shader.atom);
 	r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
+	r600_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
 
 	if (ctx->blend_state.cso)
 		r600_mark_atom_dirty(ctx, &ctx->blend_state.atom);
@@ -361,7 +359,7 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	ctx->last_primitive_type = -1;
 	ctx->last_start_instance = -1;
 
-	ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
+	ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw;
 }
 
 /* The max number of bytes to copy per packet. */
@@ -372,7 +370,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 			     struct pipe_resource *src, uint64_t src_offset,
 			     unsigned size)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 
 	assert(size);
 	assert(rctx->screen->b.has_cp_dma);
@@ -418,9 +416,9 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 		}
 
 		/* This must be done after r600_need_cs_space. */
-		src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, (struct r600_resource*)src,
+		src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src,
 						  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
-		dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, (struct r600_resource*)dst,
+		dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst,
 						  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
 
 		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
@@ -453,7 +451,7 @@ void r600_dma_copy_buffer(struct r600_context *rctx,
 			  uint64_t src_offset,
 			  uint64_t size)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -471,9 +469,9 @@ void r600_dma_copy_buffer(struct r600_context *rctx,
 	for (i = 0; i < ncopy; i++) {
 		csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ,
 				      RADEON_PRIO_SDMA_BUFFER);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE,
 				      RADEON_PRIO_SDMA_BUFFER);
 		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize);
 		cs->buf[cs->cdw++] = dst_offset & 0xfffffffc;
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 9f4cda2c142..bd00dcb642c 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -178,11 +178,11 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen,
 		goto fail;
 	}
 
-	rctx->b.rings.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
-					     r600_context_gfx_flush, rctx,
-					     rscreen->b.trace_bo ?
-						     rscreen->b.trace_bo->cs_buf : NULL);
-	rctx->b.rings.gfx.flush = r600_context_gfx_flush;
+	rctx->b.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
+				       r600_context_gfx_flush, rctx,
+				       rscreen->b.trace_bo ?
+					       rscreen->b.trace_bo->cs_buf : NULL);
+	rctx->b.gfx.flush = r600_context_gfx_flush;
 
 	rctx->allocator_fetch_shader = u_suballocator_create(&rctx->b.b, 64 * 1024, 256,
 							     0, PIPE_USAGE_DEFAULT, FALSE);
@@ -323,6 +323,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_SM5:
 	case PIPE_CAP_TEXTURE_QUERY_LOD:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 		return family >= CHIP_CEDAR ? 1 : 0;
 	case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
 		return family >= CHIP_CEDAR ? 4 : 0;
@@ -338,13 +339,13 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_CLEAR_TEXTURE:
 		return 0;
 
 	/* Stream output. */
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 520b03f605d..bbb55adef82 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -38,7 +38,7 @@
 
 #include "tgsi/tgsi_scan.h"
 
-#define R600_NUM_ATOMS 42
+#define R600_NUM_ATOMS 43
 
 #define R600_MAX_VIEWPORTS 16
 
@@ -116,6 +116,7 @@ struct r600_db_misc_state {
 	unsigned			log_samples;
 	unsigned			db_shader_control;
 	bool				htile_clear;
+	uint8_t				ps_conservative_z;
 };
 
 struct r600_cb_misc_state {
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index fc6335ae8bc..560197c82b5 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -2044,6 +2044,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 
 	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
 	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
 
 	if (shader->vs_as_gs_a)
 		vs_add_primid_output(&ctx, key.vs.prim_id_out);
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index c240e7110c1..2040f732bf5 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -76,6 +76,8 @@ struct r600_shader {
 	boolean			uses_tex_buffers;
 	boolean                 gs_prim_id_input;
 
+	uint8_t			ps_conservative_z;
+
 	/* Size in bytes of a data item in the ring(s) (single vertex data).
 	   Stages with only one ring items 123 will be set to 0. */
 	unsigned		ring_item_sizes[4];
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 1be3e1b4de5..c2d4abc5ea1 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -244,7 +244,7 @@ boolean r600_is_format_supported(struct pipe_screen *screen,
 
 static void r600_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a;
 	float offset_units = state->offset_units;
 	float offset_scale = state->offset_scale;
@@ -760,7 +760,7 @@ r600_create_sampler_view(struct pipe_context *ctx,
 
 static void r600_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_clip_state *state = &rctx->clip_state.state;
 
 	radeon_set_context_reg_seq(cs, R_028E20_PA_CL_UCP0_X, 6*4);
@@ -774,7 +774,7 @@ static void r600_set_polygon_stipple(struct pipe_context *ctx,
 
 static void r600_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_scissor_state *rstate = &rctx->scissor;
 	struct pipe_scissor_state *state;
 	bool do_disable_workaround = false;
@@ -1334,7 +1334,7 @@ static void r600_get_sample_position(struct pipe_context *ctx,
 
 static void r600_emit_msaa_state(struct r600_context *rctx, int nr_samples)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned max_dist = 0;
 
 	if (rctx->b.family == CHIP_R600) {
@@ -1401,7 +1401,7 @@ static void r600_emit_msaa_state(struct r600_context *rctx, int nr_samples)
 
 static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_framebuffer_state *state = &rctx->framebuffer.state;
 	unsigned nr_cbufs = state->nr_cbufs;
 	struct r600_surface **cb = (struct r600_surface**)&state->cbufs[0];
@@ -1432,7 +1432,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
 			radeon_set_context_reg(cs, R_028040_CB_COLOR0_BASE + i*4, cb[i]->cb_color_base);
 
 			reloc = radeon_add_to_buffer_list(&rctx->b,
-						      &rctx->b.rings.gfx,
+						      &rctx->b.gfx,
 						      (struct r600_resource*)cb[i]->base.texture,
 						      RADEON_USAGE_READWRITE,
 						      cb[i]->base.texture->nr_samples > 1 ?
@@ -1445,7 +1445,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
 			radeon_set_context_reg(cs, R_0280E0_CB_COLOR0_FRAG + i*4, cb[i]->cb_color_fmask);
 
 			reloc = radeon_add_to_buffer_list(&rctx->b,
-						      &rctx->b.rings.gfx,
+						      &rctx->b.gfx,
 						      cb[i]->cb_buffer_fmask,
 						      RADEON_USAGE_READWRITE,
 						      cb[i]->base.texture->nr_samples > 1 ?
@@ -1458,7 +1458,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
 			radeon_set_context_reg(cs, R_0280C0_CB_COLOR0_TILE + i*4, cb[i]->cb_color_cmask);
 
 			reloc = radeon_add_to_buffer_list(&rctx->b,
-						      &rctx->b.rings.gfx,
+						      &rctx->b.gfx,
 						      cb[i]->cb_buffer_cmask,
 						      RADEON_USAGE_READWRITE,
 						      cb[i]->base.texture->nr_samples > 1 ?
@@ -1497,7 +1497,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
 	if (state->zsbuf) {
 		struct r600_surface *surf = (struct r600_surface*)state->zsbuf;
 		unsigned reloc = radeon_add_to_buffer_list(&rctx->b,
-						       &rctx->b.rings.gfx,
+						       &rctx->b.gfx,
 						       (struct r600_resource*)state->zsbuf->texture,
 						       RADEON_USAGE_READWRITE,
 						       surf->base.texture->nr_samples > 1 ?
@@ -1570,7 +1570,7 @@ static void r600_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 
 static void r600_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom;
 
 	if (G_028808_SPECIAL_OP(a->cb_color_control) == V_028808_SPECIAL_RESOLVE_BOX) {
@@ -1600,7 +1600,7 @@ static void r600_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom
 
 static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_db_state *a = (struct r600_db_state*)atom;
 
 	if (a->rsurf && a->rsurf->db_htile_surface) {
@@ -1610,7 +1610,7 @@ static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom
 		radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(rtex->depth_clear_value));
 		radeon_set_context_reg(cs, R_028D24_DB_HTILE_SURFACE, a->rsurf->db_htile_surface);
 		radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base);
-		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer,
+		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer,
 						  RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE);
 		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
 		cs->buf[cs->cdw++] = reloc_idx;
@@ -1621,13 +1621,28 @@ static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom
 
 static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom;
 	unsigned db_render_control = 0;
 	unsigned db_render_override =
 		S_028D10_FORCE_HIS_ENABLE0(V_028D10_FORCE_DISABLE) |
 		S_028D10_FORCE_HIS_ENABLE1(V_028D10_FORCE_DISABLE);
 
+	if (rctx->b.chip_class >= R700) {
+		switch (a->ps_conservative_z) {
+		default: /* fall through */
+		case TGSI_FS_DEPTH_LAYOUT_ANY:
+			db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_ANY_Z);
+			break;
+		case TGSI_FS_DEPTH_LAYOUT_GREATER:
+			db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_GREATER_THAN_Z);
+			break;
+		case TGSI_FS_DEPTH_LAYOUT_LESS:
+			db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_LESS_THAN_Z);
+			break;
+		}
+	}
+
 	if (a->occlusion_query_enabled) {
 		if (rctx->b.chip_class >= R700) {
 			db_render_control |= S_028D0C_R700_PERFECT_ZPASS_COUNTS(1);
@@ -1687,7 +1702,7 @@ static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom
 
 static void r600_emit_config_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_config_state *a = (struct r600_config_state*)atom;
 
 	radeon_set_config_reg(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, a->sq_gpr_resource_mgmt_1);
@@ -1696,7 +1711,7 @@ static void r600_emit_config_state(struct r600_context *rctx, struct r600_atom *
 
 static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = rctx->vertex_buffer_state.dirty_mask;
 
 	while (dirty_mask) {
@@ -1725,7 +1740,7 @@ static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom
 		radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER));
 	}
 }
@@ -1736,7 +1751,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
 				       unsigned reg_alu_constbuf_size,
 				       unsigned reg_alu_const_cache)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1758,7 +1773,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
 		}
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
 
 		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 7, 0));
@@ -1774,7 +1789,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
 		radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
 
 		dirty_mask &= ~(1 << buffer_index);
@@ -1810,7 +1825,7 @@ static void r600_emit_sampler_views(struct r600_context *rctx,
 				    struct r600_samplerview_state *state,
 				    unsigned resource_id_base)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1825,7 +1840,7 @@ static void r600_emit_sampler_views(struct r600_context *rctx,
 		radeon_emit(cs, (resource_id_base + resource_index) * 7);
 		radeon_emit_array(cs, rview->tex_resource_words, 7);
 
-		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rview->tex_resource,
+		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rview->tex_resource,
 					      RADEON_USAGE_READ,
 					      r600_get_sampler_view_priority(rview->tex_resource));
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
@@ -1857,7 +1872,7 @@ static void r600_emit_sampler_states(struct r600_context *rctx,
 				unsigned resource_id_base,
 				unsigned border_color_reg)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = texinfo->states.dirty_mask;
 
 	while (dirty_mask) {
@@ -1918,7 +1933,7 @@ static void r600_emit_ps_sampler_states(struct r600_context *rctx, struct r600_a
 
 static void r600_emit_seamless_cube_map(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned tmp;
 
 	tmp = S_009508_DISABLE_CUBE_ANISO(1) |
@@ -1936,26 +1951,26 @@ static void r600_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a
 	struct r600_sample_mask *s = (struct r600_sample_mask*)a;
 	uint8_t mask = s->sample_mask;
 
-	radeon_set_context_reg(rctx->b.rings.gfx.cs, R_028C48_PA_SC_AA_MASK,
+	radeon_set_context_reg(rctx->b.gfx.cs, R_028C48_PA_SC_AA_MASK,
 			       mask | (mask << 8) | (mask << 16) | (mask << 24));
 }
 
 static void r600_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_cso_state *state = (struct r600_cso_state*)a;
 	struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso;
 
 	radeon_set_context_reg(cs, R_028894_SQ_PGM_START_FS, shader->offset >> 8);
 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->buffer,
+	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->buffer,
                                                   RADEON_USAGE_READ,
                                                   RADEON_PRIO_INTERNAL_SHADER));
 }
 
 static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a;
 
 	uint32_t v2 = 0, primid = 0;
@@ -1990,7 +2005,7 @@ static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom
 
 static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a;
 	struct r600_resource *rbuffer;
 
@@ -2002,7 +2017,7 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 		rbuffer =(struct r600_resource*)state->esgs_ring.buffer;
 		radeon_set_config_reg(cs, R_008C40_SQ_ESGS_RING_BASE, 0);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READWRITE,
 						      RADEON_PRIO_RINGS_STREAMOUT));
 		radeon_set_config_reg(cs, R_008C44_SQ_ESGS_RING_SIZE,
@@ -2011,7 +2026,7 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 		rbuffer =(struct r600_resource*)state->gsvs_ring.buffer;
 		radeon_set_config_reg(cs, R_008C48_SQ_GSVS_RING_BASE, 0);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READWRITE,
 						      RADEON_PRIO_RINGS_STREAMOUT));
 		radeon_set_config_reg(cs, R_008C4C_SQ_GSVS_RING_SIZE,
@@ -2787,6 +2802,7 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 {
 	bool dual_export;
 	unsigned db_shader_control;
+	uint8_t ps_conservative_z;
 
 	if (!rctx->ps_shader) {
 		return;
@@ -2798,6 +2814,8 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 	db_shader_control = rctx->ps_shader->current->db_shader_control |
 			    S_02880C_DUAL_EXPORT_ENABLE(dual_export);
 
+	ps_conservative_z = rctx->ps_shader->current->shader.ps_conservative_z;
+
 	/* When alpha test is enabled we can't trust the hw to make the proper
 	 * decision on the order in which ztest should be run related to fragment
 	 * shader execution.
@@ -2811,8 +2829,10 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 		db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
 	}
 
-	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
+	if (db_shader_control != rctx->db_misc_state.db_shader_control ||
+		ps_conservative_z != rctx->db_misc_state.ps_conservative_z) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
+		rctx->db_misc_state.ps_conservative_z = ps_conservative_z;
 		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
@@ -2845,7 +2865,7 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 				unsigned pitch,
 				unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
 	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
@@ -2918,9 +2938,9 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 		cheight = cheight > copy_height ? copy_height : cheight;
 		size = (cheight * pitch) / 4;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rsrc->resource, RADEON_USAGE_READ,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource, RADEON_USAGE_READ,
 				      RADEON_PRIO_SDMA_TEXTURE);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rdst->resource, RADEON_USAGE_WRITE,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE,
 				      RADEON_PRIO_SDMA_TEXTURE);
 		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 1, 0, size);
 		cs->buf[cs->cdw++] = base >> 8;
@@ -2954,7 +2974,7 @@ static void r600_dma_copy(struct pipe_context *ctx,
 	unsigned src_x, src_y;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (rctx->b.rings.dma.cs == NULL) {
+	if (rctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
@@ -3086,6 +3106,7 @@ void r600_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3);
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5);
+	r600_add_atom(rctx, &rctx->b.render_cond_atom, id++);
 	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
 	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 178005a8574..d629194ca6e 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -71,12 +71,12 @@ void r600_init_atom(struct r600_context *rctx,
 
 void r600_emit_cso_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	r600_emit_command_buffer(rctx->b.rings.gfx.cs, ((struct r600_cso_state*)atom)->cb);
+	r600_emit_command_buffer(rctx->b.gfx.cs, ((struct r600_cso_state*)atom)->cb);
 }
 
 void r600_emit_alphatest_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_alphatest_state *a = (struct r600_alphatest_state*)atom;
 	unsigned alpha_ref = a->sx_alpha_ref;
 
@@ -211,7 +211,7 @@ static void r600_set_blend_color(struct pipe_context *ctx,
 
 void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_blend_color *state = &rctx->blend_color.state;
 
 	radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
@@ -223,7 +223,7 @@ void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom)
 
 void r600_emit_vgt_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_vgt_state *a = (struct r600_vgt_state *)atom;
 
 	radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, a->vgt_multi_prim_ib_reset_en);
@@ -257,7 +257,7 @@ static void r600_set_stencil_ref(struct pipe_context *ctx,
 
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_stencil_ref_state *a = (struct r600_stencil_ref_state*)atom;
 
 	radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
@@ -709,7 +709,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx,
 
 void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_viewport_state *rstate = &rctx->viewport;
 	struct pipe_viewport_state *state;
 	uint32_t dirty_mask;
@@ -1460,7 +1460,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_clip_misc_state *state = &rctx->clip_misc_state;
 
 	radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
@@ -1477,7 +1477,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct pipe_draw_info info = *dinfo;
 	struct pipe_index_buffer ib = {};
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 	uint64_t mask;
 
 	if (!info.indirect && !info.count && (info.indexed || !info.count_from_stream_output)) {
@@ -1490,8 +1491,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 	}
 
 	/* make sure that the gfx ring is only one active */
-	if (rctx->b.rings.dma.cs && rctx->b.rings.dma.cs->cdw) {
-		rctx->b.rings.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+	if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) {
+		rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 
 	if (!r600_update_derived_state(rctx)) {
@@ -1663,7 +1664,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 
 	/* Draw packets. */
 	if (!info.indirect) {
-		cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, rctx->b.predicate_drawing);
+		cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, 0);
 		cs->buf[cs->cdw++] = info.instance_count;
 	}
 
@@ -1675,20 +1676,20 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		rctx->vgt_state.last_draw_was_indirect = true;
 		rctx->last_start_instance = -1;
 
-		cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, rctx->b.predicate_drawing);
+		cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, 0);
 		cs->buf[cs->cdw++] = EG_DRAW_INDEX_INDIRECT_PATCH_TABLE_BASE;
 		cs->buf[cs->cdw++] = va;
 		cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
 
-		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
-		cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+		cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 							   (struct r600_resource*)info.indirect,
 							   RADEON_USAGE_READ,
                                                            RADEON_PRIO_DRAW_INDIRECT);
 	}
 
 	if (info.indexed) {
-		cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, rctx->b.predicate_drawing);
+		cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, 0);
 		cs->buf[cs->cdw++] = ib.index_size == 4 ?
 					(VGT_INDEX_32 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_32_BIT : 0)) :
 					(VGT_INDEX_16 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_16_BIT : 0));
@@ -1696,7 +1697,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		if (ib.user_buffer) {
 			unsigned size_bytes = info.count*ib.index_size;
 			unsigned size_dw = align(size_bytes, 4) / 4;
-			cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, rctx->b.predicate_drawing);
+			cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, render_cond_bit);
 			cs->buf[cs->cdw++] = info.count;
 			cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_IMMEDIATE;
 			memcpy(cs->buf+cs->cdw, ib.user_buffer, size_bytes);
@@ -1705,13 +1706,13 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 			uint64_t va = r600_resource(ib.buffer)->gpu_address + ib.offset;
 
 			if (likely(!info.indirect)) {
-				cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, rctx->b.predicate_drawing);
+				cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, render_cond_bit);
 				cs->buf[cs->cdw++] = va;
 				cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
 				cs->buf[cs->cdw++] = info.count;
 				cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
-				cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
-				cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+				cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+				cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 									   (struct r600_resource*)ib.buffer,
 									   RADEON_USAGE_READ,
                                                                            RADEON_PRIO_INDEX_BUFFER);
@@ -1719,20 +1720,20 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 			else {
 				uint32_t max_size = (ib.buffer->width0 - ib.offset) / ib.index_size;
 
-				cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 1, rctx->b.predicate_drawing);
+				cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 1, 0);
 				cs->buf[cs->cdw++] = va;
 				cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
 
-				cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
-				cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+				cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+				cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 									   (struct r600_resource*)ib.buffer,
 									   RADEON_USAGE_READ,
                                                                            RADEON_PRIO_INDEX_BUFFER);
 
-				cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, rctx->b.predicate_drawing);
+				cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, 0);
 				cs->buf[cs->cdw++] = max_size;
 
-				cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, rctx->b.predicate_drawing);
+				cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, render_cond_bit);
 				cs->buf[cs->cdw++] = info.indirect_offset;
 				cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
 			}
@@ -1752,17 +1753,17 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 			cs->buf[cs->cdw++] = 0; /* unused */
 
 			cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
-			cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+			cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 								   t->buf_filled_size, RADEON_USAGE_READ,
 								   RADEON_PRIO_SO_FILLED_SIZE);
 		}
 
 		if (likely(!info.indirect)) {
-			cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, rctx->b.predicate_drawing);
+			cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit);
 			cs->buf[cs->cdw++] = info.count;
 		}
 		else {
-			cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, rctx->b.predicate_drawing);
+			cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, render_cond_bit);
 			cs->buf[cs->cdw++] = info.indirect_offset;
 		}
 		cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_AUTO_INDEX |
@@ -1938,7 +1939,7 @@ bool sampler_state_needs_border_color(const struct pipe_sampler_state *state)
 void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a)
 {
 
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_pipe_shader *shader = ((struct r600_shader_state*)a)->shader;
 
 	if (!shader)
@@ -1946,7 +1947,7 @@ void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a)
 
 	r600_emit_command_buffer(cs, &shader->command_buffer);
 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->bo,
+	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->bo,
 					      RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER));
 }
 
@@ -2669,12 +2670,12 @@ void r600_init_common_state_functions(struct r600_context *rctx)
 void r600_trace_emit(struct r600_context *rctx)
 {
 	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint64_t va;
 	uint32_t reloc;
 
 	va = rscreen->b.trace_bo->gpu_address;
-	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rscreen->b.trace_bo,
+	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rscreen->b.trace_bo,
 				      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 	radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
 	radeon_emit(cs, va & 0xFFFFFFFFUL);
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 6bba88cb574..53f5ad6db6a 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -781,6 +781,14 @@
 #define   S_028D0C_COPY_CENTROID(x)                    (((x) & 0x1) << 7)
 #define   S_028D0C_COPY_SAMPLE(x)                      (((x) & 0x1) << 8)
 #define   S_028D0C_R700_PERFECT_ZPASS_COUNTS(x)        (((x) & 0x1) << 15)
+#define   S_028D0C_CONSERVATIVE_Z_EXPORT(x)            (((x) & 0x03) << 13)
+#define   G_028D0C_CONSERVATIVE_Z_EXPORT(x)            (((x) >> 13) & 0x03)
+#define   C_028D0C_CONSERVATIVE_Z_EXPORT               0xFFFF9FFF
+#define     V_028D0C_EXPORT_ANY_Z                      0
+#define     V_028D0C_EXPORT_LESS_THAN_Z                1
+#define     V_028D0C_EXPORT_GREATER_THAN_Z             2
+#define     V_028D0C_EXPORT_RESERVED                   3
+
 #define R_028D10_DB_RENDER_OVERRIDE                  0x028D10
 #define   V_028D10_FORCE_OFF                         0
 #define   V_028D10_FORCE_ENABLE                      1
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 0dc6c918331..c294e516408 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -34,11 +34,11 @@ boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
 					struct radeon_winsys_cs_handle *buf,
 					enum radeon_bo_usage usage)
 {
-	if (ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, buf, usage)) {
+	if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) {
 		return TRUE;
 	}
-	if (ctx->rings.dma.cs && ctx->rings.dma.cs->cdw &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, buf, usage)) {
+	if (ctx->dma.cs && ctx->dma.cs->cdw &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) {
 		return TRUE;
 	}
 	return FALSE;
@@ -60,26 +60,26 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 		rusage = RADEON_USAGE_WRITE;
 	}
 
-	if (ctx->rings.gfx.cs->cdw != ctx->initial_gfx_cs_size &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs,
+	if (ctx->gfx.cs->cdw != ctx->initial_gfx_cs_size &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
 					     resource->cs_buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			ctx->rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+			ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 			return NULL;
 		} else {
-			ctx->rings.gfx.flush(ctx, 0, NULL);
+			ctx->gfx.flush(ctx, 0, NULL);
 			busy = true;
 		}
 	}
-	if (ctx->rings.dma.cs &&
-	    ctx->rings.dma.cs->cdw &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs,
+	if (ctx->dma.cs &&
+	    ctx->dma.cs->cdw &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
 					     resource->cs_buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+			ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 			return NULL;
 		} else {
-			ctx->rings.dma.flush(ctx, 0, NULL);
+			ctx->dma.flush(ctx, 0, NULL);
 			busy = true;
 		}
 	}
@@ -90,9 +90,9 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 		} else {
 			/* We will be wait for the GPU. Wait for any offloaded
 			 * CS flush to complete to avoid busy-waiting in the winsys. */
-			ctx->ws->cs_sync_flush(ctx->rings.gfx.cs);
-			if (ctx->rings.dma.cs)
-				ctx->ws->cs_sync_flush(ctx->rings.dma.cs);
+			ctx->ws->cs_sync_flush(ctx->gfx.cs);
+			if (ctx->dma.cs)
+				ctx->ws->cs_sync_flush(ctx->dma.cs);
 		}
 	}
 
@@ -240,7 +240,7 @@ static bool r600_can_dma_copy_buffer(struct r600_common_context *rctx,
 	bool dword_aligned = !(dstx % 4) && !(srcx % 4) && !(size % 4);
 
 	return rctx->screen->has_cp_dma ||
-	       (dword_aligned && (rctx->rings.dma.cs ||
+	       (dword_aligned && (rctx->dma.cs ||
 				  rctx->screen->has_streamout));
 
 }
diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h
index b5a1dafb273..ad067ce4e76 100644
--- a/src/gallium/drivers/radeon/r600_cs.h
+++ b/src/gallium/drivers/radeon/r600_cs.h
@@ -50,21 +50,6 @@ static inline unsigned radeon_add_to_buffer_list(struct r600_common_context *rct
 						 enum radeon_bo_priority priority)
 {
 	assert(usage);
-
-	/* Make sure that all previous rings are flushed so that everything
-	 * looks serialized from the driver point of view.
-	 */
-	if (!ring->flushing) {
-		if (ring == &rctx->rings.gfx) {
-			if (rctx->rings.dma.cs) {
-				/* flush dma ring */
-				rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
-			}
-		} else {
-			/* flush gfx ring */
-			rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
-		}
-	}
 	return rctx->ws->cs_add_buffer(ring->cs, rbo->cs_buf, usage,
 				      rbo->domains, priority) * 4;
 }
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 0ad36849645..3599692a857 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -31,6 +31,7 @@
 #include "util/u_memory.h"
 #include "util/u_format_s3tc.h"
 #include "util/u_upload_mgr.h"
+#include "os/os_time.h"
 #include "vl/vl_decoder.h"
 #include "vl/vl_video_buffer.h"
 #include "radeon/radeon_video.h"
@@ -40,6 +41,12 @@
 #define HAVE_LLVM 0
 #endif
 
+struct r600_multi_fence {
+	struct pipe_reference reference;
+	struct pipe_fence_handle *gfx;
+	struct pipe_fence_handle *sdma;
+};
+
 /*
  * pipe_context
  */
@@ -110,10 +117,14 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw)
 {
+	/* Flush the GFX IB if it's not empty. */
+	if (ctx->gfx.cs->cdw > ctx->initial_gfx_cs_size)
+		ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+
 	/* Flush if there's not enough space. */
-	if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) {
-		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
-		assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw);
+	if ((num_dw + ctx->dma.cs->cdw) > ctx->dma.cs->max_dw) {
+		ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		assert((num_dw + ctx->dma.cs->cdw) <= ctx->dma.cs->max_dw);
 	}
 }
 
@@ -123,17 +134,6 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 
 void r600_preflush_suspend_features(struct r600_common_context *ctx)
 {
-	/* Disable render condition. */
-	ctx->saved_render_cond = NULL;
-	ctx->saved_render_cond_cond = FALSE;
-	ctx->saved_render_cond_mode = 0;
-	if (ctx->current_render_cond) {
-		ctx->saved_render_cond = ctx->current_render_cond;
-		ctx->saved_render_cond_cond = ctx->current_render_cond_cond;
-		ctx->saved_render_cond_mode = ctx->current_render_cond_mode;
-		ctx->b.render_condition(&ctx->b, NULL, FALSE, 0);
-	}
-
 	/* suspend queries */
 	ctx->queries_suspended_for_flush = false;
 	if (ctx->num_cs_dw_nontimer_queries_suspend) {
@@ -161,44 +161,52 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 		r600_resume_nontimer_queries(ctx);
 		r600_resume_timer_queries(ctx);
 	}
-
-	/* Re-enable render condition. */
-	if (ctx->saved_render_cond) {
-		ctx->b.render_condition(&ctx->b, ctx->saved_render_cond,
-					  ctx->saved_render_cond_cond,
-					  ctx->saved_render_cond_mode);
-	}
 }
 
 static void r600_flush_from_st(struct pipe_context *ctx,
 			       struct pipe_fence_handle **fence,
 			       unsigned flags)
 {
+	struct pipe_screen *screen = ctx->screen;
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	unsigned rflags = 0;
+	struct pipe_fence_handle *gfx_fence = NULL;
+	struct pipe_fence_handle *sdma_fence = NULL;
 
 	if (flags & PIPE_FLUSH_END_OF_FRAME)
 		rflags |= RADEON_FLUSH_END_OF_FRAME;
 
-	if (rctx->rings.dma.cs) {
-		rctx->rings.dma.flush(rctx, rflags, NULL);
+	if (rctx->dma.cs) {
+		rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
+	}
+	rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
+
+	/* Both engines can signal out of order, so we need to keep both fences. */
+	if (gfx_fence || sdma_fence) {
+		struct r600_multi_fence *multi_fence =
+			CALLOC_STRUCT(r600_multi_fence);
+		if (!multi_fence)
+			return;
+
+		multi_fence->reference.count = 1;
+		multi_fence->gfx = gfx_fence;
+		multi_fence->sdma = sdma_fence;
+
+		screen->fence_reference(screen, fence, NULL);
+		*fence = (struct pipe_fence_handle*)multi_fence;
 	}
-	rctx->rings.gfx.flush(rctx, rflags, fence);
 }
 
 static void r600_flush_dma_ring(void *ctx, unsigned flags,
 				struct pipe_fence_handle **fence)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->dma.cs;
 
-	if (!cs->cdw) {
-		return;
-	}
-
-	rctx->rings.dma.flushing = true;
-	rctx->ws->cs_flush(cs, flags, fence, 0);
-	rctx->rings.dma.flushing = false;
+	if (cs->cdw)
+		rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence, 0);
+	if (fence)
+		rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 }
 
 static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
@@ -270,10 +278,10 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 		return false;
 
 	if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
-		rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
-							 r600_flush_dma_ring,
-							 rctx, NULL);
-		rctx->rings.dma.flush = r600_flush_dma_ring;
+		rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
+						   r600_flush_dma_ring,
+						   rctx, NULL);
+		rctx->dma.flush = r600_flush_dma_ring;
 	}
 
 	return true;
@@ -281,10 +289,10 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 
 void r600_common_context_cleanup(struct r600_common_context *rctx)
 {
-	if (rctx->rings.gfx.cs)
-		rctx->ws->cs_destroy(rctx->rings.gfx.cs);
-	if (rctx->rings.dma.cs)
-		rctx->ws->cs_destroy(rctx->rings.dma.cs);
+	if (rctx->gfx.cs)
+		rctx->ws->cs_destroy(rctx->gfx.cs);
+	if (rctx->dma.cs)
+		rctx->ws->cs_destroy(rctx->dma.cs);
 	if (rctx->ctx)
 		rctx->ws->ctx_destroy(rctx->ctx);
 
@@ -297,6 +305,7 @@ void r600_common_context_cleanup(struct r600_common_context *rctx)
 	if (rctx->allocator_so_filled_size) {
 		u_suballocator_destroy(rctx->allocator_so_filled_size);
 	}
+	rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
 }
 
 void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r)
@@ -754,12 +763,19 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 }
 
 static void r600_fence_reference(struct pipe_screen *screen,
-				 struct pipe_fence_handle **ptr,
-				 struct pipe_fence_handle *fence)
+				 struct pipe_fence_handle **dst,
+				 struct pipe_fence_handle *src)
 {
-	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
-
-	rws->fence_reference(ptr, fence);
+	struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
+	struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst;
+	struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src;
+
+	if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
+		ws->fence_reference(&(*rdst)->gfx, NULL);
+		ws->fence_reference(&(*rdst)->sdma, NULL);
+		FREE(*rdst);
+	}
+        *rdst = rsrc;
 }
 
 static boolean r600_fence_finish(struct pipe_screen *screen,
@@ -767,8 +783,24 @@ static boolean r600_fence_finish(struct pipe_screen *screen,
 				 uint64_t timeout)
 {
 	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
+	struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
+	int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+
+	if (rfence->sdma) {
+		if (!rws->fence_wait(rws, rfence->sdma, timeout))
+			return false;
+
+		/* Recompute the timeout after waiting. */
+		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+			int64_t time = os_time_get_nano();
+			timeout = abs_timeout > time ? abs_timeout - time : 0;
+		}
+	}
+
+	if (!rfence->gfx)
+		return true;
 
-	return rws->fence_wait(rws, fence, timeout);
+	return rws->fence_wait(rws, rfence->gfx, timeout);
 }
 
 static bool r600_interpret_tiling(struct r600_common_screen *rscreen,
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index c300c0b3332..ebe633b9125 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -365,16 +365,10 @@ struct r600_streamout {
 
 struct r600_ring {
 	struct radeon_winsys_cs		*cs;
-	bool				flushing;
 	void (*flush)(void *ctx, unsigned flags,
 		      struct pipe_fence_handle **fence);
 };
 
-struct r600_rings {
-	struct r600_ring		gfx;
-	struct r600_ring		dma;
-};
-
 struct r600_common_context {
 	struct pipe_context b; /* base class */
 
@@ -383,7 +377,9 @@ struct r600_common_context {
 	struct radeon_winsys_ctx	*ctx;
 	enum radeon_family		family;
 	enum chip_class			chip_class;
-	struct r600_rings		rings;
+	struct r600_ring		gfx;
+	struct r600_ring		dma;
+	struct pipe_fence_handle	*last_sdma_fence;
 	unsigned			initial_gfx_cs_size;
 	unsigned			gpu_reset_counter;
 
@@ -421,14 +417,11 @@ struct r600_common_context {
 	unsigned			num_draw_calls;
 
 	/* Render condition. */
-	struct pipe_query		*current_render_cond;
-	unsigned			current_render_cond_mode;
-	boolean				current_render_cond_cond;
-	boolean				predicate_drawing;
-	/* For context flushing. */
-	struct pipe_query		*saved_render_cond;
-	boolean				saved_render_cond_cond;
-	unsigned			saved_render_cond_mode;
+	struct r600_atom		render_cond_atom;
+	struct pipe_query		*render_cond;
+	unsigned			render_cond_mode;
+	boolean				render_cond_invert;
+	bool				render_cond_force_off; /* for u_blitter */
 
 	/* MSAA sample locations.
 	 * The first index is the sample index.
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 9a5402583f4..8c2b601a96c 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -172,7 +172,7 @@ static unsigned event_type_for_stream(struct r600_query *query)
 
 static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	uint64_t va;
 
 	r600_update_occlusion_query_state(ctx, query->type, 1);
@@ -225,7 +225,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	default:
 		assert(0);
 	}
-	r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
+	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_QUERY);
 
 	if (r600_is_timer_query(query->type))
@@ -236,7 +236,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 
 static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	uint64_t va;
 
 	/* The queries which need begin already called this in begin_query. */
@@ -287,7 +287,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	default:
 		assert(0);
 	}
-	r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
+	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_QUERY);
 
 	query->buffer.results_end += query->result_size;
@@ -303,53 +303,60 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	r600_update_prims_generated_query_state(ctx, query->type, -1);
 }
 
-static void r600_emit_query_predication(struct r600_common_context *ctx, struct r600_query *query,
-					int operation, bool flag_wait)
+static void r600_emit_query_predication(struct r600_common_context *ctx,
+					struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
-	uint32_t op = PRED_OP(operation);
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	struct r600_query *query = (struct r600_query*)ctx->render_cond;
+	struct r600_query_buffer *qbuf;
+	uint32_t op;
+	bool flag_wait;
+
+	if (!query)
+		return;
+
+	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+
+	switch (query->type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+		op = PRED_OP(PREDICATION_OP_ZPASS);
+		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+		break;
+	default:
+		assert(0);
+		return;
+	}
 
 	/* if true then invert, see GL_ARB_conditional_render_inverted */
-	if (ctx->current_render_cond_cond)
+	if (ctx->render_cond_invert)
 		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
 	else
 		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
 
-	if (operation == PREDICATION_OP_CLEAR) {
-		ctx->need_gfx_cs_space(&ctx->b, 3, FALSE);
-
-		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-		radeon_emit(cs, 0);
-		radeon_emit(cs, PRED_OP(PREDICATION_OP_CLEAR));
-	} else {
-		struct r600_query_buffer *qbuf;
-		unsigned count;
-		/* Find how many results there are. */
-		count = 0;
-		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-			count += qbuf->results_end / query->result_size;
-		}
-	
-		ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE);
-	
-		op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
-	
-		/* emit predicate packets for all data blocks */
-		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-			unsigned results_base = 0;
-			uint64_t va = qbuf->buf->gpu_address;
-	
-			while (results_base < qbuf->results_end) {
-				radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-				radeon_emit(cs, va + results_base);
-				radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
-				r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ,
-						RADEON_PRIO_QUERY);
-				results_base += query->result_size;
+	op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
 	
-				/* set CONTINUE bit for all packets except the first */
-				op |= PREDICATION_CONTINUE;
-			}
+	/* emit predicate packets for all data blocks */
+	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+		unsigned results_base = 0;
+		uint64_t va = qbuf->buf->gpu_address;
+
+		while (results_base < qbuf->results_end) {
+			radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+			radeon_emit(cs, va + results_base);
+			radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
+			r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ,
+					RADEON_PRIO_QUERY);
+			results_base += query->result_size;
+
+			/* set CONTINUE bit for all packets except the first */
+			op |= PREDICATION_CONTINUE;
 		}
 	}
 }
@@ -532,7 +539,7 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
 		return;
 	case PIPE_QUERY_GPU_FINISHED:
-		rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, &rquery->fence);
+		ctx->flush(ctx, &rquery->fence, 0);
 		return;
 	case R600_QUERY_DRAW_CALLS:
 		rquery->end_result = rctx->num_draw_calls;
@@ -820,42 +827,20 @@ static void r600_render_condition(struct pipe_context *ctx,
 				  uint mode)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct r600_query *rquery = (struct r600_query *)query;
-	bool wait_flag = false;
-
-	rctx->current_render_cond = query;
-	rctx->current_render_cond_cond = condition;
-	rctx->current_render_cond_mode = mode;
-
-	if (query == NULL) {
-		if (rctx->predicate_drawing) {
-			rctx->predicate_drawing = false;
-			r600_emit_query_predication(rctx, NULL, PREDICATION_OP_CLEAR, false);
-		}
-		return;
-	}
+	struct r600_query *rquery = (struct r600_query*)query;
+	struct r600_query_buffer *qbuf;
+	struct r600_atom *atom = &rctx->render_cond_atom;
 
-	if (mode == PIPE_RENDER_COND_WAIT ||
-	    mode == PIPE_RENDER_COND_BY_REGION_WAIT) {
-		wait_flag = true;
-	}
+	rctx->render_cond = query;
+	rctx->render_cond_invert = condition;
+	rctx->render_cond_mode = mode;
 
-	rctx->predicate_drawing = true;
+	/* Compute the size of SET_PREDICATION packets. */
+	atom->num_dw = 0;
+	for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
+		atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
 
-	switch (rquery->type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		r600_emit_query_predication(rctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
-		break;
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-	case PIPE_QUERY_SO_STATISTICS:
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		r600_emit_query_predication(rctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag);
-		break;
-	default:
-		assert(0);
-	}
+	rctx->set_atom_dirty(rctx, atom, query != NULL);
 }
 
 static void r600_suspend_queries(struct r600_common_context *ctx,
@@ -939,7 +924,7 @@ void r600_resume_timer_queries(struct r600_common_context *ctx)
 /* Get backends mask */
 void r600_query_init_backend_mask(struct r600_common_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	struct r600_resource *buffer;
 	uint32_t *results;
 	unsigned num_backends = ctx->screen->info.r600_num_backends;
@@ -990,7 +975,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 		radeon_emit(cs, buffer->gpu_address);
 		radeon_emit(cs, buffer->gpu_address >> 32);
 
-		r600_emit_reloc(ctx, &ctx->rings.gfx, buffer,
+		r600_emit_reloc(ctx, &ctx->gfx, buffer,
                                 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 
 		/* analyze results */
@@ -1024,6 +1009,7 @@ void r600_query_init(struct r600_common_context *rctx)
 	rctx->b.begin_query = r600_begin_query;
 	rctx->b.end_query = r600_end_query;
 	rctx->b.get_query_result = r600_get_query_result;
+	rctx->render_cond_atom.emit = r600_emit_query_predication;
 
 	if (((struct r600_common_screen*)rctx->b.screen)->info.r600_num_backends > 0)
 	    rctx->b.render_condition = r600_render_condition;
diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c
index 33403b572af..e977ed9fa10 100644
--- a/src/gallium/drivers/radeon/r600_streamout.c
+++ b/src/gallium/drivers/radeon/r600_streamout.c
@@ -152,7 +152,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
 
 static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	unsigned reg_strmout_cntl;
 
 	/* The register is at different places on different ASICs. */
@@ -184,7 +184,7 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
 
 static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	struct r600_so_target **t = rctx->streamout.targets;
 	unsigned *stride_in_dw = rctx->streamout.stride_in_dw;
 	unsigned i, update_flags = 0;
@@ -216,7 +216,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 			radeon_emit(cs, stride_in_dw[i]);		/* VTX_STRIDE (in DW) */
 			radeon_emit(cs, va >> 8);			/* BUFFER_BASE */
 
-			r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
+			r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer),
 					RADEON_USAGE_WRITE, RADEON_PRIO_RINGS_STREAMOUT);
 
 			/* R7xx requires this packet after updating BUFFER_BASE.
@@ -226,7 +226,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 				radeon_emit(cs, i);
 				radeon_emit(cs, va >> 8);
 
-				r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
+				r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer),
 						RADEON_USAGE_WRITE, RADEON_PRIO_RINGS_STREAMOUT);
 			}
 		}
@@ -244,7 +244,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 			radeon_emit(cs, va); /* src address lo */
 			radeon_emit(cs, va >> 32); /* src address hi */
 
-			r600_emit_reloc(rctx,  &rctx->rings.gfx, t[i]->buf_filled_size,
+			r600_emit_reloc(rctx,  &rctx->gfx, t[i]->buf_filled_size,
 					RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE);
 		} else {
 			/* Start from the beginning. */
@@ -267,7 +267,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 
 void r600_emit_streamout_end(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	struct r600_so_target **t = rctx->streamout.targets;
 	unsigned i;
 	uint64_t va;
@@ -288,7 +288,7 @@ void r600_emit_streamout_end(struct r600_common_context *rctx)
 		radeon_emit(cs, 0); /* unused */
 		radeon_emit(cs, 0); /* unused */
 
-		r600_emit_reloc(rctx,  &rctx->rings.gfx, t[i]->buf_filled_size,
+		r600_emit_reloc(rctx,  &rctx->gfx, t[i]->buf_filled_size,
 				RADEON_USAGE_WRITE, RADEON_PRIO_SO_FILLED_SIZE);
 
 		/* Zero the buffer size. The counters (primitives generated,
@@ -336,8 +336,8 @@ static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 			S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) |
 			S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx));
 	}
-	radeon_set_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val);
-	radeon_set_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val);
+	radeon_set_context_reg(rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val);
+	radeon_set_context_reg(rctx->gfx.cs, strmout_config_reg, strmout_config_val);
 }
 
 static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable)
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index edfdfe33187..3126cce8c22 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -1324,7 +1324,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 {
 	int i;
 
-	if (rctx->current_render_cond)
+	if (rctx->render_cond)
 		return;
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 33b01361aa5..0c643e5cd59 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -947,6 +947,12 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	dec->msg->body.decode.width_in_samples = dec->base.width;
 	dec->msg->body.decode.height_in_samples = dec->base.height;
 
+	if ((picture->profile == PIPE_VIDEO_PROFILE_VC1_SIMPLE) ||
+	    (picture->profile == PIPE_VIDEO_PROFILE_VC1_MAIN)) {
+		dec->msg->body.decode.width_in_samples = align(dec->msg->body.decode.width_in_samples, 16) / 16;
+		dec->msg->body.decode.height_in_samples = align(dec->msg->body.decode.height_in_samples, 16) / 16;
+	}
+
 	dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
 	dec->msg->body.decode.bsd_size = bs_size;
 	dec->msg->body.decode.db_pitch = dec->base.width;
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 32bfc32073b..f56c6cf6cb4 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -244,8 +244,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
 				return codec != PIPE_VIDEO_FORMAT_MPEG4;
 			return true;
 		case PIPE_VIDEO_FORMAT_VC1:
-			/* FIXME: VC-1 simple/main profile is broken */
-			return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED;
+			return true;
 		case PIPE_VIDEO_FORMAT_HEVC:
 			/* Carrizo only supports HEVC Main */
 			return rscreen->family >= CHIP_CARRIZO &&
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index e53af1dd6b5..2de237b4716 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -50,7 +50,7 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx,
 				    uint64_t src_offset,
 				    uint64_t size)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -61,9 +61,9 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx,
 	ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE;
 	r600_need_dma_space(&ctx->b, ncopy * 7);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ,
 			      RADEON_PRIO_SDMA_BUFFER);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE,
 			      RADEON_PRIO_SDMA_BUFFER);
 
 	for (i = 0; i < ncopy; i++) {
@@ -112,7 +112,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
 			       unsigned pitch,
 			       unsigned bpe)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	struct si_screen *sscreen = ctx->screen;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
@@ -171,9 +171,9 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
 	ncopy = (copy_height + cheight - 1) / cheight;
 	r600_need_dma_space(&ctx->b, ncopy * 12);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource,
 			      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource,
 			      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
 
 	copy_height = size * 4 / pitch;
@@ -224,7 +224,7 @@ void cik_sdma_copy(struct pipe_context *ctx,
 	unsigned copy_height, y_align;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (sctx->b.rings.dma.cs == NULL) {
+	if (sctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index fce014a1e6b..13d8e6f2a5f 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -29,20 +29,23 @@ enum si_blitter_op /* bitmask */
 {
 	SI_SAVE_TEXTURES      = 1,
 	SI_SAVE_FRAMEBUFFER   = 2,
-	SI_DISABLE_RENDER_COND = 4,
+	SI_SAVE_FRAGMENT_STATE = 4,
+	SI_DISABLE_RENDER_COND = 8,
 
-	SI_CLEAR         = 0,
+	SI_CLEAR         = SI_SAVE_FRAGMENT_STATE,
 
-	SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER,
+	SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
 
 	SI_COPY          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
-			   SI_DISABLE_RENDER_COND,
+			   SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
 
-	SI_BLIT          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES,
+	SI_BLIT          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
+			   SI_SAVE_FRAGMENT_STATE,
 
-	SI_DECOMPRESS    = SI_SAVE_FRAMEBUFFER | SI_DISABLE_RENDER_COND,
+	SI_DECOMPRESS    = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE |
+			   SI_DISABLE_RENDER_COND,
 
-	SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER
+	SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
 };
 
 static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
@@ -51,22 +54,25 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 
 	r600_suspend_nontimer_queries(&sctx->b);
 
-	util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
-	util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
-	util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
-	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
-	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
-	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
+	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
+	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
+	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
 	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
 	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
-	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
-	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
-	util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
-	util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
-	util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
-	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
+	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
 	util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets,
 				     (struct pipe_stream_output_target**)sctx->b.streamout.targets);
+	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
+
+	if (op & SI_SAVE_FRAGMENT_STATE) {
+		util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
+		util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
+		util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
+		util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
+		util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
+		util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
+		util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
+	}
 
 	if (op & SI_SAVE_FRAMEBUFFER)
 		util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
@@ -80,17 +86,15 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 			sctx->samplers[PIPE_SHADER_FRAGMENT].views.views);
 	}
 
-	if ((op & SI_DISABLE_RENDER_COND) && sctx->b.current_render_cond) {
-		util_blitter_save_render_condition(sctx->blitter,
-                                                   sctx->b.current_render_cond,
-                                                   sctx->b.current_render_cond_cond,
-                                                   sctx->b.current_render_cond_mode);
-	}
+	if (op & SI_DISABLE_RENDER_COND)
+		sctx->b.render_cond_force_off = true;
 }
 
 static void si_blitter_end(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+
+	sctx->b.render_cond_force_off = false;
 	r600_resume_nontimer_queries(&sctx->b);
 }
 
@@ -731,9 +735,69 @@ static void si_flush_resource(struct pipe_context *ctx,
 	}
 }
 
+static void si_pipe_clear_buffer(struct pipe_context *ctx,
+				 struct pipe_resource *dst,
+				 unsigned offset, unsigned size,
+				 const void *clear_value_ptr,
+				 int clear_value_size)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	uint32_t dword_value;
+	unsigned i;
+
+	assert(offset % clear_value_size == 0);
+	assert(size % clear_value_size == 0);
+
+	if (clear_value_size > 4) {
+		const uint32_t *u32 = clear_value_ptr;
+		bool clear_dword_duplicated = true;
+
+		/* See if we can lower large fills to dword fills. */
+		for (i = 1; i < clear_value_size / 4; i++)
+			if (u32[0] != u32[i]) {
+				clear_dword_duplicated = false;
+				break;
+			}
+
+		if (!clear_dword_duplicated) {
+			/* Use transform feedback for 64-bit, 96-bit, and
+			 * 128-bit fills.
+			 */
+			union pipe_color_union clear_value;
+
+			memcpy(&clear_value, clear_value_ptr, clear_value_size);
+			si_blitter_begin(ctx, SI_DISABLE_RENDER_COND);
+			util_blitter_clear_buffer(sctx->blitter, dst, offset,
+						  size, clear_value_size / 4,
+						  &clear_value);
+			si_blitter_end(ctx);
+			return;
+		}
+	}
+
+	/* Expand the clear value to a dword. */
+	switch (clear_value_size) {
+	case 1:
+		dword_value = *(uint8_t*)clear_value_ptr;
+		dword_value |= (dword_value << 8) |
+			       (dword_value << 16) |
+			       (dword_value << 24);
+		break;
+	case 2:
+		dword_value = *(uint16_t*)clear_value_ptr;
+		dword_value |= dword_value << 16;
+		break;
+	default:
+		dword_value = *(uint32_t*)clear_value_ptr;
+	}
+
+	sctx->b.clear_buffer(ctx, dst, offset, size, dword_value, false);
+}
+
 void si_init_blit_functions(struct si_context *sctx)
 {
 	sctx->b.b.clear = si_clear;
+	sctx->b.b.clear_buffer = si_pipe_clear_buffer;
 	sctx->b.b.clear_render_target = si_clear_render_target;
 	sctx->b.b.clear_depth_stencil = si_clear_depth_stencil;
 	sctx->b.b.resource_copy_region = si_resource_copy_region;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 697e60a50d9..2d551dd0e6b 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -227,7 +227,7 @@ static void si_launch_grid(
 		uint32_t pc, const void *input)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_compute *program = sctx->cs_shader_state.program;
 	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 	struct r600_resource *input_buffer = program->input_buffer;
@@ -253,10 +253,10 @@ static void si_launch_grid(
 	radeon_emit(cs, 0x80000000);
 	radeon_emit(cs, 0x80000000);
 
-	sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_KCACHE |
+			 SI_CONTEXT_INV_SMEM_L1 |
 			 SI_CONTEXT_FLUSH_WITH_INV_L2 |
 			 SI_CONTEXT_FLAG_COMPUTE;
 	si_emit_cache_flush(sctx, NULL);
@@ -274,7 +274,7 @@ static void si_launch_grid(
 	kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */;
 
 	kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf,
-			sctx->b.rings.gfx.cs, PIPE_TRANSFER_WRITE);
+			sctx->b.gfx.cs, PIPE_TRANSFER_WRITE);
 	for (i = 0; i < 3; i++) {
 		kernel_args[i] = grid_layout[i];
 		kernel_args[i + 3] = grid_layout[i] * block_layout[i];
@@ -294,7 +294,7 @@ static void si_launch_grid(
 			    shader->scratch_bytes_per_wave *
 			    num_waves_for_scratch);
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					  shader->scratch_bo,
 					  RADEON_USAGE_READWRITE,
 					  RADEON_PRIO_SCRATCH_BUFFER);
@@ -310,7 +310,7 @@ static void si_launch_grid(
 	kernel_args_va = input_buffer->gpu_address;
 	kernel_args_va += kernel_args_offset;
 
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, input_buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer,
 				  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
 
 	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0, kernel_args_va);
@@ -338,7 +338,7 @@ static void si_launch_grid(
 		if (!buffer) {
 			continue;
 		}
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, buffer,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer,
 					  RADEON_USAGE_READWRITE,
 					  RADEON_PRIO_COMPUTE_GLOBAL);
 	}
@@ -361,7 +361,7 @@ static void si_launch_grid(
 #if HAVE_LLVM >= 0x0306
 	shader_va += pc;
 #endif
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, shader->bo,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
 				  RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
@@ -449,10 +449,10 @@ static void si_launch_grid(
 	si_pm4_free_state(sctx, pm4, ~0);
 
 	sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-			 SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+			 SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_KCACHE |
+			 SI_CONTEXT_INV_SMEM_L1 |
 			 SI_CONTEXT_FLAG_COMPUTE;
 	si_emit_cache_flush(sctx, NULL);
 }
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index d4bd7b28cf3..0bf85a04db7 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -46,8 +46,9 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
 				       uint64_t dst_va, uint64_t src_va,
 				       unsigned size, unsigned flags)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+	uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
 	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
 	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
 			   S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
@@ -63,14 +64,14 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
 		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
 		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	} else {
 		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
 		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
 		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
 		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	}
 }
 
@@ -79,8 +80,9 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 					uint64_t dst_va, unsigned size,
 					uint32_t clear_value, unsigned flags)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+	uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
 	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
 	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0;
 
@@ -94,26 +96,74 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 		radeon_emit(cs, 0);
 		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	} else {
 		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
 		radeon_emit(cs, clear_value);		/* DATA [31:0] */
 		radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */
 		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	}
 }
 
+static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer)
+{
+	if (is_framebuffer)
+		return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+
+	return SI_CONTEXT_INV_SMEM_L1 |
+	       SI_CONTEXT_INV_VMEM_L1 |
+	       (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
+}
+
+static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer)
+{
+	return is_framebuffer || sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+}
+
+static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
+			      struct pipe_resource *src, unsigned byte_count,
+			      unsigned remaining_size, unsigned *flags)
+{
+	si_need_cs_space(sctx);
+
+	/* This must be done after need_cs_space. */
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+				  (struct r600_resource*)dst,
+				  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+	if (src)
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+					  (struct r600_resource*)src,
+					  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
+
+	/* Flush the caches for the first copy only.
+	 * Also wait for the previous CP DMA operations.
+	 */
+	if (sctx->b.flags) {
+		si_emit_cache_flush(sctx, NULL);
+		*flags |= SI_CP_DMA_RAW_WAIT;
+	}
+
+	/* Do the synchronization after the last dma, so that all data
+	 * is written to memory.
+	 */
+	if (byte_count == remaining_size)
+		*flags |= R600_CP_DMA_SYNC;
+}
+
+/* Alignment for optimal performance. */
+#define CP_DMA_ALIGNMENT	32
 /* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+#define CP_DMA_MAX_BYTE_COUNT	((1 << 21) - CP_DMA_ALIGNMENT)
 
 static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 			    unsigned offset, unsigned size, unsigned value,
 			    bool is_framebuffer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	unsigned flush_flags, tc_l2_flag;
+	unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+	unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
 
 	if (!size)
 		return;
@@ -126,52 +176,27 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 
 	/* Fallback for unaligned clears. */
 	if (offset % 4 != 0 || size % 4 != 0) {
-		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
-						       sctx->b.rings.gfx.cs,
-						       PIPE_TRANSFER_WRITE);
-		size /= 4;
-		for (unsigned i = 0; i < size; i++)
-			*map++ = value;
+		uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
+						      sctx->b.gfx.cs,
+						      PIPE_TRANSFER_WRITE);
+		map += offset;
+		for (unsigned i = 0; i < size; i++) {
+			unsigned byte_within_dword = (offset + i) % 4;
+			*map++ = (value >> (byte_within_dword * 8)) & 0xff;
+		}
 		return;
 	}
 
 	uint64_t va = r600_resource(dst)->gpu_address + offset;
 
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
-	}
-
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
+	/* Flush the caches. */
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
 
 	while (size) {
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
 		unsigned dma_flags = tc_l2_flag;
 
-		si_need_cs_space(sctx);
-
-		/* This must be done after need_cs_space. */
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
-				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
-				      RADEON_PRIO_CP_DMA);
-
-		/* Flush the caches for the first copy only.
-		 * Also wait for the previous CP DMA operations. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(sctx, NULL);
-			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
-		}
-
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count)
-			dma_flags |= R600_CP_DMA_SYNC;
+		si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, &dma_flags);
 
 		/* Emit the clear packet. */
 		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
@@ -188,12 +213,53 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 		r600_resource(dst)->TC_L2_dirty = true;
 }
 
+/**
+ * Realign the CP DMA engine. This must be done after a copy with an unaligned
+ * size.
+ *
+ * \param size  Remaining size to the CP DMA alignment.
+ */
+static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size)
+{
+	uint64_t va;
+	unsigned dma_flags = 0;
+	unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
+
+	assert(size < CP_DMA_ALIGNMENT);
+
+	/* Use the scratch buffer as the dummy buffer. The 3D engine should be
+	 * idle at this point.
+	 */
+	if (!sctx->scratch_buffer ||
+	    sctx->scratch_buffer->b.b.width0 < scratch_size) {
+		r600_resource_reference(&sctx->scratch_buffer, NULL);
+		sctx->scratch_buffer =
+			si_resource_create_custom(&sctx->screen->b.b,
+						  PIPE_USAGE_DEFAULT,
+						  scratch_size);
+		if (!sctx->scratch_buffer)
+			return;
+		sctx->emit_scratch_reloc = true;
+	}
+
+	si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
+			  &sctx->scratch_buffer->b.b, size, size, &dma_flags);
+
+	va = sctx->scratch_buffer->gpu_address;
+	si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size,
+				   dma_flags);
+}
+
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
 		    bool is_framebuffer)
 {
-	unsigned flush_flags, tc_l2_flag;
+	uint64_t main_dst_offset, main_src_offset;
+	unsigned skipped_size = 0;
+	unsigned realign_size = 0;
+	unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+	unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
 
 	if (!size)
 		return;
@@ -207,50 +273,63 @@ void si_copy_buffer(struct si_context *sctx,
 	dst_offset += r600_resource(dst)->gpu_address;
 	src_offset += r600_resource(src)->gpu_address;
 
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	/* If the size is not aligned, we must add a dummy copy at the end
+	 * just to align the internal counter. Otherwise, the DMA engine
+	 * would slow down by an order of magnitude for following copies.
+	 */
+	if (size % CP_DMA_ALIGNMENT)
+		realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
+
+	/* If the copy begins unaligned, we must start copying from the next
+	 * aligned block and the skipped part should be copied after everything
+	 * else has been copied. Only the src alignment matters, not dst.
+	 */
+	if (src_offset % CP_DMA_ALIGNMENT) {
+		skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
+		/* The main part will be skipped if the size is too small. */
+		skipped_size = MIN2(skipped_size, size);
+		size -= skipped_size;
 	}
 
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
+	/* Flush the caches. */
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+
+	/* This is the main part doing the copying. Src is always aligned. */
+	main_dst_offset = dst_offset + skipped_size;
+	main_src_offset = src_offset + skipped_size;
 
 	while (size) {
-		unsigned sync_flags = tc_l2_flag;
+		unsigned dma_flags = tc_l2_flag;
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
 
-		si_need_cs_space(sctx);
+		si_cp_dma_prepare(sctx, dst, src, byte_count,
+				  size + skipped_size + realign_size,
+				  &dma_flags);
 
-		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(sctx, NULL);
-			sync_flags |= SI_CP_DMA_RAW_WAIT;
-		}
+		si_emit_cp_dma_copy_buffer(sctx, main_dst_offset, main_src_offset,
+					   byte_count, dma_flags);
 
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count) {
-			sync_flags |= R600_CP_DMA_SYNC;
-		}
+		size -= byte_count;
+		main_src_offset += byte_count;
+		main_dst_offset += byte_count;
+	}
 
-		/* This must be done after r600_need_cs_space. */
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
-				      RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
-				      RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+	/* Copy the part we skipped because src wasn't aligned. */
+	if (skipped_size) {
+		unsigned dma_flags = tc_l2_flag;
 
-		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
+		si_cp_dma_prepare(sctx, dst, src, skipped_size,
+				  skipped_size + realign_size,
+				  &dma_flags);
 
-		size -= byte_count;
-		src_offset += byte_count;
-		dst_offset += byte_count;
+		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset,
+					   skipped_size, dma_flags);
 	}
 
+	/* Finally, realign the engine if the size wasn't aligned. */
+	if (realign_size)
+		si_cp_dma_realign_engine(sctx, realign_size);
+
 	/* Flush the caches again in case the 3D engine has been prefetching
 	 * the resource. */
 	sctx->b.flags |= flush_flags;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index a8ff6f27319..3fa3a9bbd6e 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -117,7 +117,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
 
 	util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 			      RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 
 	desc->list_dirty = false;
@@ -152,14 +152,14 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 		if (!rview->resource)
 			continue;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      rview->resource, RADEON_USAGE_READ,
 				      r600_get_sampler_view_priority(rview->resource));
 	}
 
 	if (!views->desc.buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
 }
 
@@ -177,12 +177,12 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 			(struct si_sampler_view*)view;
 
 		if (rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				rview->resource, RADEON_USAGE_READ,
 				r600_get_sampler_view_priority(rview->resource));
 
 		if (rview->dcc_buffer && rview->dcc_buffer != rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				rview->dcc_buffer, RADEON_USAGE_READ,
 				RADEON_PRIO_DCC);
 
@@ -264,7 +264,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 {
 	if (!states->desc.buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
 }
 
@@ -334,14 +334,14 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 	while (mask) {
 		int i = u_bit_scan64(&mask);
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)buffers->buffers[i],
 				      buffers->shader_usage, buffers->priority);
 	}
 
 	if (!buffers->desc.buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_DESCRIPTORS);
 }
@@ -362,14 +362,14 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 		if (!sctx->vertex_buffer[vb].buffer)
 			continue;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 	}
 
 	if (!desc->buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_DESCRIPTORS);
 }
@@ -396,7 +396,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 	if (!desc->buffer)
 		return false;
 
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_DESCRIPTORS);
 
@@ -440,7 +440,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 		desc[3] = sctx->vertex_elements->rsrc_word3[i];
 
 		if (!bound[ve->vertex_buffer_index]) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					      (struct r600_resource*)vb->buffer,
 					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 			bound[ve->vertex_buffer_index] = true;
@@ -525,7 +525,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 
 		buffers->buffers[slot] = buffer;
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)buffer,
 				      buffers->shader_usage, buffers->priority);
 		buffers->desc.enabled_mask |= 1llu << slot;
@@ -620,7 +620,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			  S_008F0C_ADD_TID_ENABLE(add_tid);
 
 		pipe_resource_reference(&buffers->buffers[slot], buffer);
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)buffer,
 				      buffers->shader_usage, buffers->priority);
 		buffers->desc.enabled_mask |= 1llu << slot;
@@ -670,8 +670,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
 		 * used as an input immediately.
 		 */
-		sctx->b.flags |= SI_CONTEXT_INV_KCACHE |
-				 SI_CONTEXT_INV_TC_L1 |
+		sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
+				 SI_CONTEXT_INV_VMEM_L1 |
 				 SI_CONTEXT_VS_PARTIAL_FLUSH;
 	}
 
@@ -710,7 +710,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			/* Set the resource. */
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						buffer);
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					      (struct r600_resource*)buffer,
 					      buffers->shader_usage, buffers->priority);
 			buffers->desc.enabled_mask |= 1llu << bufidx;
@@ -809,7 +809,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 							    old_va, buf);
 				buffers->desc.list_dirty = true;
 
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 
@@ -838,7 +838,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 							    old_va, buf);
 				buffers->desc.list_dirty = true;
 
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 			}
@@ -863,7 +863,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 							    old_va, buf);
 				views->desc.list_dirty = true;
 
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 						      rbuffer, RADEON_USAGE_READ,
 						      RADEON_PRIO_SAMPLER_BUFFER);
 			}
@@ -948,7 +948,7 @@ static void si_emit_shader_pointer(struct si_context *sctx,
 				   struct si_descriptors *desc,
 				   unsigned sh_base, bool keep_dirty)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint64_t va;
 
 	if (!desc->pointer_dirty || !desc->buffer)
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 581e89f42d8..240d96190a9 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -49,7 +49,7 @@ static void si_dma_copy_buffer(struct si_context *ctx,
 				uint64_t src_offset,
 				uint64_t size)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	unsigned i, ncopy, csize, max_csize, sub_cmd, shift;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -78,9 +78,9 @@ static void si_dma_copy_buffer(struct si_context *ctx,
 
 	r600_need_dma_space(&ctx->b, ncopy * 5);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ,
 			      RADEON_PRIO_SDMA_BUFFER);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE,
 			      RADEON_PRIO_SDMA_BUFFER);
 
 	for (i = 0; i < ncopy; i++) {
@@ -111,7 +111,7 @@ static void si_dma_copy_tile(struct si_context *ctx,
 			     unsigned pitch,
 			     unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	struct si_screen *sscreen = ctx->screen;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
@@ -177,9 +177,9 @@ static void si_dma_copy_tile(struct si_context *ctx,
 	ncopy = (size / SI_DMA_COPY_MAX_SIZE_DW) + !!(size % SI_DMA_COPY_MAX_SIZE_DW);
 	r600_need_dma_space(&ctx->b, ncopy * 9);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource,
 			      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource,
 			      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
 
 	for (i = 0; i < ncopy; i++) {
@@ -221,7 +221,7 @@ void si_dma_copy(struct pipe_context *ctx,
 	unsigned src_x, src_y;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (sctx->b.rings.dma.cs == NULL) {
+	if (sctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 7c147e2e44c..baa02293c41 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -29,17 +29,22 @@
 /* initialize */
 void si_need_cs_space(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	struct radeon_winsys_cs *dma = ctx->b.dma.cs;
+
+	/* Flush the DMA IB if it's not empty. */
+	if (dma && dma->cdw)
+		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 
 	/* There are two memory usage counters in the winsys for all buffers
 	 * that have been added (cs_add_buffer) and two counters in the pipe
 	 * driver for those that haven't been added yet.
 	 */
-	if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs,
+	if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs,
 						       ctx->b.vram, ctx->b.gtt))) {
 		ctx->b.gtt = 0;
 		ctx->b.vram = 0;
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 		return;
 	}
 	ctx->b.gtt = 0;
@@ -49,32 +54,36 @@ void si_need_cs_space(struct si_context *ctx)
 	 * and just flush if there is not enough space left.
 	 */
 	if (unlikely(cs->cdw > cs->max_dw - 2048))
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 }
 
 void si_context_gfx_flush(void *context, unsigned flags,
 			  struct pipe_fence_handle **fence)
 {
 	struct si_context *ctx = context;
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 	struct radeon_winsys *ws = ctx->b.ws;
 
+	if (ctx->gfx_flush_in_progress)
+		return;
+
+	ctx->gfx_flush_in_progress = true;
+
 	if (cs->cdw == ctx->b.initial_gfx_cs_size &&
 	    (!fence || ctx->last_gfx_fence)) {
 		if (fence)
 			ws->fence_reference(fence, ctx->last_gfx_fence);
 		if (!(flags & RADEON_FLUSH_ASYNC))
 			ws->cs_sync_flush(cs);
+		ctx->gfx_flush_in_progress = false;
 		return;
 	}
 
-	ctx->b.rings.gfx.flushing = true;
-
 	r600_preflush_suspend_features(&ctx->b);
 
 	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
-			SI_CONTEXT_INV_TC_L1 |
-			SI_CONTEXT_INV_TC_L2 |
+			SI_CONTEXT_INV_VMEM_L1 |
+			SI_CONTEXT_INV_GLOBAL_L2 |
 			/* this is probably not needed anymore */
 			SI_CONTEXT_PS_PARTIAL_FLUSH;
 	si_emit_cache_flush(ctx, NULL);
@@ -111,7 +120,6 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	/* Flush the CS. */
 	ws->cs_flush(cs, flags, &ctx->last_gfx_fence,
 		     ctx->screen->b.cs_count++);
-	ctx->b.rings.gfx.flushing = false;
 
 	if (fence)
 		ws->fence_reference(fence, ctx->last_gfx_fence);
@@ -121,6 +129,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
 		si_check_vm_faults(ctx);
 
 	si_begin_new_cs(ctx);
+	ctx->gfx_flush_in_progress = false;
 }
 
 void si_begin_new_cs(struct si_context *ctx)
@@ -144,9 +153,9 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	/* Flush read caches at the beginning of CS. */
 	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
-			SI_CONTEXT_INV_TC_L1 |
-			SI_CONTEXT_INV_TC_L2 |
-			SI_CONTEXT_INV_KCACHE |
+			SI_CONTEXT_INV_VMEM_L1 |
+			SI_CONTEXT_INV_GLOBAL_L2 |
+			SI_CONTEXT_INV_SMEM_L1 |
 			SI_CONTEXT_INV_ICACHE;
 
 	/* set all valid group as dirty so they get reemited on
@@ -156,6 +165,8 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	/* The CS initialization should be emitted before everything else. */
 	si_pm4_emit(ctx, ctx->init_config);
+	if (ctx->init_config_gs_rings)
+		si_pm4_emit(ctx, ctx->init_config_gs_rings);
 
 	ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
 	ctx->framebuffer.dirty_zsbuf = true;
@@ -173,6 +184,7 @@ void si_begin_new_cs(struct si_context *ctx)
 	si_mark_atom_dirty(ctx, &ctx->spi_map);
 	si_mark_atom_dirty(ctx, &ctx->spi_ps_input);
 	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
+	si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
 	si_all_descriptors_begin_new_cs(ctx);
 
 	ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
@@ -182,7 +194,7 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	r600_postflush_resume_features(&ctx->b);
 
-	ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
+	ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw;
 
 	/* Invalidate various draw states so that they are emitted before
 	 * the first draw call. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 60baad3d13c..9a0fe808ee3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -50,6 +50,8 @@ static void si_destroy_context(struct pipe_context *context)
 	sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL);
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
+	if (sctx->init_config_gs_rings)
+		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
 	for (i = 0; i < Elements(sctx->vgt_shader_config); i++)
 		si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
 
@@ -139,10 +141,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 		sctx->b.b.create_video_buffer = vl_video_buffer_create;
 	}
 
-	sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
-					     sctx, sscreen->b.trace_bo ?
-						sscreen->b.trace_bo->cs_buf : NULL);
-	sctx->b.rings.gfx.flush = si_context_gfx_flush;
+	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
+				       sctx, sscreen->b.trace_bo ?
+					       sscreen->b.trace_bo->cs_buf : NULL);
+	sctx->b.gfx.flush = si_context_gfx_flush;
 
 	/* Border colors. */
 	sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS *
@@ -337,6 +339,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_CLEAR_TEXTURE:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 42cd8803c36..05d52fe19dc 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -46,15 +46,12 @@
 
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE		(R600_CONTEXT_PRIVATE_FLAG << 0)
-/* Cache used by scalar memory (SMEM) instructions. They also use TC
- * as a second level cache, which isn't flushed by this.
- * Other names: constant cache, data cache, DCACHE */
-#define SI_CONTEXT_INV_KCACHE		(R600_CONTEXT_PRIVATE_FLAG << 1)
-/* Caches used by vector memory (VMEM) instructions.
- * L1 can optionally be bypassed (GLC=1) and can only be used by shaders.
- * L2 is used by shaders and can be used by other blocks (CP, sDMA). */
-#define SI_CONTEXT_INV_TC_L1		(R600_CONTEXT_PRIVATE_FLAG << 2)
-#define SI_CONTEXT_INV_TC_L2		(R600_CONTEXT_PRIVATE_FLAG << 3)
+/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
+#define SI_CONTEXT_INV_SMEM_L1		(R600_CONTEXT_PRIVATE_FLAG << 1)
+/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
+#define SI_CONTEXT_INV_VMEM_L1		(R600_CONTEXT_PRIVATE_FLAG << 2)
+/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
+#define SI_CONTEXT_INV_GLOBAL_L2	(R600_CONTEXT_PRIVATE_FLAG << 3)
 /* Framebuffer caches. */
 #define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4)
 #define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5)
@@ -176,6 +173,7 @@ struct si_context {
 	struct pipe_fence_handle	*last_gfx_fence;
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
 	LLVMTargetMachineRef		tm;
+	bool				gfx_flush_in_progress;
 
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
@@ -204,6 +202,7 @@ struct si_context {
 
 	/* Precomputed states. */
 	struct si_pm4_state		*init_config;
+	struct si_pm4_state		*init_config_gs_rings;
 	bool				init_config_has_vgt_flush;
 	struct si_pm4_state		*vgt_shader_config[4];
 
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index f16933c5f98..c4ef2e78c50 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -127,10 +127,10 @@ void si_pm4_free_state(struct si_context *sctx,
 
 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	for (int i = 0; i < state->nbo; ++i) {
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, state->bo[i],
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i],
 				      state->bo_usage[i], state->bo_priority[i]);
 	}
 
@@ -139,7 +139,7 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 	} else {
 		struct r600_resource *ib = state->indirect_buffer;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, ib,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib,
 					  RADEON_USAGE_READ,
                                           RADEON_PRIO_IB2);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a119cbdc16c..354d0646b99 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -164,49 +164,6 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 }
 
 /**
- * Given a semantic name and index of a parameter and a mask of used parameters
- * (inputs or outputs), return the index of the parameter in the list of all
- * used parameters.
- *
- * For example, assume this list of parameters:
- *   POSITION, PSIZE, GENERIC0, GENERIC2
- * which has the mask:
- *   11000000000101
- * Then:
- *   querying POSITION returns 0,
- *   querying PSIZE returns 1,
- *   querying GENERIC0 returns 2,
- *   querying GENERIC2 returns 3.
- *
- * Which can be used as an offset to a parameter buffer in units of vec4s.
- */
-static int get_param_index(unsigned semantic_name, unsigned index,
-			   uint64_t mask)
-{
-	unsigned unique_index = si_shader_io_get_unique_index(semantic_name, index);
-	int i, param_index = 0;
-
-	/* If not present... */
-	if (!((1llu << unique_index) & mask))
-		return -1;
-
-	for (i = 0; mask; i++) {
-		uint64_t bit = 1llu << i;
-
-		if (bit & mask) {
-			if (i == unique_index)
-				return param_index;
-
-			mask &= ~bit;
-			param_index++;
-		}
-	}
-
-	assert(!"unreachable");
-	return -1;
-}
-
-/**
  * Get the value of a shader input parameter and extract a bitfield.
  */
 static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
@@ -775,6 +732,7 @@ static LLVMValueRef fetch_input_gs(
 	struct tgsi_shader_info *info = &shader->selector->info;
 	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 	unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
+	unsigned param;
 
 	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
 		return get_primitive_id(bld_base, swizzle);
@@ -805,12 +763,10 @@ static LLVMValueRef fetch_input_gs(
 						   vtx_offset_param),
 				      4);
 
+	param = si_shader_io_get_unique_index(semantic_name, semantic_index);
 	args[0] = si_shader_ctx->esgs_ring;
 	args[1] = vtx_offset;
-	args[2] = lp_build_const_int32(gallivm,
-				       (get_param_index(semantic_name, semantic_index,
-							shader->selector->inputs_read) * 4 +
-					swizzle) * 256);
+	args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
 	args[3] = uint->zero;
 	args[4] = uint->one;  /* OFFEN */
 	args[5] = uint->zero; /* IDXEN */
@@ -2016,9 +1972,6 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 	LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 					    si_shader_ctx->param_es2gs_offset);
-	uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ?
-					   es->key.tes.es_enabled_outputs :
-					   es->key.vs.es_enabled_outputs;
 	unsigned chan;
 	int i;
 
@@ -2031,11 +1984,8 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 		    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
 			continue;
 
-		param_index = get_param_index(info->output_semantic_name[i],
-					      info->output_semantic_index[i],
-					      enabled_outputs);
-		if (param_index < 0)
-			continue;
+		param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
+							    info->output_semantic_index[i]);
 
 		for (chan = 0; chan < 4; chan++) {
 			LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
@@ -4023,10 +3973,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 			fprintf(f, !i ? "%u" : ", %u",
 				key->vs.instance_divisors[i]);
 		fprintf(f, "}\n");
-
-		if (key->vs.as_es)
-			fprintf(f, "  es_enabled_outputs = 0x%"PRIx64"\n",
-				key->vs.es_enabled_outputs);
 		fprintf(f, "  as_es = %u\n", key->vs.as_es);
 		fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
 		fprintf(f, "  export_prim_id = %u\n", key->vs.export_prim_id);
@@ -4037,9 +3983,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 		break;
 
 	case PIPE_SHADER_TESS_EVAL:
-		if (key->tes.as_es)
-			fprintf(f, "  es_enabled_outputs = 0x%"PRIx64"\n",
-				key->tes.es_enabled_outputs);
 		fprintf(f, "  as_es = %u\n", key->tes.as_es);
 		fprintf(f, "  export_prim_id = %u\n", key->tes.export_prim_id);
 		break;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index fd5500c1ab3..3400a03d7bb 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -26,14 +26,15 @@
  *      Christian König <[email protected]>
  */
 
-/* How linking tessellation shader inputs and outputs works.
+/* How linking shader inputs and outputs between vertex, tessellation, and
+ * geometry shaders works.
  *
  * Inputs and outputs between shaders are stored in a buffer. This buffer
  * lives in LDS (typical case for tessellation), but it can also live
- * in memory. Each input or output has a fixed location within a vertex.
+ * in memory (ESGS). Each input or output has a fixed location within a vertex.
  * The highest used input or output determines the stride between vertices.
  *
- * Since tessellation is only enabled in the OpenGL core profile,
+ * Since GS and tessellation are only possible in the OpenGL core profile,
  * only these semantics are valid for per-vertex data:
  *
  *   Name             Location
@@ -57,13 +58,11 @@
  * That's how independent shaders agree on input and output locations.
  * The si_shader_io_get_unique_index function assigns the locations.
  *
- * Other required information for calculating the input and output addresses
- * like the vertex stride, the patch stride, and the offsets where per-vertex
- * and per-patch data start, is passed to the shader via user data SGPRs.
- * The offsets and strides are calculated at draw time and aren't available
- * at compile time.
- *
- * The same approach should be used for linking ES->GS in the future.
+ * For tessellation, other required information for calculating the input and
+ * output addresses like the vertex stride, the patch stride, and the offsets
+ * where per-vertex and per-patch data start, is passed to the shader via
+ * user data SGPRs. The offsets and strides are calculated at draw time and
+ * aren't available at compile time.
  */
 
 #ifndef SI_SHADER_H
@@ -202,13 +201,16 @@ struct si_shader_selector {
 	bool		forces_persample_interp_for_persp;
 	bool		forces_persample_interp_for_linear;
 
+	unsigned	esgs_itemsize;
+	unsigned	gs_input_verts_per_prim;
 	unsigned	gs_output_prim;
 	unsigned	gs_max_out_vertices;
 	unsigned	gs_num_invocations;
-	unsigned	gsvs_itemsize;
+	unsigned	max_gs_stream; /* count - 1 */
+	unsigned	gsvs_vertex_size;
+	unsigned	max_gsvs_emit_size;
 
 	/* masks of "get_unique_index" bits */
-	uint64_t	inputs_read;
 	uint64_t	outputs_written;
 	uint32_t	patch_outputs_written;
 	uint32_t	ps_colors_written;
@@ -241,7 +243,6 @@ union si_shader_key {
 		/* Mask of "get_unique_index" bits - which outputs are read
 		 * by the next stage (needed by ES).
 		 * This describes how outputs are laid out in memory. */
-		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
 		unsigned	as_ls:1; /* local shader */
 		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
@@ -253,7 +254,6 @@ union si_shader_key {
 		/* Mask of "get_unique_index" bits - which outputs are read
 		 * by the next stage (needed by ES).
 		 * This describes how outputs are laid out in memory. */
-		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
 		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 	} tes; /* tessellation evaluation shader */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 18b64056bc7..93847d5ec2f 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -248,7 +248,7 @@ static unsigned si_pack_float_12p4(float x)
  */
 static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_state_blend *blend = sctx->queued.named.blend;
 	uint32_t mask = 0, i;
 
@@ -265,7 +265,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at
 	 *
 	 * Reproducible with Unigine Heaven 4.0 and drirc missing.
 	 */
-	if (blend->dual_src_blend &&
+	if (blend && blend->dual_src_blend &&
 	    sctx->ps_shader.cso &&
 	    (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3)
 		mask = 0;
@@ -454,7 +454,7 @@ static void si_set_blend_color(struct pipe_context *ctx,
 
 static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
 	radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4);
@@ -486,7 +486,7 @@ static void si_set_clip_state(struct pipe_context *ctx,
 
 static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4);
 	radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
@@ -496,7 +496,7 @@ static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
 
 static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct tgsi_shader_info *info = si_get_vs_info(sctx);
 	unsigned window_space =
 	   info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
@@ -541,7 +541,7 @@ static void si_set_scissor_states(struct pipe_context *ctx,
 
 static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_scissor_state *states = sctx->scissors.states;
 	unsigned mask = sctx->scissors.dirty_mask;
 
@@ -593,7 +593,7 @@ static void si_set_viewport_states(struct pipe_context *ctx,
 
 static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_viewport_state *states = sctx->viewports.states;
 	unsigned mask = sctx->viewports.dirty_mask;
 
@@ -830,7 +830,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state)
  */
 static void si_emit_stencil_ref(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
 	struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
 
@@ -989,7 +989,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
 
 static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	unsigned db_shader_control;
 
@@ -2125,8 +2125,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	 * Flush all CB and DB caches here because all buffers can be used
 	 * for write by both TC (with shader image stores) and CB/DB.
 	 */
-	sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
 
 	/* Take the maximum of the old and new count. If the new count is lower,
@@ -2233,7 +2233,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
 static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
 	unsigned i, nr_cbufs = state->nr_cbufs;
 	struct r600_texture *tex = NULL;
@@ -2252,20 +2252,20 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 		}
 
 		tex = (struct r600_texture *)cb->base.texture;
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      &tex->resource, RADEON_USAGE_READWRITE,
 				      tex->surface.nsamples > 1 ?
 					      RADEON_PRIO_COLOR_BUFFER_MSAA :
 					      RADEON_PRIO_COLOR_BUFFER);
 
 		if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				tex->cmask_buffer, RADEON_USAGE_READWRITE,
 				RADEON_PRIO_CMASK);
 		}
 
 		if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				tex->dcc_buffer, RADEON_USAGE_READWRITE,
 				RADEON_PRIO_DCC);
 		}
@@ -2305,14 +2305,14 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 		struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
 		struct r600_texture *rtex = (struct r600_texture*)zb->base.texture;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      &rtex->resource, RADEON_USAGE_READWRITE,
 				      zb->base.texture->nr_samples > 1 ?
 					      RADEON_PRIO_DEPTH_BUFFER_MSAA :
 					      RADEON_PRIO_DEPTH_BUFFER);
 
 		if (zb->db_htile_data_base) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					      rtex->htile_buffer, RADEON_USAGE_READWRITE,
 					      RADEON_PRIO_HTILE);
 		}
@@ -2354,7 +2354,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 static void si_emit_msaa_sample_locs(struct si_context *sctx,
 				     struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned nr_samples = sctx->framebuffer.nr_samples;
 
 	cayman_emit_msaa_sample_locs(cs, nr_samples > 1 ? nr_samples :
@@ -2363,7 +2363,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
 
 static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples,
 				sctx->ps_iter_samples,
@@ -2846,7 +2846,7 @@ static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
 
 static void si_emit_sample_mask(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned mask = sctx->sample_mask.sample_mask;
 
 	radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
@@ -3044,8 +3044,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
-	sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_FLUSH_AND_INV_CB;
 }
 
@@ -3069,6 +3069,7 @@ static void si_init_config(struct si_context *sctx);
 
 void si_init_state_functions(struct si_context *sctx)
 {
+	si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond);
 	si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin);
 	si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable);
 
@@ -3444,6 +3445,9 @@ static void si_init_config(struct si_context *sctx)
 		si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32);
 	}
 
+	if (sctx->b.family == CHIP_STONEY)
+		si_pm4_set_reg(pm4, R_028754_SX_PS_DOWNCONVERT, 0);
+
 	si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
 	if (sctx->b.chip_class >= CIK)
 		si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 8b9a311cd3f..f5ca661f8d7 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -110,6 +110,7 @@ union si_state_atoms {
 	struct {
 		/* The order matters. */
 		struct r600_atom *cache_flush;
+		struct r600_atom *render_cond;
 		struct r600_atom *streamout_begin;
 		struct r600_atom *streamout_enable; /* must be after streamout_begin */
 		struct r600_atom *framebuffer;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index cf0891a2ab7..753abc8c103 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -108,7 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 				       const struct pipe_draw_info *info,
 				       unsigned *num_patches)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader_ctx_state *ls = &sctx->vs_shader;
 	/* The TES pointer will only be used for sctx->last_tcs.
 	 * It would be wrong to think that TCS = TES. */
@@ -353,7 +353,7 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx,
 
 static void si_emit_scratch_reloc(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	if (!sctx->emit_scratch_reloc)
 		return;
@@ -362,7 +362,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
 			       sctx->spi_tmpring_size);
 
 	if (sctx->scratch_buffer) {
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      sctx->scratch_buffer, RADEON_USAGE_READWRITE,
 				      RADEON_PRIO_SCRATCH_BUFFER);
 
@@ -373,7 +373,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
 /* rast_prim is the primitive type after GS. */
 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned rast_prim = sctx->current_rast_prim;
 	struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
 
@@ -401,7 +401,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 static void si_emit_draw_registers(struct si_context *sctx,
 				   const struct pipe_draw_info *info)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
 	unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
@@ -455,8 +455,9 @@ static void si_emit_draw_packets(struct si_context *sctx,
 				 const struct pipe_draw_info *info,
 				 const struct pipe_index_buffer *ib)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
+	bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
 
 	if (info->count_from_stream_output) {
 		struct r600_so_target *t =
@@ -476,7 +477,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
 		radeon_emit(cs, 0); /* unused */
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      t->buf_filled_size, RADEON_USAGE_READ,
 				      RADEON_PRIO_SO_FILLED_SIZE);
 	}
@@ -530,7 +531,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 	} else {
 		si_invalidate_draw_sh_constants(sctx);
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource *)info->indirect,
 				      RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 	}
@@ -540,7 +541,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 					  ib->index_size;
 		uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource *)ib->buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
 
@@ -563,7 +564,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 			radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
 			radeon_emit(cs, index_max_size);
 
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, render_cond_bit));
 			radeon_emit(cs, info->indirect_offset);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
@@ -571,7 +572,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		} else {
 			index_va += info->start * ib->index_size;
 
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
 			radeon_emit(cs, index_max_size);
 			radeon_emit(cs, index_va);
 			radeon_emit(cs, (index_va >> 32UL) & 0xFF);
@@ -590,13 +591,13 @@ static void si_emit_draw_packets(struct si_context *sctx,
 			radeon_emit(cs, indirect_va);
 			radeon_emit(cs, indirect_va >> 32);
 
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, render_cond_bit));
 			radeon_emit(cs, info->indirect_offset);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
 		} else {
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
 			radeon_emit(cs, info->count);
 			radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
 				    S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
@@ -604,12 +605,10 @@ static void si_emit_draw_packets(struct si_context *sctx,
 	}
 }
 
-#define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE)
-
 void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 {
 	struct r600_common_context *sctx = &si_ctx->b;
-	struct radeon_winsys_cs *cs = sctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->gfx.cs;
 	uint32_t cp_coher_cntl = 0;
 	uint32_t compute =
 		PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE));
@@ -624,12 +623,12 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 
 	if (sctx->flags & SI_CONTEXT_INV_ICACHE)
 		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-	if (sctx->flags & SI_CONTEXT_INV_KCACHE)
+	if (sctx->flags & SI_CONTEXT_INV_SMEM_L1)
 		cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
 
-	if (sctx->flags & SI_CONTEXT_INV_TC_L1)
+	if (sctx->flags & SI_CONTEXT_INV_VMEM_L1)
 		cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-	if (sctx->flags & SI_CONTEXT_INV_TC_L2) {
+	if (sctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
 		cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
 
 		/* TODO: this might not be needed. */
@@ -843,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	/* VI reads index buffers through TC L2. */
 	if (info->indexed && sctx->b.chip_class <= CIK &&
 	    r600_resource(ib.buffer)->TC_L2_dirty) {
-		sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
+		sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
 		r600_resource(ib.buffer)->TC_L2_dirty = false;
 	}
 
@@ -909,10 +908,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 void si_trace_emit(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	sctx->trace_id++;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, sctx->trace_buf,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
 	radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4a3a04caa52..7f6511cf01b 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -33,6 +33,7 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_ureg.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 #include "util/u_simple_shaders.h"
 
 static void si_set_tesseval_regs(struct si_shader *shader,
@@ -194,6 +195,8 @@ static void si_shader_es(struct si_shader *shader)
 	}
 	assert(num_sgprs <= 104);
 
+	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+		       shader->selector->esgs_itemsize / 4);
 	si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
 	si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
 	si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
@@ -209,32 +212,17 @@ static void si_shader_es(struct si_shader *shader)
 		si_set_tesseval_regs(shader, pm4);
 }
 
-static unsigned si_gs_get_max_stream(struct si_shader *shader)
-{
-	struct pipe_stream_output_info *so = &shader->selector->so;
-	unsigned max_stream = 0, i;
-
-	if (so->num_outputs == 0)
-		return 0;
-
-	for (i = 0; i < so->num_outputs; i++) {
-		if (so->output[i].stream > max_stream)
-			max_stream = so->output[i].stream;
-	}
-	return max_stream;
-}
-
 static void si_shader_gs(struct si_shader *shader)
 {
-	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16;
+	unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
 	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
-	unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2;
+	unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
 	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
 	unsigned cut_mode;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
 	uint64_t va;
-	unsigned max_stream = si_gs_get_max_stream(shader);
+	unsigned max_stream = shader->selector->max_gs_stream;
 
 	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
 	assert(gsvs_itemsize < (1 << 15));
@@ -265,8 +253,6 @@ static void si_shader_gs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
 	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
 
-	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-		       util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
 	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
 
 	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
@@ -529,10 +515,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
 		if (sctx->tes_shader.cso)
 			key->vs.as_ls = 1;
-		else if (sctx->gs_shader.cso) {
+		else if (sctx->gs_shader.cso)
 			key->vs.as_es = 1;
-			key->vs.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
-		}
 
 		if (!sctx->gs_shader.cso && sctx->ps_shader.cso &&
 		    sctx->ps_shader.cso->info.uses_primid)
@@ -543,10 +527,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 		break;
 	case PIPE_SHADER_TESS_EVAL:
-		if (sctx->gs_shader.cso) {
+		if (sctx->gs_shader.cso)
 			key->tes.as_es = 1;
-			key->tes.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
-		} else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+		else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
 			key->tes.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
@@ -713,25 +696,22 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 			sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
 		sel->gs_num_invocations =
 			sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
-		sel->gsvs_itemsize = sel->info.num_outputs * 16 *
-				     sel->gs_max_out_vertices;
+		sel->gsvs_vertex_size = sel->info.num_outputs * 16;
+		sel->max_gsvs_emit_size = sel->gsvs_vertex_size *
+					  sel->gs_max_out_vertices;
 
-		for (i = 0; i < sel->info.num_inputs; i++) {
-			unsigned name = sel->info.input_semantic_name[i];
-			unsigned index = sel->info.input_semantic_index[i];
+		sel->max_gs_stream = 0;
+		for (i = 0; i < sel->so.num_outputs; i++)
+			sel->max_gs_stream = MAX2(sel->max_gs_stream,
+						  sel->so.output[i].stream);
 
-			switch (name) {
-			case TGSI_SEMANTIC_PRIMID:
-				break;
-			default:
-				sel->inputs_read |=
-					1llu << si_shader_io_get_unique_index(name, index);
-			}
-		}
+		sel->gs_input_verts_per_prim =
+			u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
 		break;
 
 	case PIPE_SHADER_VERTEX:
 	case PIPE_SHADER_TESS_CTRL:
+	case PIPE_SHADER_TESS_EVAL:
 		for (i = 0; i < sel->info.num_outputs; i++) {
 			unsigned name = sel->info.output_semantic_name[i];
 			unsigned index = sel->info.output_semantic_index[i];
@@ -748,6 +728,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 					1llu << si_shader_io_get_unique_index(name, index);
 			}
 		}
+		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
 		break;
 	case PIPE_SHADER_FRAGMENT:
 		for (i = 0; i < sel->info.num_outputs; i++) {
@@ -937,7 +918,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 
 static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader *ps = sctx->ps_shader.current;
 	struct si_shader *vs = si_get_vs_state(sctx);
 	struct tgsi_shader_info *psinfo;
@@ -1009,7 +990,7 @@ bcolor:
 
 static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader *ps = sctx->ps_shader.current;
 	unsigned input_ena;
 
@@ -1077,6 +1058,7 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx)
 	if (sctx->init_config_has_vgt_flush)
 		return;
 
+	/* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
 	si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
 	si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
 	si_pm4_cmd_end(sctx->init_config, false);
@@ -1084,70 +1066,127 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx)
 }
 
 /* Initialize state related to ESGS / GSVS ring buffers */
-static void si_init_gs_rings(struct si_context *sctx)
+static bool si_update_gs_ring_buffers(struct si_context *sctx)
 {
-	unsigned esgs_ring_size = 128 * 1024;
-	unsigned gsvs_ring_size = 60 * 1024 * 1024;
+	struct si_shader_selector *es =
+		sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
+	struct si_shader_selector *gs = sctx->gs_shader.cso;
+	struct si_pm4_state *pm4;
 
-	assert(!sctx->esgs_ring && !sctx->gsvs_ring);
+	/* Chip constants. */
+	unsigned num_se = sctx->screen->b.info.max_se;
+	unsigned wave_size = 64;
+	unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
+	unsigned gs_vertex_reuse = 16 * num_se; /* GS_VERTEX_REUSE register (per SE) */
+	unsigned alignment = 256 * num_se;
+	/* The maximum size is 63.999 MB per SE. */
+	unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
+
+	/* Calculate the minimum size. */
+	unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse *
+					    wave_size, alignment);
+
+	/* These are recommended sizes, not minimum sizes. */
+	unsigned esgs_ring_size = max_gs_waves * 2 * wave_size *
+				  es->esgs_itemsize * gs->gs_input_verts_per_prim;
+	unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size *
+				  gs->max_gsvs_emit_size * (gs->max_gs_stream + 1);
+
+	min_esgs_ring_size = align(min_esgs_ring_size, alignment);
+	esgs_ring_size = align(esgs_ring_size, alignment);
+	gsvs_ring_size = align(gsvs_ring_size, alignment);
+
+	esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+	gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
+
+	/* Some rings don't have to be allocated if shaders don't use them.
+	 * (e.g. no varyings between ES and GS or GS and VS)
+	 */
+	bool update_esgs = esgs_ring_size &&
+			   (!sctx->esgs_ring ||
+			    sctx->esgs_ring->width0 < esgs_ring_size);
+	bool update_gsvs = gsvs_ring_size &&
+			   (!sctx->gsvs_ring ||
+			    sctx->gsvs_ring->width0 < gsvs_ring_size);
 
-	sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-				       PIPE_USAGE_DEFAULT, esgs_ring_size);
-	if (!sctx->esgs_ring)
-		return;
+	if (!update_esgs && !update_gsvs)
+		return true;
 
-	sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-					     PIPE_USAGE_DEFAULT, gsvs_ring_size);
-	if (!sctx->gsvs_ring) {
+	if (update_esgs) {
 		pipe_resource_reference(&sctx->esgs_ring, NULL);
-		return;
+		sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+						     PIPE_USAGE_DEFAULT,
+						     esgs_ring_size);
+		if (!sctx->esgs_ring)
+			return false;
 	}
 
-	si_init_config_add_vgt_flush(sctx);
+	if (update_gsvs) {
+		pipe_resource_reference(&sctx->gsvs_ring, NULL);
+		sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+						     PIPE_USAGE_DEFAULT,
+						     gsvs_ring_size);
+		if (!sctx->gsvs_ring)
+			return false;
+	}
+
+	/* Create the "init_config_gs_rings" state. */
+	pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (!pm4)
+		return false;
 
-	/* Append these registers to the init config state. */
 	if (sctx->b.chip_class >= CIK) {
-		if (sctx->b.chip_class >= VI) {
-			/* The maximum sizes are 63.999 MB on VI, because
-			 * the register fields only have 18 bits. */
-			assert(esgs_ring_size / 256 < (1 << 18));
-			assert(gsvs_ring_size / 256 < (1 << 18));
-		}
-		si_pm4_set_reg(sctx->init_config, R_030900_VGT_ESGS_RING_SIZE,
-			       esgs_ring_size / 256);
-		si_pm4_set_reg(sctx->init_config, R_030904_VGT_GSVS_RING_SIZE,
-			       gsvs_ring_size / 256);
+		if (sctx->esgs_ring)
+			si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
+				       sctx->esgs_ring->width0 / 256);
+		if (sctx->gsvs_ring)
+			si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
+				       sctx->gsvs_ring->width0 / 256);
 	} else {
-		si_pm4_set_reg(sctx->init_config, R_0088C8_VGT_ESGS_RING_SIZE,
-			       esgs_ring_size / 256);
-		si_pm4_set_reg(sctx->init_config, R_0088CC_VGT_GSVS_RING_SIZE,
-			       gsvs_ring_size / 256);
+		if (sctx->esgs_ring)
+			si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
+				       sctx->esgs_ring->width0 / 256);
+		if (sctx->gsvs_ring)
+			si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
+				       sctx->gsvs_ring->width0 / 256);
 	}
 
-	/* Flush the context to re-emit the init_config state.
-	 * This is done only once in a lifetime of a context.
-	 */
-	si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+	/* Set the state. */
+	if (sctx->init_config_gs_rings)
+		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+	sctx->init_config_gs_rings = pm4;
+
+	if (!sctx->init_config_has_vgt_flush) {
+		si_init_config_add_vgt_flush(sctx);
+		si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+	}
+
+	/* Flush the context to re-emit both init_config states. */
 	sctx->b.initial_gfx_cs_size = 0; /* force flush */
 	si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL);
 
-	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
-			   sctx->esgs_ring, 0, esgs_ring_size,
-			   true, true, 4, 64, 0);
-	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
-			   sctx->esgs_ring, 0, esgs_ring_size,
-			   false, false, 0, 0, 0);
-	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
-			   sctx->gsvs_ring, 0, gsvs_ring_size,
-			   false, false, 0, 0, 0);
+	/* Set ring bindings. */
+	if (sctx->esgs_ring) {
+		si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
+				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
+				   true, true, 4, 64, 0);
+		si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
+				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
+				   false, false, 0, 0, 0);
+	}
+	if (sctx->gsvs_ring)
+		si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
+				   sctx->gsvs_ring, 0, sctx->gsvs_ring->width0,
+				   false, false, 0, 0, 0);
+	return true;
 }
 
-static void si_update_gs_rings(struct si_context *sctx)
+static void si_update_gsvs_ring_bindings(struct si_context *sctx)
 {
-	unsigned gsvs_itemsize = sctx->gs_shader.cso->gsvs_itemsize;
+	unsigned gsvs_itemsize = sctx->gs_shader.cso->max_gsvs_emit_size;
 	uint64_t offset;
 
-	if (gsvs_itemsize == sctx->last_gsvs_itemsize)
+	if (!sctx->gsvs_ring || gsvs_itemsize == sctx->last_gsvs_itemsize)
 		return;
 
 	sctx->last_gsvs_itemsize = gsvs_itemsize;
@@ -1508,13 +1547,10 @@ bool si_update_shaders(struct si_context *sctx)
 		si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4);
 		si_update_so(sctx, sctx->gs_shader.cso);
 
-		if (!sctx->gsvs_ring) {
-			si_init_gs_rings(sctx);
-			if (!sctx->gsvs_ring)
-				return false;
-		}
+		if (!si_update_gs_ring_buffers(sctx))
+			return false;
 
-		si_update_gs_rings(sctx);
+		si_update_gsvs_ring_bindings(sctx);
 	} else {
 		si_pm4_bind_state(sctx, gs, NULL);
 		si_pm4_bind_state(sctx, es, NULL);
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 4bb24572b90..0c48340beef 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -3608,6 +3608,9 @@
 #define   S_00B854_WAVES_PER_SH(x)                                    (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */
 #define   G_00B854_WAVES_PER_SH(x)                                    (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */
 #define   C_00B854_WAVES_PER_SH                                       0xFFFFFFC0 /* mask is 0x3FF on CIK */
+#define   S_00B854_WAVES_PER_SH_CIK(x)                                (((x) & 0x3FF) << 0)
+#define   G_00B854_WAVES_PER_SH_CIK(x)                                (((x) >> 0) & 0x3FF)
+#define   C_00B854_WAVES_PER_SH_CIK                                   0xFFFFFC00
 #define   S_00B854_TG_PER_CU(x)                                       (((x) & 0x0F) << 12)
 #define   G_00B854_TG_PER_CU(x)                                       (((x) >> 12) & 0x0F)
 #define   C_00B854_TG_PER_CU                                          0xFFFF0FFF
@@ -5211,6 +5214,296 @@
 #define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
 #define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
 #define     V_028714_SPI_SHADER_32_ABGR                             0x09
+/* Stoney */
+#define R_028754_SX_PS_DOWNCONVERT                                      0x028754
+#define   S_028754_MRT0(x)                                            (((x) & 0x0F) << 0)
+#define   G_028754_MRT0(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028754_MRT0                                               0xFFFFFFF0
+#define     V_028754_SX_RT_EXPORT_NO_CONVERSION				0
+#define     V_028754_SX_RT_EXPORT_32_R					1
+#define     V_028754_SX_RT_EXPORT_32_A					2
+#define     V_028754_SX_RT_EXPORT_10_11_11				3
+#define     V_028754_SX_RT_EXPORT_2_10_10_10				4
+#define     V_028754_SX_RT_EXPORT_8_8_8_8				5
+#define     V_028754_SX_RT_EXPORT_5_6_5					6
+#define     V_028754_SX_RT_EXPORT_1_5_5_5				7
+#define     V_028754_SX_RT_EXPORT_4_4_4_4				8
+#define     V_028754_SX_RT_EXPORT_16_16_GR				9
+#define     V_028754_SX_RT_EXPORT_16_16_AR				10
+#define   S_028754_MRT1(x)                                            (((x) & 0x0F) << 4)
+#define   G_028754_MRT1(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028754_MRT1                                               0xFFFFFF0F
+#define   S_028754_MRT2(x)                                            (((x) & 0x0F) << 8)
+#define   G_028754_MRT2(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028754_MRT2                                               0xFFFFF0FF
+#define   S_028754_MRT3(x)                                            (((x) & 0x0F) << 12)
+#define   G_028754_MRT3(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028754_MRT3                                               0xFFFF0FFF
+#define   S_028754_MRT4(x)                                            (((x) & 0x0F) << 16)
+#define   G_028754_MRT4(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028754_MRT4                                               0xFFF0FFFF
+#define   S_028754_MRT5(x)                                            (((x) & 0x0F) << 20)
+#define   G_028754_MRT5(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028754_MRT5                                               0xFF0FFFFF
+#define   S_028754_MRT6(x)                                            (((x) & 0x0F) << 24)
+#define   G_028754_MRT6(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028754_MRT6                                               0xF0FFFFFF
+#define   S_028754_MRT7(x)                                            (((x) & 0x0F) << 28)
+#define   G_028754_MRT7(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028754_MRT7                                               0x0FFFFFFF
+#define R_028758_SX_BLEND_OPT_EPSILON                                   0x028758
+#define   S_028758_MRT0_EPSILON(x)                                    (((x) & 0x0F) << 0)
+#define   G_028758_MRT0_EPSILON(x)                                    (((x) >> 0) & 0x0F)
+#define   C_028758_MRT0_EPSILON                                       0xFFFFFFF0
+#define      V_028758_EXACT						0
+#define      V_028758_11BIT_FORMAT					1
+#define      V_028758_10BIT_FORMAT					3
+#define      V_028758_8BIT_FORMAT					7
+#define      V_028758_6BIT_FORMAT					11
+#define      V_028758_5BIT_FORMAT					13
+#define      V_028758_4BIT_FORMAT					15
+#define   S_028758_MRT1_EPSILON(x)                                    (((x) & 0x0F) << 4)
+#define   G_028758_MRT1_EPSILON(x)                                    (((x) >> 4) & 0x0F)
+#define   C_028758_MRT1_EPSILON                                       0xFFFFFF0F
+#define   S_028758_MRT2_EPSILON(x)                                    (((x) & 0x0F) << 8)
+#define   G_028758_MRT2_EPSILON(x)                                    (((x) >> 8) & 0x0F)
+#define   C_028758_MRT2_EPSILON                                       0xFFFFF0FF
+#define   S_028758_MRT3_EPSILON(x)                                    (((x) & 0x0F) << 12)
+#define   G_028758_MRT3_EPSILON(x)                                    (((x) >> 12) & 0x0F)
+#define   C_028758_MRT3_EPSILON                                       0xFFFF0FFF
+#define   S_028758_MRT4_EPSILON(x)                                    (((x) & 0x0F) << 16)
+#define   G_028758_MRT4_EPSILON(x)                                    (((x) >> 16) & 0x0F)
+#define   C_028758_MRT4_EPSILON                                       0xFFF0FFFF
+#define   S_028758_MRT5_EPSILON(x)                                    (((x) & 0x0F) << 20)
+#define   G_028758_MRT5_EPSILON(x)                                    (((x) >> 20) & 0x0F)
+#define   C_028758_MRT5_EPSILON                                       0xFF0FFFFF
+#define   S_028758_MRT6_EPSILON(x)                                    (((x) & 0x0F) << 24)
+#define   G_028758_MRT6_EPSILON(x)                                    (((x) >> 24) & 0x0F)
+#define   C_028758_MRT6_EPSILON                                       0xF0FFFFFF
+#define   S_028758_MRT7_EPSILON(x)                                    (((x) & 0x0F) << 28)
+#define   G_028758_MRT7_EPSILON(x)                                    (((x) >> 28) & 0x0F)
+#define   C_028758_MRT7_EPSILON                                       0x0FFFFFFF
+#define R_02875C_SX_BLEND_OPT_CONTROL                                   0x02875C
+#define   S_02875C_MRT0_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 0)
+#define   G_02875C_MRT0_COLOR_OPT_DISABLE(x)                          (((x) >> 0) & 0x1)
+#define   C_02875C_MRT0_COLOR_OPT_DISABLE                             0xFFFFFFFE
+#define   S_02875C_MRT0_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 1)
+#define   G_02875C_MRT0_ALPHA_OPT_DISABLE(x)                          (((x) >> 1) & 0x1)
+#define   C_02875C_MRT0_ALPHA_OPT_DISABLE                             0xFFFFFFFD
+#define   S_02875C_MRT1_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 4)
+#define   G_02875C_MRT1_COLOR_OPT_DISABLE(x)                          (((x) >> 4) & 0x1)
+#define   C_02875C_MRT1_COLOR_OPT_DISABLE                             0xFFFFFFEF
+#define   S_02875C_MRT1_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 5)
+#define   G_02875C_MRT1_ALPHA_OPT_DISABLE(x)                          (((x) >> 5) & 0x1)
+#define   C_02875C_MRT1_ALPHA_OPT_DISABLE                             0xFFFFFFDF
+#define   S_02875C_MRT2_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 8)
+#define   G_02875C_MRT2_COLOR_OPT_DISABLE(x)                          (((x) >> 8) & 0x1)
+#define   C_02875C_MRT2_COLOR_OPT_DISABLE                             0xFFFFFEFF
+#define   S_02875C_MRT2_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 9)
+#define   G_02875C_MRT2_ALPHA_OPT_DISABLE(x)                          (((x) >> 9) & 0x1)
+#define   C_02875C_MRT2_ALPHA_OPT_DISABLE                             0xFFFFFDFF
+#define   S_02875C_MRT3_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 12)
+#define   G_02875C_MRT3_COLOR_OPT_DISABLE(x)                          (((x) >> 12) & 0x1)
+#define   C_02875C_MRT3_COLOR_OPT_DISABLE                             0xFFFFEFFF
+#define   S_02875C_MRT3_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 13)
+#define   G_02875C_MRT3_ALPHA_OPT_DISABLE(x)                          (((x) >> 13) & 0x1)
+#define   C_02875C_MRT3_ALPHA_OPT_DISABLE                             0xFFFFDFFF
+#define   S_02875C_MRT4_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 16)
+#define   G_02875C_MRT4_COLOR_OPT_DISABLE(x)                          (((x) >> 16) & 0x1)
+#define   C_02875C_MRT4_COLOR_OPT_DISABLE                             0xFFFEFFFF
+#define   S_02875C_MRT4_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 17)
+#define   G_02875C_MRT4_ALPHA_OPT_DISABLE(x)                          (((x) >> 17) & 0x1)
+#define   C_02875C_MRT4_ALPHA_OPT_DISABLE                             0xFFFDFFFF
+#define   S_02875C_MRT5_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 20)
+#define   G_02875C_MRT5_COLOR_OPT_DISABLE(x)                          (((x) >> 20) & 0x1)
+#define   C_02875C_MRT5_COLOR_OPT_DISABLE                             0xFFEFFFFF
+#define   S_02875C_MRT5_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 21)
+#define   G_02875C_MRT5_ALPHA_OPT_DISABLE(x)                          (((x) >> 21) & 0x1)
+#define   C_02875C_MRT5_ALPHA_OPT_DISABLE                             0xFFDFFFFF
+#define   S_02875C_MRT6_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 24)
+#define   G_02875C_MRT6_COLOR_OPT_DISABLE(x)                          (((x) >> 24) & 0x1)
+#define   C_02875C_MRT6_COLOR_OPT_DISABLE                             0xFEFFFFFF
+#define   S_02875C_MRT6_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 25)
+#define   G_02875C_MRT6_ALPHA_OPT_DISABLE(x)                          (((x) >> 25) & 0x1)
+#define   C_02875C_MRT6_ALPHA_OPT_DISABLE                             0xFDFFFFFF
+#define   S_02875C_MRT7_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 28)
+#define   G_02875C_MRT7_COLOR_OPT_DISABLE(x)                          (((x) >> 28) & 0x1)
+#define   C_02875C_MRT7_COLOR_OPT_DISABLE                             0xEFFFFFFF
+#define   S_02875C_MRT7_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 29)
+#define   G_02875C_MRT7_ALPHA_OPT_DISABLE(x)                          (((x) >> 29) & 0x1)
+#define   C_02875C_MRT7_ALPHA_OPT_DISABLE                             0xDFFFFFFF
+#define   S_02875C_PIXEN_ZERO_OPT_DISABLE(x)                          (((x) & 0x1) << 31)
+#define   G_02875C_PIXEN_ZERO_OPT_DISABLE(x)                          (((x) >> 31) & 0x1)
+#define   C_02875C_PIXEN_ZERO_OPT_DISABLE                             0x7FFFFFFF
+#define R_028760_SX_MRT0_BLEND_OPT                                      0x028760
+#define   S_028760_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028760_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028760_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define     V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL			0
+#define     V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE			1
+#define     V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0			2
+#define     V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1			3
+#define     V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0			4
+#define     V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1			5
+#define     V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0			6
+#define     V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE		7
+#define   S_028760_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028760_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028760_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028760_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028760_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028760_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define     V_028760_OPT_COMB_NONE					0
+#define     V_028760_OPT_COMB_ADD					1
+#define     V_028760_OPT_COMB_SUBTRACT					2
+#define     V_028760_OPT_COMB_MIN					3
+#define     V_028760_OPT_COMB_MAX					4
+#define     V_028760_OPT_COMB_REVSUBTRACT				5
+#define     V_028760_OPT_COMB_BLEND_DISABLED				6
+#define     V_028760_OPT_COMB_SAFE_ADD					7
+#define   S_028760_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028760_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028760_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028760_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028760_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028760_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028760_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028760_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028760_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028764_SX_MRT1_BLEND_OPT                                      0x028764
+#define   S_028764_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028764_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028764_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028764_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028764_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028764_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028764_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028764_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028764_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028764_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028764_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028764_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028764_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028764_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028764_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028764_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028764_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028764_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028768_SX_MRT2_BLEND_OPT                                      0x028768
+#define   S_028768_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028768_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028768_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028768_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028768_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028768_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028768_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028768_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028768_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028768_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028768_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028768_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028768_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028768_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028768_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028768_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028768_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028768_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_02876C_SX_MRT3_BLEND_OPT                                      0x02876C
+#define   S_02876C_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_02876C_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_02876C_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_02876C_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_02876C_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_02876C_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_02876C_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_02876C_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_02876C_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_02876C_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_02876C_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_02876C_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_02876C_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_02876C_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_02876C_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_02876C_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_02876C_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_02876C_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028770_SX_MRT4_BLEND_OPT                                      0x028770
+#define   S_028770_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028770_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028770_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028770_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028770_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028770_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028770_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028770_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028770_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028770_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028770_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028770_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028770_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028770_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028770_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028770_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028770_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028770_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028774_SX_MRT5_BLEND_OPT                                      0x028774
+#define   S_028774_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028774_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028774_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028774_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028774_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028774_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028774_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028774_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028774_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028774_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028774_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028774_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028774_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028774_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028774_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028774_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028774_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028774_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028778_SX_MRT6_BLEND_OPT                                      0x028778
+#define   S_028778_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028778_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028778_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028778_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028778_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028778_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028778_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028778_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028778_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028778_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028778_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028778_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028778_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028778_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028778_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028778_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028778_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028778_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_02877C_SX_MRT7_BLEND_OPT                                      0x02877C
+#define   S_02877C_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_02877C_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_02877C_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_02877C_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_02877C_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_02877C_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_02877C_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_02877C_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_02877C_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_02877C_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_02877C_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_02877C_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_02877C_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_02877C_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_02877C_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_02877C_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_02877C_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_02877C_ALPHA_COMB_FCN                                     0xF8FFFFFF
+/*        */
 #define R_028780_CB_BLEND0_CONTROL                                      0x028780
 #define   S_028780_COLOR_SRCBLEND(x)                                  (((x) & 0x1F) << 0)
 #define   G_028780_COLOR_SRCBLEND(x)                                  (((x) >> 0) & 0x1F)
@@ -5473,6 +5766,7 @@
 #define     V_028808_CB_ELIMINATE_FAST_CLEAR                        0x02
 #define     V_028808_CB_RESOLVE                                     0x03
 #define     V_028808_CB_FMASK_DECOMPRESS                            0x05
+#define     V_028808_CB_DCC_DECOMPRESS                              0x06
 #define   S_028808_ROP3(x)                                            (((x) & 0xFF) << 16)
 #define   G_028808_ROP3(x)                                            (((x) >> 16) & 0xFF)
 #define   C_028808_ROP3                                               0xFF00FFFF
@@ -5551,6 +5845,11 @@
 #define     V_02880C_EXPORT_GREATER_THAN_Z                          2
 #define     V_02880C_EXPORT_RESERVED                                3
 /*     */
+/* Stoney */
+#define   S_02880C_DUAL_QUAD_DISABLE(x)                               (((x) & 0x1) << 15)
+#define   G_02880C_DUAL_QUAD_DISABLE(x)                               (((x) >> 15) & 0x1)
+#define   C_02880C_DUAL_QUAD_DISABLE                                  0xFFFF7FFF
+/*        */
 #define R_028810_PA_CL_CLIP_CNTL                                        0x028810
 #define   S_028810_UCP_ENA_0(x)                                       (((x) & 0x1) << 0)
 #define   G_028810_UCP_ENA_0(x)                                       (((x) >> 0) & 0x1)
@@ -6132,6 +6431,9 @@
 #define     V_028A40_GS_SCENARIO_G                                  0x03
 #define     V_028A40_GS_SCENARIO_C                                  0x04
 #define     V_028A40_SPRITE_EN                                      0x05
+#define   S_028A40_RESERVED_0(x)                                      (((x) & 0x1) << 3)
+#define   G_028A40_RESERVED_0(x)                                      (((x) >> 3) & 0x1)
+#define   C_028A40_RESERVED_0                                         0xFFFFFFF7
 #define   S_028A40_CUT_MODE(x)                                        (((x) & 0x03) << 4)
 #define   G_028A40_CUT_MODE(x)                                        (((x) >> 4) & 0x03)
 #define   C_028A40_CUT_MODE                                           0xFFFFFFCF
@@ -6139,12 +6441,19 @@
 #define     V_028A40_GS_CUT_512                                     0x01
 #define     V_028A40_GS_CUT_256                                     0x02
 #define     V_028A40_GS_CUT_128                                     0x03
+#define   S_028A40_RESERVED_1(x)                                      (((x) & 0x1F) << 6)
+#define   G_028A40_RESERVED_1(x)                                      (((x) >> 6) & 0x1F)
+#define   C_028A40_RESERVED_1                                         0xFFFFF83F
 #define   S_028A40_GS_C_PACK_EN(x)                                    (((x) & 0x1) << 11)
 #define   G_028A40_GS_C_PACK_EN(x)                                    (((x) >> 11) & 0x1)
 #define   C_028A40_GS_C_PACK_EN                                       0xFFFFF7FF
+#define   S_028A40_RESERVED_2(x)                                      (((x) & 0x1) << 12)
+#define   G_028A40_RESERVED_2(x)                                      (((x) >> 12) & 0x1)
+#define   C_028A40_RESERVED_2                                         0xFFFFEFFF
 #define   S_028A40_ES_PASSTHRU(x)                                     (((x) & 0x1) << 13)
 #define   G_028A40_ES_PASSTHRU(x)                                     (((x) >> 13) & 0x1)
 #define   C_028A40_ES_PASSTHRU                                        0xFFFFDFFF
+/* SI-CIK */
 #define   S_028A40_COMPUTE_MODE(x)                                    (((x) & 0x1) << 14)
 #define   G_028A40_COMPUTE_MODE(x)                                    (((x) >> 14) & 0x1)
 #define   C_028A40_COMPUTE_MODE                                       0xFFFFBFFF
@@ -6154,6 +6463,7 @@
 #define   S_028A40_ELEMENT_INFO_EN(x)                                 (((x) & 0x1) << 16)
 #define   G_028A40_ELEMENT_INFO_EN(x)                                 (((x) >> 16) & 0x1)
 #define   C_028A40_ELEMENT_INFO_EN                                    0xFFFEFFFF
+/*        */
 #define   S_028A40_PARTIAL_THD_AT_EOI(x)                              (((x) & 0x1) << 17)
 #define   G_028A40_PARTIAL_THD_AT_EOI(x)                              (((x) >> 17) & 0x1)
 #define   C_028A40_PARTIAL_THD_AT_EOI                                 0xFFFDFFFF
@@ -6339,6 +6649,9 @@
 #define   C_028A7C_RDREQ_POLICY                                       0xFFFFFF3F
 #define     V_028A7C_VGT_POLICY_LRU                                 0x00
 #define     V_028A7C_VGT_POLICY_STREAM                              0x01
+#define   S_028A7C_RDREQ_POLICY_VI(x)                                 (((x) & 0x1) << 6)
+#define   G_028A7C_RDREQ_POLICY_VI(x)                                 (((x) >> 6) & 0x1)
+#define   C_028A7C_RDREQ_POLICY_VI                                    0xFFFFFFBF
 #define   S_028A7C_ATC(x)                                             (((x) & 0x1) << 8)
 #define   G_028A7C_ATC(x)                                             (((x) >> 8) & 0x1)
 #define   C_028A7C_ATC                                                0xFFFFFEFF
@@ -6715,6 +7028,9 @@
 #define     V_028B6C_VGT_POLICY_BYPASS                              0x02
 /*     */
 /* VI */
+#define   S_028B6C_RDREQ_POLICY_VI(x)                                 (((x) & 0x1) << 15)
+#define   G_028B6C_RDREQ_POLICY_VI(x)                                 (((x) >> 15) & 0x1)
+#define   C_028B6C_RDREQ_POLICY_VI                                    0xFFFF7FFF
 #define   S_028B6C_DISTRIBUTION_MODE(x)                               (((x) & 0x03) << 17)
 #define   G_028B6C_DISTRIBUTION_MODE(x)                               (((x) >> 17) & 0x03)
 #define   C_028B6C_DISTRIBUTION_MODE                                  0xFFF9FFFF
@@ -7317,6 +7633,12 @@
 #define   S_028C3C_AA_MASK_X1Y1(x)                                    (((x) & 0xFFFF) << 16)
 #define   G_028C3C_AA_MASK_X1Y1(x)                                    (((x) >> 16) & 0xFFFF)
 #define   C_028C3C_AA_MASK_X1Y1                                       0x0000FFFF
+/* Stoney */
+#define R_028C40_PA_SC_SHADER_CONTROL                                   0x028C40
+#define   S_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x)                    (((x) & 0x03) << 0)
+#define   G_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x)                    (((x) >> 0) & 0x03)
+#define   C_028C40_REALIGN_DQUADS_AFTER_N_WAVES                       0xFFFFFFFC
+/*        */
 #define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL                            0x028C58
 #define   S_028C58_VTX_REUSE_DEPTH(x)                                 (((x) & 0xFF) << 0)
 #define   G_028C58_VTX_REUSE_DEPTH(x)                                 (((x) >> 0) & 0xFF)
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index c0fc82b2f2c..bb4cef29ec9 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -250,6 +250,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c
index caf4b17de16..acb2e95e747 100644
--- a/src/gallium/drivers/svga/svga_draw_arrays.c
+++ b/src/gallium/drivers/svga/svga_draw_arrays.c
@@ -204,7 +204,8 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
                        unsigned prim, unsigned start, unsigned count,
                        unsigned start_instance, unsigned instance_count)
 {
-   unsigned gen_prim, gen_size, gen_nr, gen_type;
+   unsigned gen_prim, gen_size, gen_nr;
+   enum indices_mode gen_type;
    u_generate_func gen_func;
    enum pipe_error ret = PIPE_OK;
    unsigned api_pv = hwtnl->api_pv;
diff --git a/src/gallium/drivers/svga/svga_draw_elements.c b/src/gallium/drivers/svga/svga_draw_elements.c
index 9df8f6e9beb..0213409ef29 100644
--- a/src/gallium/drivers/svga/svga_draw_elements.c
+++ b/src/gallium/drivers/svga/svga_draw_elements.c
@@ -133,7 +133,8 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                unsigned prim, unsigned start, unsigned count,
                                unsigned start_instance, unsigned instance_count)
 {
-   unsigned gen_prim, gen_size, gen_nr, gen_type;
+   unsigned gen_prim, gen_size, gen_nr;
+   enum indices_mode gen_type;
    u_translate_func gen_func;
    enum pipe_error ret = PIPE_OK;
 
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 5aa7b0d86eb..a80bc9b9119 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -383,6 +383,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
    }
 
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index e70ee689c59..9b7ab16103f 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -2672,6 +2672,7 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit)
    }
    else if (emit->unit == PIPE_SHADER_FRAGMENT) {
       if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS ||
+          emit->key.fs.white_fragments ||
           emit->key.fs.write_color0_to_n_cbufs > 1) {
          /* Allocate a temp to hold the output color */
          emit->fs.color_tmp_index = total_temps;
@@ -6369,8 +6370,11 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit,
    emit_src_register(emit, &tmp_src_x);
    end_emit_instruction(emit);
 
-   /* If we don't need to broadcast the color below, emit final color here */
-   if (emit->key.fs.write_color0_to_n_cbufs <= 1) {
+   /* If we don't need to broadcast the color below or set fragments to
+    * white, emit final color here.
+    */
+   if (emit->key.fs.write_color0_to_n_cbufs <= 1 &&
+       !emit->key.fs.white_fragments) {
       /* MOV output.color, tempcolor */
       emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst,
                            &color_src, FALSE);     /* XXX saturate? */
@@ -6381,9 +6385,27 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit,
 
 
 /**
+ * When we need to emit white for all fragments (for emulating XOR logicop
+ * mode), this function copies white into the temporary color output register.
+ */
+static void
+emit_set_color_white(struct svga_shader_emitter_v10 *emit,
+                     unsigned fs_color_tmp_index)
+{
+   struct tgsi_full_dst_register color_dst =
+      make_dst_temp_reg(fs_color_tmp_index);
+   struct tgsi_full_src_register white =
+      make_immediate_reg_float(emit, 1.0f);
+
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &white, FALSE);
+}
+
+
+/**
  * Emit instructions for writing a single color output to multiple
  * color buffers.
- * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS
+ * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS (or
+ * when key.fs.white_fragments is true).
  * property is set and the number of render targets is greater than one.
  * \param fs_color_tmp_index  index of the temp register that holds the
  *                            color to broadcast.
@@ -6398,7 +6420,6 @@ emit_broadcast_color_instructions(struct svga_shader_emitter_v10 *emit,
       make_src_temp_reg(fs_color_tmp_index);
 
    assert(emit->unit == PIPE_SHADER_FRAGMENT);
-   assert(n > 1);
 
    for (i = 0; i < n; i++) {
       unsigned output_reg = emit->fs.color_out_index[i];
@@ -6440,7 +6461,11 @@ emit_post_helpers(struct svga_shader_emitter_v10 *emit)
       if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS) {
          emit_alpha_test_instructions(emit, fs_color_tmp_index);
       }
-      if (emit->key.fs.write_color0_to_n_cbufs > 1) {
+      if (emit->key.fs.white_fragments) {
+         emit_set_color_white(emit, fs_color_tmp_index);
+      }
+      if (emit->key.fs.write_color0_to_n_cbufs > 1 ||
+          emit->key.fs.white_fragments) {
          emit_broadcast_color_instructions(emit, fs_color_tmp_index);
       }
    }
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index f7b41f5816d..21e3bde2ee2 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -37,14 +37,17 @@
 static bool dump_stats = false;
 
 static void
+vc4_bo_cache_free_all(struct vc4_bo_cache *cache);
+
+static void
 vc4_bo_dump_stats(struct vc4_screen *screen)
 {
         struct vc4_bo_cache *cache = &screen->bo_cache;
 
         fprintf(stderr, "  BOs allocated:   %d\n", screen->bo_count);
-        fprintf(stderr, "  BOs size:        %dkb\n", screen->bo_size / 102);
+        fprintf(stderr, "  BOs size:        %dkb\n", screen->bo_size / 1024);
         fprintf(stderr, "  BOs cached:      %d\n", cache->bo_count);
-        fprintf(stderr, "  BOs cached size: %dkb\n", cache->bo_size / 102);
+        fprintf(stderr, "  BOs cached size: %dkb\n", cache->bo_size / 1024);
 
         if (!list_empty(&cache->time_list)) {
                 struct vc4_bo *first = LIST_ENTRY(struct vc4_bo,
@@ -136,6 +139,8 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
         bo->name = name;
         bo->private = true;
 
+        bool cleared_and_retried = false;
+retry:
         if (!using_vc4_simulator) {
                 struct drm_vc4_create_bo create;
                 memset(&create, 0, sizeof(create));
@@ -157,8 +162,15 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
                 assert(create.size >= size);
         }
         if (ret != 0) {
-                fprintf(stderr, "create ioctl failure\n");
-                abort();
+                if (!list_empty(&screen->bo_cache.time_list) &&
+                    !cleared_and_retried) {
+                        cleared_and_retried = true;
+                        vc4_bo_cache_free_all(&screen->bo_cache);
+                        goto retry;
+                }
+
+                free(bo);
+                return NULL;
         }
 
         screen->bo_count++;
@@ -248,6 +260,18 @@ free_stale_bos(struct vc4_screen *screen, time_t time)
         }
 }
 
+static void
+vc4_bo_cache_free_all(struct vc4_bo_cache *cache)
+{
+        pipe_mutex_lock(cache->lock);
+        list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
+                                 time_list) {
+                vc4_bo_remove_from_cache(cache, bo);
+                vc4_bo_free(bo);
+        }
+        pipe_mutex_unlock(cache->lock);
+}
+
 void
 vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
 {
@@ -428,7 +452,7 @@ vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size)
         screen->bo_count++;
         screen->bo_size += bo->size;
         if (dump_stats) {
-                fprintf(stderr, "Allocated shader %dkb:\n", size / 1024);
+                fprintf(stderr, "Allocated shader %dkb:\n", bo->size / 1024);
                 vc4_bo_dump_stats(screen);
         }
 
@@ -600,11 +624,7 @@ vc4_bufmgr_destroy(struct pipe_screen *pscreen)
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_bo_cache *cache = &screen->bo_cache;
 
-        list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
-                                 time_list) {
-                vc4_bo_remove_from_cache(cache, bo);
-                vc4_bo_free(bo);
-        }
+        vc4_bo_cache_free_all(cache);
 
         if (dump_stats) {
                 fprintf(stderr, "BO stats after screen destroy:\n");
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 476d2b5b0b1..a719f276b2e 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -184,6 +184,21 @@ dump_VC4_PACKET_GL_INDEXED_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offs
 }
 
 static void
+dump_VC4_PACKET_GL_ARRAY_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        uint8_t *b = cl + offset;
+        uint32_t *count = cl + offset + 1;
+        uint32_t *start = cl + offset + 5;
+
+        fprintf(stderr, "0x%08x 0x%08x:      0x%02x %s\n",
+                offset, hw_offset, b[0], u_prim_name(b[0] & 0x7));
+        fprintf(stderr, "0x%08x 0x%08x:      %d verts\n",
+                offset + 1, hw_offset + 1, *count);
+        fprintf(stderr, "0x%08x 0x%08x:      0x%08x start\n",
+                offset + 5, hw_offset + 5, *start);
+}
+
+static void
 dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint32_t *bits = cl + offset;
@@ -380,7 +395,7 @@ static const struct packet_info {
         PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
 
         PACKET_DUMP(VC4_PACKET_GL_INDEXED_PRIMITIVE),
-        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE),
+        PACKET_DUMP(VC4_PACKET_GL_ARRAY_PRIMITIVE),
 
         PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE),
         PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE),
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 122bda0bac6..bb723845531 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -35,11 +35,12 @@
 
 static bool miptree_debug = false;
 
-static void
+static bool
 vc4_resource_bo_alloc(struct vc4_resource *rsc)
 {
         struct pipe_resource *prsc = &rsc->base.b;
         struct pipe_screen *pscreen = prsc->screen;
+        struct vc4_bo *bo;
 
         if (miptree_debug) {
                 fprintf(stderr, "alloc %p: size %d + offset %d -> %d\n",
@@ -51,12 +52,18 @@ vc4_resource_bo_alloc(struct vc4_resource *rsc)
                         rsc->cube_map_stride * (prsc->array_size - 1));
         }
 
-        vc4_bo_unreference(&rsc->bo);
-        rsc->bo = vc4_bo_alloc(vc4_screen(pscreen),
-                               rsc->slices[0].offset +
-                               rsc->slices[0].size +
-                               rsc->cube_map_stride * (prsc->array_size - 1),
-                               "resource");
+        bo = vc4_bo_alloc(vc4_screen(pscreen),
+                          rsc->slices[0].offset +
+                          rsc->slices[0].size +
+                          rsc->cube_map_stride * (prsc->array_size - 1),
+                          "resource");
+        if (bo) {
+                vc4_bo_unreference(&rsc->bo);
+                rsc->bo = bo;
+                return true;
+        } else {
+                return false;
+        }
 }
 
 static void
@@ -101,21 +108,27 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
         char *buf;
 
         if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
-                vc4_resource_bo_alloc(rsc);
+                if (vc4_resource_bo_alloc(rsc)) {
 
-                /* If it might be bound as one of our vertex buffers, make
-                 * sure we re-emit vertex buffer state.
-                 */
-                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
-                        vc4->dirty |= VC4_DIRTY_VTXBUF;
+                        /* If it might be bound as one of our vertex buffers,
+                         * make sure we re-emit vertex buffer state.
+                         */
+                        if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                                vc4->dirty |= VC4_DIRTY_VTXBUF;
+                } else {
+                        /* If we failed to reallocate, flush everything so
+                         * that we don't violate any syncing requirements.
+                         */
+                        vc4_flush(pctx);
+                }
         } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
                 if (vc4_cl_references_bo(pctx, rsc->bo)) {
                         if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
                             prsc->last_level == 0 &&
                             prsc->width0 == box->width &&
                             prsc->height0 == box->height &&
-                            prsc->depth0 == box->depth) {
-                                vc4_resource_bo_alloc(rsc);
+                            prsc->depth0 == box->depth &&
+                            vc4_resource_bo_alloc(rsc)) {
                                 if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
                                         vc4->dirty |= VC4_DIRTY_VTXBUF;
                         } else {
@@ -389,8 +402,7 @@ vc4_resource_create(struct pipe_screen *pscreen,
                 rsc->vc4_format = get_resource_texture_format(prsc);
 
         vc4_setup_slices(rsc);
-        vc4_resource_bo_alloc(rsc);
-        if (!rsc->bo)
+        if (!vc4_resource_bo_alloc(rsc))
                 goto fail;
 
         return prsc;
@@ -668,7 +680,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
         uint16_t *dst = data;
 
         struct pipe_transfer *src_transfer = NULL;
-        uint32_t *src;
+        const uint32_t *src;
         if (ib->user_buffer) {
                 src = ib->user_buffer;
         } else {
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index bb867611804..88ee48c0e8f 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -184,6 +184,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_CLEAR_TEXTURE:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 78aa344ab1d..a234ce53b20 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -420,6 +420,23 @@ vc4_set_framebuffer_state(struct pipe_context *pctx,
         cso->width = framebuffer->width;
         cso->height = framebuffer->height;
 
+        /* If we're binding to uninitialized buffers, no need to load their
+         * contents before drawing..
+         */
+        if (cso->cbufs[0]) {
+                struct vc4_resource *rsc =
+                        vc4_resource(cso->cbufs[0]->texture);
+                if (!rsc->writes)
+                        vc4->cleared |= PIPE_CLEAR_COLOR0;
+        }
+
+        if (cso->zsbuf) {
+                struct vc4_resource *rsc =
+                        vc4_resource(cso->zsbuf->texture);
+                if (!rsc->writes)
+                        vc4->cleared |= PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL;
+        }
+
         /* Nonzero texture mipmap levels are laid out as if they were in
          * power-of-two-sized spaces.  The renderbuffer config infers its
          * stride from the width parameter, so we need to configure our
@@ -583,6 +600,10 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                 tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level;
 
                 prsc = vc4_resource_create(pctx->screen, &tmpl);
+                if (!prsc) {
+                        free(so);
+                        return NULL;
+                }
                 rsc = vc4_resource(prsc);
                 clone = vc4_resource(prsc);
                 clone->shadow_parent = &shadow_parent->base.b;
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index cca379d47ab..26a4f7736e3 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -218,6 +218,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 6f9fe767404..27f358f8fb9 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -45,6 +45,7 @@ struct pipe_blit_info;
 struct pipe_box;
 struct pipe_clip_state;
 struct pipe_constant_buffer;
+struct pipe_debug_callback;
 struct pipe_depth_stencil_alpha_state;
 struct pipe_draw_info;
 struct pipe_fence_handle;
@@ -239,6 +240,13 @@ struct pipe_context {
                           const float default_inner_level[2]);
 
    /**
+    * Sets the debug callback. If the pointer is null, then no callback is
+    * set, otherwise a copy of the data should be made.
+    */
+   void (*set_debug_callback)(struct pipe_context *,
+                              const struct pipe_debug_callback *);
+
+   /**
     * Bind an array of shader buffers that will be used by a shader.
     * Any buffers that were previously bound to the specified range
     * will be unbound.
@@ -372,6 +380,16 @@ struct pipe_context {
                                unsigned width, unsigned height);
 
    /**
+    * Clear the texture with the specified texel. Not guaranteed to be a
+    * renderable format. Data provided in the resource's format.
+    */
+   void (*clear_texture)(struct pipe_context *pipe,
+                         struct pipe_resource *res,
+                         unsigned level,
+                         const struct pipe_box *box,
+                         const void *data);
+
+   /**
     * Clear a buffer. Runs a memset over the specified region with the element
     * value passed in through clear_value of size clear_value_size.
     */
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index b15c8809c1d..7240154727e 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -634,6 +634,7 @@ enum pipe_cap
    PIPE_CAP_FORCE_PERSAMPLE_INTERP,
    PIPE_CAP_SHAREABLE_SHADERS,
    PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS,
+   PIPE_CAP_CLEAR_TEXTURE,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
@@ -868,6 +869,18 @@ struct pipe_driver_query_group_info
    unsigned num_queries;
 };
 
+enum pipe_debug_type
+{
+   PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1,
+   PIPE_DEBUG_TYPE_ERROR,
+   PIPE_DEBUG_TYPE_SHADER_INFO,
+   PIPE_DEBUG_TYPE_PERF_INFO,
+   PIPE_DEBUG_TYPE_INFO,
+   PIPE_DEBUG_TYPE_FALLBACK,
+   PIPE_DEBUG_TYPE_CONFORMANCE,
+};
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index e0ab9013dd5..a3137aec8db 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -185,7 +185,8 @@ struct tgsi_declaration_interp
 #define TGSI_SEMANTIC_TESSOUTER  32 /**< outer tessellation levels */
 #define TGSI_SEMANTIC_TESSINNER  33 /**< inner tessellation levels */
 #define TGSI_SEMANTIC_VERTICESIN 34 /**< number of input vertices */
-#define TGSI_SEMANTIC_COUNT      35 /**< number of semantic values */
+#define TGSI_SEMANTIC_HELPER_INVOCATION 35 /**< current invocation is helper */
+#define TGSI_SEMANTIC_COUNT      36 /**< number of semantic values */
 
 struct tgsi_declaration_semantic
 {
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 4bf8d46c686..6bdf03a8b2b 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -684,6 +684,31 @@ struct pipe_compute_state
    unsigned req_input_mem; /**< Required size of the INPUT resource. */
 };
 
+/**
+ * Structure that contains a callback for debug messages from the driver back
+ * to the state tracker.
+ */
+struct pipe_debug_callback
+{
+   /**
+    * Callback for the driver to report debug/performance/etc information back
+    * to the state tracker.
+    *
+    * \param data       user-supplied data pointer
+    * \param id         message type identifier, if pointed value is 0, then a
+    *                   new id is assigned
+    * \param type       PIPE_DEBUG_TYPE_*
+    * \param format     printf-style format string
+    * \param args       args for format string
+    */
+   void (*debug_message)(void *data,
+                         unsigned *id,
+                         enum pipe_debug_type type,
+                         const char *fmt,
+                         va_list args);
+   void *data;
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/state_trackers/clover/api/context.cpp b/src/gallium/state_trackers/clover/api/context.cpp
index 021eea36f6e..c0cd2d32b95 100644
--- a/src/gallium/state_trackers/clover/api/context.cpp
+++ b/src/gallium/state_trackers/clover/api/context.cpp
@@ -45,8 +45,13 @@ clCreateContext(const cl_context_properties *d_props, cl_uint num_devs,
          throw error(CL_INVALID_PROPERTY);
    }
 
+   const auto notify = (!pfn_notify ? context::notify_action() :
+                        [=](const char *s) {
+                           pfn_notify(s, NULL, 0, user_data);
+                        });
+
    ret_error(r_errcode, CL_SUCCESS);
-   return desc(new context(props, devs));
+   return desc(new context(props, devs, notify));
 
 } catch (error &e) {
    ret_error(r_errcode, e);
diff --git a/src/gallium/state_trackers/clover/core/context.cpp b/src/gallium/state_trackers/clover/core/context.cpp
index bf4df39dc2a..c3e20829384 100644
--- a/src/gallium/state_trackers/clover/core/context.cpp
+++ b/src/gallium/state_trackers/clover/core/context.cpp
@@ -25,8 +25,9 @@
 using namespace clover;
 
 context::context(const property_list &props,
-                 const ref_vector<device> &devs) :
-   props(props), devs(devs) {
+                 const ref_vector<device> &devs,
+                 const notify_action &notify) :
+   notify(notify), props(props), devs(devs) {
 }
 
 bool
diff --git a/src/gallium/state_trackers/clover/core/context.hpp b/src/gallium/state_trackers/clover/core/context.hpp
index 0ec4ff4a231..7b22ccae78f 100644
--- a/src/gallium/state_trackers/clover/core/context.hpp
+++ b/src/gallium/state_trackers/clover/core/context.hpp
@@ -36,7 +36,10 @@ namespace clover {
       typedef clover::property_list<cl_context_properties> property_list;
 
    public:
-      context(const property_list &props, const ref_vector<device> &devs);
+      typedef std::function<void (const char *)> notify_action;
+
+      context(const property_list &props, const ref_vector<device> &devs,
+              const notify_action &notify);
 
       context(const context &ctx) = delete;
       context &
@@ -53,6 +56,8 @@ namespace clover {
       device_range
       devices() const;
 
+      const notify_action notify;
+
    private:
       property_list props;
       const std::vector<intrusive_ref<device>> devs;
diff --git a/src/gallium/state_trackers/clover/core/queue.cpp b/src/gallium/state_trackers/clover/core/queue.cpp
index 4aaf67de241..24d71f186e0 100644
--- a/src/gallium/state_trackers/clover/core/queue.cpp
+++ b/src/gallium/state_trackers/clover/core/queue.cpp
@@ -24,15 +24,36 @@
 #include "core/event.hpp"
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
+#include "pipe/p_state.h"
 
 using namespace clover;
 
+namespace {
+   void
+   debug_notify_callback(void *data,
+                         unsigned *id,
+                         enum pipe_debug_type type,
+                         const char *fmt,
+                         va_list args) {
+      const command_queue *queue = (const command_queue *)data;
+      char buffer[1024];
+      vsnprintf(buffer, sizeof(buffer), fmt, args);
+      queue->context().notify(buffer);
+   }
+}
+
 command_queue::command_queue(clover::context &ctx, clover::device &dev,
                              cl_command_queue_properties props) :
    context(ctx), device(dev), props(props) {
    pipe = dev.pipe->context_create(dev.pipe, NULL, PIPE_CONTEXT_COMPUTE_ONLY);
    if (!pipe)
       throw error(CL_INVALID_DEVICE);
+
+   if (ctx.notify) {
+      struct pipe_debug_callback cb = { &debug_notify_callback, this };
+      if (pipe->set_debug_callback)
+         pipe->set_debug_callback(pipe, &cb);
+   }
 }
 
 command_queue::~command_queue() {
diff --git a/src/gallium/state_trackers/omx/entrypoint.c b/src/gallium/state_trackers/omx/entrypoint.c
index a7656660e8c..7df90b16a84 100644
--- a/src/gallium/state_trackers/omx/entrypoint.c
+++ b/src/gallium/state_trackers/omx/entrypoint.c
@@ -38,6 +38,7 @@
 
 #include "os/os_thread.h"
 #include "util/u_memory.h"
+#include "loader/loader.h"
 
 #include "entrypoint.h"
 #include "vid_dec.h"
@@ -47,6 +48,8 @@ pipe_static_mutex(omx_lock);
 static Display *omx_display = NULL;
 static struct vl_screen *omx_screen = NULL;
 static unsigned omx_usecount = 0;
+static const char *omx_render_node = NULL;
+static int drm_fd;
 
 int omx_component_library_Setup(stLoaderComponentType **stComponents)
 {
@@ -73,18 +76,30 @@ struct vl_screen *omx_get_screen(void)
    pipe_mutex_lock(omx_lock);
 
    if (!omx_display) {
-      omx_display = XOpenDisplay(NULL);
-      if (!omx_display) {
-         pipe_mutex_unlock(omx_lock);
-         return NULL;
+      omx_render_node = debug_get_option("OMX_RENDER_NODE", NULL);
+      if (!omx_render_node) {
+         omx_display = XOpenDisplay(NULL);
+         if (!omx_display)
+            goto error;
       }
    }
 
    if (!omx_screen) {
-      omx_screen = vl_screen_create(omx_display, 0);
-      if (!omx_screen) {
-         pipe_mutex_unlock(omx_lock);
-         return NULL;
+      if (omx_render_node) {
+         drm_fd = loader_open_device(omx_render_node);
+         if (drm_fd < 0)
+            goto error;
+         omx_screen = vl_drm_screen_create(drm_fd);
+         if (!omx_screen) {
+            close(drm_fd);
+            goto error;
+         }
+      } else {
+         omx_screen = vl_screen_create(omx_display, 0);
+         if (!omx_screen) {
+            XCloseDisplay(omx_display);
+            goto error;
+         }
       }
    }
 
@@ -92,14 +107,24 @@ struct vl_screen *omx_get_screen(void)
 
    pipe_mutex_unlock(omx_lock);
    return omx_screen;
+
+error:
+   pipe_mutex_unlock(omx_lock);
+   return NULL;
 }
 
 void omx_put_screen(void)
 {
    pipe_mutex_lock(omx_lock);
    if ((--omx_usecount) == 0) {
-      vl_screen_destroy(omx_screen);
-      XCloseDisplay(omx_display);
+      if (!omx_render_node) {
+         vl_screen_destroy(omx_screen);
+         if (omx_display)
+            XCloseDisplay(omx_display);
+      } else {
+         close(drm_fd);
+         vl_drm_screen_destroy(omx_screen);
+      }
       omx_screen = NULL;
       omx_display = NULL;
    }
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c
index 71a65037757..769305e2999 100644
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -152,11 +152,11 @@ vlVaUnmapBuffer(VADriverContextP ctx, VABufferID buf_id)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
    if (buf->derived_surface.resource) {
-     if (!buf->derived_surface.transfer)
-        return VA_STATUS_ERROR_INVALID_BUFFER;
+      if (!buf->derived_surface.transfer)
+         return VA_STATUS_ERROR_INVALID_BUFFER;
 
-     pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer);
-     buf->derived_surface.transfer = NULL;
+      pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer);
+      buf->derived_surface.transfer = NULL;
    }
 
    return VA_STATUS_SUCCESS;
@@ -175,10 +175,10 @@ vlVaDestroyBuffer(VADriverContextP ctx, VABufferID buf_id)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
    if (buf->derived_surface.resource) {
-     if (buf->export_refcount > 0)
-       return VA_STATUS_ERROR_INVALID_BUFFER;
+      if (buf->export_refcount > 0)
+         return VA_STATUS_ERROR_INVALID_BUFFER;
 
-     pipe_resource_reference(&buf->derived_surface.resource, NULL);
+      pipe_resource_reference(&buf->derived_surface.resource, NULL);
    }
 
    FREE(buf->data);
@@ -280,15 +280,14 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
 
          buf_info->handle = (intptr_t)whandle.handle;
          break;
+      }
       default:
          return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
       }
-   }
-
-   buf_info->type = buf->type;
-   buf_info->mem_type = mem_type;
-   buf_info->mem_size = buf->num_elements * buf->size;
 
+      buf_info->type = buf->type;
+      buf_info->mem_type = mem_type;
+      buf_info->mem_size = buf->num_elements * buf->size;
    }
 
    buf->export_refcount++;
diff --git a/src/gallium/state_trackers/va/config.c b/src/gallium/state_trackers/va/config.c
index 0f47aacdbd6..a545a18c1e0 100644
--- a/src/gallium/state_trackers/va/config.c
+++ b/src/gallium/state_trackers/va/config.c
@@ -71,8 +71,8 @@ vlVaQueryConfigEntrypoints(VADriverContextP ctx, VAProfile profile,
    *num_entrypoints = 0;
 
    if (profile == VAProfileNone) {
-       entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc;
-       return VA_STATUS_SUCCESS;
+      entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc;
+      return VA_STATUS_SUCCESS;
    }
 
    p = ProfileToPipe(profile);
@@ -104,7 +104,7 @@ vlVaGetConfigAttributes(VADriverContextP ctx, VAProfile profile, VAEntrypoint en
          value = VA_RT_FORMAT_YUV420;
          break;
       case VAConfigAttribRateControl:
-	 value = VA_RC_NONE;
+         value = VA_RC_NONE;
          break;
       default:
          value = VA_ATTRIB_NOT_SUPPORTED;
@@ -127,8 +127,8 @@ vlVaCreateConfig(VADriverContextP ctx, VAProfile profile, VAEntrypoint entrypoin
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
    if (profile == VAProfileNone && entrypoint == VAEntrypointVideoProc) {
-       *config_id = PIPE_VIDEO_PROFILE_UNKNOWN;
-       return VA_STATUS_SUCCESS;
+      *config_id = PIPE_VIDEO_PROFILE_UNKNOWN;
+      return VA_STATUS_SUCCESS;
    }
 
    p = ProfileToPipe(profile);
@@ -167,7 +167,7 @@ vlVaQueryConfigAttributes(VADriverContextP ctx, VAConfigID config_id, VAProfile
 
    if (config_id == PIPE_VIDEO_PROFILE_UNKNOWN) {
       *entrypoint = VAEntrypointVideoProc;
-       *num_attribs = 0;
+      *num_attribs = 0;
       return VA_STATUS_SUCCESS;
    }
 
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index ec9e0488d85..98c4104da48 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -28,8 +28,6 @@
 
 #include "pipe/p_screen.h"
 #include "pipe/p_video_codec.h"
-#include "pipe-loader/pipe_loader.h"
-#include "state_tracker/drm_driver.h"
 #include "util/u_memory.h"
 #include "util/u_handle_table.h"
 #include "util/u_video.h"
@@ -133,31 +131,16 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
          return VA_STATUS_ERROR_INVALID_PARAMETER;
       }
 
-#if GALLIUM_STATIC_TARGETS
       drm_fd = drm_info->fd;
-#else
-      drm_fd = dup(drm_info->fd);
-#endif
 
       if (drm_fd < 0) {
          FREE(drv);
          return VA_STATUS_ERROR_INVALID_PARAMETER;
       }
 
-      drv->vscreen = CALLOC_STRUCT(vl_screen);
+      drv->vscreen = vl_drm_screen_create(drm_fd);
       if (!drv->vscreen)
          goto error_screen;
-
-#if GALLIUM_STATIC_TARGETS
-      drv->vscreen->pscreen = dd_create_screen(drm_fd);
-#else
-      if (pipe_loader_drm_probe_fd(&drv->dev, drm_fd))
-         drv->vscreen->pscreen = pipe_loader_create_screen(drv->dev, PIPE_SEARCH_DIR);
-#endif
-
-      if (!drv->vscreen->pscreen)
-         goto error_pipe;
-
       }
       break;
    default:
@@ -202,7 +185,7 @@ error_pipe:
    if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11)
       vl_screen_destroy(drv->vscreen);
    else
-      FREE(drv->vscreen);
+      vl_drm_screen_destroy(drv->vscreen);
 
 error_screen:
    FREE(drv);
@@ -342,7 +325,7 @@ vlVaTerminate(VADriverContextP ctx)
    if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11)
       vl_screen_destroy(drv->vscreen);
    else
-      FREE(drv->vscreen);
+      vl_drm_screen_destroy(drv->vscreen);
    handle_table_destroy(drv->htab);
    FREE(drv);
 
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c
index c6d0c5abf65..ae07da857e1 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -447,8 +447,8 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image,
       tmp_buf = drv->pipe->create_video_buffer(drv->pipe, &surf->templat);
 
       if (!tmp_buf) {
-          surf->templat.buffer_format = old_surf_format;
-          return VA_STATUS_ERROR_ALLOCATION_FAILED;
+         surf->templat.buffer_format = old_surf_format;
+         return VA_STATUS_ERROR_ALLOCATION_FAILED;
       }
 
       surf->buffer->destroy(surf->buffer);
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index e850689005d..5e7841a0521 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -59,13 +59,14 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende
       return VA_STATUS_ERROR_INVALID_SURFACE;
 
    context->target = surf->buffer;
-
    if (!context->decoder) {
       /* VPP */
       if ((context->target->buffer_format != PIPE_FORMAT_B8G8R8A8_UNORM  &&
-           context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM) ||
+           context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM  &&
+           context->target->buffer_format != PIPE_FORMAT_B8G8R8X8_UNORM  &&
+           context->target->buffer_format != PIPE_FORMAT_R8G8B8X8_UNORM) ||
            context->target->interlaced)
-          return VA_STATUS_ERROR_UNIMPLEMENTED;
+         return VA_STATUS_ERROR_UNIMPLEMENTED;
       return VA_STATUS_SUCCESS;
    }
 
@@ -693,8 +694,10 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
           bufHasStartcode(buf, 0x0000010b, 32))
          break;
 
+      if (context->decoder->profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED) {
          buffers[num_buffers] = (void *const)&start_code_vc1;
          sizes[num_buffers++] = sizeof(start_code_vc1);
+      }
       break;
    case PIPE_VIDEO_FORMAT_MPEG4:
       if (bufHasStartcode(buf, 0x000001, 24))
@@ -717,60 +720,60 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
 static VAStatus
 handleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
-    struct u_rect src_rect;
-    struct u_rect dst_rect;
-    struct u_rect *dirty_area;
-    vlVaSurface *src_surface;
-    VAProcPipelineParameterBuffer *pipeline_param;
-    struct pipe_surface **surfaces;
-    struct pipe_screen *screen;
-    struct pipe_surface *psurf;
-
-    if (!drv || !context)
-       return VA_STATUS_ERROR_INVALID_CONTEXT;
+   struct u_rect src_rect;
+   struct u_rect dst_rect;
+   struct u_rect *dirty_area;
+   vlVaSurface *src_surface;
+   VAProcPipelineParameterBuffer *pipeline_param;
+   struct pipe_surface **surfaces;
+   struct pipe_screen *screen;
+   struct pipe_surface *psurf;
+
+   if (!drv || !context)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
 
-    if (!buf || !buf->data)
-       return VA_STATUS_ERROR_INVALID_BUFFER;
+   if (!buf || !buf->data)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
 
-    if (!context->target)
-        return VA_STATUS_ERROR_INVALID_SURFACE;
+   if (!context->target)
+      return VA_STATUS_ERROR_INVALID_SURFACE;
 
-    pipeline_param = (VAProcPipelineParameterBuffer *)buf->data;
+   pipeline_param = (VAProcPipelineParameterBuffer *)buf->data;
 
-    src_surface = handle_table_get(drv->htab, pipeline_param->surface);
-    if (!src_surface || !src_surface->buffer)
-       return VA_STATUS_ERROR_INVALID_SURFACE;
+   src_surface = handle_table_get(drv->htab, pipeline_param->surface);
+   if (!src_surface || !src_surface->buffer)
+      return VA_STATUS_ERROR_INVALID_SURFACE;
 
-    surfaces = context->target->get_surfaces(context->target);
+   surfaces = context->target->get_surfaces(context->target);
 
-    if (!surfaces || !surfaces[0])
-        return VA_STATUS_ERROR_INVALID_SURFACE;
+   if (!surfaces || !surfaces[0])
+      return VA_STATUS_ERROR_INVALID_SURFACE;
 
-    screen = drv->pipe->screen;
+   screen = drv->pipe->screen;
 
-    psurf = surfaces[0];
+   psurf = surfaces[0];
 
-    src_rect.x0 = pipeline_param->surface_region->x;
-    src_rect.y0 = pipeline_param->surface_region->y;
-    src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width;
-    src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height;
+   src_rect.x0 = pipeline_param->surface_region->x;
+   src_rect.y0 = pipeline_param->surface_region->y;
+   src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width;
+   src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height;
 
-    dst_rect.x0 = pipeline_param->output_region->x;
-    dst_rect.y0 = pipeline_param->output_region->y;
-    dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width;
-    dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height;
+   dst_rect.x0 = pipeline_param->output_region->x;
+   dst_rect.y0 = pipeline_param->output_region->y;
+   dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width;
+   dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height;
 
-    dirty_area = vl_screen_get_dirty_area(drv->vscreen);
+   dirty_area = vl_screen_get_dirty_area(drv->vscreen);
 
-    vl_compositor_clear_layers(&drv->cstate);
-    vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
-    vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect);
-    vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true);
+   vl_compositor_clear_layers(&drv->cstate);
+   vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
+   vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect);
+   vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true);
 
-    screen->fence_reference(screen, &src_surface->fence, NULL);
-    drv->pipe->flush(drv->pipe, &src_surface->fence, 0);
+   screen->fence_reference(screen, &src_surface->fence, NULL);
+   drv->pipe->flush(drv->pipe, &src_surface->fence, 0);
 
-    return VA_STATUS_SUCCESS;
+   return VA_STATUS_SUCCESS;
 }
 
 VAStatus
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 8f406e09990..589d6860b6a 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -45,6 +45,11 @@
 
 #include <va/va_drmcommon.h>
 
+static const enum pipe_format vpp_surface_formats[] = {
+   PIPE_FORMAT_B8G8R8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM,
+   PIPE_FORMAT_B8G8R8X8_UNORM, PIPE_FORMAT_R8G8B8X8_UNORM
+};
+
 VAStatus
 vlVaCreateSurfaces(VADriverContextP ctx, int width, int height, int format,
                    int num_surfaces, VASurfaceID *surfaces)
@@ -311,101 +316,100 @@ VAStatus
 vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config,
                            VASurfaceAttrib *attrib_list, unsigned int *num_attribs)
 {
-    vlVaDriver *drv;
-    VASurfaceAttrib *attribs;
-    struct pipe_screen *pscreen;
-    int i;
-
-    if (config == VA_INVALID_ID)
-        return VA_STATUS_ERROR_INVALID_CONFIG;
-
-    if (!attrib_list && !num_attribs)
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-    if (!attrib_list) {
-        *num_attribs = VASurfaceAttribCount;
-        return VA_STATUS_SUCCESS;
-    }
-
-    if (!ctx)
-       return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    drv = VL_VA_DRIVER(ctx);
-
-    if (!drv)
-        return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    pscreen = VL_VA_PSCREEN(ctx);
-
-    if (!pscreen)
-       return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    attribs = CALLOC(VASurfaceAttribCount, sizeof(VASurfaceAttrib));
-
-    if (!attribs)
-        return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
-    i = 0;
-
-    if (config == PIPE_VIDEO_PROFILE_UNKNOWN) {
-       /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN
-          only for VAEntrypointVideoProc. */
-       attribs[i].type = VASurfaceAttribPixelFormat;
-       attribs[i].value.type = VAGenericValueTypeInteger;
-       attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
-       attribs[i].value.value.i = VA_FOURCC_BGRA;
-       i++;
-
-       attribs[i].type = VASurfaceAttribPixelFormat;
-       attribs[i].value.type = VAGenericValueTypeInteger;
-       attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
-       attribs[i].value.value.i = VA_FOURCC_RGBA;
-       i++;
-    } else {
-       /* Assume VAEntrypointVLD for now. */
-       attribs[i].type = VASurfaceAttribPixelFormat;
-       attribs[i].value.type = VAGenericValueTypeInteger;
-       attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
-       attribs[i].value.value.i = VA_FOURCC_NV12;
-       i++;
-    }
-
-    attribs[i].type = VASurfaceAttribMemoryType;
-    attribs[i].value.type = VAGenericValueTypeInteger;
-    attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
-    attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA |
-        VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
-    i++;
-
-    attribs[i].type = VASurfaceAttribExternalBufferDescriptor;
-    attribs[i].value.type = VAGenericValueTypePointer;
-    attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE;
-    attribs[i].value.value.p = NULL; /* ignore */
-    i++;
-
-    attribs[i].type = VASurfaceAttribMaxWidth;
-    attribs[i].value.type = VAGenericValueTypeInteger;
-    attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
-    attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
-    i++;
-
-    attribs[i].type = VASurfaceAttribMaxHeight;
-    attribs[i].value.type = VAGenericValueTypeInteger;
-    attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
-    attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
-    i++;
-
-    if (i > *num_attribs) {
-        *num_attribs = i;
-        FREE(attribs);
-        return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
-    }
-
-    *num_attribs = i;
-    memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib));
-    FREE(attribs);
-
-    return VA_STATUS_SUCCESS;
+   vlVaDriver *drv;
+   VASurfaceAttrib *attribs;
+   struct pipe_screen *pscreen;
+   int i, j;
+
+   STATIC_ASSERT(ARRAY_SIZE(vpp_surface_formats) <= VL_VA_MAX_IMAGE_FORMATS);
+
+   if (config == VA_INVALID_ID)
+      return VA_STATUS_ERROR_INVALID_CONFIG;
+
+   if (!attrib_list && !num_attribs)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   if (!attrib_list) {
+      *num_attribs = VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount;
+      return VA_STATUS_SUCCESS;
+   }
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   drv = VL_VA_DRIVER(ctx);
+
+   if (!drv)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   pscreen = VL_VA_PSCREEN(ctx);
+
+   if (!pscreen)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   attribs = CALLOC(VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount,
+                    sizeof(VASurfaceAttrib));
+
+   if (!attribs)
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+   i = 0;
+
+   /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN
+    * only for VAEntrypointVideoProc. */
+   if (config == PIPE_VIDEO_PROFILE_UNKNOWN) {
+      for (j = 0; j < ARRAY_SIZE(vpp_surface_formats); ++j) {
+         attribs[i].type = VASurfaceAttribPixelFormat;
+         attribs[i].value.type = VAGenericValueTypeInteger;
+         attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+         attribs[i].value.value.i = PipeFormatToVaFourcc(vpp_surface_formats[j]);
+         i++;
+      }
+   } else {
+      /* Assume VAEntrypointVLD for now. */
+      attribs[i].type = VASurfaceAttribPixelFormat;
+      attribs[i].value.type = VAGenericValueTypeInteger;
+      attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+      attribs[i].value.value.i = VA_FOURCC_NV12;
+      i++;
+   }
+
+   attribs[i].type = VASurfaceAttribMemoryType;
+   attribs[i].value.type = VAGenericValueTypeInteger;
+   attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+   attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA |
+         VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
+   i++;
+
+   attribs[i].type = VASurfaceAttribExternalBufferDescriptor;
+   attribs[i].value.type = VAGenericValueTypePointer;
+   attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE;
+   attribs[i].value.value.p = NULL; /* ignore */
+   i++;
+
+   attribs[i].type = VASurfaceAttribMaxWidth;
+   attribs[i].value.type = VAGenericValueTypeInteger;
+   attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
+   attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
+   i++;
+
+   attribs[i].type = VASurfaceAttribMaxHeight;
+   attribs[i].value.type = VAGenericValueTypeInteger;
+   attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
+   attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
+   i++;
+
+   if (i > *num_attribs) {
+      *num_attribs = i;
+      FREE(attribs);
+      return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
+   }
+
+   *num_attribs = i;
+   memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib));
+   FREE(attribs);
+
+   return VA_STATUS_SUCCESS;
 }
 
 static VAStatus
@@ -414,75 +418,77 @@ suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface,
                             int index, VASurfaceID *surfaces,
                             struct pipe_video_buffer *templat)
 {
-    vlVaDriver *drv;
-    struct pipe_screen *pscreen;
-    struct pipe_resource *resource;
-    struct pipe_resource res_templ;
-    struct winsys_handle whandle;
-    struct pipe_resource *resources[VL_NUM_COMPONENTS];
-
-    if (!ctx)
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-    pscreen = VL_VA_PSCREEN(ctx);
-    drv = VL_VA_DRIVER(ctx);
-
-    if (!memory_attibute || !memory_attibute->buffers ||
-        index > memory_attibute->num_buffers)
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-    if (surface->templat.width != memory_attibute->width ||
-        surface->templat.height != memory_attibute->height ||
-        memory_attibute->num_planes < 1)
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-    switch (memory_attibute->pixel_format) {
-    case VA_FOURCC_RGBA:
-    case VA_FOURCC_RGBX:
-    case VA_FOURCC_BGRA:
-    case VA_FOURCC_BGRX:
-        if (memory_attibute->num_planes != 1)
-            return VA_STATUS_ERROR_INVALID_PARAMETER;
-        break;
-    default:
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-    }
-
-    memset(&res_templ, 0, sizeof(res_templ));
-    res_templ.target = PIPE_TEXTURE_2D;
-    res_templ.last_level = 0;
-    res_templ.depth0 = 1;
-    res_templ.array_size = 1;
-    res_templ.width0 = memory_attibute->width;
-    res_templ.height0 = memory_attibute->height;
-    res_templ.format = surface->templat.buffer_format;
-    res_templ.bind = PIPE_BIND_SAMPLER_VIEW;
-    res_templ.usage = PIPE_USAGE_DEFAULT;
-
-    memset(&whandle, 0, sizeof(struct winsys_handle));
-    whandle.type = DRM_API_HANDLE_TYPE_FD;
-    whandle.handle = memory_attibute->buffers[index];
-    whandle.stride = memory_attibute->pitches[index];
-
-    resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle);
-
-    if (!resource)
-       return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
-    memset(resources, 0, sizeof resources);
-    resources[0] = resource;
-
-    surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources);
-    if (!surface->buffer)
-        return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
-    util_dynarray_init(&surface->subpics);
-    surfaces[index] = handle_table_add(drv->htab, surface);
-
-    if (!surfaces[index])
+   vlVaDriver *drv;
+   struct pipe_screen *pscreen;
+   struct pipe_resource *resource;
+   struct pipe_resource res_templ;
+   struct winsys_handle whandle;
+   struct pipe_resource *resources[VL_NUM_COMPONENTS];
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   pscreen = VL_VA_PSCREEN(ctx);
+   drv = VL_VA_DRIVER(ctx);
+
+   if (!memory_attibute || !memory_attibute->buffers ||
+       index > memory_attibute->num_buffers)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   if (surface->templat.width != memory_attibute->width ||
+       surface->templat.height != memory_attibute->height ||
+       memory_attibute->num_planes < 1)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   switch (memory_attibute->pixel_format) {
+   case VA_FOURCC_RGBA:
+   case VA_FOURCC_RGBX:
+   case VA_FOURCC_BGRA:
+   case VA_FOURCC_BGRX:
+      if (memory_attibute->num_planes != 1)
+         return VA_STATUS_ERROR_INVALID_PARAMETER;
+      break;
+   default:
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+   }
+
+   memset(&res_templ, 0, sizeof(res_templ));
+   res_templ.target = PIPE_TEXTURE_2D;
+   res_templ.last_level = 0;
+   res_templ.depth0 = 1;
+   res_templ.array_size = 1;
+   res_templ.width0 = memory_attibute->width;
+   res_templ.height0 = memory_attibute->height;
+   res_templ.format = surface->templat.buffer_format;
+   res_templ.bind = PIPE_BIND_SAMPLER_VIEW;
+   res_templ.usage = PIPE_USAGE_DEFAULT;
+
+   memset(&whandle, 0, sizeof(struct winsys_handle));
+   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.handle = memory_attibute->buffers[index];
+   whandle.stride = memory_attibute->pitches[index];
+
+   resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle);
+
+   if (!resource)
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+   memset(resources, 0, sizeof resources);
+   resources[0] = resource;
+
+   surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources);
+   if (!surface->buffer)
       return VA_STATUS_ERROR_ALLOCATION_FAILED;
 
-    return VA_STATUS_SUCCESS;
+   util_dynarray_init(&surface->subpics);
+   surfaces[index] = handle_table_add(drv->htab, surface);
+
+   if (!surfaces[index]) {
+      surface->buffer->destroy(surface->buffer);
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+   }
+
+   return VA_STATUS_SUCCESS;
 }
 
 VAStatus
@@ -491,143 +497,147 @@ vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format,
                     VASurfaceID *surfaces, unsigned int num_surfaces,
                     VASurfaceAttrib *attrib_list, unsigned int num_attribs)
 {
-    vlVaDriver *drv;
-    VASurfaceAttribExternalBuffers *memory_attibute;
-    struct pipe_video_buffer templat;
-    struct pipe_screen *pscreen;
-    int i;
-    int memory_type;
-    int expected_fourcc;
-    VAStatus vaStatus;
-
-    if (!ctx)
-       return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    if (!(width && height))
-       return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
-
-    drv = VL_VA_DRIVER(ctx);
-
-    if (!drv)
-        return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    pscreen = VL_VA_PSCREEN(ctx);
-
-    if (!pscreen)
-        return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    /* Default. */
-    memory_attibute = NULL;
-    memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA;
-    expected_fourcc = 0;
-
-    for (i = 0; i < num_attribs && attrib_list; i++) {
-        if ((attrib_list[i].type == VASurfaceAttribPixelFormat) &&
-            (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
-            if (attrib_list[i].value.type != VAGenericValueTypeInteger)
-                return VA_STATUS_ERROR_INVALID_PARAMETER;
-            expected_fourcc = attrib_list[i].value.value.i;
-        }
-
-        if ((attrib_list[i].type == VASurfaceAttribMemoryType) &&
-            (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
-
-            if (attrib_list[i].value.type != VAGenericValueTypeInteger)
-                return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-            switch (attrib_list[i].value.value.i) {
-                case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
-                case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
-                   memory_type = attrib_list[i].value.value.i;
-                   break;
-                default:
-                   return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
-            }
-        }
-
-        if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) &&
-            (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) {
-            if (attrib_list[i].value.type != VAGenericValueTypePointer)
-                return VA_STATUS_ERROR_INVALID_PARAMETER;
-            memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p;
-        }
-    }
-
-    if (VA_RT_FORMAT_YUV420 != format &&
-        VA_RT_FORMAT_YUV422 != format &&
-        VA_RT_FORMAT_YUV444 != format &&
-        VA_RT_FORMAT_RGB32  != format) {
-        return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT;
-    }
-
-    switch (memory_type) {
-        case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
-            break;
-        case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
-            if (!memory_attibute)
-               return VA_STATUS_ERROR_INVALID_PARAMETER;
+   vlVaDriver *drv;
+   VASurfaceAttribExternalBuffers *memory_attibute;
+   struct pipe_video_buffer templat;
+   struct pipe_screen *pscreen;
+   int i;
+   int memory_type;
+   int expected_fourcc;
+   VAStatus vaStatus;
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   if (!(width && height))
+      return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
+
+   drv = VL_VA_DRIVER(ctx);
+
+   if (!drv)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   pscreen = VL_VA_PSCREEN(ctx);
+
+   if (!pscreen)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   /* Default. */
+   memory_attibute = NULL;
+   memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA;
+   expected_fourcc = 0;
+
+   for (i = 0; i < num_attribs && attrib_list; i++) {
+      if ((attrib_list[i].type == VASurfaceAttribPixelFormat) &&
+          (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
+         if (attrib_list[i].value.type != VAGenericValueTypeInteger)
+            return VA_STATUS_ERROR_INVALID_PARAMETER;
+         expected_fourcc = attrib_list[i].value.value.i;
+      }
+
+      if ((attrib_list[i].type == VASurfaceAttribMemoryType) &&
+          (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
 
-            expected_fourcc = memory_attibute->pixel_format;
+         if (attrib_list[i].value.type != VAGenericValueTypeInteger)
+            return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+         switch (attrib_list[i].value.value.i) {
+         case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+         case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+            memory_type = attrib_list[i].value.value.i;
             break;
-        default:
-            assert(0);
-    }
-
-    memset(&templat, 0, sizeof(templat));
-
-    if (expected_fourcc) {
-       templat.buffer_format = VaFourccToPipeFormat(expected_fourcc);
-       templat.interlaced = 0;
-    } else {
-        templat.buffer_format = pscreen->get_video_param
-        (
-           pscreen,
-           PIPE_VIDEO_PROFILE_UNKNOWN,
-           PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
-           PIPE_VIDEO_CAP_PREFERED_FORMAT
-        );
-        templat.interlaced = pscreen->get_video_param
-        (
-           pscreen,
-           PIPE_VIDEO_PROFILE_UNKNOWN,
-           PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
-           PIPE_VIDEO_CAP_PREFERS_INTERLACED
-        );
-    }
-
-    templat.chroma_format = ChromaToPipe(format);
-
-    templat.width = width;
-    templat.height = height;
-
-    memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID));
-
-    for (i = 0; i < num_surfaces; i++) {
-        vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface));
-        if (!surf)
+         default:
+            return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
+         }
+      }
+
+      if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) &&
+          (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) {
+         if (attrib_list[i].value.type != VAGenericValueTypePointer)
+            return VA_STATUS_ERROR_INVALID_PARAMETER;
+         memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p;
+      }
+   }
+
+   if (VA_RT_FORMAT_YUV420 != format &&
+       VA_RT_FORMAT_YUV422 != format &&
+       VA_RT_FORMAT_YUV444 != format &&
+       VA_RT_FORMAT_RGB32  != format) {
+      return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT;
+   }
+
+   switch (memory_type) {
+   case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+      break;
+   case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+      if (!memory_attibute)
+         return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+      expected_fourcc = memory_attibute->pixel_format;
+      break;
+   default:
+      assert(0);
+   }
+
+   memset(&templat, 0, sizeof(templat));
+
+   if (expected_fourcc) {
+      templat.buffer_format = VaFourccToPipeFormat(expected_fourcc);
+      templat.interlaced = 0;
+   } else {
+      templat.buffer_format = pscreen->get_video_param
+            (
+               pscreen,
+               PIPE_VIDEO_PROFILE_UNKNOWN,
+               PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+               PIPE_VIDEO_CAP_PREFERED_FORMAT
+               );
+      templat.interlaced = pscreen->get_video_param
+            (
+               pscreen,
+               PIPE_VIDEO_PROFILE_UNKNOWN,
+               PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+               PIPE_VIDEO_CAP_PREFERS_INTERLACED
+               );
+   }
+
+   templat.chroma_format = ChromaToPipe(format);
+
+   templat.width = width;
+   templat.height = height;
+
+   memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID));
+
+   for (i = 0; i < num_surfaces; i++) {
+      vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface));
+      if (!surf)
+         goto no_res;
+
+      surf->templat = templat;
+
+      switch (memory_type) {
+      case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+         surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat);
+         if (!surf->buffer) {
+            FREE(surf);
+            goto no_res;
+         }
+         util_dynarray_init(&surf->subpics);
+         surfaces[i] = handle_table_add(drv->htab, surf);
+         break;
+      case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+         vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat);
+         if (vaStatus != VA_STATUS_SUCCESS) {
+            FREE(surf);
             goto no_res;
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
 
-        surf->templat = templat;
-
-        switch (memory_type) {
-            case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
-                surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat);
-                if (!surf->buffer)
-                    goto no_res;
-                util_dynarray_init(&surf->subpics);
-                surfaces[i] = handle_table_add(drv->htab, surf);
-                break;
-            case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
-                vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat);
-                if (vaStatus != VA_STATUS_SUCCESS)
-                  goto no_res;
-                break;
-            default:
-                assert(0);
-        }
-    }
-
-    return VA_STATUS_SUCCESS;
+   return VA_STATUS_SUCCESS;
 
 no_res:
    if (i)
@@ -707,7 +717,7 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context,
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
    if (!pipeline_cap)
-   return VA_STATUS_ERROR_INVALID_PARAMETER;
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
 
    if (num_filters && !filters)
       return VA_STATUS_ERROR_INVALID_PARAMETER;
diff --git a/src/gallium/state_trackers/wgl/stw_context.c b/src/gallium/state_trackers/wgl/stw_context.c
index 3e99cc44db7..5978ca63677 100644
--- a/src/gallium/state_trackers/wgl/stw_context.c
+++ b/src/gallium/state_trackers/wgl/stw_context.c
@@ -59,11 +59,9 @@ stw_current_context(void)
    return (struct stw_context *) ((st) ? st->st_manager_private : NULL);
 }
 
+
 BOOL APIENTRY
-DrvCopyContext(
-   DHGLRC dhrcSource,
-   DHGLRC dhrcDest,
-   UINT fuMask )
+DrvCopyContext(DHGLRC dhrcSource, DHGLRC dhrcDest, UINT fuMask)
 {
    struct stw_context *src;
    struct stw_context *dst;
@@ -72,12 +70,12 @@ DrvCopyContext(
    if (!stw_dev)
       return FALSE;
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   
+   stw_lock_contexts(stw_dev);
+
    src = stw_lookup_context_locked( dhrcSource );
    dst = stw_lookup_context_locked( dhrcDest );
 
-   if (src && dst) { 
+   if (src && dst) {
       /* FIXME */
       assert(0);
       (void) src;
@@ -85,15 +83,14 @@ DrvCopyContext(
       (void) fuMask;
    }
 
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   
+   stw_unlock_contexts(stw_dev);
+
    return ret;
 }
 
+
 BOOL APIENTRY
-DrvShareLists(
-   DHGLRC dhglrc1,
-   DHGLRC dhglrc2 )
+DrvShareLists(DHGLRC dhglrc1, DHGLRC dhglrc2)
 {
    struct stw_context *ctx1;
    struct stw_context *ctx2;
@@ -102,30 +99,29 @@ DrvShareLists(
    if (!stw_dev)
       return FALSE;
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   
+   stw_lock_contexts(stw_dev);
+
    ctx1 = stw_lookup_context_locked( dhglrc1 );
    ctx2 = stw_lookup_context_locked( dhglrc2 );
 
    if (ctx1 && ctx2 && ctx2->st->share)
       ret = ctx2->st->share(ctx2->st, ctx1->st);
 
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   
+   stw_unlock_contexts(stw_dev);
+
    return ret;
 }
 
+
 DHGLRC APIENTRY
-DrvCreateContext(
-   HDC hdc )
+DrvCreateContext(HDC hdc)
 {
    return DrvCreateLayerContext( hdc, 0 );
 }
 
+
 DHGLRC APIENTRY
-DrvCreateLayerContext(
-   HDC hdc,
-   INT iLayerPlane )
+DrvCreateLayerContext(HDC hdc, INT iLayerPlane)
 {
    return stw_create_context_attribs(hdc, iLayerPlane, 0, 1, 0, 0,
                                      WGL_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB,
@@ -160,29 +156,26 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext,
    if (iLayerPlane != 0)
       return 0;
 
-   iPixelFormat = GetPixelFormat(hdc);
-   if(!iPixelFormat)
-      return 0;
-
    /*
     * GDI only knows about displayable pixel formats, so determine the pixel
     * format from the framebuffer.
     *
-    * TODO: Remove the GetPixelFormat() above, and stop relying on GDI.
+    * This also allows to use a OpenGL DLL / ICD without installing.
     */
    fb = stw_framebuffer_from_hdc( hdc );
    if (fb) {
-      assert(iPixelFormat == fb->iDisplayablePixelFormat);
       iPixelFormat = fb->iPixelFormat;
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
+   } else {
+      return 0;
    }
 
    pfi = stw_pixelformat_get_info( iPixelFormat );
 
    if (hShareContext != 0) {
-      pipe_mutex_lock( stw_dev->ctx_mutex );
+      stw_lock_contexts(stw_dev);
       shareCtx = stw_lookup_context_locked( hShareContext );
-      pipe_mutex_unlock( stw_dev->ctx_mutex );
+      stw_unlock_contexts(stw_dev);
    }
 
    ctx = CALLOC_STRUCT( stw_context );
@@ -257,7 +250,7 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext,
       ctx->hud = hud_create(ctx->st->pipe, ctx->st->cso_context);
    }
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
+   stw_lock_contexts(stw_dev);
    if (handle) {
       /* We're replacing the context data for this handle. See the
        * wglCreateContextAttribsARB() function.
@@ -283,7 +276,8 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext,
 
    ctx->dhglrc = handle;
 
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   stw_unlock_contexts(stw_dev);
+
    if (!ctx->dhglrc)
       goto no_hglrc;
 
@@ -300,24 +294,24 @@ no_ctx:
    return 0;
 }
 
+
 BOOL APIENTRY
-DrvDeleteContext(
-   DHGLRC dhglrc )
+DrvDeleteContext(DHGLRC dhglrc)
 {
    struct stw_context *ctx ;
    BOOL ret = FALSE;
-   
+
    if (!stw_dev)
       return FALSE;
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
+   stw_lock_contexts(stw_dev);
    ctx = stw_lookup_context_locked(dhglrc);
    handle_table_remove(stw_dev->ctx_table, dhglrc);
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   stw_unlock_contexts(stw_dev);
 
    if (ctx) {
       struct stw_context *curctx = stw_current_context();
-      
+
       /* Unbind current if deleting current context. */
       if (curctx == ctx)
          stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL);
@@ -335,22 +329,22 @@ DrvDeleteContext(
    return ret;
 }
 
+
 BOOL APIENTRY
-DrvReleaseContext(
-   DHGLRC dhglrc )
+DrvReleaseContext(DHGLRC dhglrc)
 {
    struct stw_context *ctx;
 
    if (!stw_dev)
       return FALSE;
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
+   stw_lock_contexts(stw_dev);
    ctx = stw_lookup_context_locked( dhglrc );
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   stw_unlock_contexts(stw_dev);
 
    if (!ctx)
       return FALSE;
-   
+
    /* The expectation is that ctx is the same context which is
     * current for this thread.  We should check that and return False
     * if not the case.
@@ -371,28 +365,28 @@ stw_get_current_context( void )
    struct stw_context *ctx;
 
    ctx = stw_current_context();
-   if(!ctx)
+   if (!ctx)
       return 0;
-   
+
    return ctx->dhglrc;
 }
 
+
 HDC
 stw_get_current_dc( void )
 {
    struct stw_context *ctx;
 
    ctx = stw_current_context();
-   if(!ctx)
+   if (!ctx)
       return NULL;
-   
+
    return ctx->hdc;
 }
 
+
 BOOL
-stw_make_current(
-   HDC hdc,
-   DHGLRC dhglrc )
+stw_make_current(HDC hdc, DHGLRC dhglrc)
 {
    struct stw_context *curctx = NULL;
    struct stw_context *ctx = NULL;
@@ -415,9 +409,9 @@ stw_make_current(
    }
 
    if (dhglrc) {
-      pipe_mutex_lock( stw_dev->ctx_mutex );
+      stw_lock_contexts(stw_dev);
       ctx = stw_lookup_context_locked( dhglrc );
-      pipe_mutex_unlock( stw_dev->ctx_mutex );
+      stw_unlock_contexts(stw_dev);
       if (!ctx) {
          goto fail;
       }
@@ -428,8 +422,9 @@ stw_make_current(
       }
       else {
          /* Applications should call SetPixelFormat before creating a context,
-          * but not all do, and the opengl32 runtime seems to use a default pixel
-          * format in some cases, so we must create a framebuffer for those here
+          * but not all do, and the opengl32 runtime seems to use a default
+          * pixel format in some cases, so we must create a framebuffer for
+          * those here.
           */
          int iPixelFormat = GetPixelFormat(hdc);
          if (iPixelFormat)
@@ -437,7 +432,7 @@ stw_make_current(
          if (!fb)
             goto fail;
       }
-   
+
       if (fb->iPixelFormat != ctx->iPixelFormat) {
          SetLastError(ERROR_INVALID_PIXEL_FORMAT);
          goto fail;
@@ -446,21 +441,26 @@ stw_make_current(
       /* Bind the new framebuffer */
       ctx->hdc = hdc;
 
+      /* Note: when we call this function we will wind up in the
+       * stw_st_framebuffer_validate_locked() function which will incur
+       * a recursive fb->mutex lock.
+       */
       ret = stw_dev->stapi->make_current(stw_dev->stapi, ctx->st,
                                          fb->stfb, fb->stfb);
       stw_framebuffer_reference(&ctx->current_framebuffer, fb);
    } else {
       ret = stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL);
    }
-   
+
 fail:
 
    if (fb) {
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
    }
 
    /* On failure, make the thread's current rendering context not current
-    * before returning */
+    * before returning.
+    */
    if (!ret) {
       stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL);
       ctx = NULL;
@@ -476,18 +476,6 @@ fail:
    return ret;
 }
 
-/**
- * Flush the current context if it is bound to the framebuffer.
- */
-void
-stw_flush_current_locked( struct stw_framebuffer *fb )
-{
-   struct stw_context *ctx = stw_current_context();
-
-   if (ctx && ctx->current_framebuffer == fb) {
-      ctx->st->flush(ctx->st, ST_FLUSH_FRONT, NULL);
-   }
-}
 
 /**
  * Notify the current context that the framebuffer has become invalid.
@@ -498,6 +486,7 @@ stw_notify_current_locked( struct stw_framebuffer *fb )
    p_atomic_inc(&fb->stfb->stamp);
 }
 
+
 /**
  * Although WGL allows different dispatch entrypoints per context
  */
@@ -844,15 +833,13 @@ static const GLCLTPROCTABLE cpt =
    }
 };
 
+
 PGLCLTPROCTABLE APIENTRY
-DrvSetContext(
-   HDC hdc,
-   DHGLRC dhglrc,
-   PFN_SETPROCTABLE pfnSetProcTable )
+DrvSetContext(HDC hdc, DHGLRC dhglrc, PFN_SETPROCTABLE pfnSetProcTable)
 {
    PGLCLTPROCTABLE r = (PGLCLTPROCTABLE)&cpt;
 
-   if (!stw_make_current( hdc, dhglrc ))
+   if (!stw_make_current(hdc, dhglrc))
       r = NULL;
 
    return r;
diff --git a/src/gallium/state_trackers/wgl/stw_context.h b/src/gallium/state_trackers/wgl/stw_context.h
index c66c166de2e..6bfa7150d6f 100644
--- a/src/gallium/state_trackers/wgl/stw_context.h
+++ b/src/gallium/state_trackers/wgl/stw_context.h
@@ -60,7 +60,6 @@ HDC stw_get_current_dc( void );
 
 BOOL stw_make_current( HDC hdc, DHGLRC dhglrc );
 
-void stw_flush_current_locked( struct stw_framebuffer *fb );
 void stw_notify_current_locked( struct stw_framebuffer *fb );
 
 #endif /* STW_CONTEXT_H */
diff --git a/src/gallium/state_trackers/wgl/stw_device.c b/src/gallium/state_trackers/wgl/stw_device.c
index 25b6341ecad..287b937afe5 100644
--- a/src/gallium/state_trackers/wgl/stw_device.c
+++ b/src/gallium/state_trackers/wgl/stw_device.c
@@ -106,8 +106,8 @@ stw_init(const struct stw_winsys *stw_winsys)
          screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
    stw_dev->max_2d_length = 1 << (stw_dev->max_2d_levels - 1);
 
-   pipe_mutex_init( stw_dev->ctx_mutex );
-   pipe_mutex_init( stw_dev->fb_mutex );
+   InitializeCriticalSection(&stw_dev->ctx_mutex);
+   InitializeCriticalSection(&stw_dev->fb_mutex);
 
    stw_dev->ctx_table = handle_table_create();
    if (!stw_dev->ctx_table) {
@@ -156,9 +156,9 @@ stw_cleanup(void)
     * Abort cleanup if there are still active contexts. In some situations
     * this DLL may be unloaded before the DLL that is using GL contexts is.
     */
-   pipe_mutex_lock( stw_dev->ctx_mutex );
+   stw_lock_contexts(stw_dev);
    dhglrc = handle_table_get_first_handle(stw_dev->ctx_table);
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   stw_unlock_contexts(stw_dev);
    if (dhglrc) {
       debug_printf("%s: contexts still active -- cleanup aborted\n", __FUNCTION__);
       stw_dev = NULL;
@@ -169,8 +169,8 @@ stw_cleanup(void)
 
    stw_framebuffer_cleanup();
    
-   pipe_mutex_destroy( stw_dev->fb_mutex );
-   pipe_mutex_destroy( stw_dev->ctx_mutex );
+   DeleteCriticalSection(&stw_dev->fb_mutex);
+   DeleteCriticalSection(&stw_dev->ctx_mutex);
    
    FREE(stw_dev->smapi);
    stw_dev->stapi->destroy(stw_dev->stapi);
diff --git a/src/gallium/state_trackers/wgl/stw_device.h b/src/gallium/state_trackers/wgl/stw_device.h
index e35a4b94036..3f0dffe408b 100644
--- a/src/gallium/state_trackers/wgl/stw_device.h
+++ b/src/gallium/state_trackers/wgl/stw_device.h
@@ -30,7 +30,6 @@
 
 
 #include "pipe/p_compiler.h"
-#include "os/os_thread.h"
 #include "util/u_handle_table.h"
 #include "stw_icd.h"
 #include "stw_pixelformat.h"
@@ -65,10 +64,10 @@ struct stw_device
 
    GLCALLBACKTABLE callbacks;
 
-   pipe_mutex ctx_mutex;
+   CRITICAL_SECTION ctx_mutex;
    struct handle_table *ctx_table;
    
-   pipe_mutex fb_mutex;
+   CRITICAL_SECTION fb_mutex;
    struct stw_framebuffer *fb_head;
    
 #ifdef DEBUG
@@ -89,4 +88,32 @@ stw_lookup_context_locked( DHGLRC dhglrc )
 }
 
 
+static inline void
+stw_lock_contexts(struct stw_device *stw_dev)
+{
+   EnterCriticalSection(&stw_dev->ctx_mutex);
+}
+
+
+static inline void
+stw_unlock_contexts(struct stw_device *stw_dev)
+{
+   LeaveCriticalSection(&stw_dev->ctx_mutex);
+}
+
+
+static inline void
+stw_lock_framebuffers(struct stw_device *stw_dev)
+{
+   EnterCriticalSection(&stw_dev->fb_mutex);
+}
+
+
+static inline void
+stw_unlock_framebuffers(struct stw_device *stw_dev)
+{
+   LeaveCriticalSection(&stw_dev->fb_mutex);
+}
+
+
 #endif /* STW_DEVICE_H_ */
diff --git a/src/gallium/state_trackers/wgl/stw_ext_context.c b/src/gallium/state_trackers/wgl/stw_ext_context.c
index 6af20627398..4c58316a0e1 100644
--- a/src/gallium/state_trackers/wgl/stw_ext_context.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_context.c
@@ -35,6 +35,8 @@
 #include "stw_device.h"
 #include "stw_ext_context.h"
 
+#include "util/u_debug.h"
+
 
 wglCreateContext_t wglCreateContext_func = 0;
 wglDeleteContext_t wglDeleteContext_func = 0;
diff --git a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c
index 0bd60c064d7..c99fa3e513d 100644
--- a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c
@@ -35,6 +35,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
+#include "util/u_debug.h"
+
 #include "stw_device.h"
 #include "stw_pixelformat.h"
 #include "stw_framebuffer.h"
@@ -220,7 +222,7 @@ wglCreatePbufferARB(HDC hCurrentDC,
    fb->bPbuffer = TRUE;
    iDisplayablePixelFormat = fb->iDisplayablePixelFormat;
 
-   stw_framebuffer_release(fb);
+   stw_framebuffer_unlock(fb);
 
    /*
     * We need to set a displayable pixel format on the hidden window DC
diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.c b/src/gallium/state_trackers/wgl/stw_framebuffer.c
index 7b34fcbb5ed..b49bc22e21f 100644
--- a/src/gallium/state_trackers/wgl/stw_framebuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c
@@ -44,27 +44,31 @@
 /**
  * Search the framebuffer with the matching HWND while holding the
  * stw_dev::fb_mutex global lock.
+ * If a stw_framebuffer is found, lock it and return the pointer.
+ * Else, return NULL.
  */
 static inline struct stw_framebuffer *
-stw_framebuffer_from_hwnd_locked(
-   HWND hwnd )
+stw_framebuffer_from_hwnd_locked(HWND hwnd)
 {
    struct stw_framebuffer *fb;
 
    for (fb = stw_dev->fb_head; fb != NULL; fb = fb->next)
       if (fb->hWnd == hwnd) {
-         pipe_mutex_lock(fb->mutex);
-         break;
+         stw_framebuffer_lock(fb);
+         assert(fb->mutex.RecursionCount == 1);
+         return fb;
       }
 
-   return fb;
+   return NULL;
 }
 
 
 /**
- * Destroy this framebuffer. Both stw_dev::fb_mutex and stw_framebuffer::mutex
- * must be held, by this order.  If there are still references to the
- * framebuffer, nothing will happen.
+ * Decrement the reference count on the given stw_framebuffer object.
+ * If the reference count hits zero, destroy the object.
+ *
+ * Note: Both stw_dev::fb_mutex and stw_framebuffer::mutex must already
+ * be locked.
  */
 static void
 stw_framebuffer_destroy_locked(struct stw_framebuffer *fb)
@@ -74,10 +78,11 @@ stw_framebuffer_destroy_locked(struct stw_framebuffer *fb)
    /* check the reference count */
    fb->refcnt--;
    if (fb->refcnt) {
-      pipe_mutex_unlock( fb->mutex );
+      stw_framebuffer_unlock(fb);
       return;
    }
 
+   /* remove this stw_framebuffer from the device's linked list */
    link = &stw_dev->fb_head;
    while (*link != fb)
       link = &(*link)->next;
@@ -91,22 +96,18 @@ stw_framebuffer_destroy_locked(struct stw_framebuffer *fb)
 
    stw_st_destroy_framebuffer_locked(fb->stfb);
 
-   pipe_mutex_unlock( fb->mutex );
+   stw_framebuffer_unlock(fb);
 
-   pipe_mutex_destroy( fb->mutex );
+   DeleteCriticalSection(&fb->mutex);
 
    FREE( fb );
 }
 
 
-void
-stw_framebuffer_release(struct stw_framebuffer *fb)
-{
-   assert(fb);
-   pipe_mutex_unlock( fb->mutex );
-}
-
-
+/**
+ * Query the size of the given framebuffer's on-screen window and update
+ * the stw_framebuffer's width/height.
+ */
 static void
 stw_framebuffer_get_size(struct stw_framebuffer *fb)
 {
@@ -118,7 +119,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb)
    /*
     * Sanity checking.
     */
-
    assert(fb->hWnd);
    assert(fb->width && fb->height);
    assert(fb->client_rect.right  == fb->client_rect.left + fb->width);
@@ -127,7 +127,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb)
    /*
     * Get the client area size.
     */
-
    if (!GetClientRect(fb->hWnd, &client_rect)) {
       return;
    }
@@ -145,7 +144,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb)
        * preserve the current window size, until the window is restored or
        * maximized again.
        */
-
       return;
    }
 
@@ -217,22 +215,27 @@ stw_call_window_proc(int nCode, WPARAM wParam, LPARAM lParam)
              * of the client area via GetClientRect.
              */
             stw_framebuffer_get_size(fb);
-            stw_framebuffer_release(fb);
+            stw_framebuffer_unlock(fb);
          }
       }
    }
    else if (pParams->message == WM_DESTROY) {
-      pipe_mutex_lock( stw_dev->fb_mutex );
+      stw_lock_framebuffers(stw_dev);
       fb = stw_framebuffer_from_hwnd_locked( pParams->hwnd );
       if (fb)
          stw_framebuffer_destroy_locked(fb);
-      pipe_mutex_unlock( stw_dev->fb_mutex );
+      stw_unlock_framebuffers(stw_dev);
    }
 
    return CallNextHookEx(tls_data->hCallWndProcHook, nCode, wParam, lParam);
 }
 
 
+/**
+ * Create a new stw_framebuffer object which corresponds to the given
+ * HDC/window.  If successful, we return the new stw_framebuffer object
+ * with its mutex locked.
+ */
 struct stw_framebuffer *
 stw_framebuffer_create(HDC hdc, int iPixelFormat)
 {
@@ -283,18 +286,18 @@ stw_framebuffer_create(HDC hdc, int iPixelFormat)
 
    stw_framebuffer_get_size(fb);
 
-   pipe_mutex_init( fb->mutex );
+   InitializeCriticalSection(&fb->mutex);
 
    /* This is the only case where we lock the stw_framebuffer::mutex before
     * stw_dev::fb_mutex, since no other thread can know about this framebuffer
     * and we must prevent any other thread from destroying it before we return.
     */
-   pipe_mutex_lock( fb->mutex );
+   stw_framebuffer_lock(fb);
 
-   pipe_mutex_lock( stw_dev->fb_mutex );
+   stw_lock_framebuffers(stw_dev);
    fb->next = stw_dev->fb_head;
    stw_dev->fb_head = fb;
-   pipe_mutex_unlock( stw_dev->fb_mutex );
+   stw_unlock_framebuffers(stw_dev);
 
    return fb;
 }
@@ -315,12 +318,12 @@ stw_framebuffer_reference(struct stw_framebuffer **ptr,
    if (fb)
       fb->refcnt++;
    if (old_fb) {
-      pipe_mutex_lock(stw_dev->fb_mutex);
+      stw_lock_framebuffers(stw_dev);
 
-      pipe_mutex_lock(old_fb->mutex);
+      stw_framebuffer_lock(old_fb);
       stw_framebuffer_destroy_locked(old_fb);
 
-      pipe_mutex_unlock(stw_dev->fb_mutex);
+      stw_unlock_framebuffers(stw_dev);
    }
 
    *ptr = fb;
@@ -347,6 +350,9 @@ stw_framebuffer_update(struct stw_framebuffer *fb)
 }
 
 
+/**
+ * Try to free all stw_framebuffer objects associated with the device.
+ */
 void
 stw_framebuffer_cleanup(void)
 {
@@ -356,29 +362,29 @@ stw_framebuffer_cleanup(void)
    if (!stw_dev)
       return;
 
-   pipe_mutex_lock( stw_dev->fb_mutex );
+   stw_lock_framebuffers(stw_dev);
 
    fb = stw_dev->fb_head;
    while (fb) {
       next = fb->next;
 
-      pipe_mutex_lock(fb->mutex);
+      stw_framebuffer_lock(fb);
       stw_framebuffer_destroy_locked(fb);
 
       fb = next;
    }
    stw_dev->fb_head = NULL;
 
-   pipe_mutex_unlock( stw_dev->fb_mutex );
+   stw_unlock_framebuffers(stw_dev);
 }
 
 
 /**
  * Given an hdc, return the corresponding stw_framebuffer.
+ * The returned stw_framebuffer will have its mutex locked.
  */
 static inline struct stw_framebuffer *
-stw_framebuffer_from_hdc_locked(
-   HDC hdc )
+stw_framebuffer_from_hdc_locked(HDC hdc)
 {
    HWND hwnd;
 
@@ -392,7 +398,8 @@ stw_framebuffer_from_hdc_locked(
 
 
 /**
- * Given an hdc, return the corresponding stw_framebuffer.
+ * Given an HDC, return the corresponding stw_framebuffer.
+ * The returned stw_framebuffer will have its mutex locked.
  */
 struct stw_framebuffer *
 stw_framebuffer_from_hdc(HDC hdc)
@@ -402,25 +409,26 @@ stw_framebuffer_from_hdc(HDC hdc)
    if (!stw_dev)
       return NULL;
 
-   pipe_mutex_lock( stw_dev->fb_mutex );
+   stw_lock_framebuffers(stw_dev);
    fb = stw_framebuffer_from_hdc_locked(hdc);
-   pipe_mutex_unlock( stw_dev->fb_mutex );
+   stw_unlock_framebuffers(stw_dev);
 
    return fb;
 }
 
 
 /**
- * Given an hdc, return the corresponding stw_framebuffer.
+ * Given an HWND, return the corresponding stw_framebuffer.
+ * The returned stw_framebuffer will have its mutex locked.
  */
 struct stw_framebuffer *
 stw_framebuffer_from_hwnd(HWND hwnd)
 {
    struct stw_framebuffer *fb;
 
-   pipe_mutex_lock( stw_dev->fb_mutex );
+   stw_lock_framebuffers(stw_dev);
    fb = stw_framebuffer_from_hwnd_locked(hwnd);
-   pipe_mutex_unlock( stw_dev->fb_mutex );
+   stw_unlock_framebuffers(stw_dev);
 
    return fb;
 }
@@ -444,12 +452,12 @@ DrvSetPixelFormat(HDC hdc, LONG iPixelFormat)
    fb = stw_framebuffer_from_hdc_locked(hdc);
    if (fb) {
       /*
-       * SetPixelFormat must be called only once.  However ignore 
+       * SetPixelFormat must be called only once.  However ignore
        * pbuffers, for which the framebuffer object is created first.
        */
       boolean bPbuffer = fb->bPbuffer;
 
-      stw_framebuffer_release( fb );
+      stw_framebuffer_unlock( fb );
 
       return bPbuffer;
    }
@@ -459,14 +467,16 @@ DrvSetPixelFormat(HDC hdc, LONG iPixelFormat)
       return FALSE;
    }
 
-   stw_framebuffer_release( fb );
+   stw_framebuffer_unlock( fb );
 
    /* Some applications mistakenly use the undocumented wglSetPixelFormat
     * function instead of SetPixelFormat, so we call SetPixelFormat here to
     * avoid opengl32.dll's wglCreateContext to fail */
    if (GetPixelFormat(hdc) == 0) {
       BOOL bRet = SetPixelFormat(hdc, iPixelFormat, NULL);
-      assert(bRet);
+      if (!bRet) {
+	  debug_printf("SetPixelFormat failed\n");
+      }
    }
 
    return TRUE;
@@ -482,7 +492,7 @@ stw_pixelformat_get(HDC hdc)
    fb = stw_framebuffer_from_hdc(hdc);
    if (fb) {
       iPixelFormat = fb->iPixelFormat;
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
    }
 
    return iPixelFormat;
@@ -539,7 +549,7 @@ DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data)
    stw_framebuffer_update(fb);
    stw_notify_current_locked(fb);
 
-   stw_framebuffer_release(fb);
+   stw_framebuffer_unlock(fb);
 
    return TRUE;
 }
@@ -548,7 +558,8 @@ DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data)
 /**
  * Queue a composition.
  *
- * It will drop the lock on success.
+ * The stw_framebuffer object must have its mutex locked.  The mutex will
+ * be unlocked here before returning.
  */
 BOOL
 stw_framebuffer_present_locked(HDC hdc,
@@ -567,7 +578,7 @@ stw_framebuffer_present_locked(HDC hdc,
       data.pPrivateData = (void *)res;
 
       stw_notify_current_locked(fb);
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
 
       return stw_dev->callbacks.wglCbPresentBuffers(hdc, &data);
    }
@@ -578,7 +589,7 @@ stw_framebuffer_present_locked(HDC hdc,
 
       stw_framebuffer_update(fb);
       stw_notify_current_locked(fb);
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
 
       return TRUE;
    }
@@ -599,19 +610,26 @@ DrvSwapBuffers(HDC hdc)
       return FALSE;
 
    if (!(fb->pfi->pfd.dwFlags & PFD_DOUBLEBUFFER)) {
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
       return TRUE;
    }
 
-   /* Display the HUD */
    ctx = stw_current_context();
-   if (ctx && ctx->hud) {
-      struct pipe_resource *back =
-         stw_get_framebuffer_resource(fb->stfb, ST_ATTACHMENT_BACK_LEFT);
-      hud_draw(ctx->hud, back);
-   }
+   if (ctx) {
+      if (ctx->hud) {
+         /* Display the HUD */
+         struct pipe_resource *back =
+            stw_get_framebuffer_resource(fb->stfb, ST_ATTACHMENT_BACK_LEFT);
+         if (back) {
+            hud_draw(ctx->hud, back);
+         }
+      }
 
-   stw_flush_current_locked(fb);
+      if (ctx->current_framebuffer == fb) {
+         /* flush current context */
+         ctx->st->flush(ctx->st, ST_FLUSH_END_OF_FRAME, NULL);
+      }
+   }
 
    return stw_st_swap_framebuffer_locked(hdc, fb->stfb);
 }
diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.h b/src/gallium/state_trackers/wgl/stw_framebuffer.h
index 28962c8cbd5..109c79dd002 100644
--- a/src/gallium/state_trackers/wgl/stw_framebuffer.h
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.h
@@ -30,7 +30,8 @@
 
 #include <windows.h>
 
-#include "os/os_thread.h"
+#include "util/u_debug.h"
+
 
 struct pipe_resource;
 struct st_framebuffer_iface;
@@ -45,11 +46,11 @@ struct stw_framebuffer
     * This mutex has two purposes:
     * - protect the access to the mutable data members below
     * - prevent the framebuffer from being deleted while being accessed.
-    * 
-    * It is OK to lock this mutex while holding the stw_device::fb_mutex lock, 
-    * but the opposite must never happen.
+    *
+    * Note: if both this mutex and the stw_device::fb_mutex need to be locked,
+    * the stw_device::fb_mutex needs to be locked first.
     */
-   pipe_mutex mutex;
+   CRITICAL_SECTION mutex;
    
    /*
     * Immutable members.
@@ -112,38 +113,33 @@ struct stw_framebuffer
 /**
  * Create a new framebuffer object which will correspond to the given HDC.
  * 
- * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release
+ * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock
  * must be called when done 
  */
 struct stw_framebuffer *
-stw_framebuffer_create(
-   HDC hdc,
-   int iPixelFormat );
+stw_framebuffer_create(HDC hdc, int iPixelFormat);
 
 void
-stw_framebuffer_reference(
-   struct stw_framebuffer **ptr,
-   struct stw_framebuffer *fb);
+stw_framebuffer_reference(struct stw_framebuffer **ptr,
+                          struct stw_framebuffer *fb);
 
 /**
  * Search a framebuffer with a matching HWND.
  * 
- * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release
+ * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock
  * must be called when done 
  */
 struct stw_framebuffer *
-stw_framebuffer_from_hwnd(
-   HWND hwnd );
+stw_framebuffer_from_hwnd(HWND hwnd);
 
 /**
  * Search a framebuffer with a matching HDC.
  * 
- * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release
+ * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock
  * must be called when done 
  */
 struct stw_framebuffer *
-stw_framebuffer_from_hdc(
-   HDC hdc );
+stw_framebuffer_from_hdc(HDC hdc);
 
 BOOL
 stw_framebuffer_present_locked(HDC hdc,
@@ -151,17 +147,29 @@ stw_framebuffer_present_locked(HDC hdc,
                                struct pipe_resource *res);
 
 void
-stw_framebuffer_update(
-   struct stw_framebuffer *fb);
+stw_framebuffer_update(struct stw_framebuffer *fb);
+
+
+static inline void
+stw_framebuffer_lock(struct stw_framebuffer *fb)
+{
+   assert(fb);
+   EnterCriticalSection(&fb->mutex);
+}
+
 
 /**
  * Release stw_framebuffer::mutex lock. This framebuffer must not be accessed
  * after calling this function, as it may have been deleted by another thread
  * in the meanwhile.
  */
-void
-stw_framebuffer_release(
-   struct stw_framebuffer *fb);
+static inline void
+stw_framebuffer_unlock(struct stw_framebuffer *fb)
+{
+   assert(fb);
+   LeaveCriticalSection(&fb->mutex);
+}
+
 
 /**
  * Cleanup any existing framebuffers when exiting application.
diff --git a/src/gallium/state_trackers/wgl/stw_getprocaddress.c b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
index 33949b6606f..28d10d2e312 100644
--- a/src/gallium/state_trackers/wgl/stw_getprocaddress.c
+++ b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
@@ -37,6 +37,8 @@
 #include "stw_icd.h"
 #include "stw_nopfuncs.h"
 
+#include "util/u_debug.h"
+
 struct stw_extension_entry
 {
    const char *name;
diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c
index db6cf8ee30f..ef6158d3645 100644
--- a/src/gallium/state_trackers/wgl/stw_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c
@@ -74,10 +74,11 @@ stw_pf_color[] = {
    /* no-alpha */
    { PIPE_FORMAT_B8G8R8X8_UNORM,    { 8,  8,  8,  0}, {16,  8,  0,  0} },
    { PIPE_FORMAT_X8R8G8B8_UNORM,    { 8,  8,  8,  0}, { 8, 16, 24,  0} },
-   { PIPE_FORMAT_B5G6R5_UNORM,      { 5,  6,  5,  0}, {11,  5,  0,  0} },
    /* alpha */
    { PIPE_FORMAT_B8G8R8A8_UNORM,    { 8,  8,  8,  8}, {16,  8,  0, 24} },
    { PIPE_FORMAT_A8R8G8B8_UNORM,    { 8,  8,  8,  8}, { 8, 16, 24,  0} },
+   /* shallow bit depths */
+   { PIPE_FORMAT_B5G6R5_UNORM,      { 5,  6,  5,  0}, {11,  5,  0,  0} },
 #if 0
    { PIPE_FORMAT_R10G10B10A2_UNORM, {10, 10, 10,  2}, { 0, 10, 20, 30} },
 #endif
@@ -214,14 +215,15 @@ stw_pixelformat_add(
 
 
 /**
- * Add the depth/stencil/accum/ms variants for a particular color format.
+ * Add the depth/stencil/accum/ms variants for a list of color formats.
  */
 static unsigned
-add_color_format_variants(const struct stw_pf_color_info *color,
+add_color_format_variants(const struct stw_pf_color_info *color_formats,
+                          unsigned num_color_formats,
                           boolean extended)
 {
    struct pipe_screen *screen = stw_dev->screen;
-   unsigned ms, db, ds, acc;
+   unsigned cfmt, ms, db, ds, acc;
    unsigned bind_flags = PIPE_BIND_RENDER_TARGET;
    unsigned num_added = 0;
    int force_samples = 0;
@@ -245,27 +247,31 @@ add_color_format_variants(const struct stw_pf_color_info *color,
       if (force_samples && samples != force_samples)
          continue;
 
-      if (!screen->is_format_supported(screen, color->format,
-                                       PIPE_TEXTURE_2D, samples, bind_flags)) {
-         continue;
-      }
+      for (cfmt = 0; cfmt < num_color_formats; cfmt++) {
+         if (!screen->is_format_supported(screen, color_formats[cfmt].format,
+                                          PIPE_TEXTURE_2D, samples,
+                                          bind_flags)) {
+            continue;
+         }
 
-      for (db = 0; db < Elements(stw_pf_doublebuffer); db++) {
-         unsigned doublebuffer = stw_pf_doublebuffer[db];
+         for (db = 0; db < Elements(stw_pf_doublebuffer); db++) {
+            unsigned doublebuffer = stw_pf_doublebuffer[db];
 
-         for (ds = 0; ds < Elements(stw_pf_depth_stencil); ds++) {
-            const struct stw_pf_depth_info *depth = &stw_pf_depth_stencil[ds];
+            for (ds = 0; ds < Elements(stw_pf_depth_stencil); ds++) {
+               const struct stw_pf_depth_info *depth = &stw_pf_depth_stencil[ds];
 
-            if (!screen->is_format_supported(screen, depth->format,
-                                             PIPE_TEXTURE_2D, samples,
-                                             PIPE_BIND_DEPTH_STENCIL)) {
-               continue;
-            }
+               if (!screen->is_format_supported(screen, depth->format,
+                                                PIPE_TEXTURE_2D, samples,
+                                                PIPE_BIND_DEPTH_STENCIL)) {
+                  continue;
+               }
 
-            for (acc = 0; acc < 2; acc++) {
-               stw_pixelformat_add(stw_dev, extended, color, depth,
-                                   acc * 16, doublebuffer, samples);
-               num_added++;
+               for (acc = 0; acc < 2; acc++) {
+                  stw_pixelformat_add(stw_dev, extended, &color_formats[cfmt],
+                                      depth,
+                                      acc * 16, doublebuffer, samples);
+                  num_added++;
+               }
             }
          }
       }
@@ -278,22 +284,19 @@ add_color_format_variants(const struct stw_pf_color_info *color,
 void
 stw_pixelformat_init( void )
 {
-   unsigned i;
-   unsigned num_formats = 0;
+   unsigned num_formats;
 
    assert( !stw_dev->pixelformat_count );
    assert( !stw_dev->pixelformat_extended_count );
 
    /* normal, displayable formats */
-   for (i = 0; i < Elements(stw_pf_color); i++) {
-      num_formats += add_color_format_variants(&stw_pf_color[i], FALSE);
-   }
+   num_formats = add_color_format_variants(stw_pf_color,
+                                           Elements(stw_pf_color), FALSE);
    assert(num_formats > 0);
 
    /* extended, pbuffer-only formats */
-   for (i = 0; i < Elements(stw_pf_color_extended); i++) {
-      add_color_format_variants(&stw_pf_color_extended[i], TRUE);
-   }
+   add_color_format_variants(stw_pf_color_extended,
+                             Elements(stw_pf_color_extended), TRUE);
 
    assert( stw_dev->pixelformat_count <= stw_dev->pixelformat_extended_count );
    assert( stw_dev->pixelformat_extended_count <= STW_MAX_PIXELFORMATS );
diff --git a/src/gallium/state_trackers/wgl/stw_st.c b/src/gallium/state_trackers/wgl/stw_st.c
index b41171a9195..78586db1969 100644
--- a/src/gallium/state_trackers/wgl/stw_st.c
+++ b/src/gallium/state_trackers/wgl/stw_st.c
@@ -52,6 +52,28 @@ stw_st_framebuffer(struct st_framebuffer_iface *stfb)
    return (struct stw_st_framebuffer *) stfb;
 }
 
+
+/**
+ * Is the given mutex held by the calling thread?
+ */
+static bool
+own_mutex(const CRITICAL_SECTION *cs)
+{
+   // We can't compare OwningThread with our thread handle/id (see
+   // http://stackoverflow.com/a/12675635 ) but we can compare with the
+   // OwningThread member of a critical section we know we own.
+   CRITICAL_SECTION dummy;
+   InitializeCriticalSection(&dummy);
+   EnterCriticalSection(&dummy);
+   if (0)
+      _debug_printf("%p %p\n", cs->OwningThread, dummy.OwningThread);
+   bool ret = cs->OwningThread == dummy.OwningThread;
+   LeaveCriticalSection(&dummy);
+   DeleteCriticalSection(&dummy);
+   return ret;
+}
+
+
 /**
  * Remove outdated textures and create the requested ones.
  */
@@ -136,7 +158,7 @@ stw_st_framebuffer_validate(struct st_context_iface *stctx,
    for (i = 0; i < count; i++)
       statt_mask |= 1 << statts[i];
 
-   pipe_mutex_lock(stwfb->fb->mutex);
+   stw_framebuffer_lock(stwfb->fb);
 
    if (stwfb->fb->must_resize || (statt_mask & ~stwfb->texture_mask)) {
       stw_st_framebuffer_validate_locked(&stwfb->base,
@@ -149,7 +171,7 @@ stw_st_framebuffer_validate(struct st_context_iface *stctx,
       pipe_resource_reference(&out[i], stwfb->textures[statts[i]]);
    }
 
-   stw_framebuffer_release(stwfb->fb);
+   stw_framebuffer_unlock(stwfb->fb);
 
    return TRUE;
 }
@@ -165,10 +187,17 @@ stw_st_framebuffer_present_locked(HDC hdc,
    struct stw_st_framebuffer *stwfb = stw_st_framebuffer(stfb);
    struct pipe_resource *resource;
 
+   assert(own_mutex(&stwfb->fb->mutex));
+
    resource = stwfb->textures[statt];
    if (resource) {
       stw_framebuffer_present_locked(hdc, stwfb->fb, resource);
    }
+   else {
+      stw_framebuffer_unlock(stwfb->fb);
+   }
+
+   assert(!own_mutex(&stwfb->fb->mutex));
 
    return TRUE;
 }
@@ -182,7 +211,7 @@ stw_st_framebuffer_flush_front(struct st_context_iface *stctx,
    boolean ret;
    HDC hDC;
 
-   pipe_mutex_lock(stwfb->fb->mutex);
+   stw_framebuffer_lock(stwfb->fb);
 
    /* We must not cache HDCs anywhere, as they can be invalidated by the
     * application, or screen resolution changes. */
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 2878c8f5744..7f395b704c7 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -76,6 +76,9 @@ struct radeon_bomgr {
     bool va;
     uint64_t va_offset;
     struct list_head va_holes;
+
+    /* BO size alignment */
+    unsigned size_align;
 };
 
 static inline struct radeon_bomgr *radeon_bomgr(struct pb_manager *mgr)
@@ -188,8 +191,10 @@ static uint64_t radeon_bomgr_find_va(struct radeon_bomgr *mgr, uint64_t size, ui
     struct radeon_bo_va_hole *hole, *n;
     uint64_t offset = 0, waste = 0;
 
-    alignment = MAX2(alignment, 4096);
-    size = align(size, 4096);
+    /* All VM address space holes will implicitly start aligned to the
+     * size alignment, so we don't need to sanitize the alignment here
+     */
+    size = align(size, mgr->size_align);
 
     pipe_mutex_lock(mgr->bo_va_mutex);
     /* first look for a hole */
@@ -246,7 +251,7 @@ static void radeon_bomgr_free_va(struct radeon_bomgr *mgr, uint64_t va, uint64_t
 {
     struct radeon_bo_va_hole *hole;
 
-    size = align(size, 4096);
+    size = align(size, mgr->size_align);
 
     pipe_mutex_lock(mgr->bo_va_mutex);
     if ((va + size) == mgr->va_offset) {
@@ -357,9 +362,9 @@ static void radeon_bo_destroy(struct pb_buffer *_buf)
     pipe_mutex_destroy(bo->map_mutex);
 
     if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-        bo->rws->allocated_vram -= align(bo->base.size, 4096);
+        bo->rws->allocated_vram -= align(bo->base.size, mgr->size_align);
     else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-        bo->rws->allocated_gtt -= align(bo->base.size, 4096);
+        bo->rws->allocated_gtt -= align(bo->base.size, mgr->size_align);
     FREE(bo);
 }
 
@@ -644,9 +649,9 @@ static struct pb_buffer *radeon_bomgr_create_bo(struct pb_manager *_mgr,
     }
 
     if (rdesc->initial_domains & RADEON_DOMAIN_VRAM)
-        rws->allocated_vram += align(size, 4096);
+        rws->allocated_vram += align(size, mgr->size_align);
     else if (rdesc->initial_domains & RADEON_DOMAIN_GTT)
-        rws->allocated_gtt += align(size, 4096);
+        rws->allocated_gtt += align(size, mgr->size_align);
 
     return &bo->base;
 }
@@ -720,6 +725,9 @@ struct pb_manager *radeon_bomgr_create(struct radeon_drm_winsys *rws)
     mgr->va_offset = rws->va_start;
     list_inithead(&mgr->va_holes);
 
+    /* TTM aligns the BO size to the CPU page size */
+    mgr->size_align = sysconf(_SC_PAGESIZE);
+
     return &mgr->base;
 }
 
@@ -882,7 +890,7 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
      * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
      * like constant/uniform buffers, can benefit from better and more reuse.
      */
-    size = align(size, 4096);
+    size = align(size, mgr->size_align);
 
     /* Only set one usage bit each for domains and flags, or the cache manager
      * might consider different sets of domains / flags compatible
@@ -993,7 +1001,7 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
         pipe_mutex_unlock(mgr->bo_handles_mutex);
     }
 
-    ws->allocated_gtt += align(bo->base.size, 4096);
+    ws->allocated_gtt += align(bo->base.size, mgr->size_align);
 
     return (struct pb_buffer*)bo;
 }
@@ -1130,9 +1138,9 @@ done:
     bo->initial_domain = radeon_bo_get_initial_domain((void*)bo);
 
     if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-        ws->allocated_vram += align(bo->base.size, 4096);
+        ws->allocated_vram += align(bo->base.size, mgr->size_align);
     else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-        ws->allocated_gtt += align(bo->base.size, 4096);
+        ws->allocated_gtt += align(bo->base.size, mgr->size_align);
 
     return (struct pb_buffer*)bo;
 
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index d77ebd6ca15..b5d4435e5e6 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -309,7 +309,7 @@ virgl_drm_winsys_resource_cache_create(struct virgl_winsys *qws,
    while (curr != &qdws->delayed) {
       curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
 
-      if (!res && (ret = virgl_is_res_compat(qdws, curr_res, size, bind, format) > 0))
+      if (!res && ((ret = virgl_is_res_compat(qdws, curr_res, size, bind, format)) > 0))
          res = curr_res;
       else if (os_time_timeout(curr_res->start, curr_res->end, now)) {
          LIST_DEL(&curr_res->head);
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
index b19c4561493..9c9ec044591 100644
--- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
@@ -343,7 +343,7 @@ virgl_vtest_winsys_resource_cache_create(struct virgl_winsys *vws,
    while (curr != &vtws->delayed) {
       curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
 
-      if (!res && (ret = virgl_is_res_compat(vtws, curr_res, size, bind, format) > 0))
+      if (!res && ((ret = virgl_is_res_compat(vtws, curr_res, size, bind, format)) > 0))
          res = curr_res;
       else if (os_time_timeout(curr_res->start, curr_res->end, now)) {
          LIST_DEL(&curr_res->head);
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index 05e76049764..957fd6b90ba 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -38,7 +38,7 @@ NIR_FILES = \
 	nir/nir_intrinsics.h \
 	nir/nir_instr_set.c \
 	nir/nir_instr_set.h \
-	nir/nir_live_variables.c \
+	nir/nir_liveness.c \
 	nir/nir_lower_alu_to_scalar.c \
 	nir/nir_lower_atomics.c \
 	nir/nir_lower_clip.c \
@@ -68,7 +68,6 @@ NIR_FILES = \
 	nir/nir_opt_dead_cf.c \
 	nir/nir_opt_gcm.c \
 	nir/nir_opt_global_to_local.c \
-	nir/nir_opt_peephole_ffma.c \
 	nir/nir_opt_peephole_select.c \
 	nir/nir_opt_remove_phis.c \
 	nir/nir_opt_undef.c \
@@ -180,6 +179,7 @@ LIBGLSL_FILES = \
 	lower_vec_index_to_cond_assign.cpp \
 	lower_vec_index_to_swizzle.cpp \
 	lower_vector.cpp \
+	lower_vector_derefs.cpp \
 	lower_vector_insert.cpp \
 	lower_vertex_id.cpp \
 	lower_output_reads.cpp \
diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index e803e6d7675..1b75234d578 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -448,6 +448,7 @@ struct ast_type_qualifier {
 	 unsigned patch:1;
 	 unsigned uniform:1;
 	 unsigned buffer:1;
+	 unsigned shared_storage:1;
 	 unsigned smooth:1;
 	 unsigned flat:1;
 	 unsigned noperspective:1;
diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp
index 74d403fdb65..ca7a9a10c36 100644
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -319,10 +319,9 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
     * expression.
     */
    if (array->type->is_array()
-       || array->type->is_matrix()) {
+       || array->type->is_matrix()
+       || array->type->is_vector()) {
       return new(mem_ctx) ir_dereference_array(array, idx);
-   } else if (array->type->is_vector()) {
-      return new(mem_ctx) ir_expression(ir_binop_vector_extract, array, idx);
    } else if (array->type->is_error()) {
       return array;
    } else {
diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp
index e4e4a3fe148..466ece67424 100644
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -256,18 +256,10 @@ verify_parameter_modes(_mesa_glsl_parse_state *state,
 			     actual->variable_referenced()->name);
 	    return false;
 	 } else if (!actual->is_lvalue()) {
-            /* Even though ir_binop_vector_extract is not an l-value, let it
-             * slop through.  generate_call will handle it correctly.
-             */
-            ir_expression *const expr = ((ir_rvalue *) actual)->as_expression();
-            if (expr == NULL
-                || expr->operation != ir_binop_vector_extract
-                || !expr->operands[0]->is_lvalue()) {
-               _mesa_glsl_error(&loc, state,
-                                "function parameter '%s %s' is not an lvalue",
-                                mode, formal->name);
-               return false;
-            }
+            _mesa_glsl_error(&loc, state,
+                             "function parameter '%s %s' is not an lvalue",
+                             mode, formal->name);
+            return false;
 	 }
       }
 
@@ -376,12 +368,8 @@ fix_parameter(void *mem_ctx, ir_rvalue *actual, const glsl_type *formal_type,
 
    ir_rvalue *lhs = actual;
    if (expr != NULL && expr->operation == ir_binop_vector_extract) {
-      rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert,
-                                       expr->operands[0]->type,
-                                       expr->operands[0]->clone(mem_ctx, NULL),
-                                       rhs,
-                                       expr->operands[1]->clone(mem_ctx, NULL));
-      lhs = expr->operands[0]->clone(mem_ctx, NULL);
+      lhs = new(mem_ctx) ir_dereference_array(expr->operands[0]->clone(mem_ctx, NULL),
+                                              expr->operands[1]->clone(mem_ctx, NULL));
    }
 
    ir_assignment *const assignment_2 = new(mem_ctx) ir_assignment(lhs, rhs);
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 0a79fb14633..65db2618895 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -538,18 +538,20 @@ bit_logic_result_type(const struct glsl_type *type_a,
 }
 
 static const struct glsl_type *
-modulus_result_type(const struct glsl_type *type_a,
-                    const struct glsl_type *type_b,
+modulus_result_type(ir_rvalue * &value_a, ir_rvalue * &value_b,
                     struct _mesa_glsl_parse_state *state, YYLTYPE *loc)
 {
+   const glsl_type *type_a = value_a->type;
+   const glsl_type *type_b = value_b->type;
+
    if (!state->check_version(130, 300, loc, "operator '%%' is reserved")) {
       return glsl_type::error_type;
    }
 
-   /* From GLSL 1.50 spec, page 56:
+   /* Section 5.9 (Expressions) of the GLSL 4.00 specification says:
+    *
     *    "The operator modulus (%) operates on signed or unsigned integers or
-    *    integer vectors. The operand types must both be signed or both be
-    *    unsigned."
+    *    integer vectors."
     */
    if (!type_a->is_integer()) {
       _mesa_glsl_error(loc, state, "LHS of operator %% must be an integer");
@@ -559,11 +561,28 @@ modulus_result_type(const struct glsl_type *type_a,
       _mesa_glsl_error(loc, state, "RHS of operator %% must be an integer");
       return glsl_type::error_type;
    }
-   if (type_a->base_type != type_b->base_type) {
+
+   /*    "If the fundamental types in the operands do not match, then the
+    *    conversions from section 4.1.10 "Implicit Conversions" are applied
+    *    to create matching types."
+    *
+    * Note that GLSL 4.00 (and GL_ARB_gpu_shader5) introduced implicit
+    * int -> uint conversion rules.  Prior to that, there were no implicit
+    * conversions.  So it's harmless to apply them universally - no implicit
+    * conversions will exist.  If the types don't match, we'll receive false,
+    * and raise an error, satisfying the GLSL 1.50 spec, page 56:
+    *
+    *    "The operand types must both be signed or unsigned."
+    */
+   if (!apply_implicit_conversion(type_a, value_b, state) &&
+       !apply_implicit_conversion(type_b, value_a, state)) {
       _mesa_glsl_error(loc, state,
-                       "operands of %% must have the same base type");
+                       "could not implicitly convert operands to "
+                       "modulus (%%) operator");
       return glsl_type::error_type;
    }
+   type_a = value_a->type;
+   type_b = value_b->type;
 
    /*    "The operands cannot be vectors of differing size. If one operand is
     *    a scalar and the other vector, then the scalar is applied component-
@@ -850,43 +869,6 @@ do_assignment(exec_list *instructions, struct _mesa_glsl_parse_state *state,
 {
    void *ctx = state;
    bool error_emitted = (lhs->type->is_error() || rhs->type->is_error());
-   ir_rvalue *extract_channel = NULL;
-
-   /* If the assignment LHS comes back as an ir_binop_vector_extract
-    * expression, move it to the RHS as an ir_triop_vector_insert.
-    */
-   if (lhs->ir_type == ir_type_expression) {
-      ir_expression *const lhs_expr = lhs->as_expression();
-
-      if (unlikely(lhs_expr->operation == ir_binop_vector_extract)) {
-         ir_rvalue *new_rhs =
-            validate_assignment(state, lhs_loc, lhs,
-                                rhs, is_initializer);
-
-         if (new_rhs == NULL) {
-            return lhs;
-         } else {
-            /* This converts:
-             * - LHS: (expression float vector_extract <vec> <channel>)
-             * - RHS: <scalar>
-             * into:
-             * - LHS: <vec>
-             * - RHS: (expression vec2 vector_insert <vec> <channel> <scalar>)
-             *
-             * The LHS type is now a vector instead of a scalar.  Since GLSL
-             * allows assignments to be used as rvalues, we need to re-extract
-             * the channel from assignment_temp when returning the rvalue.
-             */
-            extract_channel = lhs_expr->operands[1];
-            rhs = new(ctx) ir_expression(ir_triop_vector_insert,
-                                         lhs_expr->operands[0]->type,
-                                         lhs_expr->operands[0],
-                                         new_rhs,
-                                         extract_channel);
-            lhs = lhs_expr->operands[0]->clone(ctx, NULL);
-         }
-      }
-   }
 
    ir_variable *lhs_var = lhs->variable_referenced();
    if (lhs_var)
@@ -984,12 +966,6 @@ do_assignment(exec_list *instructions, struct _mesa_glsl_parse_state *state,
       }
       ir_rvalue *rvalue = new(ctx) ir_dereference_variable(var);
 
-      if (extract_channel) {
-         rvalue = new(ctx) ir_expression(ir_binop_vector_extract,
-                                         rvalue,
-                                         extract_channel->clone(ctx, NULL));
-      }
-
       *out_rvalue = rvalue;
    } else {
       if (!error_emitted)
@@ -1355,7 +1331,7 @@ ast_expression::do_hir(exec_list *instructions,
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
-      type = modulus_result_type(op[0]->type, op[1]->type, state, & loc);
+      type = modulus_result_type(op[0], op[1], state, &loc);
 
       assert(operations[this->oper] == ir_binop_mod);
 
@@ -1602,7 +1578,7 @@ ast_expression::do_hir(exec_list *instructions,
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
-      type = modulus_result_type(op[0]->type, op[1]->type, state, & loc);
+      type = modulus_result_type(op[0], op[1], state, &loc);
 
       assert(operations[this->oper] == ir_binop_mod);
 
@@ -2160,6 +2136,41 @@ process_array_type(YYLTYPE *loc, const glsl_type *base,
    return array_type;
 }
 
+static bool
+precision_qualifier_allowed(const glsl_type *type)
+{
+   /* Precision qualifiers apply to floating point, integer and opaque
+    * types.
+    *
+    * Section 4.5.2 (Precision Qualifiers) of the GLSL 1.30 spec says:
+    *    "Any floating point or any integer declaration can have the type
+    *    preceded by one of these precision qualifiers [...] Literal
+    *    constants do not have precision qualifiers. Neither do Boolean
+    *    variables.
+    *
+    * Section 4.5 (Precision and Precision Qualifiers) of the GLSL 1.30
+    * spec also says:
+    *
+    *     "Precision qualifiers are added for code portability with OpenGL
+    *     ES, not for functionality. They have the same syntax as in OpenGL
+    *     ES."
+    *
+    * Section 8 (Built-In Functions) of the GLSL ES 1.00 spec says:
+    *
+    *     "uniform lowp sampler2D sampler;
+    *     highp vec2 coord;
+    *     ...
+    *     lowp vec4 col = texture2D (sampler, coord);
+    *                                            // texture2D returns lowp"
+    *
+    * From this, we infer that GLSL 1.30 (and later) should allow precision
+    * qualifiers on sampler types just like float and integer types.
+    */
+   return (type->is_float()
+       || type->is_integer()
+       || type->contains_opaque())
+       && !type->without_array()->is_record();
+}
 
 const glsl_type *
 ast_type_specifier::glsl_type(const char **name,
@@ -2176,27 +2187,268 @@ ast_type_specifier::glsl_type(const char **name,
    return type;
 }
 
-const glsl_type *
-ast_fully_specified_type::glsl_type(const char **name,
-                                    struct _mesa_glsl_parse_state *state) const
+/**
+ * From the OpenGL ES 3.0 spec, 4.5.4 Default Precision Qualifiers:
+ *
+ * "The precision statement
+ *
+ *    precision precision-qualifier type;
+ *
+ *  can be used to establish a default precision qualifier. The type field can
+ *  be either int or float or any of the sampler types, (...) If type is float,
+ *  the directive applies to non-precision-qualified floating point type
+ *  (scalar, vector, and matrix) declarations. If type is int, the directive
+ *  applies to all non-precision-qualified integer type (scalar, vector, signed,
+ *  and unsigned) declarations."
+ *
+ * We use the symbol table to keep the values of the default precisions for
+ * each 'type' in each scope and we use the 'type' string from the precision
+ * statement as key in the symbol table. When we want to retrieve the default
+ * precision associated with a given glsl_type we need to know the type string
+ * associated with it. This is what this function returns.
+ */
+static const char *
+get_type_name_for_precision_qualifier(const glsl_type *type)
 {
-   const struct glsl_type *type = this->specifier->glsl_type(name, state);
-
-   if (type == NULL)
-      return NULL;
+   switch (type->base_type) {
+   case GLSL_TYPE_FLOAT:
+      return "float";
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+      return "int";
+   case GLSL_TYPE_ATOMIC_UINT:
+      return "atomic_uint";
+   case GLSL_TYPE_IMAGE:
+   /* fallthrough */
+   case GLSL_TYPE_SAMPLER: {
+      const unsigned type_idx =
+         type->sampler_array + 2 * type->sampler_shadow;
+      const unsigned offset = type->base_type == GLSL_TYPE_SAMPLER ? 0 : 4;
+      assert(type_idx < 4);
+      switch (type->sampler_type) {
+      case GLSL_TYPE_FLOAT:
+         switch (type->sampler_dimensionality) {
+         case GLSL_SAMPLER_DIM_1D: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "sampler1D", "sampler1DArray",
+              "sampler1DShadow", "sampler1DArrayShadow"
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_2D: {
+            static const char *const names[8] = {
+              "sampler2D", "sampler2DArray",
+              "sampler2DShadow", "sampler2DArrayShadow",
+              "image2D", "image2DArray", NULL, NULL
+            };
+            return names[offset + type_idx];
+         }
+         case GLSL_SAMPLER_DIM_3D: {
+            static const char *const names[8] = {
+              "sampler3D", NULL, NULL, NULL,
+              "image3D", NULL, NULL, NULL
+            };
+            return names[offset + type_idx];
+         }
+         case GLSL_SAMPLER_DIM_CUBE: {
+            static const char *const names[8] = {
+              "samplerCube", "samplerCubeArray",
+              "samplerCubeShadow", "samplerCubeArrayShadow",
+              "imageCube", NULL, NULL, NULL
+            };
+            return names[offset + type_idx];
+         }
+         case GLSL_SAMPLER_DIM_MS: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "sampler2DMS", "sampler2DMSArray", NULL, NULL
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_RECT: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "samplerRect", NULL, "samplerRectShadow", NULL
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_BUF: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "samplerBuffer", NULL, NULL, NULL
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_EXTERNAL: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "samplerExternalOES", NULL, NULL, NULL
+            };
+            return names[type_idx];
+         }
+         default:
+            unreachable("Unsupported sampler/image dimensionality");
+         } /* sampler/image float dimensionality */
+         break;
+      case GLSL_TYPE_INT:
+         switch (type->sampler_dimensionality) {
+         case GLSL_SAMPLER_DIM_1D: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "isampler1D", "isampler1DArray", NULL, NULL
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_2D: {
+            static const char *const names[8] = {
+              "isampler2D", "isampler2DArray", NULL, NULL,
+              "iimage2D", "iimage2DArray", NULL, NULL
+            };
+            return names[offset + type_idx];
+         }
+         case GLSL_SAMPLER_DIM_3D: {
+            static const char *const names[8] = {
+              "isampler3D", NULL, NULL, NULL,
+              "iimage3D", NULL, NULL, NULL
+            };
+            return names[offset + type_idx];
+         }
+         case GLSL_SAMPLER_DIM_CUBE: {
+            static const char *const names[8] = {
+              "isamplerCube", "isamplerCubeArray", NULL, NULL,
+              "iimageCube", NULL, NULL, NULL
+            };
+            return names[offset + type_idx];
+         }
+         case GLSL_SAMPLER_DIM_MS: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "isampler2DMS", "isampler2DMSArray", NULL, NULL
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_RECT: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "isamplerRect", NULL, "isamplerRectShadow", NULL
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_BUF: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "isamplerBuffer", NULL, NULL, NULL
+            };
+            return names[type_idx];
+         }
+         default:
+            unreachable("Unsupported isampler/iimage dimensionality");
+         } /* sampler/image int dimensionality */
+         break;
+      case GLSL_TYPE_UINT:
+         switch (type->sampler_dimensionality) {
+         case GLSL_SAMPLER_DIM_1D: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "usampler1D", "usampler1DArray", NULL, NULL
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_2D: {
+            static const char *const names[8] = {
+              "usampler2D", "usampler2DArray", NULL, NULL,
+              "uimage2D", "uimage2DArray", NULL, NULL
+            };
+            return names[offset + type_idx];
+         }
+         case GLSL_SAMPLER_DIM_3D: {
+            static const char *const names[8] = {
+              "usampler3D", NULL, NULL, NULL,
+              "uimage3D", NULL, NULL, NULL
+            };
+            return names[offset + type_idx];
+         }
+         case GLSL_SAMPLER_DIM_CUBE: {
+            static const char *const names[8] = {
+              "usamplerCube", "usamplerCubeArray", NULL, NULL,
+              "uimageCube", NULL, NULL, NULL
+            };
+            return names[offset + type_idx];
+         }
+         case GLSL_SAMPLER_DIM_MS: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "usampler2DMS", "usampler2DMSArray", NULL, NULL
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_RECT: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "usamplerRect", NULL, "usamplerRectShadow", NULL
+            };
+            return names[type_idx];
+         }
+         case GLSL_SAMPLER_DIM_BUF: {
+            assert(type->base_type == GLSL_TYPE_SAMPLER);
+            static const char *const names[4] = {
+              "usamplerBuffer", NULL, NULL, NULL
+            };
+            return names[type_idx];
+         }
+         default:
+            unreachable("Unsupported usampler/uimage dimensionality");
+         } /* sampler/image uint dimensionality */
+         break;
+      default:
+         unreachable("Unsupported sampler/image type");
+      } /* sampler/image type */
+      break;
+   } /* GLSL_TYPE_SAMPLER/GLSL_TYPE_IMAGE */
+   break;
+   default:
+      unreachable("Unsupported type");
+   } /* base type */
+}
 
-   if (type->base_type == GLSL_TYPE_FLOAT
-       && state->es_shader
-       && state->stage == MESA_SHADER_FRAGMENT
-       && this->qualifier.precision == ast_precision_none
-       && state->symbols->get_variable("#default precision") == NULL) {
-      YYLTYPE loc = this->get_location();
-      _mesa_glsl_error(&loc, state,
-                       "no precision specified this scope for type `%s'",
-                       type->name);
+static unsigned
+select_gles_precision(unsigned qual_precision,
+                      const glsl_type *type,
+                      struct _mesa_glsl_parse_state *state, YYLTYPE *loc)
+{
+   /* Precision qualifiers do not have any meaning in Desktop GLSL.
+    * In GLES we take the precision from the type qualifier if present,
+    * otherwise, if the type of the variable allows precision qualifiers at
+    * all, we look for the default precision qualifier for that type in the
+    * current scope.
+    */
+   assert(state->es_shader);
+
+   unsigned precision = GLSL_PRECISION_NONE;
+   if (qual_precision) {
+      precision = qual_precision;
+   } else if (precision_qualifier_allowed(type)) {
+      const char *type_name =
+         get_type_name_for_precision_qualifier(type->without_array());
+      assert(type_name != NULL);
+
+      precision =
+         state->symbols->get_default_precision_qualifier(type_name);
+      if (precision == ast_precision_none) {
+         _mesa_glsl_error(loc, state,
+                          "No precision specified in this scope for type `%s'",
+                          type->name);
+      }
    }
+   return precision;
+}
 
-   return type;
+const glsl_type *
+ast_fully_specified_type::glsl_type(const char **name,
+                                    struct _mesa_glsl_parse_state *state) const
+{
+   return this->specifier->glsl_type(name, state);
 }
 
 /**
@@ -2734,6 +2986,12 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
    if (qual->flags.q.sample)
       var->data.sample = 1;
 
+   /* Precision qualifiers do not hold any meaning in Desktop GLSL */
+   if (state->es_shader) {
+      var->data.precision =
+         select_gles_precision(qual->precision, var->type, state, loc);
+   }
+
    if (state->stage == MESA_SHADER_GEOMETRY &&
        qual->flags.q.out && qual->flags.q.stream) {
       var->data.stream = qual->stream;
@@ -2791,6 +3049,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
       var->data.mode = ir_var_uniform;
    else if (qual->flags.q.buffer)
       var->data.mode = ir_var_shader_storage;
+   else if (qual->flags.q.shared_storage)
+      var->data.mode = ir_var_shader_shared;
 
    if (!is_parameter && is_varying_var(var, state->stage)) {
       /* User-defined ins/outs are not permitted in compute shaders. */
@@ -3090,6 +3350,12 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
                        "members");
    }
 
+   if (qual->flags.q.shared_storage && state->stage != MESA_SHADER_COMPUTE) {
+      _mesa_glsl_error(loc, state,
+                       "the shared storage qualifiers can only be used with "
+                       "compute shaders");
+   }
+
    if (qual->flags.q.row_major || qual->flags.q.column_major) {
       validate_matrix_layout_for_type(state, loc, var->type, var);
    }
@@ -3642,42 +3908,6 @@ validate_identifier(const char *identifier, YYLTYPE loc,
    }
 }
 
-static bool
-precision_qualifier_allowed(const glsl_type *type)
-{
-   /* Precision qualifiers apply to floating point, integer and opaque
-    * types.
-    *
-    * Section 4.5.2 (Precision Qualifiers) of the GLSL 1.30 spec says:
-    *    "Any floating point or any integer declaration can have the type
-    *    preceded by one of these precision qualifiers [...] Literal
-    *    constants do not have precision qualifiers. Neither do Boolean
-    *    variables.
-    *
-    * Section 4.5 (Precision and Precision Qualifiers) of the GLSL 1.30
-    * spec also says:
-    *
-    *     "Precision qualifiers are added for code portability with OpenGL
-    *     ES, not for functionality. They have the same syntax as in OpenGL
-    *     ES."
-    *
-    * Section 8 (Built-In Functions) of the GLSL ES 1.00 spec says:
-    *
-    *     "uniform lowp sampler2D sampler;
-    *     highp vec2 coord;
-    *     ...
-    *     lowp vec4 col = texture2D (sampler, coord);
-    *                                            // texture2D returns lowp"
-    *
-    * From this, we infer that GLSL 1.30 (and later) should allow precision
-    * qualifiers on sampler types just like float and integer types.
-    */
-   return type->is_float()
-       || type->is_integer()
-       || type->is_record()
-       || type->contains_opaque();
-}
-
 ir_rvalue *
 ast_declarator_list::hir(exec_list *instructions,
                          struct _mesa_glsl_parse_state *state)
@@ -5750,20 +5980,10 @@ ast_type_specifier::hir(exec_list *instructions,
          return NULL;
       }
 
-      if (type->base_type == GLSL_TYPE_FLOAT
-          && state->es_shader
-          && state->stage == MESA_SHADER_FRAGMENT) {
+      if (state->es_shader) {
          /* Section 4.5.3 (Default Precision Qualifiers) of the GLSL ES 1.00
           * spec says:
           *
-          *     "The fragment language has no default precision qualifier for
-          *     floating point types."
-          *
-          * As a result, we have to track whether or not default precision has
-          * been specified for float in GLSL ES fragment shaders.
-          *
-          * Earlier in that same section, the spec says:
-          *
           *     "Non-precision qualified declarations will use the precision
           *     qualifier specified in the most recent precision statement
           *     that is still in scope. The precision statement has the same
@@ -5776,16 +5996,13 @@ ast_type_specifier::hir(exec_list *instructions,
           *     overriding earlier statements within that scope."
           *
           * Default precision specifications follow the same scope rules as
-          * variables.  So, we can track the state of the default float
-          * precision in the symbol table, and the rules will just work.  This
+          * variables.  So, we can track the state of the default precision
+          * qualifiers in the symbol table, and the rules will just work.  This
           * is a slight abuse of the symbol table, but it has the semantics
           * that we want.
           */
-         ir_variable *const junk =
-            new(state) ir_variable(type, "#default precision",
-                                   ir_var_auto);
-
-         state->symbols->add_variable(junk);
+         state->symbols->add_default_precision_qualifier(this->type_name,
+                                                         this->default_precision);
       }
 
       /* FINISHME: Translate precision statements into IR. */
@@ -5964,9 +6181,21 @@ ast_process_structure_or_interface_block(exec_list *instructions,
          fields[i].centroid = qual->flags.q.centroid ? 1 : 0;
          fields[i].sample = qual->flags.q.sample ? 1 : 0;
          fields[i].patch = qual->flags.q.patch ? 1 : 0;
+         fields[i].precision = qual->precision;
 
-         /* Only save explicitly defined streams in block's field */
-         fields[i].stream = qual->flags.q.explicit_stream ? qual->stream : -1;
+         /* From Section 4.4.2.3 (Geometry Outputs) of the GLSL 4.50 spec:
+          *
+          *   "A block member may be declared with a stream identifier, but
+          *   the specified stream must match the stream associated with the
+          *   containing block."
+          */
+         if (qual->flags.q.explicit_stream &&
+             qual->stream != layout->stream) {
+            _mesa_glsl_error(&loc, state, "stream layout qualifier on "
+                             "interface block member `%s' does not match "
+                             "the interface block (%d vs %d)",
+                             fields[i].name, qual->stream, layout->stream);
+         }
 
          if (qual->flags.q.row_major || qual->flags.q.column_major) {
             if (!qual->flags.q.uniform && !qual->flags.q.buffer) {
@@ -6268,18 +6497,6 @@ ast_interface_block::hir(exec_list *instructions,
 
    state->struct_specifier_depth--;
 
-   for (unsigned i = 0; i < num_variables; i++) {
-      if (fields[i].stream != -1 &&
-          (unsigned) fields[i].stream != this->layout.stream) {
-         _mesa_glsl_error(&loc, state,
-                          "stream layout qualifier on "
-                          "interface block member `%s' does not match "
-                          "the interface block (%d vs %d)",
-                          fields[i].name, fields[i].stream,
-                          this->layout.stream);
-      }
-   }
-
    if (!redeclaring_per_vertex) {
       validate_identifier(this->block_name, loc, state);
 
@@ -6646,6 +6863,13 @@ ast_interface_block::hir(exec_list *instructions,
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
             var->data.read_only = true;
 
+         /* Precision qualifiers do not have any meaning in Desktop GLSL */
+         if (state->es_shader) {
+            var->data.precision =
+               select_gles_precision(fields[i].precision, fields[i].type,
+                                     state, &loc);
+         }
+
          if (fields[i].matrix_layout == GLSL_MATRIX_LAYOUT_INHERITED) {
             var->data.matrix_layout = matrix_layout == GLSL_MATRIX_LAYOUT_INHERITED
                ? GLSL_MATRIX_LAYOUT_COLUMN_MAJOR : matrix_layout;
diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp
index 08a4504296b..79134c19893 100644
--- a/src/glsl/ast_type.cpp
+++ b/src/glsl/ast_type.cpp
@@ -85,7 +85,8 @@ ast_type_qualifier::has_storage() const
           || this->flags.q.in
           || this->flags.q.out
           || this->flags.q.uniform
-          || this->flags.q.buffer;
+          || this->flags.q.buffer
+          || this->flags.q.shared_storage;
 }
 
 bool
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index 509a57b8813..13494446b59 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -459,9 +459,15 @@ fp64(const _mesa_glsl_parse_state *state)
 }
 
 static bool
+compute_shader(const _mesa_glsl_parse_state *state)
+{
+   return state->stage == MESA_SHADER_COMPUTE;
+}
+
+static bool
 barrier_supported(const _mesa_glsl_parse_state *state)
 {
-   return state->stage == MESA_SHADER_COMPUTE ||
+   return compute_shader(state) ||
           state->stage == MESA_SHADER_TESS_CTRL;
 }
 
@@ -785,8 +791,8 @@ private:
 
    ir_function_signature *_memory_barrier_intrinsic(
       builtin_available_predicate avail);
-   ir_function_signature *_memory_barrier(
-      builtin_available_predicate avail);
+   ir_function_signature *_memory_barrier(const char *intrinsic_name,
+                                          builtin_available_predicate avail);
 
    ir_function_signature *_shader_clock_intrinsic(builtin_available_predicate avail,
                                                   const glsl_type *type);
@@ -963,6 +969,21 @@ builtin_builder::create_intrinsics()
    add_function("__intrinsic_memory_barrier",
                 _memory_barrier_intrinsic(shader_image_load_store),
                 NULL);
+   add_function("__intrinsic_group_memory_barrier",
+                _memory_barrier_intrinsic(compute_shader),
+                NULL);
+   add_function("__intrinsic_memory_barrier_atomic_counter",
+                _memory_barrier_intrinsic(compute_shader),
+                NULL);
+   add_function("__intrinsic_memory_barrier_buffer",
+                _memory_barrier_intrinsic(compute_shader),
+                NULL);
+   add_function("__intrinsic_memory_barrier_image",
+                _memory_barrier_intrinsic(compute_shader),
+                NULL);
+   add_function("__intrinsic_memory_barrier_shared",
+                _memory_barrier_intrinsic(compute_shader),
+                NULL);
 
    add_function("__intrinsic_shader_clock",
                 _shader_clock_intrinsic(shader_clock,
@@ -2754,7 +2775,28 @@ builtin_builder::create_builtins()
    add_image_functions(true);
 
    add_function("memoryBarrier",
-                _memory_barrier(shader_image_load_store),
+                _memory_barrier("__intrinsic_memory_barrier",
+                                shader_image_load_store),
+                NULL);
+   add_function("groupMemoryBarrier",
+                _memory_barrier("__intrinsic_group_memory_barrier",
+                                compute_shader),
+                NULL);
+   add_function("memoryBarrierAtomicCounter",
+                _memory_barrier("__intrinsic_memory_barrier_atomic_counter",
+                                compute_shader),
+                NULL);
+   add_function("memoryBarrierBuffer",
+                _memory_barrier("__intrinsic_memory_barrier_buffer",
+                                compute_shader),
+                NULL);
+   add_function("memoryBarrierImage",
+                _memory_barrier("__intrinsic_memory_barrier_image",
+                                compute_shader),
+                NULL);
+   add_function("memoryBarrierShared",
+                _memory_barrier("__intrinsic_memory_barrier_shared",
+                                compute_shader),
                 NULL);
 
    add_function("clock2x32ARB",
@@ -5264,10 +5306,11 @@ builtin_builder::_memory_barrier_intrinsic(builtin_available_predicate avail)
 }
 
 ir_function_signature *
-builtin_builder::_memory_barrier(builtin_available_predicate avail)
+builtin_builder::_memory_barrier(const char *intrinsic_name,
+                                 builtin_available_predicate avail)
 {
    MAKE_SIG(glsl_type::void_type, avail, 0);
-   body.emit(call(shader->symbols->get_function("__intrinsic_memory_barrier"),
+   body.emit(call(shader->symbols->get_function(intrinsic_name),
                   NULL, sig->parameters));
    return sig;
 }
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index c30fb9226e5..b06c1bc5c12 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -1059,6 +1059,9 @@ builtin_variable_generator::generate_fs_special_vars()
       var = add_input(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex");
       var->data.interpolation = INTERP_QUALIFIER_FLAT;
    }
+
+   if (state->is_version(450, 310)/* || state->ARB_ES3_1_compatibility_enable*/)
+      add_system_value(SYSTEM_VALUE_HELPER_INVOCATION, bool_t, "gl_HelperInvocation");
 }
 
 
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index 4acccf74065..6aa7abec00e 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2387,6 +2387,7 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	   }
 	} else {
 	   add_builtin_define(parser, "GL_ARB_draw_buffers", 1);
+           add_builtin_define(parser, "GL_ARB_enhanced_layouts", 1);
            add_builtin_define(parser, "GL_ARB_separate_shader_objects", 1);
 	   add_builtin_define(parser, "GL_ARB_texture_rectangle", 1);
            add_builtin_define(parser, "GL_AMD_shader_trinary_minmax", 1);
diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll
index 21428177c97..e59f93e10ef 100644
--- a/src/glsl/glsl_lexer.ll
+++ b/src/glsl/glsl_lexer.ll
@@ -414,6 +414,8 @@ writeonly      KEYWORD_WITH_ALT(420, 300, 420, 310, yyextra->ARB_shader_image_lo
 
 atomic_uint     KEYWORD_WITH_ALT(420, 300, 420, 310, yyextra->ARB_shader_atomic_counters_enable, ATOMIC_UINT);
 
+shared          KEYWORD_WITH_ALT(430, 310, 430, 310, yyextra->ARB_compute_shader_enable, SHARED);
+
 struct		return STRUCT;
 void		return VOID_TOK;
 
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 4636435f191..adf6a05acce 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -165,6 +165,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %token IMAGE1DSHADOW IMAGE2DSHADOW IMAGE1DARRAYSHADOW IMAGE2DARRAYSHADOW
 %token COHERENT VOLATILE RESTRICT READONLY WRITEONLY
 %token ATOMIC_UINT
+%token SHARED
 %token STRUCT VOID_TOK WHILE
 %token <identifier> IDENTIFIER TYPE_IDENTIFIER NEW_IDENTIFIER
 %type <identifier> any_identifier
@@ -312,6 +313,18 @@ translation_unit:
    {
       delete state->symbols;
       state->symbols = new(ralloc_parent(state)) glsl_symbol_table;
+      if (state->es_shader) {
+         if (state->stage == MESA_SHADER_FRAGMENT) {
+            state->symbols->add_default_precision_qualifier("int", ast_precision_medium);
+         } else {
+            state->symbols->add_default_precision_qualifier("float", ast_precision_high);
+            state->symbols->add_default_precision_qualifier("int", ast_precision_high);
+         }
+         state->symbols->add_default_precision_qualifier("sampler2D", ast_precision_low);
+         state->symbols->add_default_precision_qualifier("samplerExternalOES", ast_precision_low);
+         state->symbols->add_default_precision_qualifier("samplerCube", ast_precision_low);
+         state->symbols->add_default_precision_qualifier("atomic_uint", ast_precision_high);
+      }
       _mesa_glsl_initialize_types(state);
    }
    ;
@@ -1639,6 +1652,11 @@ interface_block_layout_qualifier:
       memset(& $$, 0, sizeof($$));
       $$.flags.q.packed = 1;
    }
+   | SHARED
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.shared = 1;
+   }
    ;
 
 subroutine_qualifier:
@@ -1929,6 +1947,11 @@ storage_qualifier:
       memset(& $$, 0, sizeof($$));
       $$.flags.q.buffer = 1;
    }
+   | SHARED
+   {
+      memset(& $$, 0, sizeof($$));
+      $$.flags.q.shared_storage = 1;
+   }
    ;
 
 memory_qualifier:
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index f856a200e09..02584c62a4d 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -596,6 +596,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(ARB_derivative_control,           true,  false,     ARB_derivative_control),
    EXT(ARB_draw_buffers,                 true,  false,     dummy_true),
    EXT(ARB_draw_instanced,               true,  false,     ARB_draw_instanced),
+   EXT(ARB_enhanced_layouts,             true,  false,     ARB_enhanced_layouts),
    EXT(ARB_explicit_attrib_location,     true,  false,     ARB_explicit_attrib_location),
    EXT(ARB_explicit_uniform_location,    true,  false,     ARB_explicit_uniform_location),
    EXT(ARB_fragment_coord_conventions,   true,  false,     ARB_fragment_coord_conventions),
@@ -635,7 +636,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
     */
    EXT(OES_EGL_image_external,         false, true,      OES_EGL_image_external),
    EXT(OES_standard_derivatives,       false, true,      OES_standard_derivatives),
-   EXT(OES_texture_3D,                 false, true,      EXT_texture3D),
+   EXT(OES_texture_3D,                 false, true,      dummy_true),
    EXT(OES_texture_storage_multisample_2d_array, false, true, ARB_texture_multisample),
 
    /* All other extensions go here, sorted alphabetically.
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index b54c5359149..1d8c1b8799f 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -209,6 +209,11 @@ struct _mesa_glsl_parse_state {
       return ARB_shader_atomic_counters_enable || is_version(420, 310);
    }
 
+   bool has_enhanced_layouts() const
+   {
+      return ARB_enhanced_layouts_enable || is_version(440, 0);
+   }
+
    bool has_explicit_attrib_stream() const
    {
       return ARB_gpu_shader5_enable || is_version(400, 0);
@@ -499,6 +504,8 @@ struct _mesa_glsl_parse_state {
    bool ARB_draw_buffers_warn;
    bool ARB_draw_instanced_enable;
    bool ARB_draw_instanced_warn;
+   bool ARB_enhanced_layouts_enable;
+   bool ARB_enhanced_layouts_warn;
    bool ARB_explicit_attrib_location_enable;
    bool ARB_explicit_attrib_location_warn;
    bool ARB_explicit_uniform_location_enable;
diff --git a/src/glsl/glsl_symbol_table.cpp b/src/glsl/glsl_symbol_table.cpp
index 536f0a3a8c2..6c682acf560 100644
--- a/src/glsl/glsl_symbol_table.cpp
+++ b/src/glsl/glsl_symbol_table.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "glsl_symbol_table.h"
+#include "ast.h"
 
 class symbol_table_entry {
 public:
@@ -201,6 +202,20 @@ bool glsl_symbol_table::add_function(ir_function *f)
    return _mesa_symbol_table_add_symbol(table, -1, f->name, entry) == 0;
 }
 
+bool glsl_symbol_table::add_default_precision_qualifier(const char *type_name,
+                                                        int precision)
+{
+   char *name = ralloc_asprintf(mem_ctx, "#default_precision_%s", type_name);
+
+   ast_type_specifier *default_specifier = new(mem_ctx) ast_type_specifier(name);
+   default_specifier->default_precision = precision;
+
+   symbol_table_entry *entry =
+      new(mem_ctx) symbol_table_entry(default_specifier);
+
+   return _mesa_symbol_table_add_symbol(table, -1, name, entry) == 0;
+}
+
 void glsl_symbol_table::add_global_function(ir_function *f)
 {
    symbol_table_entry *entry = new(mem_ctx) symbol_table_entry(f);
@@ -234,6 +249,15 @@ ir_function *glsl_symbol_table::get_function(const char *name)
    return entry != NULL ? entry->f : NULL;
 }
 
+int glsl_symbol_table::get_default_precision_qualifier(const char *type_name)
+{
+   char *name = ralloc_asprintf(mem_ctx, "#default_precision_%s", type_name);
+   symbol_table_entry *entry = get_entry(name);
+   if (!entry)
+      return ast_precision_none;
+   return entry->a->default_precision;
+}
+
 symbol_table_entry *glsl_symbol_table::get_entry(const char *name)
 {
    return (symbol_table_entry *)
diff --git a/src/glsl/glsl_symbol_table.h b/src/glsl/glsl_symbol_table.h
index e32b88b8699..5d654e5e6a7 100644
--- a/src/glsl/glsl_symbol_table.h
+++ b/src/glsl/glsl_symbol_table.h
@@ -72,6 +72,7 @@ struct glsl_symbol_table {
    bool add_function(ir_function *f);
    bool add_interface(const char *name, const glsl_type *i,
                       enum ir_variable_mode mode);
+   bool add_default_precision_qualifier(const char *type_name, int precision);
    /*@}*/
 
    /**
@@ -88,6 +89,7 @@ struct glsl_symbol_table {
    ir_function *get_function(const char *name);
    const glsl_type *get_interface(const char *name,
                                   enum ir_variable_mode mode);
+   int get_default_precision_qualifier(const char *type_name);
    /*@}*/
 
    /**
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index 9c9f22d018b..d59dee1e369 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -322,18 +322,19 @@ protected:
  * Variable storage classes
  */
 enum ir_variable_mode {
-   ir_var_auto = 0,     /**< Function local variables and globals. */
-   ir_var_uniform,      /**< Variable declared as a uniform. */
-   ir_var_shader_storage,   /**< Variable declared as an ssbo. */
+   ir_var_auto = 0,             /**< Function local variables and globals. */
+   ir_var_uniform,              /**< Variable declared as a uniform. */
+   ir_var_shader_storage,       /**< Variable declared as an ssbo. */
+   ir_var_shader_shared,        /**< Variable declared as shared. */
    ir_var_shader_in,
    ir_var_shader_out,
    ir_var_function_in,
    ir_var_function_out,
    ir_var_function_inout,
-   ir_var_const_in,	/**< "in" param that must be a constant expression */
-   ir_var_system_value, /**< Ex: front-face, instance-id, etc. */
-   ir_var_temporary,	/**< Temporary variable generated during compilation. */
-   ir_var_mode_count	/**< Number of variable modes */
+   ir_var_const_in,             /**< "in" param that must be a constant expression */
+   ir_var_system_value,         /**< Ex: front-face, instance-id, etc. */
+   ir_var_temporary,            /**< Temporary variable generated during compilation. */
+   ir_var_mode_count            /**< Number of variable modes */
 };
 
 /**
@@ -770,6 +771,19 @@ public:
       unsigned index:1;
 
       /**
+       * Precision qualifier.
+       *
+       * In desktop GLSL we do not care about precision qualifiers at all, in
+       * fact, the spec says that precision qualifiers are ignored.
+       *
+       * To make things easy, we make it so that this field is always
+       * GLSL_PRECISION_NONE on desktop shaders. This way all the variables
+       * have the same precision value and the checks we add in the compiler
+       * for this field will never break a desktop shader compile.
+       */
+      unsigned precision:2;
+
+      /**
        * \brief Layout qualifier for gl_FragDepth.
        *
        * This is not equal to \c ir_depth_layout_none if and only if this
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index ce5c4929bfb..2fee81c09c2 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -124,11 +124,12 @@ bool lower_const_arrays_to_uniforms(exec_list *instructions);
 bool lower_clip_distance(gl_shader *shader);
 void lower_output_reads(unsigned stage, exec_list *instructions);
 bool lower_packing_builtins(exec_list *instructions, int op_mask);
-void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions);
+void lower_ubo_reference(struct gl_shader *shader);
 void lower_packed_varyings(void *mem_ctx,
                            unsigned locations_used, ir_variable_mode mode,
                            unsigned gs_input_vertices, gl_shader *shader);
 bool lower_vector_insert(exec_list *instructions, bool lower_nonconstant_index);
+bool lower_vector_derefs(gl_shader *shader);
 void lower_named_interface_blocks(void *mem_ctx, gl_shader *shader);
 bool optimize_redundant_jumps(exec_list *instructions);
 bool optimize_split_arrays(exec_list *instructions, bool linked);
diff --git a/src/glsl/ir_print_visitor.cpp b/src/glsl/ir_print_visitor.cpp
index b9196900ea6..42b03fdea52 100644
--- a/src/glsl/ir_print_visitor.cpp
+++ b/src/glsl/ir_print_visitor.cpp
@@ -173,8 +173,8 @@ void ir_print_visitor::visit(ir_variable *ir)
    const char *const samp = (ir->data.sample) ? "sample " : "";
    const char *const patc = (ir->data.patch) ? "patch " : "";
    const char *const inv = (ir->data.invariant) ? "invariant " : "";
-   const char *const mode[] = { "", "uniform ", "shader_storage",
-                                "shader_in ", "shader_out ",
+   const char *const mode[] = { "", "uniform ", "shader_storage ",
+                                "shader_shared ", "shader_in ", "shader_out ",
                                 "in ", "out ", "inout ",
 			        "const_in ", "sys ", "temporary " };
    STATIC_ASSERT(ARRAY_SIZE(mode) == ir_var_mode_count);
diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
index 935571ae1d6..e63b5c318e3 100644
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@@ -110,9 +110,10 @@ ir_validate::visit(ir_dereference_variable *ir)
 ir_visitor_status
 ir_validate::visit_enter(class ir_dereference_array *ir)
 {
-   if (!ir->array->type->is_array() && !ir->array->type->is_matrix()) {
-      printf("ir_dereference_array @ %p does not specify an array or a "
-             "matrix\n",
+   if (!ir->array->type->is_array() && !ir->array->type->is_matrix() &&
+      !ir->array->type->is_vector()) {
+      printf("ir_dereference_array @ %p does not specify an array, a vector "
+             "or a matrix\n",
              (void *) ir);
       ir->print();
       printf("\n");
diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp
index cdcc06d53e2..3aa52dbd18a 100644
--- a/src/glsl/link_atomics.cpp
+++ b/src/glsl/link_atomics.cpp
@@ -240,6 +240,8 @@ link_assign_atomic_counter_resources(struct gl_context *ctx,
          storage->offset = var->data.atomic.offset;
          storage->array_stride = (var->type->is_array() ?
                                   var->type->without_array()->atomic_size() : 0);
+         if (!var->type->is_matrix())
+            storage->matrix_stride = 0;
       }
 
       /* Assign stage-specific fields. */
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp
index 35b9f9c6017..cdc1d3ac7be 100644
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -179,7 +179,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding)
 
       /* This is a field of a UBO.  val is the binding index. */
       for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-         int stage_index = prog->UniformBlockStageIndex[i][block_index];
+         int stage_index = prog->InterfaceBlockStageIndex[i][block_index];
 
          if (stage_index != -1) {
             struct gl_shader *sh = prog->_LinkedShaders[i];
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index c35d87acea6..db00f8febc6 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -1174,10 +1174,10 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       struct gl_shader *sh = prog->_LinkedShaders[i];
 
-      prog->UniformBlockStageIndex[i] = ralloc_array(prog, int,
-						     max_num_uniform_blocks);
+      prog->InterfaceBlockStageIndex[i] = ralloc_array(prog, int,
+                                                       max_num_uniform_blocks);
       for (unsigned int j = 0; j < max_num_uniform_blocks; j++)
-	 prog->UniformBlockStageIndex[i][j] = -1;
+	 prog->InterfaceBlockStageIndex[i][j] = -1;
 
       if (sh == NULL)
 	 continue;
@@ -1194,7 +1194,7 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
 	    return false;
 	 }
 
-	 prog->UniformBlockStageIndex[i][index] = j;
+	 prog->InterfaceBlockStageIndex[i][index] = j;
       }
    }
 
@@ -2836,9 +2836,9 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
       }
 
       for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
-	 if (prog->UniformBlockStageIndex[j][i] != -1) {
+	 if (prog->InterfaceBlockStageIndex[j][i] != -1) {
             struct gl_shader *sh = prog->_LinkedShaders[j];
-            int stage_index = prog->UniformBlockStageIndex[j][i];
+            int stage_index = prog->InterfaceBlockStageIndex[j][i];
             if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) {
                shader_blocks[j]++;
                total_shader_storage_blocks++;
@@ -2955,7 +2955,7 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
          total_image_units += sh->NumImages;
 
          for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
-            int stage_index = prog->UniformBlockStageIndex[i][j];
+            int stage_index = prog->InterfaceBlockStageIndex[i][j];
             if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage)
                total_shader_storage_blocks++;
          }
@@ -3734,7 +3734,7 @@ build_program_resource_list(struct gl_shader_program *shProg)
       int block_index = shProg->UniformStorage[i].block_index;
       if (block_index != -1) {
          for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
-             if (shProg->UniformBlockStageIndex[j][block_index] != -1)
+             if (shProg->InterfaceBlockStageIndex[j][block_index] != -1)
                 stageref |= (1 << j);
          }
       }
@@ -3776,7 +3776,8 @@ build_program_resource_list(struct gl_shader_program *shProg)
          continue;
 
       for (int j = MESA_SHADER_VERTEX; j < MESA_SHADER_STAGES; j++) {
-         if (!shProg->UniformStorage[i].opaque[j].active)
+         if (!shProg->UniformStorage[i].opaque[j].active ||
+             !shProg->UniformStorage[i].type->is_subroutine())
             continue;
 
          type = _mesa_shader_stage_to_subroutine_uniform((gl_shader_stage)j);
@@ -3799,11 +3800,6 @@ build_program_resource_list(struct gl_shader_program *shProg)
             return;
       }
    }
-
-   /* TODO - following extensions will require more resource types:
-    *
-    *    GL_ARB_shader_storage_buffer_object
-    */
 }
 
 /**
@@ -4449,6 +4445,16 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
 
    /* FINISHME: Assign fragment shader output locations. */
 
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      if (prog->_LinkedShaders[i] == NULL)
+	 continue;
+
+      if (ctx->Const.ShaderCompilerOptions[i].LowerBufferInterfaceBlocks)
+         lower_ubo_reference(prog->_LinkedShaders[i]);
+
+      lower_vector_derefs(prog->_LinkedShaders[i]);
+   }
+
 done:
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       free(shader_list[i]);
diff --git a/src/glsl/lower_packed_varyings.cpp b/src/glsl/lower_packed_varyings.cpp
index 5d66ca931cf..037c27d88ab 100644
--- a/src/glsl/lower_packed_varyings.cpp
+++ b/src/glsl/lower_packed_varyings.cpp
@@ -621,6 +621,7 @@ lower_packed_varyings_visitor::get_packed_varying_deref(
       packed_var->data.patch = unpacked_var->data.patch;
       packed_var->data.interpolation = unpacked_var->data.interpolation;
       packed_var->data.location = location;
+      packed_var->data.precision = unpacked_var->data.precision;
       unpacked_var->insert_before(packed_var);
       this->packed_varyings[slot] = packed_var;
    } else {
diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp
index 57a242b4074..b74aa3d0630 100644
--- a/src/glsl/lower_ubo_reference.cpp
+++ b/src/glsl/lower_ubo_reference.cpp
@@ -390,7 +390,19 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var,
       case ir_type_dereference_array: {
          ir_dereference_array *deref_array = (ir_dereference_array *) deref;
          unsigned array_stride;
-         if (deref_array->array->type->is_matrix() && *row_major) {
+         if (deref_array->array->type->is_vector()) {
+            /* We get this when storing or loading a component out of a vector
+             * with a non-constant index. This happens for v[i] = f where v is
+             * a vector (or m[i][j] = f where m is a matrix). If we don't
+             * lower that here, it gets turned into v = vector_insert(v, i,
+             * f), which loads the entire vector, modifies one component and
+             * then write the entire thing back.  That breaks if another
+             * thread or SIMD channel is modifying the same vector.
+             */
+            array_stride = 4;
+            if (deref_array->array->type->is_double())
+               array_stride *= 2;
+         } else if (deref_array->array->type->is_matrix() && *row_major) {
             /* When loading a vector out of a row major matrix, the
              * step between the columns (vectors) is the size of a
              * float, while the step between the rows (elements of a
@@ -1270,7 +1282,7 @@ lower_ubo_reference_visitor::visit_enter(ir_call *ir)
 } /* unnamed namespace */
 
 void
-lower_ubo_reference(struct gl_shader *shader, exec_list *instructions)
+lower_ubo_reference(struct gl_shader *shader)
 {
    lower_ubo_reference_visitor v(shader);
 
@@ -1281,6 +1293,6 @@ lower_ubo_reference(struct gl_shader *shader, exec_list *instructions)
     */
    do {
       v.progress = false;
-      visit_list_elements(&v, instructions);
+      visit_list_elements(&v, shader->ir);
    } while (v.progress);
 }
diff --git a/src/glsl/lower_vector_derefs.cpp b/src/glsl/lower_vector_derefs.cpp
new file mode 100644
index 00000000000..4a5d6f0da4c
--- /dev/null
+++ b/src/glsl/lower_vector_derefs.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include "ir.h"
+#include "ir_builder.h"
+#include "ir_rvalue_visitor.h"
+#include "ir_optimization.h"
+
+using namespace ir_builder;
+
+namespace {
+
+class vector_deref_visitor : public ir_rvalue_enter_visitor {
+public:
+   vector_deref_visitor()
+      : progress(false)
+   {
+   }
+
+   virtual ~vector_deref_visitor()
+   {
+   }
+
+   virtual void handle_rvalue(ir_rvalue **rv);
+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+
+   bool progress;
+};
+
+} /* anonymous namespace */
+
+ir_visitor_status
+vector_deref_visitor::visit_enter(ir_assignment *ir)
+{
+   if (!ir->lhs || ir->lhs->ir_type != ir_type_dereference_array)
+      return ir_rvalue_enter_visitor::visit_enter(ir);
+
+   ir_dereference_array *const deref = (ir_dereference_array *) ir->lhs;
+   if (!deref->array->type->is_vector())
+      return ir_rvalue_enter_visitor::visit_enter(ir);
+
+   ir_dereference *const new_lhs = (ir_dereference *) deref->array;
+   ir->set_lhs(new_lhs);
+
+   ir_constant *old_index_constant = deref->array_index->constant_expression_value();
+   void *mem_ctx = ralloc_parent(ir);
+   if (!old_index_constant) {
+      ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert,
+                                           new_lhs->type,
+                                           new_lhs->clone(mem_ctx, NULL),
+                                           ir->rhs,
+                                           deref->array_index);
+      ir->write_mask = (1 << new_lhs->type->vector_elements) - 1;
+   } else {
+      ir->write_mask = 1 << old_index_constant->get_int_component(0);
+   }
+
+   return ir_rvalue_enter_visitor::visit_enter(ir);
+}
+
+void
+vector_deref_visitor::handle_rvalue(ir_rvalue **rv)
+{
+   if (*rv == NULL || (*rv)->ir_type != ir_type_dereference_array)
+      return;
+
+   ir_dereference_array *const deref = (ir_dereference_array *) *rv;
+   if (!deref->array->type->is_vector())
+      return;
+
+   void *mem_ctx = ralloc_parent(deref);
+   *rv = new(mem_ctx) ir_expression(ir_binop_vector_extract,
+                                    deref->array,
+                                    deref->array_index);
+}
+
+bool
+lower_vector_derefs(gl_shader *shader)
+{
+   vector_deref_visitor v;
+
+   visit_list_elements(&v, shader->ir);
+
+   return v.progress;
+}
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index ba14bbbeb6a..d8df3544f10 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -27,6 +27,7 @@
 
 #include "glsl_to_nir.h"
 #include "nir_control_flow.h"
+#include "nir_builder.h"
 #include "ir_visitor.h"
 #include "ir_hierarchical_visitor.h"
 #include "ir.h"
@@ -73,14 +74,14 @@ public:
 private:
    void create_overload(ir_function_signature *ir, nir_function *function);
    void add_instr(nir_instr *instr, unsigned num_components);
-   nir_src evaluate_rvalue(ir_rvalue *ir);
+   nir_ssa_def *evaluate_rvalue(ir_rvalue *ir);
 
-   nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_src *srcs);
-   nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_src src1);
-   nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_src src1,
-                       nir_src src2);
-   nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_src src1,
-                       nir_src src2, nir_src src3);
+   nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def **srcs);
+   nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def *src1);
+   nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def *src1,
+                       nir_ssa_def *src2);
+   nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def *src1,
+                       nir_ssa_def *src2, nir_ssa_def *src3);
 
    bool supports_ints;
 
@@ -88,8 +89,8 @@ private:
 
    nir_shader *shader;
    nir_function_impl *impl;
-   exec_list *cf_node_list;
-   nir_instr *result; /* result of the expression tree last visited */
+   nir_builder b;
+   nir_ssa_def *result; /* result of the expression tree last visited */
 
    nir_deref_var *evaluate_deref(nir_instr *mem_ctx, ir_instruction *ir);
 
@@ -162,6 +163,8 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
    shader->info.num_images = sh->NumImages;
    shader->info.inputs_read = sh->Program->InputsRead;
    shader->info.outputs_written = sh->Program->OutputsWritten;
+   shader->info.patch_inputs_read = sh->Program->PatchInputsRead;
+   shader->info.patch_outputs_written = sh->Program->PatchOutputsWritten;
    shader->info.system_values_read = sh->Program->SystemValuesRead;
    shader->info.uses_texture_gather = sh->Program->UsesGather;
    shader->info.uses_clip_distance_out =
@@ -537,7 +540,8 @@ nir_visitor::visit(ir_function_signature *ir)
 
       this->is_global = false;
 
-      this->cf_node_list = &impl->body;
+      nir_builder_init(&b, impl);
+      b.cursor = nir_after_cf_list(&impl->body);
       visit_exec_list(&ir->body, this);
 
       this->is_global = true;
@@ -549,34 +553,31 @@ nir_visitor::visit(ir_function_signature *ir)
 void
 nir_visitor::visit(ir_loop *ir)
 {
-   exec_list *old_list = this->cf_node_list;
-
    nir_loop *loop = nir_loop_create(this->shader);
-   nir_cf_node_insert_end(old_list, &loop->cf_node);
-   this->cf_node_list = &loop->body;
-   visit_exec_list(&ir->body_instructions, this);
+   nir_builder_cf_insert(&b, &loop->cf_node);
 
-   this->cf_node_list = old_list;
+   b.cursor = nir_after_cf_list(&loop->body);
+   visit_exec_list(&ir->body_instructions, this);
+   b.cursor = nir_after_cf_node(&loop->cf_node);
 }
 
 void
 nir_visitor::visit(ir_if *ir)
 {
-   nir_src condition = evaluate_rvalue(ir->condition);
-
-   exec_list *old_list = this->cf_node_list;
+   nir_src condition =
+      nir_src_for_ssa(evaluate_rvalue(ir->condition));
 
    nir_if *if_stmt = nir_if_create(this->shader);
    if_stmt->condition = condition;
-   nir_cf_node_insert_end(old_list, &if_stmt->cf_node);
+   nir_builder_cf_insert(&b, &if_stmt->cf_node);
 
-   this->cf_node_list = &if_stmt->then_list;
+   b.cursor = nir_after_cf_list(&if_stmt->then_list);
    visit_exec_list(&ir->then_instructions, this);
 
-   this->cf_node_list = &if_stmt->else_list;
+   b.cursor = nir_after_cf_list(&if_stmt->else_list);
    visit_exec_list(&ir->else_instructions, this);
 
-   this->cf_node_list = old_list;
+   b.cursor = nir_after_cf_node(&if_stmt->cf_node);
 }
 
 void
@@ -593,11 +594,13 @@ nir_visitor::visit(ir_discard *ir)
    if (ir->condition) {
       discard = nir_intrinsic_instr_create(this->shader,
                                            nir_intrinsic_discard_if);
-      discard->src[0] = evaluate_rvalue(ir->condition);
+      discard->src[0] =
+         nir_src_for_ssa(evaluate_rvalue(ir->condition));
    } else {
       discard = nir_intrinsic_instr_create(this->shader, nir_intrinsic_discard);
    }
-   nir_instr_insert_after_cf_list(this->cf_node_list, &discard->instr);
+
+   nir_builder_instr_insert(&b, &discard->instr);
 }
 
 void
@@ -606,7 +609,7 @@ nir_visitor::visit(ir_emit_vertex *ir)
    nir_intrinsic_instr *instr =
       nir_intrinsic_instr_create(this->shader, nir_intrinsic_emit_vertex);
    instr->const_index[0] = ir->stream_id();
-   nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(&b, &instr->instr);
 }
 
 void
@@ -615,7 +618,7 @@ nir_visitor::visit(ir_end_primitive *ir)
    nir_intrinsic_instr *instr =
       nir_intrinsic_instr_create(this->shader, nir_intrinsic_end_primitive);
    instr->const_index[0] = ir->stream_id();
-   nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(&b, &instr->instr);
 }
 
 void
@@ -634,7 +637,7 @@ nir_visitor::visit(ir_loop_jump *ir)
    }
 
    nir_jump_instr *instr = nir_jump_instr_create(this->shader, type);
-   nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(&b, &instr->instr);
 }
 
 void
@@ -649,7 +652,7 @@ nir_visitor::visit(ir_return *ir)
    }
 
    nir_jump_instr *instr = nir_jump_instr_create(this->shader, nir_jump_return);
-   nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(&b, &instr->instr);
 }
 
 void
@@ -723,6 +726,16 @@ nir_visitor::visit(ir_call *ir)
          op = nir_intrinsic_ssbo_atomic_comp_swap;
       } else if (strcmp(ir->callee_name(), "__intrinsic_shader_clock") == 0) {
          op = nir_intrinsic_shader_clock;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_group_memory_barrier") == 0) {
+         op = nir_intrinsic_group_memory_barrier;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_atomic_counter") == 0) {
+         op = nir_intrinsic_memory_barrier_atomic_counter;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_buffer") == 0) {
+         op = nir_intrinsic_memory_barrier_buffer;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_image") == 0) {
+         op = nir_intrinsic_memory_barrier_image;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_shared") == 0) {
+         op = nir_intrinsic_memory_barrier_shared;
       } else {
          unreachable("not reached");
       }
@@ -738,7 +751,7 @@ nir_visitor::visit(ir_call *ir)
             (ir_dereference *) ir->actual_parameters.get_head();
          instr->variables[0] = evaluate_deref(&instr->instr, param);
          nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
-         nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+         nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
       case nir_intrinsic_image_load:
@@ -755,8 +768,7 @@ nir_visitor::visit(ir_call *ir)
       case nir_intrinsic_image_size: {
          nir_ssa_undef_instr *instr_undef =
             nir_ssa_undef_instr_create(shader, 1);
-         nir_instr_insert_after_cf_list(this->cf_node_list,
-                                        &instr_undef->instr);
+         nir_builder_instr_insert(&b, &instr_undef->instr);
 
          /* Set the image variable dereference. */
          exec_node *param = ir->actual_parameters.get_head();
@@ -777,35 +789,33 @@ nir_visitor::visit(ir_call *ir)
 
          if (op == nir_intrinsic_image_size ||
              op == nir_intrinsic_image_samples) {
-            nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+            nir_builder_instr_insert(&b, &instr->instr);
             break;
          }
 
          /* Set the address argument, extending the coordinate vector to four
           * components.
           */
-         const nir_src src_addr = evaluate_rvalue((ir_dereference *)param);
-         nir_alu_instr *instr_addr = nir_alu_instr_create(shader, nir_op_vec4);
-         nir_ssa_dest_init(&instr_addr->instr, &instr_addr->dest.dest, 4, NULL);
+         nir_ssa_def *src_addr =
+            evaluate_rvalue((ir_dereference *)param);
+         nir_ssa_def *srcs[4];
 
          for (int i = 0; i < 4; i++) {
-            if (i < type->coordinate_components()) {
-               instr_addr->src[i].src = src_addr;
-               instr_addr->src[i].swizzle[0] = i;
-            } else {
-               instr_addr->src[i].src = nir_src_for_ssa(&instr_undef->def);
-            }
+            if (i < type->coordinate_components())
+               srcs[i] = nir_channel(&b, src_addr, i);
+            else
+               srcs[i] = &instr_undef->def;
          }
 
-         nir_instr_insert_after_cf_list(cf_node_list, &instr_addr->instr);
-         instr->src[0] = nir_src_for_ssa(&instr_addr->dest.dest.ssa);
+         instr->src[0] = nir_src_for_ssa(nir_vec(&b, srcs, 4));
          param = param->get_next();
 
          /* Set the sample argument, which is undefined for single-sample
           * images.
           */
          if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_MS) {
-            instr->src[1] = evaluate_rvalue((ir_dereference *)param);
+            instr->src[1] =
+               nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param));
             param = param->get_next();
          } else {
             instr->src[1] = nir_src_for_ssa(&instr_undef->def);
@@ -813,23 +823,30 @@ nir_visitor::visit(ir_call *ir)
 
          /* Set the intrinsic parameters. */
          if (!param->is_tail_sentinel()) {
-            instr->src[2] = evaluate_rvalue((ir_dereference *)param);
+            instr->src[2] =
+               nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param));
             param = param->get_next();
          }
 
          if (!param->is_tail_sentinel()) {
-            instr->src[3] = evaluate_rvalue((ir_dereference *)param);
+            instr->src[3] =
+               nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param));
             param = param->get_next();
          }
-         nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+         nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
       case nir_intrinsic_memory_barrier:
-         nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+      case nir_intrinsic_group_memory_barrier:
+      case nir_intrinsic_memory_barrier_atomic_counter:
+      case nir_intrinsic_memory_barrier_buffer:
+      case nir_intrinsic_memory_barrier_image:
+      case nir_intrinsic_memory_barrier_shared:
+         nir_builder_instr_insert(&b, &instr->instr);
          break;
       case nir_intrinsic_shader_clock:
          nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
-         nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+         nir_builder_instr_insert(&b, &instr->instr);
          break;
       case nir_intrinsic_store_ssbo: {
          exec_node *param = ir->actual_parameters.get_head();
@@ -851,7 +868,7 @@ nir_visitor::visit(ir_call *ir)
             op = nir_intrinsic_store_ssbo_indirect;
             ralloc_free(instr);
             instr = nir_intrinsic_instr_create(shader, op);
-            instr->src[2] = evaluate_rvalue(offset);
+            instr->src[2] = nir_src_for_ssa(evaluate_rvalue(offset));
             instr->const_index[0] = 0;
          } else {
             instr->const_index[0] = const_offset->value.u[0];
@@ -859,11 +876,11 @@ nir_visitor::visit(ir_call *ir)
 
          instr->const_index[1] = write_mask->value.u[0];
 
-         instr->src[0] = evaluate_rvalue(val);
+         instr->src[0] = nir_src_for_ssa(evaluate_rvalue(val));
          instr->num_components = val->type->vector_elements;
 
-         instr->src[1] = evaluate_rvalue(block);
-         nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+         instr->src[1] = nir_src_for_ssa(evaluate_rvalue(block));
+         nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
       case nir_intrinsic_load_ssbo: {
@@ -879,14 +896,14 @@ nir_visitor::visit(ir_call *ir)
             op = nir_intrinsic_load_ssbo_indirect;
             ralloc_free(instr);
             instr = nir_intrinsic_instr_create(shader, op);
-            instr->src[1] = evaluate_rvalue(offset);
+            instr->src[1] = nir_src_for_ssa(evaluate_rvalue(offset));
             instr->const_index[0] = 0;
             dest = &instr->dest;
          } else {
             instr->const_index[0] = const_offset->value.u[0];
          }
 
-         instr->src[0] = evaluate_rvalue(block);
+         instr->src[0] = nir_src_for_ssa(evaluate_rvalue(block));
 
          const glsl_type *type = ir->return_deref->var->type;
          instr->num_components = type->vector_elements;
@@ -898,7 +915,7 @@ nir_visitor::visit(ir_call *ir)
          /* Insert the created nir instruction now since in the case of boolean
           * result we will need to emit another instruction after it
           */
-         nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+         nir_builder_instr_insert(&b, &instr->instr);
 
          /*
           * In SSBO/UBO's, a true boolean value is any non-zero value, but we
@@ -906,26 +923,19 @@ nir_visitor::visit(ir_call *ir)
           * comparison.
           */
          if (type->base_type == GLSL_TYPE_BOOL) {
-            nir_load_const_instr *const_zero =
-               nir_load_const_instr_create(shader, 1);
-            const_zero->value.u[0] = 0;
-            nir_instr_insert_after_cf_list(this->cf_node_list,
-                                           &const_zero->instr);
-
             nir_alu_instr *load_ssbo_compare =
                nir_alu_instr_create(shader, nir_op_ine);
             load_ssbo_compare->src[0].src.is_ssa = true;
             load_ssbo_compare->src[0].src.ssa = &instr->dest.ssa;
-            load_ssbo_compare->src[1].src.is_ssa = true;
-            load_ssbo_compare->src[1].src.ssa = &const_zero->def;
+            load_ssbo_compare->src[1].src =
+               nir_src_for_ssa(nir_imm_int(&b, 0));
             for (unsigned i = 0; i < type->vector_elements; i++)
                load_ssbo_compare->src[1].swizzle[i] = 0;
             nir_ssa_dest_init(&load_ssbo_compare->instr,
                               &load_ssbo_compare->dest.dest,
                               type->vector_elements, NULL);
             load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1;
-            nir_instr_insert_after_cf_list(this->cf_node_list,
-                                           &load_ssbo_compare->instr);
+            nir_builder_instr_insert(&b, &load_ssbo_compare->instr);
             dest = &load_ssbo_compare->dest.dest;
          }
          break;
@@ -946,31 +956,31 @@ nir_visitor::visit(ir_call *ir)
          /* Block index */
          exec_node *param = ir->actual_parameters.get_head();
          ir_instruction *inst = (ir_instruction *) param;
-         instr->src[0] = evaluate_rvalue(inst->as_rvalue());
+         instr->src[0] = nir_src_for_ssa(evaluate_rvalue(inst->as_rvalue()));
 
          /* Offset */
          param = param->get_next();
          inst = (ir_instruction *) param;
-         instr->src[1] = evaluate_rvalue(inst->as_rvalue());
+         instr->src[1] = nir_src_for_ssa(evaluate_rvalue(inst->as_rvalue()));
 
          /* data1 parameter (this is always present) */
          param = param->get_next();
          inst = (ir_instruction *) param;
-         instr->src[2] = evaluate_rvalue(inst->as_rvalue());
+         instr->src[2] = nir_src_for_ssa(evaluate_rvalue(inst->as_rvalue()));
 
          /* data2 parameter (only with atomic_comp_swap) */
          if (param_count == 4) {
             assert(op == nir_intrinsic_ssbo_atomic_comp_swap);
             param = param->get_next();
             inst = (ir_instruction *) param;
-            instr->src[3] = evaluate_rvalue(inst->as_rvalue());
+            instr->src[3] = nir_src_for_ssa(evaluate_rvalue(inst->as_rvalue()));
          }
 
          /* Atomic result */
          assert(ir->return_deref);
          nir_ssa_dest_init(&instr->instr, &instr->dest,
                            ir->return_deref->type->vector_elements, NULL);
-         nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+         nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
       default:
@@ -986,8 +996,7 @@ nir_visitor::visit(ir_call *ir)
             evaluate_deref(&store_instr->instr, ir->return_deref);
          store_instr->src[0] = nir_src_for_ssa(&dest->ssa);
 
-         nir_instr_insert_after_cf_list(this->cf_node_list,
-                                        &store_instr->instr);
+         nir_builder_instr_insert(&b, &store_instr->instr);
       }
 
       return;
@@ -1007,7 +1016,7 @@ nir_visitor::visit(ir_call *ir)
    }
 
    instr->return_deref = evaluate_deref(&instr->instr, ir->return_deref);
-   nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(&b, &instr->instr);
 }
 
 void
@@ -1026,11 +1035,12 @@ nir_visitor::visit(ir_assignment *ir)
 
       if (ir->condition) {
          nir_if *if_stmt = nir_if_create(this->shader);
-         if_stmt->condition = evaluate_rvalue(ir->condition);
-         nir_cf_node_insert_end(this->cf_node_list, &if_stmt->cf_node);
+         if_stmt->condition = nir_src_for_ssa(evaluate_rvalue(ir->condition));
+         nir_builder_cf_insert(&b, &if_stmt->cf_node);
          nir_instr_insert_after_cf_list(&if_stmt->then_list, &copy->instr);
+         b.cursor = nir_after_cf_node(&if_stmt->cf_node);
       } else {
-         nir_instr_insert_after_cf_list(this->cf_node_list, &copy->instr);
+         nir_builder_instr_insert(&b, &copy->instr);
       }
       return;
    }
@@ -1039,7 +1049,7 @@ nir_visitor::visit(ir_assignment *ir)
 
    ir->lhs->accept(this);
    nir_deref_var *lhs_deref = this->deref_head;
-   nir_src src = evaluate_rvalue(ir->rhs);
+   nir_ssa_def *src = evaluate_rvalue(ir->rhs);
 
    if (ir->write_mask != (1 << num_components) - 1 && ir->write_mask != 0) {
       /*
@@ -1055,42 +1065,25 @@ nir_visitor::visit(ir_assignment *ir)
       nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
       load->variables[0] = lhs_deref;
       ralloc_steal(load, load->variables[0]);
-      nir_instr_insert_after_cf_list(this->cf_node_list, &load->instr);
-
-      nir_op vec_op;
-      switch (ir->lhs->type->vector_elements) {
-         case 1: vec_op = nir_op_imov; break;
-         case 2: vec_op = nir_op_vec2; break;
-         case 3: vec_op = nir_op_vec3; break;
-         case 4: vec_op = nir_op_vec4; break;
-         default: unreachable("Invalid number of components"); break;
-      }
-      nir_alu_instr *vec = nir_alu_instr_create(this->shader, vec_op);
-      nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, NULL);
-      vec->dest.write_mask = (1 << num_components) - 1;
+      nir_builder_instr_insert(&b, &load->instr);
+
+      nir_ssa_def *srcs[4];
 
       unsigned component = 0;
       for (unsigned i = 0; i < ir->lhs->type->vector_elements; i++) {
          if (ir->write_mask & (1 << i)) {
-            vec->src[i].src = src;
-
             /* GLSL IR will give us the input to the write-masked assignment
              * in a single packed vector.  So, for example, if the
              * writemask is xzw, then we have to swizzle x -> x, y -> z,
              * and z -> w and get the y component from the load.
              */
-            vec->src[i].swizzle[0] = component++;
+            srcs[i] = nir_channel(&b, src, component++);
          } else {
-            vec->src[i].src.is_ssa = true;
-            vec->src[i].src.ssa = &load->dest.ssa;
-            vec->src[i].swizzle[0] = i;
+            srcs[i] = nir_channel(&b, &load->dest.ssa, i);
          }
       }
 
-      nir_instr_insert_after_cf_list(this->cf_node_list, &vec->instr);
-
-      src.is_ssa = true;
-      src.ssa = &vec->dest.dest.ssa;
+      src = nir_vec(&b, srcs, ir->lhs->type->vector_elements);
    }
 
    nir_intrinsic_instr *store =
@@ -1098,15 +1091,16 @@ nir_visitor::visit(ir_assignment *ir)
    store->num_components = ir->lhs->type->vector_elements;
    nir_deref *store_deref = nir_copy_deref(store, &lhs_deref->deref);
    store->variables[0] = nir_deref_as_var(store_deref);
-   store->src[0] = src;
+   store->src[0] = nir_src_for_ssa(src);
 
    if (ir->condition) {
       nir_if *if_stmt = nir_if_create(this->shader);
-      if_stmt->condition = evaluate_rvalue(ir->condition);
-      nir_cf_node_insert_end(this->cf_node_list, &if_stmt->cf_node);
+      if_stmt->condition = nir_src_for_ssa(evaluate_rvalue(ir->condition));
+      nir_builder_cf_insert(&b, &if_stmt->cf_node);
       nir_instr_insert_after_cf_list(&if_stmt->then_list, &store->instr);
+      b.cursor = nir_after_cf_node(&if_stmt->cf_node);
    } else {
-      nir_instr_insert_after_cf_list(this->cf_node_list, &store->instr);
+      nir_builder_instr_insert(&b, &store->instr);
    }
 }
 
@@ -1154,11 +1148,15 @@ nir_visitor::add_instr(nir_instr *instr, unsigned num_components)
    if (dest)
       nir_ssa_dest_init(instr, dest, num_components, NULL);
 
-   nir_instr_insert_after_cf_list(this->cf_node_list, instr);
-   this->result = instr;
+   nir_builder_instr_insert(&b, instr);
+
+   if (dest) {
+      assert(dest->is_ssa);
+      this->result = &dest->ssa;
+   }
 }
 
-nir_src
+nir_ssa_def *
 nir_visitor::evaluate_rvalue(ir_rvalue* ir)
 {
    ir->accept(this);
@@ -1176,46 +1174,7 @@ nir_visitor::evaluate_rvalue(ir_rvalue* ir)
       add_instr(&load_instr->instr, ir->type->vector_elements);
    }
 
-   nir_dest *dest = get_instr_dest(this->result);
-   assert(dest->is_ssa);
-
-   return nir_src_for_ssa(&dest->ssa);
-}
-
-nir_alu_instr *
-nir_visitor::emit(nir_op op, unsigned dest_size, nir_src *srcs)
-{
-   nir_alu_instr *instr = nir_alu_instr_create(this->shader, op);
-   for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++)
-      instr->src[i].src = srcs[i];
-   instr->dest.write_mask = (1 << dest_size) - 1;
-   add_instr(&instr->instr, dest_size);
-   return instr;
-}
-
-nir_alu_instr *
-nir_visitor::emit(nir_op op, unsigned dest_size, nir_src src1)
-{
-   assert(nir_op_infos[op].num_inputs == 1);
-   return emit(op, dest_size, &src1);
-}
-
-nir_alu_instr *
-nir_visitor::emit(nir_op op, unsigned dest_size, nir_src src1,
-                  nir_src src2)
-{
-   assert(nir_op_infos[op].num_inputs == 2);
-   nir_src srcs[] = { src1, src2 };
-   return emit(op, dest_size, srcs);
-}
-
-nir_alu_instr *
-nir_visitor::emit(nir_op op, unsigned dest_size, nir_src src1,
-                  nir_src src2, nir_src src3)
-{
-   assert(nir_op_infos[op].num_inputs == 3);
-   nir_src srcs[] = { src1, src2, src3 };
-   return emit(op, dest_size, srcs);
+   return this->result;
 }
 
 void
@@ -1236,9 +1195,9 @@ nir_visitor::visit(ir_expression *ir)
       nir_intrinsic_instr *load = nir_intrinsic_instr_create(this->shader, op);
       load->num_components = ir->type->vector_elements;
       load->const_index[0] = const_index ? const_index->value.u[0] : 0; /* base offset */
-      load->src[0] = evaluate_rvalue(ir->operands[0]);
+      load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0]));
       if (!const_index)
-         load->src[1] = evaluate_rvalue(ir->operands[1]);
+         load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
       add_instr(&load->instr, ir->type->vector_elements);
 
       /*
@@ -1246,22 +1205,8 @@ nir_visitor::visit(ir_expression *ir)
        * a true boolean to be ~0. Fix this up with a != 0 comparison.
        */
 
-      if (ir->type->base_type == GLSL_TYPE_BOOL) {
-         nir_load_const_instr *const_zero = nir_load_const_instr_create(shader, 1);
-         const_zero->value.u[0] = 0;
-         nir_instr_insert_after_cf_list(this->cf_node_list, &const_zero->instr);
-
-         nir_alu_instr *compare = nir_alu_instr_create(shader, nir_op_ine);
-         compare->src[0].src.is_ssa = true;
-         compare->src[0].src.ssa = &load->dest.ssa;
-         compare->src[1].src.is_ssa = true;
-         compare->src[1].src.ssa = &const_zero->def;
-         for (unsigned i = 0; i < ir->type->vector_elements; i++)
-            compare->src[1].swizzle[i] = 0;
-         compare->dest.write_mask = (1 << ir->type->vector_elements) - 1;
-
-         add_instr(&compare->instr, ir->type->vector_elements);
-      }
+      if (ir->type->base_type == GLSL_TYPE_BOOL)
+         this->result = nir_ine(&b, &load->dest.ssa, nir_imm_int(&b, 0));
 
       return;
    }
@@ -1316,24 +1261,17 @@ nir_visitor::visit(ir_expression *ir)
 
       if (intrin->intrinsic == nir_intrinsic_interp_var_at_offset ||
           intrin->intrinsic == nir_intrinsic_interp_var_at_sample)
-         intrin->src[0] = evaluate_rvalue(ir->operands[1]);
+         intrin->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
 
       add_instr(&intrin->instr, deref->type->vector_elements);
 
       if (swizzle) {
-         nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov);
-         mov->dest.write_mask = (1 << swizzle->type->vector_elements) - 1;
-         mov->src[0].src.is_ssa = true;
-         mov->src[0].src.ssa = &intrin->dest.ssa;
-
-         mov->src[0].swizzle[0] = swizzle->mask.x;
-         mov->src[0].swizzle[1] = swizzle->mask.y;
-         mov->src[0].swizzle[2] = swizzle->mask.z;
-         mov->src[0].swizzle[3] = swizzle->mask.w;
-         for (unsigned i = deref->type->vector_elements; i < 4; i++)
-            mov->src[0].swizzle[i] = 0;
-
-         add_instr(&mov->instr, swizzle->type->vector_elements);
+         unsigned swiz[4] = {
+            swizzle->mask.x, swizzle->mask.y, swizzle->mask.z, swizzle->mask.w
+         };
+
+         result = nir_swizzle(&b, result, swiz,
+                              swizzle->type->vector_elements, false);
       }
 
       return;
@@ -1343,7 +1281,7 @@ nir_visitor::visit(ir_expression *ir)
       break;
    }
 
-   nir_src srcs[4];
+   nir_ssa_def *srcs[4];
    for (unsigned i = 0; i < ir->get_num_operands(); i++)
       srcs[i] = evaluate_rvalue(ir->operands[i]);
 
@@ -1360,53 +1298,48 @@ nir_visitor::visit(ir_expression *ir)
    else
       out_type = GLSL_TYPE_FLOAT;
 
-   unsigned dest_size = ir->type->vector_elements;
-
-   nir_alu_instr *instr;
-   nir_op op;
-
    switch (ir->operation) {
-   case ir_unop_bit_not: emit(nir_op_inot, dest_size, srcs); break;
+   case ir_unop_bit_not: result = nir_inot(&b, srcs[0]); break;
    case ir_unop_logic_not:
-      emit(supports_ints ? nir_op_inot : nir_op_fnot, dest_size, srcs);
+      result = supports_ints ? nir_inot(&b, srcs[0]) : nir_fnot(&b, srcs[0]);
       break;
    case ir_unop_neg:
-      instr = emit(types[0] == GLSL_TYPE_FLOAT ? nir_op_fneg : nir_op_ineg,
-                   dest_size, srcs);
+      result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fneg(&b, srcs[0])
+                                             : nir_ineg(&b, srcs[0]);
       break;
    case ir_unop_abs:
-      instr = emit(types[0] == GLSL_TYPE_FLOAT ? nir_op_fabs : nir_op_iabs,
-                   dest_size, srcs);
+      result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fabs(&b, srcs[0])
+                                             : nir_iabs(&b, srcs[0]);
       break;
    case ir_unop_saturate:
       assert(types[0] == GLSL_TYPE_FLOAT);
-      instr = emit(nir_op_fsat, dest_size, srcs);
+      result = nir_fsat(&b, srcs[0]);
       break;
    case ir_unop_sign:
-      emit(types[0] == GLSL_TYPE_FLOAT ? nir_op_fsign : nir_op_isign,
-           dest_size, srcs);
+      result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fsign(&b, srcs[0])
+                                             : nir_isign(&b, srcs[0]);
       break;
-   case ir_unop_rcp:  emit(nir_op_frcp, dest_size, srcs);  break;
-   case ir_unop_rsq:  emit(nir_op_frsq, dest_size, srcs);  break;
-   case ir_unop_sqrt: emit(nir_op_fsqrt, dest_size, srcs); break;
+   case ir_unop_rcp:  result = nir_frcp(&b, srcs[0]);  break;
+   case ir_unop_rsq:  result = nir_frsq(&b, srcs[0]);  break;
+   case ir_unop_sqrt: result = nir_fsqrt(&b, srcs[0]); break;
    case ir_unop_exp:  unreachable("ir_unop_exp should have been lowered");
    case ir_unop_log:  unreachable("ir_unop_log should have been lowered");
-   case ir_unop_exp2: emit(nir_op_fexp2, dest_size, srcs); break;
-   case ir_unop_log2: emit(nir_op_flog2, dest_size, srcs); break;
+   case ir_unop_exp2: result = nir_fexp2(&b, srcs[0]); break;
+   case ir_unop_log2: result = nir_flog2(&b, srcs[0]); break;
    case ir_unop_i2f:
-      emit(supports_ints ? nir_op_i2f : nir_op_fmov, dest_size, srcs);
+      result = supports_ints ? nir_i2f(&b, srcs[0]) : nir_fmov(&b, srcs[0]);
       break;
    case ir_unop_u2f:
-      emit(supports_ints ? nir_op_u2f : nir_op_fmov, dest_size, srcs);
+      result = supports_ints ? nir_u2f(&b, srcs[0]) : nir_fmov(&b, srcs[0]);
       break;
    case ir_unop_b2f:
-      emit(supports_ints ? nir_op_b2f : nir_op_fmov, dest_size, srcs);
+      result = supports_ints ? nir_b2f(&b, srcs[0]) : nir_fmov(&b, srcs[0]);
       break;
-   case ir_unop_f2i:  emit(nir_op_f2i, dest_size, srcs);   break;
-   case ir_unop_f2u:  emit(nir_op_f2u, dest_size, srcs);   break;
-   case ir_unop_f2b:  emit(nir_op_f2b, dest_size, srcs);   break;
-   case ir_unop_i2b:  emit(nir_op_i2b, dest_size, srcs);   break;
-   case ir_unop_b2i:  emit(nir_op_b2i, dest_size, srcs);   break;
+   case ir_unop_f2i:  result = nir_f2i(&b, srcs[0]);   break;
+   case ir_unop_f2u:  result = nir_f2u(&b, srcs[0]);   break;
+   case ir_unop_f2b:  result = nir_f2b(&b, srcs[0]);   break;
+   case ir_unop_i2b:  result = nir_i2b(&b, srcs[0]);   break;
+   case ir_unop_b2i:  result = nir_b2i(&b, srcs[0]);   break;
    case ir_unop_i2u:
    case ir_unop_u2i:
    case ir_unop_bitcast_i2f:
@@ -1415,132 +1348,132 @@ nir_visitor::visit(ir_expression *ir)
    case ir_unop_bitcast_f2u:
    case ir_unop_subroutine_to_int:
       /* no-op */
-      emit(nir_op_imov, dest_size, srcs);
+      result = nir_imov(&b, srcs[0]);
       break;
    case ir_unop_any:
       switch (ir->operands[0]->type->vector_elements) {
       case 2:
-         emit(supports_ints ? nir_op_bany2 : nir_op_fany2,
-              dest_size, srcs);
+         result = supports_ints ? nir_bany2(&b, srcs[0])
+                                : nir_fany2(&b, srcs[0]);
          break;
       case 3:
-         emit(supports_ints ? nir_op_bany3 : nir_op_fany3,
-              dest_size, srcs);
+         result = supports_ints ? nir_bany3(&b, srcs[0])
+                                : nir_fany3(&b, srcs[0]);
          break;
       case 4:
-         emit(supports_ints ? nir_op_bany4 : nir_op_fany4,
-              dest_size, srcs);
+         result = supports_ints ? nir_bany4(&b, srcs[0])
+                                : nir_fany4(&b, srcs[0]);
          break;
       default:
          unreachable("not reached");
       }
       break;
-   case ir_unop_trunc: emit(nir_op_ftrunc, dest_size, srcs); break;
-   case ir_unop_ceil:  emit(nir_op_fceil,  dest_size, srcs); break;
-   case ir_unop_floor: emit(nir_op_ffloor, dest_size, srcs); break;
-   case ir_unop_fract: emit(nir_op_ffract, dest_size, srcs); break;
-   case ir_unop_round_even: emit(nir_op_fround_even, dest_size, srcs); break;
-   case ir_unop_sin:   emit(nir_op_fsin,   dest_size, srcs); break;
-   case ir_unop_cos:   emit(nir_op_fcos,   dest_size, srcs); break;
-   case ir_unop_dFdx:        emit(nir_op_fddx,        dest_size, srcs); break;
-   case ir_unop_dFdy:        emit(nir_op_fddy,        dest_size, srcs); break;
-   case ir_unop_dFdx_fine:   emit(nir_op_fddx_fine,   dest_size, srcs); break;
-   case ir_unop_dFdy_fine:   emit(nir_op_fddy_fine,   dest_size, srcs); break;
-   case ir_unop_dFdx_coarse: emit(nir_op_fddx_coarse, dest_size, srcs); break;
-   case ir_unop_dFdy_coarse: emit(nir_op_fddy_coarse, dest_size, srcs); break;
+   case ir_unop_trunc: result = nir_ftrunc(&b, srcs[0]); break;
+   case ir_unop_ceil:  result = nir_fceil(&b, srcs[0]); break;
+   case ir_unop_floor: result = nir_ffloor(&b, srcs[0]); break;
+   case ir_unop_fract: result = nir_ffract(&b, srcs[0]); break;
+   case ir_unop_round_even: result = nir_fround_even(&b, srcs[0]); break;
+   case ir_unop_sin:   result = nir_fsin(&b, srcs[0]); break;
+   case ir_unop_cos:   result = nir_fcos(&b, srcs[0]); break;
+   case ir_unop_dFdx:        result = nir_fddx(&b, srcs[0]); break;
+   case ir_unop_dFdy:        result = nir_fddy(&b, srcs[0]); break;
+   case ir_unop_dFdx_fine:   result = nir_fddx_fine(&b, srcs[0]); break;
+   case ir_unop_dFdy_fine:   result = nir_fddy_fine(&b, srcs[0]); break;
+   case ir_unop_dFdx_coarse: result = nir_fddx_coarse(&b, srcs[0]); break;
+   case ir_unop_dFdy_coarse: result = nir_fddy_coarse(&b, srcs[0]); break;
    case ir_unop_pack_snorm_2x16:
-      emit(nir_op_pack_snorm_2x16, dest_size, srcs);
+      result = nir_pack_snorm_2x16(&b, srcs[0]);
       break;
    case ir_unop_pack_snorm_4x8:
-      emit(nir_op_pack_snorm_4x8, dest_size, srcs);
+      result = nir_pack_snorm_4x8(&b, srcs[0]);
       break;
    case ir_unop_pack_unorm_2x16:
-      emit(nir_op_pack_unorm_2x16, dest_size, srcs);
+      result = nir_pack_unorm_2x16(&b, srcs[0]);
       break;
    case ir_unop_pack_unorm_4x8:
-      emit(nir_op_pack_unorm_4x8, dest_size, srcs);
+      result = nir_pack_unorm_4x8(&b, srcs[0]);
       break;
    case ir_unop_pack_half_2x16:
-      emit(nir_op_pack_half_2x16, dest_size, srcs);
+      result = nir_pack_half_2x16(&b, srcs[0]);
       break;
    case ir_unop_unpack_snorm_2x16:
-      emit(nir_op_unpack_snorm_2x16, dest_size, srcs);
+      result = nir_unpack_snorm_2x16(&b, srcs[0]);
       break;
    case ir_unop_unpack_snorm_4x8:
-      emit(nir_op_unpack_snorm_4x8, dest_size, srcs);
+      result = nir_unpack_snorm_4x8(&b, srcs[0]);
       break;
    case ir_unop_unpack_unorm_2x16:
-      emit(nir_op_unpack_unorm_2x16, dest_size, srcs);
+      result = nir_unpack_unorm_2x16(&b, srcs[0]);
       break;
    case ir_unop_unpack_unorm_4x8:
-      emit(nir_op_unpack_unorm_4x8, dest_size, srcs);
+      result = nir_unpack_unorm_4x8(&b, srcs[0]);
       break;
    case ir_unop_unpack_half_2x16:
-      emit(nir_op_unpack_half_2x16, dest_size, srcs);
+      result = nir_unpack_half_2x16(&b, srcs[0]);
       break;
    case ir_unop_unpack_half_2x16_split_x:
-      emit(nir_op_unpack_half_2x16_split_x, dest_size, srcs);
+      result = nir_unpack_half_2x16_split_x(&b, srcs[0]);
       break;
    case ir_unop_unpack_half_2x16_split_y:
-      emit(nir_op_unpack_half_2x16_split_y, dest_size, srcs);
+      result = nir_unpack_half_2x16_split_y(&b, srcs[0]);
       break;
    case ir_unop_bitfield_reverse:
-      emit(nir_op_bitfield_reverse, dest_size, srcs);
+      result = nir_bitfield_reverse(&b, srcs[0]);
       break;
    case ir_unop_bit_count:
-      emit(nir_op_bit_count, dest_size, srcs);
+      result = nir_bit_count(&b, srcs[0]);
       break;
    case ir_unop_find_msb:
       switch (types[0]) {
       case GLSL_TYPE_UINT:
-         emit(nir_op_ufind_msb, dest_size, srcs);
+         result = nir_ufind_msb(&b, srcs[0]);
          break;
       case GLSL_TYPE_INT:
-         emit(nir_op_ifind_msb, dest_size, srcs);
+         result = nir_ifind_msb(&b, srcs[0]);
          break;
       default:
          unreachable("Invalid type for findMSB()");
       }
       break;
    case ir_unop_find_lsb:
-      emit(nir_op_find_lsb,  dest_size, srcs);
+      result = nir_find_lsb(&b, srcs[0]);
       break;
 
    case ir_unop_noise:
       switch (ir->type->vector_elements) {
       case 1:
          switch (ir->operands[0]->type->vector_elements) {
-            case 1: emit(nir_op_fnoise1_1, dest_size, srcs); break;
-            case 2: emit(nir_op_fnoise1_2, dest_size, srcs); break;
-            case 3: emit(nir_op_fnoise1_3, dest_size, srcs); break;
-            case 4: emit(nir_op_fnoise1_4, dest_size, srcs); break;
+            case 1: result = nir_fnoise1_1(&b, srcs[0]); break;
+            case 2: result = nir_fnoise1_2(&b, srcs[0]); break;
+            case 3: result = nir_fnoise1_3(&b, srcs[0]); break;
+            case 4: result = nir_fnoise1_4(&b, srcs[0]); break;
             default: unreachable("not reached");
          }
          break;
       case 2:
          switch (ir->operands[0]->type->vector_elements) {
-            case 1: emit(nir_op_fnoise2_1, dest_size, srcs); break;
-            case 2: emit(nir_op_fnoise2_2, dest_size, srcs); break;
-            case 3: emit(nir_op_fnoise2_3, dest_size, srcs); break;
-            case 4: emit(nir_op_fnoise2_4, dest_size, srcs); break;
+            case 1: result = nir_fnoise2_1(&b, srcs[0]); break;
+            case 2: result = nir_fnoise2_2(&b, srcs[0]); break;
+            case 3: result = nir_fnoise2_3(&b, srcs[0]); break;
+            case 4: result = nir_fnoise2_4(&b, srcs[0]); break;
             default: unreachable("not reached");
          }
          break;
       case 3:
          switch (ir->operands[0]->type->vector_elements) {
-            case 1: emit(nir_op_fnoise3_1, dest_size, srcs); break;
-            case 2: emit(nir_op_fnoise3_2, dest_size, srcs); break;
-            case 3: emit(nir_op_fnoise3_3, dest_size, srcs); break;
-            case 4: emit(nir_op_fnoise3_4, dest_size, srcs); break;
+            case 1: result = nir_fnoise3_1(&b, srcs[0]); break;
+            case 2: result = nir_fnoise3_2(&b, srcs[0]); break;
+            case 3: result = nir_fnoise3_3(&b, srcs[0]); break;
+            case 4: result = nir_fnoise3_4(&b, srcs[0]); break;
             default: unreachable("not reached");
          }
          break;
       case 4:
          switch (ir->operands[0]->type->vector_elements) {
-            case 1: emit(nir_op_fnoise4_1, dest_size, srcs); break;
-            case 2: emit(nir_op_fnoise4_2, dest_size, srcs); break;
-            case 3: emit(nir_op_fnoise4_3, dest_size, srcs); break;
-            case 4: emit(nir_op_fnoise4_4, dest_size, srcs); break;
+            case 1: result = nir_fnoise4_1(&b, srcs[0]); break;
+            case 2: result = nir_fnoise4_2(&b, srcs[0]); break;
+            case 3: result = nir_fnoise4_3(&b, srcs[0]); break;
+            case 4: result = nir_fnoise4_4(&b, srcs[0]); break;
             default: unreachable("not reached");
          }
          break;
@@ -1553,240 +1486,173 @@ nir_visitor::visit(ir_expression *ir)
          this->shader,
          nir_intrinsic_get_buffer_size);
       load->num_components = ir->type->vector_elements;
-      load->src[0] = evaluate_rvalue(ir->operands[0]);
+      load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0]));
       add_instr(&load->instr, ir->type->vector_elements);
       return;
    }
 
    case ir_binop_add:
+      result = (out_type == GLSL_TYPE_FLOAT) ? nir_fadd(&b, srcs[0], srcs[1])
+                                             : nir_iadd(&b, srcs[0], srcs[1]);
+      break;
    case ir_binop_sub:
+      result = (out_type == GLSL_TYPE_FLOAT) ? nir_fsub(&b, srcs[0], srcs[1])
+                                             : nir_isub(&b, srcs[0], srcs[1]);
+      break;
    case ir_binop_mul:
+      result = (out_type == GLSL_TYPE_FLOAT) ? nir_fmul(&b, srcs[0], srcs[1])
+                                             : nir_imul(&b, srcs[0], srcs[1]);
+      break;
    case ir_binop_div:
+      if (out_type == GLSL_TYPE_FLOAT)
+         result = nir_fdiv(&b, srcs[0], srcs[1]);
+      else if (out_type == GLSL_TYPE_INT)
+         result = nir_idiv(&b, srcs[0], srcs[1]);
+      else
+         result = nir_udiv(&b, srcs[0], srcs[1]);
+      break;
    case ir_binop_mod:
+      result = (out_type == GLSL_TYPE_FLOAT) ? nir_fmod(&b, srcs[0], srcs[1])
+                                             : nir_umod(&b, srcs[0], srcs[1]);
+      break;
    case ir_binop_min:
+      if (out_type == GLSL_TYPE_FLOAT)
+         result = nir_fmin(&b, srcs[0], srcs[1]);
+      else if (out_type == GLSL_TYPE_INT)
+         result = nir_imin(&b, srcs[0], srcs[1]);
+      else
+         result = nir_umin(&b, srcs[0], srcs[1]);
+      break;
    case ir_binop_max:
-   case ir_binop_pow:
-   case ir_binop_bit_and:
-   case ir_binop_bit_or:
-   case ir_binop_bit_xor:
+      if (out_type == GLSL_TYPE_FLOAT)
+         result = nir_fmax(&b, srcs[0], srcs[1]);
+      else if (out_type == GLSL_TYPE_INT)
+         result = nir_imax(&b, srcs[0], srcs[1]);
+      else
+         result = nir_umax(&b, srcs[0], srcs[1]);
+      break;
+   case ir_binop_pow: result = nir_fpow(&b, srcs[0], srcs[1]); break;
+   case ir_binop_bit_and: result = nir_iand(&b, srcs[0], srcs[1]); break;
+   case ir_binop_bit_or: result = nir_ior(&b, srcs[0], srcs[1]); break;
+   case ir_binop_bit_xor: result = nir_ixor(&b, srcs[0], srcs[1]); break;
    case ir_binop_logic_and:
+      result = supports_ints ? nir_iand(&b, srcs[0], srcs[1])
+                             : nir_fand(&b, srcs[0], srcs[1]);
+      break;
    case ir_binop_logic_or:
-   case ir_binop_logic_xor:
-   case ir_binop_lshift:
+      result = supports_ints ? nir_ior(&b, srcs[0], srcs[1])
+                             : nir_for(&b, srcs[0], srcs[1]);
+      break;
+   case ir_binop_logic_xor: result = nir_ixor(&b, srcs[0], srcs[1]); break;
+      result = supports_ints ? nir_ior(&b, srcs[0], srcs[1])
+                             : nir_for(&b, srcs[0], srcs[1]);
+      break;
+   case ir_binop_lshift: result = nir_ishl(&b, srcs[0], srcs[1]); break;
    case ir_binop_rshift:
-      switch (ir->operation) {
-      case ir_binop_add:
-         if (out_type == GLSL_TYPE_FLOAT)
-            op = nir_op_fadd;
-         else
-            op = nir_op_iadd;
-         break;
-      case ir_binop_sub:
-         if (out_type == GLSL_TYPE_FLOAT)
-            op = nir_op_fsub;
-         else
-            op = nir_op_isub;
-         break;
-      case ir_binop_mul:
-         if (out_type == GLSL_TYPE_FLOAT)
-            op = nir_op_fmul;
-         else
-            op = nir_op_imul;
-         break;
-      case ir_binop_div:
-         if (out_type == GLSL_TYPE_FLOAT)
-            op = nir_op_fdiv;
-         else if (out_type == GLSL_TYPE_INT)
-            op = nir_op_idiv;
-         else
-            op = nir_op_udiv;
-         break;
-      case ir_binop_mod:
-         if (out_type == GLSL_TYPE_FLOAT)
-            op = nir_op_fmod;
-         else
-            op = nir_op_umod;
-         break;
-      case ir_binop_min:
-         if (out_type == GLSL_TYPE_FLOAT)
-            op = nir_op_fmin;
-         else if (out_type == GLSL_TYPE_INT)
-            op = nir_op_imin;
-         else
-            op = nir_op_umin;
-         break;
-      case ir_binop_max:
-         if (out_type == GLSL_TYPE_FLOAT)
-            op = nir_op_fmax;
-         else if (out_type == GLSL_TYPE_INT)
-            op = nir_op_imax;
-         else
-            op = nir_op_umax;
-         break;
-      case ir_binop_bit_and:
-         op = nir_op_iand;
-         break;
-      case ir_binop_bit_or:
-         op = nir_op_ior;
-         break;
-      case ir_binop_bit_xor:
-         op = nir_op_ixor;
-         break;
-      case ir_binop_logic_and:
-         if (supports_ints)
-            op = nir_op_iand;
-         else
-            op = nir_op_fand;
-         break;
-      case ir_binop_logic_or:
-         if (supports_ints)
-            op = nir_op_ior;
-         else
-            op = nir_op_for;
-         break;
-      case ir_binop_logic_xor:
-         if (supports_ints)
-            op = nir_op_ixor;
-         else
-            op = nir_op_fxor;
-         break;
-      case ir_binop_lshift:
-         op = nir_op_ishl;
-         break;
-      case ir_binop_rshift:
-         if (out_type == GLSL_TYPE_INT)
-            op = nir_op_ishr;
-         else
-            op = nir_op_ushr;
-         break;
-      case ir_binop_pow:
-         op = nir_op_fpow;
-         break;
-
-      default:
-         unreachable("not reached");
-      }
-
-      instr = emit(op, dest_size, srcs);
-
-      if (ir->operands[0]->type->vector_elements != 1 &&
-          ir->operands[1]->type->vector_elements == 1) {
-         for (unsigned i = 0; i < ir->operands[0]->type->vector_elements;
-              i++) {
-            instr->src[1].swizzle[i] = 0;
-         }
-      }
-
-      if (ir->operands[1]->type->vector_elements != 1 &&
-          ir->operands[0]->type->vector_elements == 1) {
-         for (unsigned i = 0; i < ir->operands[1]->type->vector_elements;
-              i++) {
-            instr->src[0].swizzle[i] = 0;
-         }
-      }
-
+      result = (out_type == GLSL_TYPE_INT) ? nir_ishr(&b, srcs[0], srcs[1])
+                                           : nir_ushr(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_imul_high:
-      emit(out_type == GLSL_TYPE_UINT ? nir_op_umul_high : nir_op_imul_high,
-           dest_size, srcs);
+      result = (out_type == GLSL_TYPE_INT) ? nir_imul_high(&b, srcs[0], srcs[1])
+                                           : nir_umul_high(&b, srcs[0], srcs[1]);
       break;
-   case ir_binop_carry:  emit(nir_op_uadd_carry, dest_size, srcs);  break;
-   case ir_binop_borrow: emit(nir_op_usub_borrow, dest_size, srcs); break;
+   case ir_binop_carry:  result = nir_uadd_carry(&b, srcs[0], srcs[1]);  break;
+   case ir_binop_borrow: result = nir_usub_borrow(&b, srcs[0], srcs[1]); break;
    case ir_binop_less:
       if (supports_ints) {
          if (types[0] == GLSL_TYPE_FLOAT)
-            emit(nir_op_flt, dest_size, srcs);
+            result = nir_flt(&b, srcs[0], srcs[1]);
          else if (types[0] == GLSL_TYPE_INT)
-            emit(nir_op_ilt, dest_size, srcs);
+            result = nir_ilt(&b, srcs[0], srcs[1]);
          else
-            emit(nir_op_ult, dest_size, srcs);
+            result = nir_ult(&b, srcs[0], srcs[1]);
       } else {
-         emit(nir_op_slt, dest_size, srcs);
+         result = nir_slt(&b, srcs[0], srcs[1]);
       }
       break;
    case ir_binop_greater:
       if (supports_ints) {
          if (types[0] == GLSL_TYPE_FLOAT)
-            emit(nir_op_flt, dest_size, srcs[1], srcs[0]);
+            result = nir_flt(&b, srcs[1], srcs[0]);
          else if (types[0] == GLSL_TYPE_INT)
-            emit(nir_op_ilt, dest_size, srcs[1], srcs[0]);
+            result = nir_ilt(&b, srcs[1], srcs[0]);
          else
-            emit(nir_op_ult, dest_size, srcs[1], srcs[0]);
+            result = nir_ult(&b, srcs[1], srcs[0]);
       } else {
-         emit(nir_op_slt, dest_size, srcs[1], srcs[0]);
+         result = nir_slt(&b, srcs[1], srcs[0]);
       }
       break;
    case ir_binop_lequal:
       if (supports_ints) {
          if (types[0] == GLSL_TYPE_FLOAT)
-            emit(nir_op_fge, dest_size, srcs[1], srcs[0]);
+            result = nir_fge(&b, srcs[1], srcs[0]);
          else if (types[0] == GLSL_TYPE_INT)
-            emit(nir_op_ige, dest_size, srcs[1], srcs[0]);
+            result = nir_ige(&b, srcs[1], srcs[0]);
          else
-            emit(nir_op_uge, dest_size, srcs[1], srcs[0]);
+            result = nir_uge(&b, srcs[1], srcs[0]);
       } else {
-         emit(nir_op_slt, dest_size, srcs[1], srcs[0]);
+         result = nir_slt(&b, srcs[1], srcs[0]);
       }
       break;
    case ir_binop_gequal:
       if (supports_ints) {
          if (types[0] == GLSL_TYPE_FLOAT)
-            emit(nir_op_fge, dest_size, srcs);
+            result = nir_fge(&b, srcs[0], srcs[1]);
          else if (types[0] == GLSL_TYPE_INT)
-            emit(nir_op_ige, dest_size, srcs);
+            result = nir_ige(&b, srcs[0], srcs[1]);
          else
-            emit(nir_op_uge, dest_size, srcs);
+            result = nir_uge(&b, srcs[0], srcs[1]);
       } else {
-         emit(nir_op_slt, dest_size, srcs);
+         result = nir_slt(&b, srcs[0], srcs[1]);
       }
       break;
    case ir_binop_equal:
       if (supports_ints) {
          if (types[0] == GLSL_TYPE_FLOAT)
-            emit(nir_op_feq, dest_size, srcs);
+            result = nir_feq(&b, srcs[0], srcs[1]);
          else
-            emit(nir_op_ieq, dest_size, srcs);
+            result = nir_ieq(&b, srcs[0], srcs[1]);
       } else {
-         emit(nir_op_seq, dest_size, srcs);
+         result = nir_seq(&b, srcs[0], srcs[1]);
       }
       break;
    case ir_binop_nequal:
       if (supports_ints) {
          if (types[0] == GLSL_TYPE_FLOAT)
-            emit(nir_op_fne, dest_size, srcs);
+            result = nir_fne(&b, srcs[0], srcs[1]);
          else
-            emit(nir_op_ine, dest_size, srcs);
+            result = nir_ine(&b, srcs[0], srcs[1]);
       } else {
-         emit(nir_op_sne, dest_size, srcs);
+         result = nir_sne(&b, srcs[0], srcs[1]);
       }
       break;
    case ir_binop_all_equal:
       if (supports_ints) {
          if (types[0] == GLSL_TYPE_FLOAT) {
             switch (ir->operands[0]->type->vector_elements) {
-               case 1: emit(nir_op_feq, dest_size, srcs); break;
-               case 2: emit(nir_op_ball_fequal2, dest_size, srcs); break;
-               case 3: emit(nir_op_ball_fequal3, dest_size, srcs); break;
-               case 4: emit(nir_op_ball_fequal4, dest_size, srcs); break;
+               case 1: result = nir_feq(&b, srcs[0], srcs[1]); break;
+               case 2: result = nir_ball_fequal2(&b, srcs[0], srcs[1]); break;
+               case 3: result = nir_ball_fequal3(&b, srcs[0], srcs[1]); break;
+               case 4: result = nir_ball_fequal4(&b, srcs[0], srcs[1]); break;
                default:
                   unreachable("not reached");
             }
          } else {
             switch (ir->operands[0]->type->vector_elements) {
-               case 1: emit(nir_op_ieq, dest_size, srcs); break;
-               case 2: emit(nir_op_ball_iequal2, dest_size, srcs); break;
-               case 3: emit(nir_op_ball_iequal3, dest_size, srcs); break;
-               case 4: emit(nir_op_ball_iequal4, dest_size, srcs); break;
+               case 1: result = nir_ieq(&b, srcs[0], srcs[1]); break;
+               case 2: result = nir_ball_iequal2(&b, srcs[0], srcs[1]); break;
+               case 3: result = nir_ball_iequal3(&b, srcs[0], srcs[1]); break;
+               case 4: result = nir_ball_iequal4(&b, srcs[0], srcs[1]); break;
                default:
                   unreachable("not reached");
             }
          }
       } else {
          switch (ir->operands[0]->type->vector_elements) {
-            case 1: emit(nir_op_seq, dest_size, srcs); break;
-            case 2: emit(nir_op_fall_equal2, dest_size, srcs); break;
-            case 3: emit(nir_op_fall_equal3, dest_size, srcs); break;
-            case 4: emit(nir_op_fall_equal4, dest_size, srcs); break;
+            case 1: result = nir_seq(&b, srcs[0], srcs[1]); break;
+            case 2: result = nir_fall_equal2(&b, srcs[0], srcs[1]); break;
+            case 3: result = nir_fall_equal3(&b, srcs[0], srcs[1]); break;
+            case 4: result = nir_fall_equal4(&b, srcs[0], srcs[1]); break;
             default:
                unreachable("not reached");
          }
@@ -1796,29 +1662,29 @@ nir_visitor::visit(ir_expression *ir)
       if (supports_ints) {
          if (types[0] == GLSL_TYPE_FLOAT) {
             switch (ir->operands[0]->type->vector_elements) {
-               case 1: emit(nir_op_fne, dest_size, srcs); break;
-               case 2: emit(nir_op_bany_fnequal2, dest_size, srcs); break;
-               case 3: emit(nir_op_bany_fnequal3, dest_size, srcs); break;
-               case 4: emit(nir_op_bany_fnequal4, dest_size, srcs); break;
+               case 1: result = nir_fne(&b, srcs[0], srcs[1]); break;
+               case 2: result = nir_bany_fnequal2(&b, srcs[0], srcs[1]); break;
+               case 3: result = nir_bany_fnequal3(&b, srcs[0], srcs[1]); break;
+               case 4: result = nir_bany_fnequal4(&b, srcs[0], srcs[1]); break;
                default:
                   unreachable("not reached");
             }
          } else {
             switch (ir->operands[0]->type->vector_elements) {
-               case 1: emit(nir_op_ine, dest_size, srcs); break;
-               case 2: emit(nir_op_bany_inequal2, dest_size, srcs); break;
-               case 3: emit(nir_op_bany_inequal3, dest_size, srcs); break;
-               case 4: emit(nir_op_bany_inequal4, dest_size, srcs); break;
+               case 1: result = nir_ine(&b, srcs[0], srcs[1]); break;
+               case 2: result = nir_bany_inequal2(&b, srcs[0], srcs[1]); break;
+               case 3: result = nir_bany_inequal3(&b, srcs[0], srcs[1]); break;
+               case 4: result = nir_bany_inequal4(&b, srcs[0], srcs[1]); break;
                default:
                   unreachable("not reached");
             }
          }
       } else {
          switch (ir->operands[0]->type->vector_elements) {
-            case 1: emit(nir_op_sne, dest_size, srcs); break;
-            case 2: emit(nir_op_fany_nequal2, dest_size, srcs); break;
-            case 3: emit(nir_op_fany_nequal3, dest_size, srcs); break;
-            case 4: emit(nir_op_fany_nequal4, dest_size, srcs); break;
+            case 1: result = nir_sne(&b, srcs[0], srcs[1]); break;
+            case 2: result = nir_fany_nequal2(&b, srcs[0], srcs[1]); break;
+            case 3: result = nir_fany_nequal3(&b, srcs[0], srcs[1]); break;
+            case 4: result = nir_fany_nequal4(&b, srcs[0], srcs[1]); break;
             default:
                unreachable("not reached");
          }
@@ -1826,64 +1692,44 @@ nir_visitor::visit(ir_expression *ir)
       break;
    case ir_binop_dot:
       switch (ir->operands[0]->type->vector_elements) {
-         case 2: emit(nir_op_fdot2, dest_size, srcs); break;
-         case 3: emit(nir_op_fdot3, dest_size, srcs); break;
-         case 4: emit(nir_op_fdot4, dest_size, srcs); break;
+         case 2: result = nir_fdot2(&b, srcs[0], srcs[1]); break;
+         case 3: result = nir_fdot3(&b, srcs[0], srcs[1]); break;
+         case 4: result = nir_fdot4(&b, srcs[0], srcs[1]); break;
          default:
             unreachable("not reached");
       }
       break;
 
    case ir_binop_pack_half_2x16_split:
-         emit(nir_op_pack_half_2x16_split, dest_size, srcs);
+         result = nir_pack_half_2x16_split(&b, srcs[0], srcs[1]);
          break;
-   case ir_binop_bfm:   emit(nir_op_bfm, dest_size, srcs);   break;
-   case ir_binop_ldexp: emit(nir_op_ldexp, dest_size, srcs); break;
-   case ir_triop_fma:   emit(nir_op_ffma, dest_size, srcs);  break;
+   case ir_binop_bfm:   result = nir_bfm(&b, srcs[0], srcs[1]);   break;
+   case ir_binop_ldexp: result = nir_ldexp(&b, srcs[0], srcs[1]); break;
+   case ir_triop_fma:
+      result = nir_ffma(&b, srcs[0], srcs[1], srcs[2]);
+      break;
    case ir_triop_lrp:
-      instr = emit(nir_op_flrp, dest_size, srcs);
-      if (ir->operands[0]->type->vector_elements != 1 &&
-          ir->operands[2]->type->vector_elements == 1) {
-         for (unsigned i = 0; i < ir->operands[0]->type->vector_elements;
-              i++) {
-            instr->src[2].swizzle[i] = 0;
-         }
-      }
+      result = nir_flrp(&b, srcs[0], srcs[1], srcs[2]);
       break;
    case ir_triop_csel:
       if (supports_ints)
-         emit(nir_op_bcsel, dest_size, srcs);
+         result = nir_bcsel(&b, srcs[0], srcs[1], srcs[2]);
       else
-         emit(nir_op_fcsel, dest_size, srcs);
+         result = nir_fcsel(&b, srcs[0], srcs[1], srcs[2]);
       break;
    case ir_triop_bfi:
-      instr = emit(nir_op_bfi, dest_size, srcs);
-      for (unsigned i = 0; i < ir->operands[1]->type->vector_elements; i++) {
-         instr->src[0].swizzle[i] = 0;
-      }
+      result = nir_bfi(&b, srcs[0], srcs[1], srcs[2]);
       break;
    case ir_triop_bitfield_extract:
-      instr = emit(out_type == GLSL_TYPE_INT ? nir_op_ibitfield_extract :
-                   nir_op_ubitfield_extract, dest_size, srcs);
-      for (unsigned i = 0; i < ir->operands[0]->type->vector_elements; i++) {
-         instr->src[1].swizzle[i] = 0;
-         instr->src[2].swizzle[i] = 0;
-      }
+      result = (out_type == GLSL_TYPE_INT) ?
+         nir_ibitfield_extract(&b, srcs[0], srcs[1], srcs[2]) :
+         nir_ubitfield_extract(&b, srcs[0], srcs[1], srcs[2]);
       break;
    case ir_quadop_bitfield_insert:
-      instr = emit(nir_op_bitfield_insert, dest_size, srcs);
-      for (unsigned i = 0; i < ir->operands[0]->type->vector_elements; i++) {
-         instr->src[2].swizzle[i] = 0;
-         instr->src[3].swizzle[i] = 0;
-      }
+      result = nir_bitfield_insert(&b, srcs[0], srcs[1], srcs[2], srcs[3]);
       break;
    case ir_quadop_vector:
-      switch (ir->type->vector_elements) {
-         case 2: emit(nir_op_vec2, dest_size, srcs); break;
-         case 3: emit(nir_op_vec3, dest_size, srcs); break;
-         case 4: emit(nir_op_vec4, dest_size, srcs); break;
-         default: unreachable("not reached");
-      }
+      result = nir_vec(&b, srcs, ir->type->vector_elements);
       break;
 
    default:
@@ -1894,13 +1740,9 @@ nir_visitor::visit(ir_expression *ir)
 void
 nir_visitor::visit(ir_swizzle *ir)
 {
-   nir_alu_instr *instr = emit(supports_ints ? nir_op_imov : nir_op_fmov,
-                               ir->type->vector_elements,
-                               evaluate_rvalue(ir->val));
-
    unsigned swizzle[4] = { ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w };
-   for (unsigned i = 0; i < ir->type->vector_elements; i++)
-      instr->src[0].swizzle[i] = swizzle[i];
+   result = nir_swizzle(&b, evaluate_rvalue(ir->val), swizzle,
+                        ir->type->vector_elements, !supports_ints);
 }
 
 void
@@ -2006,19 +1848,22 @@ nir_visitor::visit(ir_texture *ir)
 
    if (ir->coordinate != NULL) {
       instr->coord_components = ir->coordinate->type->vector_elements;
-      instr->src[src_number].src = evaluate_rvalue(ir->coordinate);
+      instr->src[src_number].src =
+         nir_src_for_ssa(evaluate_rvalue(ir->coordinate));
       instr->src[src_number].src_type = nir_tex_src_coord;
       src_number++;
    }
 
    if (ir->projector != NULL) {
-      instr->src[src_number].src = evaluate_rvalue(ir->projector);
+      instr->src[src_number].src =
+         nir_src_for_ssa(evaluate_rvalue(ir->projector));
       instr->src[src_number].src_type = nir_tex_src_projector;
       src_number++;
    }
 
    if (ir->shadow_comparitor != NULL) {
-      instr->src[src_number].src = evaluate_rvalue(ir->shadow_comparitor);
+      instr->src[src_number].src =
+         nir_src_for_ssa(evaluate_rvalue(ir->shadow_comparitor));
       instr->src[src_number].src_type = nir_tex_src_comparitor;
       src_number++;
    }
@@ -2032,7 +1877,8 @@ nir_visitor::visit(ir_texture *ir)
          for (unsigned i = 0; i < const_offset->type->vector_elements; i++)
             instr->const_offset[i] = const_offset->value.i[i];
       } else {
-         instr->src[src_number].src = evaluate_rvalue(ir->offset);
+         instr->src[src_number].src =
+            nir_src_for_ssa(evaluate_rvalue(ir->offset));
          instr->src[src_number].src_type = nir_tex_src_offset;
          src_number++;
       }
@@ -2040,7 +1886,8 @@ nir_visitor::visit(ir_texture *ir)
 
    switch (ir->op) {
    case ir_txb:
-      instr->src[src_number].src = evaluate_rvalue(ir->lod_info.bias);
+      instr->src[src_number].src =
+         nir_src_for_ssa(evaluate_rvalue(ir->lod_info.bias));
       instr->src[src_number].src_type = nir_tex_src_bias;
       src_number++;
       break;
@@ -2049,23 +1896,27 @@ nir_visitor::visit(ir_texture *ir)
    case ir_txf:
    case ir_txs:
       if (ir->lod_info.lod != NULL) {
-         instr->src[src_number].src = evaluate_rvalue(ir->lod_info.lod);
+         instr->src[src_number].src =
+            nir_src_for_ssa(evaluate_rvalue(ir->lod_info.lod));
          instr->src[src_number].src_type = nir_tex_src_lod;
          src_number++;
       }
       break;
 
    case ir_txd:
-      instr->src[src_number].src = evaluate_rvalue(ir->lod_info.grad.dPdx);
+      instr->src[src_number].src =
+         nir_src_for_ssa(evaluate_rvalue(ir->lod_info.grad.dPdx));
       instr->src[src_number].src_type = nir_tex_src_ddx;
       src_number++;
-      instr->src[src_number].src = evaluate_rvalue(ir->lod_info.grad.dPdy);
+      instr->src[src_number].src =
+         nir_src_for_ssa(evaluate_rvalue(ir->lod_info.grad.dPdy));
       instr->src[src_number].src_type = nir_tex_src_ddy;
       src_number++;
       break;
 
    case ir_txf_ms:
-      instr->src[src_number].src = evaluate_rvalue(ir->lod_info.sample_index);
+      instr->src[src_number].src =
+         nir_src_for_ssa(evaluate_rvalue(ir->lod_info.sample_index));
       instr->src[src_number].src_type = nir_tex_src_ms_index;
       src_number++;
       break;
@@ -2140,7 +1991,8 @@ nir_visitor::visit(ir_dereference_array *ir)
       deref->base_offset = const_index->value.u[0];
    } else {
       deref->deref_array_type = nir_deref_array_type_indirect;
-      deref->indirect = evaluate_rvalue(ir->array_index);
+      deref->indirect =
+         nir_src_for_ssa(evaluate_rvalue(ir->array_index));
    }
 
    ir->array->accept(this);
@@ -2155,5 +2007,5 @@ nir_visitor::visit(ir_barrier *ir)
 {
    nir_intrinsic_instr *instr =
       nir_intrinsic_instr_create(this->shader, nir_intrinsic_barrier);
-   nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(&b, &instr->instr);
 }
diff --git a/src/glsl/nir/glsl_types.cpp b/src/glsl/nir/glsl_types.cpp
index 80ab3591462..3e9d38f7707 100644
--- a/src/glsl/nir/glsl_types.cpp
+++ b/src/glsl/nir/glsl_types.cpp
@@ -163,6 +163,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].sample = fields[i].sample;
       this->fields.structure[i].matrix_layout = fields[i].matrix_layout;
       this->fields.structure[i].patch = fields[i].patch;
+      this->fields.structure[i].precision = fields[i].precision;
    }
 
    mtx_unlock(&glsl_type::mutex);
@@ -900,6 +901,9 @@ glsl_type::record_compare(const glsl_type *b) const
       if (this->fields.structure[i].image_restrict
           != b->fields.structure[i].image_restrict)
          return false;
+      if (this->fields.structure[i].precision
+          != b->fields.structure[i].precision)
+         return false;
    }
 
    return true;
diff --git a/src/glsl/nir/glsl_types.h b/src/glsl/nir/glsl_types.h
index a85a9e6a6a8..14c2aa49f85 100644
--- a/src/glsl/nir/glsl_types.h
+++ b/src/glsl/nir/glsl_types.h
@@ -103,6 +103,13 @@ enum glsl_matrix_layout {
    GLSL_MATRIX_LAYOUT_ROW_MAJOR
 };
 
+enum {
+   GLSL_PRECISION_NONE = 0,
+   GLSL_PRECISION_HIGH,
+   GLSL_PRECISION_MEDIUM,
+   GLSL_PRECISION_LOW
+};
+
 #ifdef __cplusplus
 #include "GL/gl.h"
 #include "util/ralloc.h"
@@ -330,7 +337,6 @@ struct glsl_type {
     */
    unsigned count_attribute_slots() const;
 
-
    /**
     * Alignment in bytes of the start of this type in a std140 uniform
     * block.
@@ -850,10 +856,9 @@ struct glsl_struct_field {
    unsigned patch:1;
 
    /**
-    * For interface blocks, it has a value if this variable uses multiple vertex
-    * streams (as in ir_variable::stream). -1 otherwise.
+    * Precision qualifier
     */
-   int stream;
+   unsigned precision;
 
    /**
     * Image qualifiers, applicable to buffer variables defined in shader
@@ -868,8 +873,7 @@ struct glsl_struct_field {
 #ifdef __cplusplus
    glsl_struct_field(const struct glsl_type *_type, const char *_name)
       : type(_type), name(_name), location(-1), interpolation(0), centroid(0),
-        sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0),
-        stream(-1)
+        sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0)
    {
       /* empty */
    }
diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index 5f03095d673..bb7a5fa5835 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -302,9 +302,9 @@ nir_function_impl_create(nir_function_overload *overload)
 }
 
 nir_block *
-nir_block_create(void *mem_ctx)
+nir_block_create(nir_shader *shader)
 {
-   nir_block *block = ralloc(mem_ctx, nir_block);
+   nir_block *block = ralloc(shader, nir_block);
 
    cf_init(&block->cf_node, nir_cf_node_block);
 
@@ -330,19 +330,19 @@ src_init(nir_src *src)
 }
 
 nir_if *
-nir_if_create(void *mem_ctx)
+nir_if_create(nir_shader *shader)
 {
-   nir_if *if_stmt = ralloc(mem_ctx, nir_if);
+   nir_if *if_stmt = ralloc(shader, nir_if);
 
    cf_init(&if_stmt->cf_node, nir_cf_node_if);
    src_init(&if_stmt->condition);
 
-   nir_block *then = nir_block_create(mem_ctx);
+   nir_block *then = nir_block_create(shader);
    exec_list_make_empty(&if_stmt->then_list);
    exec_list_push_tail(&if_stmt->then_list, &then->cf_node.node);
    then->cf_node.parent = &if_stmt->cf_node;
 
-   nir_block *else_stmt = nir_block_create(mem_ctx);
+   nir_block *else_stmt = nir_block_create(shader);
    exec_list_make_empty(&if_stmt->else_list);
    exec_list_push_tail(&if_stmt->else_list, &else_stmt->cf_node.node);
    else_stmt->cf_node.parent = &if_stmt->cf_node;
@@ -351,13 +351,13 @@ nir_if_create(void *mem_ctx)
 }
 
 nir_loop *
-nir_loop_create(void *mem_ctx)
+nir_loop_create(nir_shader *shader)
 {
-   nir_loop *loop = ralloc(mem_ctx, nir_loop);
+   nir_loop *loop = ralloc(shader, nir_loop);
 
    cf_init(&loop->cf_node, nir_cf_node_loop);
 
-   nir_block *body = nir_block_create(mem_ctx);
+   nir_block *body = nir_block_create(shader);
    exec_list_make_empty(&loop->body);
    exec_list_push_tail(&loop->body, &body->cf_node.node);
    body->cf_node.parent = &loop->cf_node;
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 9b278d6a767..1215e58dfb8 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -399,10 +399,10 @@ typedef struct {
     */
    bool is_packed;
 
-   /** set of nir_instr's where this register is used (read from) */
+   /** set of nir_src's where this register is used (read from) */
    struct list_head uses;
 
-   /** set of nir_instr's where this register is defined (written to) */
+   /** set of nir_dest's where this register is defined (written to) */
    struct list_head defs;
 
    /** set of nir_if's where this register is used as a condition */
@@ -798,7 +798,7 @@ NIR_DEFINE_CAST(nir_deref_as_var, nir_deref, nir_deref_var, deref)
 NIR_DEFINE_CAST(nir_deref_as_array, nir_deref, nir_deref_array, deref)
 NIR_DEFINE_CAST(nir_deref_as_struct, nir_deref, nir_deref_struct, deref)
 
-/** Returns the tail of a deref chain */
+/* Returns the last deref in the chain. */
 static inline nir_deref *
 nir_deref_tail(nir_deref *deref)
 {
@@ -1332,7 +1332,7 @@ typedef enum {
    nir_metadata_none = 0x0,
    nir_metadata_block_index = 0x1,
    nir_metadata_dominance = 0x2,
-   nir_metadata_live_variables = 0x4,
+   nir_metadata_live_ssa_defs = 0x4,
 } nir_metadata;
 
 typedef struct {
@@ -1504,6 +1504,11 @@ typedef struct nir_shader_info {
    /* Which system values are actually read */
    uint64_t system_values_read;
 
+   /* Which patch inputs are actually read */
+   uint32_t patch_inputs_read;
+   /* Which patch outputs are actually written */
+   uint32_t patch_outputs_written;
+
    /* Whether or not this shader ever uses textureGather() */
    bool uses_texture_gather;
 
@@ -1644,9 +1649,9 @@ nir_function_overload *nir_function_overload_create(nir_function *func);
 
 nir_function_impl *nir_function_impl_create(nir_function_overload *func);
 
-nir_block *nir_block_create(void *mem_ctx);
-nir_if *nir_if_create(void *mem_ctx);
-nir_loop *nir_loop_create(void *mem_ctx);
+nir_block *nir_block_create(nir_shader *shader);
+nir_if *nir_if_create(nir_shader *shader);
+nir_loop *nir_loop_create(nir_shader *shader);
 
 nir_function_impl *nir_cf_node_get_function(nir_cf_node *node);
 
@@ -1957,6 +1962,9 @@ void nir_assign_var_locations(struct exec_list *var_list,
 void nir_lower_io(nir_shader *shader,
                   nir_variable_mode mode,
                   int (*type_size)(const struct glsl_type *));
+nir_src *nir_get_io_indirect_src(nir_intrinsic_instr *instr);
+nir_src *nir_get_io_vertex_index_src(nir_intrinsic_instr *instr);
+
 void nir_lower_vars_to_ssa(nir_shader *shader);
 
 bool nir_remove_dead_variables(nir_shader *shader);
@@ -2024,7 +2032,7 @@ bool nir_lower_gs_intrinsics(nir_shader *shader);
 
 bool nir_normalize_cubemap_coords(nir_shader *shader);
 
-void nir_live_variables_impl(nir_function_impl *impl);
+void nir_live_ssa_defs_impl(nir_function_impl *impl);
 bool nir_ssa_defs_interfere(nir_ssa_def *a, nir_ssa_def *b);
 
 void nir_convert_to_ssa_impl(nir_function_impl *impl);
@@ -2042,12 +2050,10 @@ bool nir_opt_constant_folding(nir_shader *shader);
 
 bool nir_opt_global_to_local(nir_shader *shader);
 
-bool nir_copy_prop_impl(nir_function_impl *impl);
 bool nir_copy_prop(nir_shader *shader);
 
 bool nir_opt_cse(nir_shader *shader);
 
-bool nir_opt_dce_impl(nir_function_impl *impl);
 bool nir_opt_dce(nir_shader *shader);
 
 bool nir_opt_dead_cf(nir_shader *shader);
@@ -2055,7 +2061,6 @@ bool nir_opt_dead_cf(nir_shader *shader);
 void nir_opt_gcm(nir_shader *shader);
 
 bool nir_opt_peephole_select(nir_shader *shader);
-bool nir_opt_peephole_ffma(nir_shader *shader);
 
 bool nir_opt_remove_phis(nir_shader *shader);
 
diff --git a/src/glsl/nir/nir_control_flow.c b/src/glsl/nir/nir_control_flow.c
index 7f51c4faf49..96395a41615 100644
--- a/src/glsl/nir/nir_control_flow.c
+++ b/src/glsl/nir/nir_control_flow.c
@@ -452,6 +452,9 @@ split_block_cursor(nir_cursor cursor,
          before = split_block_before_instr(nir_instr_next(cursor.instr));
       }
       break;
+
+   default:
+      unreachable("not reached");
    }
 
    if (_before)
diff --git a/src/glsl/nir/nir_from_ssa.c b/src/glsl/nir/nir_from_ssa.c
index eaf883dbaa0..f2797f72c8e 100644
--- a/src/glsl/nir/nir_from_ssa.c
+++ b/src/glsl/nir/nir_from_ssa.c
@@ -777,7 +777,7 @@ nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only)
    nir_metadata_preserve(impl, nir_metadata_block_index |
                                nir_metadata_dominance);
 
-   nir_metadata_require(impl, nir_metadata_live_variables |
+   nir_metadata_require(impl, nir_metadata_live_ssa_defs |
                               nir_metadata_dominance);
 
    nir_foreach_block(impl, coalesce_phi_nodes_block, &state);
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h
index 9fd91de157f..0a134aff211 100644
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -91,6 +91,17 @@ BARRIER(memory_barrier)
  */
 INTRINSIC(shader_clock, 0, ARR(), true, 1, 0, 0, NIR_INTRINSIC_CAN_ELIMINATE)
 
+/*
+ * Memory barrier with semantics analogous to the compute shader
+ * groupMemoryBarrier(), memoryBarrierAtomicCounter(), memoryBarrierBuffer(),
+ * memoryBarrierImage() and memoryBarrierShared() GLSL intrinsics.
+ */
+BARRIER(group_memory_barrier)
+BARRIER(memory_barrier_atomic_counter)
+BARRIER(memory_barrier_buffer)
+BARRIER(memory_barrier_image)
+BARRIER(memory_barrier_shared)
+
 /** A conditional discard, with a single boolean source. */
 INTRINSIC(discard_if, 1, ARR(1), false, 0, 0, 0, 0)
 
@@ -264,6 +275,8 @@ LOAD(ubo, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 LOAD(input, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 LOAD(per_vertex_input, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 LOAD(ssbo, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE)
+LOAD(output, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE)
+LOAD(per_vertex_output, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE)
 LOAD(push_constant, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 
 /*
@@ -282,6 +295,7 @@ LOAD(push_constant, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDE
              false, 0, 0, 1 + extra_indices, flags)
 
 STORE(output, 0, 0, 0, 0)
+STORE(per_vertex_output, 1, 1, 0, 0)
 STORE(ssbo, 1, 1, 1, 0)
 
 LAST_INTRINSIC(store_ssbo_indirect)
diff --git a/src/glsl/nir/nir_live_variables.c b/src/glsl/nir/nir_liveness.c
index 1c96dcf36c5..05f79d7bc61 100644
--- a/src/glsl/nir/nir_live_variables.c
+++ b/src/glsl/nir/nir_liveness.c
@@ -42,7 +42,7 @@
  * block but not in the live-in of the block containing the phi node.
  */
 
-struct live_variables_state {
+struct live_ssa_defs_state {
    unsigned num_ssa_defs;
    unsigned bitset_words;
 
@@ -52,7 +52,7 @@ struct live_variables_state {
 static bool
 index_ssa_def(nir_ssa_def *def, void *void_state)
 {
-   struct live_variables_state *state = void_state;
+   struct live_ssa_defs_state *state = void_state;
 
    if (def->parent_instr->type == nir_instr_type_ssa_undef)
       def->live_index = 0;
@@ -77,7 +77,7 @@ index_ssa_definitions_block(nir_block *block, void *state)
 static bool
 init_liveness_block(nir_block *block, void *void_state)
 {
-   struct live_variables_state *state = void_state;
+   struct live_ssa_defs_state *state = void_state;
 
    block->live_in = reralloc(block, block->live_in, BITSET_WORD,
                              state->bitset_words);
@@ -129,7 +129,7 @@ set_ssa_def_dead(nir_ssa_def *def, void *void_live)
  */
 static bool
 propagate_across_edge(nir_block *pred, nir_block *succ,
-                      struct live_variables_state *state)
+                      struct live_ssa_defs_state *state)
 {
    NIR_VLA(BITSET_WORD, live, state->bitset_words);
    memcpy(live, succ->live_in, state->bitset_words * sizeof *live);
@@ -165,9 +165,9 @@ propagate_across_edge(nir_block *pred, nir_block *succ,
 }
 
 void
-nir_live_variables_impl(nir_function_impl *impl)
+nir_live_ssa_defs_impl(nir_function_impl *impl)
 {
-   struct live_variables_state state;
+   struct live_ssa_defs_state state;
 
    /* We start at 1 because we reserve the index value of 0 for ssa_undef
     * instructions.  Those are never live, so their liveness information
diff --git a/src/glsl/nir/nir_lower_global_vars_to_local.c b/src/glsl/nir/nir_lower_global_vars_to_local.c
index fab236611a5..d549ee79bb4 100644
--- a/src/glsl/nir/nir_lower_global_vars_to_local.c
+++ b/src/glsl/nir/nir_lower_global_vars_to_local.c
@@ -100,6 +100,9 @@ nir_lower_global_vars_to_local(nir_shader *shader)
          exec_node_remove(&var->node);
          var->data.mode = nir_var_local;
          exec_list_push_tail(&impl->locals, &var->node);
+         nir_metadata_preserve(impl, nir_metadata_block_index |
+                                     nir_metadata_dominance |
+                                     nir_metadata_live_ssa_defs);
          progress = true;
       }
    }
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index 688b48f4675..00a31458310 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -68,10 +68,22 @@ nir_assign_var_locations(struct exec_list *var_list, unsigned *size,
  * by a vertex number (such as geometry shader inputs).
  */
 static bool
-stage_uses_per_vertex_inputs(struct lower_io_state *state)
+is_per_vertex_input(struct lower_io_state *state, nir_variable *var)
 {
    gl_shader_stage stage = state->builder.shader->stage;
-   return stage == MESA_SHADER_GEOMETRY;
+
+   return var->data.mode == nir_var_shader_in && !var->data.patch &&
+          (stage == MESA_SHADER_TESS_CTRL ||
+           stage == MESA_SHADER_TESS_EVAL ||
+           stage == MESA_SHADER_GEOMETRY);
+}
+
+static bool
+is_per_vertex_output(struct lower_io_state *state, nir_variable *var)
+{
+   gl_shader_stage stage = state->builder.shader->stage;
+   return var->data.mode == nir_var_shader_out && !var->data.patch &&
+          stage == MESA_SHADER_TESS_CTRL;
 }
 
 static unsigned
@@ -149,6 +161,15 @@ load_op(struct lower_io_state *state,
                              nir_intrinsic_load_input;
       }
       break;
+   case nir_var_shader_out:
+      if (per_vertex) {
+         op = has_indirect ? nir_intrinsic_load_per_vertex_output_indirect :
+                             nir_intrinsic_load_per_vertex_output;
+      } else {
+         op = has_indirect ? nir_intrinsic_load_output_indirect :
+                             nir_intrinsic_load_output;
+      }
+      break;
    case nir_var_uniform:
       op = has_indirect ? nir_intrinsic_load_uniform_indirect :
                           nir_intrinsic_load_uniform;
@@ -179,13 +200,16 @@ nir_lower_io_block(nir_block *block, void *void_state)
       if (state->mode != -1 && state->mode != mode)
          continue;
 
+      if (mode != nir_var_shader_in &&
+          mode != nir_var_shader_out &&
+          mode != nir_var_uniform)
+         continue;
+
       switch (intrin->intrinsic) {
       case nir_intrinsic_load_var: {
-         if (mode != nir_var_shader_in && mode != nir_var_uniform)
-            continue;
-
-         bool per_vertex = stage_uses_per_vertex_inputs(state) &&
-                           mode == nir_var_shader_in;
+         bool per_vertex =
+            is_per_vertex_input(state, intrin->variables[0]->var) ||
+            is_per_vertex_output(state, intrin->variables[0]->var);
 
          nir_ssa_def *indirect;
          nir_ssa_def *vertex_index;
@@ -229,20 +253,26 @@ nir_lower_io_block(nir_block *block, void *void_state)
       }
 
       case nir_intrinsic_store_var: {
-         if (intrin->variables[0]->var->data.mode != nir_var_shader_out)
-            continue;
+         assert(mode == nir_var_shader_out);
 
          nir_ssa_def *indirect;
+         nir_ssa_def *vertex_index;
+
+         bool per_vertex =
+            is_per_vertex_output(state, intrin->variables[0]->var);
 
          unsigned offset = get_io_offset(intrin->variables[0], &intrin->instr,
-                                         NULL, &indirect, state);
+                                         per_vertex ? &vertex_index : NULL,
+                                         &indirect, state);
          offset += intrin->variables[0]->var->data.driver_location;
 
          nir_intrinsic_op store_op;
-         if (indirect) {
-            store_op = nir_intrinsic_store_output_indirect;
+         if (per_vertex) {
+            store_op = indirect ? nir_intrinsic_store_per_vertex_output_indirect
+                                : nir_intrinsic_store_per_vertex_output;
          } else {
-            store_op = nir_intrinsic_store_output;
+            store_op = indirect ? nir_intrinsic_store_output_indirect
+                                : nir_intrinsic_store_output;
          }
 
          nir_intrinsic_instr *store = nir_intrinsic_instr_create(state->mem_ctx,
@@ -252,8 +282,11 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
          nir_src_copy(&store->src[0], &intrin->src[0], store);
 
+         if (per_vertex)
+            store->src[1] = nir_src_for_ssa(vertex_index);
+
          if (indirect)
-            store->src[1] = nir_src_for_ssa(indirect);
+            store->src[per_vertex ? 2 : 1] = nir_src_for_ssa(indirect);
 
          nir_instr_insert_before(&intrin->instr, &store->instr);
          nir_instr_remove(&intrin->instr);
@@ -295,3 +328,45 @@ nir_lower_io(nir_shader *shader, nir_variable_mode mode,
          nir_lower_io_impl(overload->impl, mode, type_size);
    }
 }
+
+/**
+ * Return the indirect source for a load/store indirect intrinsic.
+ */
+nir_src *
+nir_get_io_indirect_src(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_input_indirect:
+   case nir_intrinsic_load_output_indirect:
+   case nir_intrinsic_load_uniform_indirect:
+      return &instr->src[0];
+   case nir_intrinsic_load_per_vertex_input_indirect:
+   case nir_intrinsic_load_per_vertex_output_indirect:
+   case nir_intrinsic_store_output_indirect:
+      return &instr->src[1];
+   case nir_intrinsic_store_per_vertex_output_indirect:
+      return &instr->src[2];
+   default:
+      return NULL;
+   }
+}
+
+/**
+ * Return the vertex index source for a load/store per_vertex intrinsic.
+ */
+nir_src *
+nir_get_io_vertex_index_src(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_per_vertex_input:
+   case nir_intrinsic_load_per_vertex_output:
+   case nir_intrinsic_load_per_vertex_input_indirect:
+   case nir_intrinsic_load_per_vertex_output_indirect:
+      return &instr->src[0];
+   case nir_intrinsic_store_per_vertex_output:
+   case nir_intrinsic_store_per_vertex_output_indirect:
+      return &instr->src[1];
+   default:
+      return NULL;
+   }
+}
diff --git a/src/glsl/nir/nir_lower_outputs_to_temporaries.c b/src/glsl/nir/nir_lower_outputs_to_temporaries.c
index 80f43951b5c..9441f4762b6 100644
--- a/src/glsl/nir/nir_lower_outputs_to_temporaries.c
+++ b/src/glsl/nir/nir_lower_outputs_to_temporaries.c
@@ -78,6 +78,9 @@ nir_lower_outputs_to_temporaries(nir_shader *shader)
 {
    struct lower_outputs_state state;
 
+   if (shader->stage == MESA_SHADER_TESS_CTRL)
+      return;
+
    state.shader = shader;
    exec_list_move_nodes_to(&shader->outputs, &state.old_outputs);
 
diff --git a/src/glsl/nir/nir_lower_vars_to_ssa.c b/src/glsl/nir/nir_lower_vars_to_ssa.c
index 59715072c15..e670dbdc7e7 100644
--- a/src/glsl/nir/nir_lower_vars_to_ssa.c
+++ b/src/glsl/nir/nir_lower_vars_to_ssa.c
@@ -455,7 +455,8 @@ lower_copies_to_load_store(struct deref_node *node,
          struct deref_node *arg_node =
             get_deref_node(copy->variables[i], state);
 
-         if (arg_node == NULL)
+         /* Only bother removing copy entries for other nodes */
+         if (arg_node == NULL || arg_node == node)
             continue;
 
          struct set_entry *arg_entry = _mesa_set_search(arg_node->copies, copy);
@@ -466,6 +467,8 @@ lower_copies_to_load_store(struct deref_node *node,
       nir_instr_remove(&copy->instr);
    }
 
+   node->copies = NULL;
+
    return true;
 }
 
@@ -876,10 +879,6 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
    state.add_to_direct_deref_nodes = true;
    nir_foreach_block(impl, register_variable_uses_block, &state);
 
-   struct set *outputs = _mesa_set_create(state.dead_ctx,
-                                          _mesa_hash_pointer,
-                                          _mesa_key_pointer_equal);
-
    bool progress = false;
 
    nir_metadata_require(impl, nir_metadata_block_index);
@@ -913,9 +912,6 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
          def_stack_push(node, &load->def, &state);
       }
 
-      if (deref->var->data.mode == nir_var_shader_out)
-         _mesa_set_add(outputs, node);
-
       foreach_deref_node_match(deref, lower_copies_to_load_store, &state);
    }
 
diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c
index c08b721dae4..736a66c8639 100644
--- a/src/glsl/nir/nir_lower_vec_to_movs.c
+++ b/src/glsl/nir/nir_lower_vec_to_movs.c
@@ -288,6 +288,11 @@ nir_lower_vec_to_movs_impl(nir_function_impl *impl)
 
    nir_foreach_block(impl, lower_vec_to_movs_block, &state);
 
+   if (state.progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   }
+
    return state.progress;
 }
 
diff --git a/src/glsl/nir/nir_metadata.c b/src/glsl/nir/nir_metadata.c
index a03e12456a1..6de981f430f 100644
--- a/src/glsl/nir/nir_metadata.c
+++ b/src/glsl/nir/nir_metadata.c
@@ -39,8 +39,8 @@ nir_metadata_require(nir_function_impl *impl, nir_metadata required)
       nir_index_blocks(impl);
    if (NEEDS_UPDATE(nir_metadata_dominance))
       nir_calc_dominance_impl(impl);
-   if (NEEDS_UPDATE(nir_metadata_live_variables))
-      nir_live_variables_impl(impl);
+   if (NEEDS_UPDATE(nir_metadata_live_ssa_defs))
+      nir_live_ssa_defs_impl(impl);
 
 #undef NEEDS_UPDATE
 
diff --git a/src/glsl/nir/nir_opt_copy_propagate.c b/src/glsl/nir/nir_opt_copy_propagate.c
index 71367d001bb..7d8bdd7f2ca 100644
--- a/src/glsl/nir/nir_opt_copy_propagate.c
+++ b/src/glsl/nir/nir_opt_copy_propagate.c
@@ -256,12 +256,18 @@ copy_prop_block(nir_block *block, void *_state)
    return true;
 }
 
-bool
+static bool
 nir_copy_prop_impl(nir_function_impl *impl)
 {
    bool progress = false;
 
    nir_foreach_block(impl, copy_prop_block, &progress);
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   }
+
    return progress;
 }
 
diff --git a/src/glsl/nir/nir_opt_dce.c b/src/glsl/nir/nir_opt_dce.c
index e0ebdc61c2f..603252825c3 100644
--- a/src/glsl/nir/nir_opt_dce.c
+++ b/src/glsl/nir/nir_opt_dce.c
@@ -145,7 +145,7 @@ delete_block_cb(nir_block *block, void *_state)
    return true;
 }
 
-bool
+static bool
 nir_opt_dce_impl(nir_function_impl *impl)
 {
    struct exec_list *worklist = ralloc(NULL, struct exec_list);
diff --git a/src/glsl/nir/nir_opt_dead_cf.c b/src/glsl/nir/nir_opt_dead_cf.c
index 0d4819b5158..356e926ffe3 100644
--- a/src/glsl/nir/nir_opt_dead_cf.c
+++ b/src/glsl/nir/nir_opt_dead_cf.c
@@ -204,7 +204,7 @@ loop_is_dead(nir_loop *loop)
       return false;
 
    nir_function_impl *impl = nir_cf_node_get_function(&loop->cf_node);
-   nir_metadata_require(impl, nir_metadata_live_variables |
+   nir_metadata_require(impl, nir_metadata_live_ssa_defs |
                               nir_metadata_dominance);
 
    for (nir_block *cur = after->imm_dom; cur != before; cur = cur->imm_dom) {
diff --git a/src/glsl/nir/nir_opt_remove_phis.c b/src/glsl/nir/nir_opt_remove_phis.c
index 5bdf7ef4da7..66d37544115 100644
--- a/src/glsl/nir/nir_opt_remove_phis.c
+++ b/src/glsl/nir/nir_opt_remove_phis.c
@@ -108,6 +108,11 @@ remove_phis_impl(nir_function_impl *impl)
 
    nir_foreach_block(impl, remove_phis_block, &progress);
 
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   }
+
    return progress;
 }
 
diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c
index 30220c5e48d..f7f5fdf3181 100644
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -448,8 +448,12 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
    case nir_intrinsic_load_per_vertex_input_indirect:
       var_list = &state->shader->inputs;
       break;
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_output_indirect:
    case nir_intrinsic_store_output:
    case nir_intrinsic_store_output_indirect:
+   case nir_intrinsic_store_per_vertex_output:
+   case nir_intrinsic_store_per_vertex_output_indirect:
       var_list = &state->shader->outputs;
       break;
    default:
diff --git a/src/glsl/nir/nir_remove_dead_variables.c b/src/glsl/nir/nir_remove_dead_variables.c
index d6783e78803..8f0833c7e24 100644
--- a/src/glsl/nir/nir_remove_dead_variables.c
+++ b/src/glsl/nir/nir_remove_dead_variables.c
@@ -126,8 +126,14 @@ nir_remove_dead_variables(nir_shader *shader)
    progress = remove_dead_vars(&shader->globals, live) || progress;
 
    nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         progress = remove_dead_vars(&overload->impl->locals, live) || progress;
+      if (overload->impl) {
+         if (remove_dead_vars(&overload->impl->locals, live)) {
+            nir_metadata_preserve(overload->impl, nir_metadata_block_index |
+                                                  nir_metadata_dominance |
+                                                  nir_metadata_live_ssa_defs);
+            progress = true;
+         }
+      }
    }
 
    _mesa_set_destroy(live, NULL);
diff --git a/src/glsl/nir/nir_split_var_copies.c b/src/glsl/nir/nir_split_var_copies.c
index f583178caf4..bfbef72c1ab 100644
--- a/src/glsl/nir/nir_split_var_copies.c
+++ b/src/glsl/nir/nir_split_var_copies.c
@@ -263,6 +263,11 @@ split_var_copies_impl(nir_function_impl *impl)
 
    ralloc_free(state.dead_ctx);
 
+   if (state.progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   }
+
    return state.progress;
 }
 
diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c
index a42e830fd72..ed374b921fa 100644
--- a/src/glsl/nir/nir_validate.c
+++ b/src/glsl/nir/nir_validate.c
@@ -401,15 +401,18 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
    case nir_intrinsic_load_var: {
       const struct glsl_type *type =
          nir_deref_tail(&instr->variables[0]->deref)->type;
-      assert(glsl_type_is_vector_or_scalar(type));
+      assert(glsl_type_is_vector_or_scalar(type) ||
+             (instr->variables[0]->var->data.mode == nir_var_uniform &&
+              glsl_get_base_type(type) == GLSL_TYPE_SUBROUTINE));
       assert(instr->num_components == glsl_get_vector_elements(type));
-      assert(instr->variables[0]->var->data.mode != nir_var_shader_out);
       break;
    }
    case nir_intrinsic_store_var: {
       const struct glsl_type *type =
          nir_deref_tail(&instr->variables[0]->deref)->type;
-      assert(glsl_type_is_vector_or_scalar(type));
+      assert(glsl_type_is_vector_or_scalar(type) ||
+             (instr->variables[0]->var->data.mode == nir_var_uniform &&
+              glsl_get_base_type(type) == GLSL_TYPE_SUBROUTINE));
       assert(instr->num_components == glsl_get_vector_elements(type));
       assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
              instr->variables[0]->var->data.mode != nir_var_uniform &&
@@ -422,7 +425,6 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
       assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
              instr->variables[0]->var->data.mode != nir_var_uniform &&
              instr->variables[0]->var->data.mode != nir_var_shader_storage);
-      assert(instr->variables[1]->var->data.mode != nir_var_shader_out);
       break;
    default:
       break;
diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h
index d1cf7ca04cc..dd0e0bad806 100644
--- a/src/glsl/nir/shader_enums.h
+++ b/src/glsl/nir/shader_enums.h
@@ -396,6 +396,7 @@ typedef enum
    SYSTEM_VALUE_SAMPLE_ID,
    SYSTEM_VALUE_SAMPLE_POS,
    SYSTEM_VALUE_SAMPLE_MASK_IN,
+   SYSTEM_VALUE_HELPER_INVOCATION,
    /*@}*/
 
    /**
diff --git a/src/glsl/nir/spirv_to_nir.c b/src/glsl/nir/spirv_to_nir.c
index 45964e65b7e..740479f4d20 100644
--- a/src/glsl/nir/spirv_to_nir.c
+++ b/src/glsl/nir/spirv_to_nir.c
@@ -533,7 +533,6 @@ vtn_handle_type(struct vtn_builder *b, SpvOp opcode,
          fields[i].centroid = 0;
          fields[i].sample = 0;
          fields[i].matrix_layout = 2;
-         fields[i].stream = -1;
       }
 
       struct member_decoration_ctx ctx = {
diff --git a/src/glsl/opt_dead_code_local.cpp b/src/glsl/opt_dead_code_local.cpp
index 4770fcff2ea..ee9f22c0373 100644
--- a/src/glsl/opt_dead_code_local.cpp
+++ b/src/glsl/opt_dead_code_local.cpp
@@ -197,6 +197,11 @@ process_assignment(void *ctx, ir_assignment *ir, exec_list *assignments)
 	    if (entry->lhs != var)
 	       continue;
 
+            /* Skip if the assignment we're trying to eliminate isn't a plain
+             * variable deref. */
+            if (entry->ir->lhs->ir_type != ir_type_dereference_variable)
+               continue;
+
 	    int remove = entry->unused & ir->write_mask;
 	    if (debug) {
 	       printf("%s 0x%01x - 0x%01x = 0x%01x\n",
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index 3a95360eda6..7d59c787aed 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -126,8 +126,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
    shProg->NumShaderStorageBlocks = 0;
 
    for (i = 0; i < MESA_SHADER_STAGES; i++) {
-      ralloc_free(shProg->UniformBlockStageIndex[i]);
-      shProg->UniformBlockStageIndex[i] = NULL;
+      ralloc_free(shProg->InterfaceBlockStageIndex[i]);
+      shProg->InterfaceBlockStageIndex[i] = NULL;
    }
 
    ralloc_free(shProg->AtomicBuffers);
@@ -173,7 +173,6 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api)
    ctx->Extensions.OES_standard_derivatives = true;
 
    ctx->Extensions.EXT_shader_integer_mix = true;
-   ctx->Extensions.EXT_texture3D = true;
    ctx->Extensions.EXT_texture_array = true;
 
    ctx->Extensions.NV_texture_rectangle = true;
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index bf20e4801cc..9a777a24c61 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -905,4 +905,13 @@
 
 </category>
 
+<category name="GL_EXT_buffer_storage" number="239">
+    <function name="BufferStorageEXT" alias="BufferStorage" es2="3.1">
+        <param name="target" type="GLenum"/>
+        <param name="size" type="GLsizeiptr"/>
+        <param name="data" type="const GLvoid *"/>
+        <param name="flags" type="GLbitfield"/>
+    </function>
+</category>
+
 </OpenGLAPI>
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index de0e330b7d1..778b92d9892 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -77,6 +77,7 @@ MAIN_FILES = \
 	main/execmem.c \
 	main/extensions.c \
 	main/extensions.h \
+	main/extensions_table.h \
 	main/fbobject.c \
 	main/fbobject.h \
 	main/feedback.c \
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index e27489d6195..0ffcd9c2c3f 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -449,6 +449,16 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
    save->API = ctx->API;
    ctx->API = API_OPENGL_COMPAT;
 
+   /* Mesa's extension helper functions use the current context's API to look up
+    * the version required by an extension as a step in determining whether or
+    * not it has been advertised. Since meta aims to only be restricted by the
+    * driver capability (and not by whether or not an extension has been
+    * advertised), set the helper functions' Version variable to a value that
+    * will make the checks on the context API and version unconditionally pass.
+    */
+   save->ExtensionsVersion = ctx->Extensions.Version;
+   ctx->Extensions.Version = ~0;
+
    /* Pausing transform feedback needs to be done early, or else we won't be
     * able to change other state.
     */
@@ -1222,6 +1232,7 @@ _mesa_meta_end(struct gl_context *ctx)
    ctx->Meta->SaveStackDepth--;
 
    ctx->API = save->API;
+   ctx->Extensions.Version = save->ExtensionsVersion;
 }
 
 
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
index 23fa209905d..d4bf0b65524 100644
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -72,6 +72,7 @@ struct save_state
 
    /* Always saved/restored with meta. */
    gl_api API;
+   uint8_t ExtensionsVersion;
 
    /** MESA_META_CLEAR (and others?) */
    struct gl_query_object *CurrentOcclusionObject;
@@ -285,9 +286,11 @@ enum blit_msaa_shader {
    BLIT_2X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE,
    BLIT_4X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE,
    BLIT_8X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE,
+   BLIT_16X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE,
    BLIT_2X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE,
    BLIT_4X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE,
    BLIT_8X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE,
+   BLIT_16X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE,
    BLIT_MSAA_SHADER_COUNT,
 };
 
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 5972a5af0c9..4a2444af0f9 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -72,20 +72,25 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
    char *sample_map_expr = rzalloc_size(mem_ctx, 1);
    char *texel_fetch_macro = rzalloc_size(mem_ctx, 1);
    const char *sampler_array_suffix = "";
-   float y_scale;
+   float x_scale, y_scale;
    enum blit_msaa_shader shader_index;
 
    assert(src_rb);
    samples = MAX2(src_rb->NumSamples, 1);
-   y_scale = samples * 0.5;
+
+   if (samples == 16)
+      x_scale = 4.0;
+   else
+      x_scale = 2.0;
+   y_scale = samples / x_scale;
 
    /* We expect only power of 2 samples in source multisample buffer. */
    assert(samples > 0 && _mesa_is_pow_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
-   /* Update the assert if we plan to support more than 8X MSAA. */
-   assert(shader_offset > 0 && shader_offset < 4);
+   /* Update the assert if we plan to support more than 16X MSAA. */
+   assert(shader_offset > 0 && shader_offset <= 4);
 
    assert(target == GL_TEXTURE_2D_MULTISAMPLE ||
           target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY);
@@ -129,6 +134,10 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
       sample_number =  "sample_map[int(2 * fract(coord.x) + 8 * fract(coord.y))]";
       sample_map = ctx->Const.SampleMap8x;
       break;
+   case 16:
+      sample_number =  "sample_map[int(4 * fract(coord.x) + 16 * fract(coord.y))]";
+      sample_map = ctx->Const.SampleMap16x;
+      break;
    default:
       sample_number = NULL;
       sample_map = NULL;
@@ -184,9 +193,9 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
                                "{\n"
                                "%s"
                                "   vec2 interp;\n"
-                               "   const vec2 scale = vec2(2.0f, %ff);\n"
-                               "   const vec2 scale_inv = vec2(0.5f, %ff);\n"
-                               "   const vec2 s_0_offset = vec2(0.25f, %ff);\n"
+                               "   const vec2 scale = vec2(%ff, %ff);\n"
+                               "   const vec2 scale_inv = vec2(%ff, %ff);\n"
+                               "   const vec2 s_0_offset = vec2(%ff, %ff);\n"
                                "   vec2 s_0_coord, s_1_coord, s_2_coord, s_3_coord;\n"
                                "   vec4 s_0_color, s_1_color, s_2_color, s_3_color;\n"
                                "   vec4 x_0_color, x_1_color;\n"
@@ -219,9 +228,9 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
                                "}\n",
                                sampler_array_suffix,
                                sample_map_expr,
-                               y_scale,
-                               1.0f / y_scale,
-                               1.0f / samples,
+                               x_scale, y_scale,
+                               1.0f / x_scale, 1.0f / y_scale,
+                               0.5f / x_scale, 0.5f / y_scale,
                                texel_fetch_macro);
 
    _mesa_meta_compile_and_link_program(ctx, vs_source, fs_source, name,
@@ -348,17 +357,17 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
        shader_index == BLIT_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_DEPTH_COPY ||
        shader_index == BLIT_MSAA_SHADER_2D_MULTISAMPLE_DEPTH_COPY) {
       char *sample_index;
-      const char *arb_sample_shading_extension_string;
+      const char *tex_coords = "texCoords";
 
       if (dst_is_msaa) {
-         arb_sample_shading_extension_string = "#extension GL_ARB_sample_shading : enable";
          sample_index = "gl_SampleID";
          name = "depth MSAA copy";
+
+         if (ctx->Extensions.ARB_gpu_shader5 && samples >= 16) {
+            /* See comment below for the color copy */
+            tex_coords = "interpolateAtOffset(texCoords, vec2(0.0))";
+         }
       } else {
-         /* Don't need that extension, since we're drawing to a single-sampled
-          * destination.
-          */
-         arb_sample_shading_extension_string = "";
          /* From the GL 4.3 spec:
           *
           *     "If there is a multisample buffer (the value of SAMPLE_BUFFERS
@@ -388,34 +397,59 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
       fs_source = ralloc_asprintf(mem_ctx,
                                   "#version 130\n"
                                   "#extension GL_ARB_texture_multisample : enable\n"
-                                  "%s\n"
+                                  "#extension GL_ARB_sample_shading : enable\n"
+                                  "#extension GL_ARB_gpu_shader5 : enable\n"
                                   "uniform sampler2DMS%s texSampler;\n"
                                   "in %s texCoords;\n"
                                   "out vec4 out_color;\n"
                                   "\n"
                                   "void main()\n"
                                   "{\n"
-                                  "   gl_FragDepth = texelFetch(texSampler, i%s(texCoords), %s).r;\n"
+                                  "   gl_FragDepth = texelFetch(texSampler, i%s(%s), %s).r;\n"
                                   "}\n",
-                                  arb_sample_shading_extension_string,
                                   sampler_array_suffix,
                                   texcoord_type,
                                   texcoord_type,
+                                  tex_coords,
                                   sample_index);
    } else {
       /* You can create 2D_MULTISAMPLE textures with 0 sample count (meaning 1
        * sample).  Yes, this is ridiculous.
        */
       char *sample_resolve;
-      const char *arb_sample_shading_extension_string;
       const char *merge_function;
       name = ralloc_asprintf(mem_ctx, "%svec4 MSAA %s",
                              vec4_prefix,
                              dst_is_msaa ? "copy" : "resolve");
 
       if (dst_is_msaa) {
-         arb_sample_shading_extension_string = "#extension GL_ARB_sample_shading : enable";
-         sample_resolve = ralloc_asprintf(mem_ctx, "   out_color = texelFetch(texSampler, i%s(texCoords), gl_SampleID);", texcoord_type);
+         const char *tex_coords;
+
+         if (ctx->Extensions.ARB_gpu_shader5 && samples >= 16) {
+            /* If interpolateAtOffset is available then it will be used to
+             * force the interpolation to the center. This is required at
+             * least on Intel hardware because it is possible to have a sample
+             * position on the 0 x or y axis which means it will lie exactly
+             * on the pixel boundary. If we let the hardware interpolate the
+             * coordinates at one of these positions then it is possible for
+             * it to jump to a neighboring texel when converting to ints due
+             * to rounding errors. This is only done for >= 16x MSAA because
+             * it probably has some overhead. It is more likely that some
+             * hardware will use one of these problematic positions at 16x
+             * MSAA because in that case in D3D they are defined to be at
+             * these positions.
+             */
+            tex_coords = "interpolateAtOffset(texCoords, vec2(0.0))";
+         } else {
+            tex_coords = "texCoords";
+         }
+
+         sample_resolve =
+            ralloc_asprintf(mem_ctx,
+                            "   out_color = texelFetch(texSampler, "
+                            "i%s(%s), gl_SampleID);",
+                            texcoord_type, tex_coords);
+
          merge_function = "";
       } else {
          int i;
@@ -430,8 +464,6 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
                "vec4 merge(vec4 a, vec4 b) { return (a + b); }\n";
          }
 
-         arb_sample_shading_extension_string = "";
-
          /* We're assuming power of two samples for this resolution procedure.
           *
           * To avoid losing any floating point precision if the samples all
@@ -487,7 +519,8 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
       fs_source = ralloc_asprintf(mem_ctx,
                                   "#version 130\n"
                                   "#extension GL_ARB_texture_multisample : enable\n"
-                                  "%s\n"
+                                  "#extension GL_ARB_sample_shading : enable\n"
+                                  "#extension GL_ARB_gpu_shader5 : enable\n"
                                   "#define gvec4 %svec4\n"
                                   "uniform %ssampler2DMS%s texSampler;\n"
                                   "in %s texCoords;\n"
@@ -498,7 +531,6 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
                                   "{\n"
                                   "%s\n" /* sample_resolve */
                                   "}\n",
-                                  arb_sample_shading_extension_string,
                                   vec4_prefix,
                                   vec4_prefix,
                                   sampler_array_suffix,
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index 4800278a467..a9da0a21ba3 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -128,6 +128,8 @@ _mesa_meta_glsl_generate_mipmap_cleanup(struct gen_mipmap_state *mipmap)
    mipmap->VAO = 0;
    _mesa_DeleteBuffers(1, &mipmap->VBO);
    mipmap->VBO = 0;
+   _mesa_DeleteSamplers(1, &mipmap->Sampler);
+   mipmap->Sampler = 0;
 
    _mesa_meta_blit_shader_table_cleanup(&mipmap->shaders);
 }
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index ed2654ef329..595903dd572 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -14,6 +14,7 @@ i965_compiler_FILES = \
 	brw_eu_emit.c \
 	brw_eu.h \
 	brw_eu_util.c \
+	brw_eu_validate.c \
 	brw_fs_builder.h \
 	brw_fs_channel_expressions.cpp \
 	brw_fs_cmod_propagation.cpp \
@@ -46,6 +47,7 @@ i965_compiler_FILES = \
 	brw_nir.h \
 	brw_nir.c \
 	brw_nir_analyze_boolean_resolves.c \
+	brw_nir_opt_peephole_ffma.c \
 	brw_nir_uniforms.cpp \
 	brw_packed_float.c \
 	brw_predicated_break.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c
index 508f1f019ae..d8226e0ca05 100644
--- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
+++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
@@ -88,7 +88,6 @@ reserve_hw_bt_space(struct brw_context *brw, unsigned bytes)
 void
 brw_upload_binding_table(struct brw_context *brw,
                          uint32_t packet_name,
-                         GLbitfield brw_new_binding_table,
                          const struct brw_stage_prog_data *prog_data,
                          struct brw_stage_state *stage_state)
 {
@@ -127,7 +126,7 @@ brw_upload_binding_table(struct brw_context *brw,
       }
    }
 
-   brw->ctx.NewDriverState |= brw_new_binding_table;
+   brw->ctx.NewDriverState |= BRW_NEW_BINDING_TABLE_POINTERS;
 
    if (brw->gen >= 7) {
       if (brw->use_resource_streamer) {
@@ -159,7 +158,7 @@ brw_vs_upload_binding_table(struct brw_context *brw)
    const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
    brw_upload_binding_table(brw,
                             _3DSTATE_BINDING_TABLE_POINTERS_VS,
-                            BRW_NEW_VS_BINDING_TABLE, prog_data,
+                            prog_data,
                             &brw->vs.base);
 }
 
@@ -183,7 +182,7 @@ brw_upload_wm_binding_table(struct brw_context *brw)
    const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
    brw_upload_binding_table(brw,
                             _3DSTATE_BINDING_TABLE_POINTERS_PS,
-                            BRW_NEW_PS_BINDING_TABLE, prog_data,
+                            prog_data,
                             &brw->wm.base);
 }
 
@@ -209,7 +208,7 @@ brw_gs_upload_binding_table(struct brw_context *brw)
    const struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
    brw_upload_binding_table(brw,
                             _3DSTATE_BINDING_TABLE_POINTERS_GS,
-                            BRW_NEW_GS_BINDING_TABLE, prog_data,
+                            prog_data,
                             &brw->gs.base);
 }
 
@@ -406,10 +405,8 @@ const struct brw_tracked_state brw_binding_table_pointers = {
    .dirty = {
       .mesa = 0,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_GS_BINDING_TABLE |
-             BRW_NEW_PS_BINDING_TABLE |
-             BRW_NEW_STATE_BASE_ADDRESS |
-             BRW_NEW_VS_BINDING_TABLE,
+             BRW_NEW_BINDING_TABLE_POINTERS |
+             BRW_NEW_STATE_BASE_ADDRESS,
    },
    .emit = gen4_upload_binding_table_pointers,
 };
@@ -442,10 +439,8 @@ const struct brw_tracked_state gen6_binding_table_pointers = {
    .dirty = {
       .mesa = 0,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_GS_BINDING_TABLE |
-             BRW_NEW_PS_BINDING_TABLE |
-             BRW_NEW_STATE_BASE_ADDRESS |
-             BRW_NEW_VS_BINDING_TABLE,
+             BRW_NEW_BINDING_TABLE_POINTERS |
+             BRW_NEW_STATE_BASE_ADDRESS,
    },
    .emit = gen6_upload_binding_table_pointers,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index e5133ef5a3d..cd78af0dce4 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -146,6 +146,13 @@ struct brw_sampler_prog_key_data {
    uint32_t compressed_multisample_layout_mask;
 
    /**
+    * Whether this sampler is using 16x multisampling. If so fetching from
+    * this sampler will be handled with a different instruction, ld2dms_w
+    * instead of ld2dms.
+    */
+   uint32_t msaa_16;
+
+   /**
     * For Sandybridge, which shader w/a we need for gather quirks.
     */
    enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS];
@@ -454,6 +461,8 @@ struct brw_vue_map {
    int num_slots;
 };
 
+void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map);
+
 /**
  * Convert a VUE slot number into a byte offset within the VUE.
  */
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 3b125448e14..ac6045dbba9 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -84,6 +84,12 @@ brw_query_samples_for_format(struct gl_context *ctx, GLenum target,
 
    switch (brw->gen) {
    case 9:
+      samples[0] = 16;
+      samples[1] = 8;
+      samples[2] = 4;
+      samples[3] = 2;
+      return 4;
+
    case 8:
       samples[0] = 8;
       samples[1] = 4;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index c83f47bdff7..4b2db61c758 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -184,9 +184,7 @@ enum brw_state_id {
    BRW_STATE_CONTEXT,
    BRW_STATE_PSP,
    BRW_STATE_SURFACES,
-   BRW_STATE_VS_BINDING_TABLE,
-   BRW_STATE_GS_BINDING_TABLE,
-   BRW_STATE_PS_BINDING_TABLE,
+   BRW_STATE_BINDING_TABLE_POINTERS,
    BRW_STATE_INDICES,
    BRW_STATE_VERTICES,
    BRW_STATE_BATCH,
@@ -261,9 +259,7 @@ enum brw_state_id {
 #define BRW_NEW_CONTEXT                 (1ull << BRW_STATE_CONTEXT)
 #define BRW_NEW_PSP                     (1ull << BRW_STATE_PSP)
 #define BRW_NEW_SURFACES                (1ull << BRW_STATE_SURFACES)
-#define BRW_NEW_VS_BINDING_TABLE        (1ull << BRW_STATE_VS_BINDING_TABLE)
-#define BRW_NEW_GS_BINDING_TABLE        (1ull << BRW_STATE_GS_BINDING_TABLE)
-#define BRW_NEW_PS_BINDING_TABLE        (1ull << BRW_STATE_PS_BINDING_TABLE)
+#define BRW_NEW_BINDING_TABLE_POINTERS  (1ull << BRW_STATE_BINDING_TABLE_POINTERS)
 #define BRW_NEW_INDICES                 (1ull << BRW_STATE_INDICES)
 #define BRW_NEW_VERTICES                (1ull << BRW_STATE_VERTICES)
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 754da9fc3da..3ad90da8b2f 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -79,7 +79,9 @@
 #define _3DPRIM_LINESTRIP_BF      0x13
 #define _3DPRIM_LINESTRIP_CONT_BF 0x14
 #define _3DPRIM_TRIFAN_NOSTIPPLE  0x16
-#endif
+#define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
+
+#endif /* bdw_pack.h */
 
 /* We use this offset to be able to pass native primitive types in struct
  * _mesa_prim::mode.  Native primitive types are BRW_PRIM_OFFSET +
@@ -840,43 +842,62 @@ enum PACKED brw_horizontal_stride {
 
 enum opcode {
    /* These are the actual hardware opcodes. */
+   BRW_OPCODE_ILLEGAL = 0,
    BRW_OPCODE_MOV =	1,
    BRW_OPCODE_SEL =	2,
+   BRW_OPCODE_MOVI =	3,   /**< G45+ */
    BRW_OPCODE_NOT =	4,
    BRW_OPCODE_AND =	5,
    BRW_OPCODE_OR =	6,
    BRW_OPCODE_XOR =	7,
    BRW_OPCODE_SHR =	8,
    BRW_OPCODE_SHL =	9,
+   // BRW_OPCODE_DIM =	10,  /**< Gen7.5 only */ /* Reused */
+   // BRW_OPCODE_SMOV =	10,  /**< Gen8+       */ /* Reused */
+   /* Reserved - 11 */
    BRW_OPCODE_ASR =	12,
+   /* Reserved - 13-15 */
    BRW_OPCODE_CMP =	16,
    BRW_OPCODE_CMPN =	17,
    BRW_OPCODE_CSEL =	18,  /**< Gen8+ */
    BRW_OPCODE_F32TO16 = 19,  /**< Gen7 only */
    BRW_OPCODE_F16TO32 = 20,  /**< Gen7 only */
+   /* Reserved - 21-22 */
    BRW_OPCODE_BFREV =	23,  /**< Gen7+ */
    BRW_OPCODE_BFE =	24,  /**< Gen7+ */
    BRW_OPCODE_BFI1 =	25,  /**< Gen7+ */
    BRW_OPCODE_BFI2 =	26,  /**< Gen7+ */
+   /* Reserved - 27-31 */
    BRW_OPCODE_JMPI =	32,
+   // BRW_OPCODE_BRD =	33,  /**< Gen7+ */
    BRW_OPCODE_IF =	34,
-   BRW_OPCODE_IFF =	35,  /**< Pre-Gen6 */
+   BRW_OPCODE_IFF =	35,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_BRC =	35,  /**< Gen7+       */ /* Reused */
    BRW_OPCODE_ELSE =	36,
    BRW_OPCODE_ENDIF =	37,
-   BRW_OPCODE_DO =	38,
+   BRW_OPCODE_DO =	38,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_CASE =	38,  /**< Gen6 only   */ /* Reused */
    BRW_OPCODE_WHILE =	39,
    BRW_OPCODE_BREAK =	40,
    BRW_OPCODE_CONTINUE = 41,
    BRW_OPCODE_HALT =	42,
-   BRW_OPCODE_MSAVE =	44,  /**< Pre-Gen6 */
-   BRW_OPCODE_MRESTORE = 45, /**< Pre-Gen6 */
-   BRW_OPCODE_PUSH =	46,  /**< Pre-Gen6 */
-   BRW_OPCODE_GOTO =	46,  /**< Gen8+    */
-   BRW_OPCODE_POP =	47,  /**< Pre-Gen6 */
+   // BRW_OPCODE_CALLA =	43,  /**< Gen7.5+     */
+   // BRW_OPCODE_MSAVE =	44,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_CALL =	44,  /**< Gen6+       */ /* Reused */
+   // BRW_OPCODE_MREST =	45,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_RET =	45,  /**< Gen6+       */ /* Reused */
+   // BRW_OPCODE_PUSH =	46,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_FORK =	46,  /**< Gen6 only   */ /* Reused */
+   // BRW_OPCODE_GOTO =	46,  /**< Gen8+       */ /* Reused */
+   // BRW_OPCODE_POP =	47,  /**< Pre-Gen6    */
    BRW_OPCODE_WAIT =	48,
    BRW_OPCODE_SEND =	49,
    BRW_OPCODE_SENDC =	50,
+   BRW_OPCODE_SENDS =	51,  /**< Gen9+ */
+   BRW_OPCODE_SENDSC =	52,  /**< Gen9+ */
+   /* Reserved 53-55 */
    BRW_OPCODE_MATH =	56,  /**< Gen6+ */
+   /* Reserved 57-63 */
    BRW_OPCODE_ADD =	64,
    BRW_OPCODE_MUL =	65,
    BRW_OPCODE_AVG =	66,
@@ -895,16 +916,21 @@ enum opcode {
    BRW_OPCODE_SUBB =	79,  /**< Gen7+ */
    BRW_OPCODE_SAD2 =	80,
    BRW_OPCODE_SADA2 =	81,
+   /* Reserved 82-83 */
    BRW_OPCODE_DP4 =	84,
    BRW_OPCODE_DPH =	85,
    BRW_OPCODE_DP3 =	86,
    BRW_OPCODE_DP2 =	87,
+   /* Reserved 88 */
    BRW_OPCODE_LINE =	89,
    BRW_OPCODE_PLN =	90,  /**< G45+ */
    BRW_OPCODE_MAD =	91,  /**< Gen6+ */
    BRW_OPCODE_LRP =	92,  /**< Gen6+ */
+   // BRW_OPCODE_MADM =	93,  /**< Gen8+ */
+   /* Reserved 94-124 */
    BRW_OPCODE_NENOP =	125, /**< G45 only */
    BRW_OPCODE_NOP =	126,
+   /* Reserved 127 */
 
    /* These are compiler backend opcodes that get translated into other
     * instructions.
@@ -966,6 +992,8 @@ enum opcode {
    FS_OPCODE_TXB_LOGICAL,
    SHADER_OPCODE_TXF_CMS,
    SHADER_OPCODE_TXF_CMS_LOGICAL,
+   SHADER_OPCODE_TXF_CMS_W,
+   SHADER_OPCODE_TXF_CMS_W_LOGICAL,
    SHADER_OPCODE_TXF_UMS,
    SHADER_OPCODE_TXF_UMS_LOGICAL,
    SHADER_OPCODE_TXF_MCS,
@@ -1029,13 +1057,10 @@ enum opcode {
    SHADER_OPCODE_GEN7_SCRATCH_READ,
 
    /**
-    * Gen8+ SIMD8 URB Read message.
-    *
-    * Source 0: The header register, containing URB handles (g1).
-    *
-    * Currently only supports constant offsets, in inst->offset.
+    * Gen8+ SIMD8 URB Read messages.
     */
    SHADER_OPCODE_URB_READ_SIMD8,
+   SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT,
 
    SHADER_OPCODE_URB_WRITE_SIMD8,
    SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT,
@@ -1373,10 +1398,23 @@ enum PACKED brw_predicate {
    BRW_PREDICATE_ALIGN16_ALL4H       =  7,
 };
 
-#define BRW_ARCHITECTURE_REGISTER_FILE    0
-#define BRW_GENERAL_REGISTER_FILE         1
-#define BRW_MESSAGE_REGISTER_FILE         2
-#define BRW_IMMEDIATE_VALUE               3
+enum PACKED brw_reg_file {
+   BRW_ARCHITECTURE_REGISTER_FILE = 0,
+   BRW_GENERAL_REGISTER_FILE      = 1,
+   BRW_MESSAGE_REGISTER_FILE      = 2,
+   BRW_IMMEDIATE_VALUE            = 3,
+
+   ARF = BRW_ARCHITECTURE_REGISTER_FILE,
+   FIXED_GRF = BRW_GENERAL_REGISTER_FILE,
+   MRF = BRW_MESSAGE_REGISTER_FILE,
+   IMM = BRW_IMMEDIATE_VALUE,
+
+   /* These are not hardware values */
+   VGRF,
+   ATTR,
+   UNIFORM, /* prog_data->params[reg] */
+   BAD_FILE,
+};
 
 #define BRW_HW_REG_TYPE_UD  0
 #define BRW_HW_REG_TYPE_D   1
@@ -1541,6 +1579,7 @@ enum brw_message_target {
 #define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO   17
 #define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C 18
 #define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W     28
 #define GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS       29
 #define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS       30
 #define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS       31
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 6372fb5c55f..541c7958d5e 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -337,6 +337,15 @@ static const struct brw_device_info brw_device_info_skl_gt3 = {
 
 static const struct brw_device_info brw_device_info_skl_gt4 = {
    GEN9_FEATURES, .gt = 4,
+   /* From the "L3 Allocation and Programming" documentation:
+    *
+    * "URB is limited to 1008KB due to programming restrictions.  This is not a
+    * restriction of the L3 implementation, but of the FF and other clients.
+    * Therefore, in a GT4 implementation it is possible for the programmed
+    * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
+    * only 1008KB of this will be used."
+    */
+   .urb.size = 1008 / 3,
 };
 
 static const struct brw_device_info brw_device_info_bxt = {
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index df747107188..650bdeea344 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -34,6 +34,7 @@
 
 const struct opcode_desc opcode_descs[128] = {
    [BRW_OPCODE_MOV]      = { .name = "mov",     .nsrc = 1, .ndst = 1 },
+   [BRW_OPCODE_MOVI]     = { .name = "movi",    .nsrc = 2, .ndst = 1 },
    [BRW_OPCODE_FRC]      = { .name = "frc",     .nsrc = 1, .ndst = 1 },
    [BRW_OPCODE_RNDU]     = { .name = "rndu",    .nsrc = 1, .ndst = 1 },
    [BRW_OPCODE_RNDD]     = { .name = "rndd",    .nsrc = 1, .ndst = 1 },
@@ -83,23 +84,26 @@ const struct opcode_desc opcode_descs[128] = {
 
    [BRW_OPCODE_SEND]     = { .name = "send",    .nsrc = 1, .ndst = 1 },
    [BRW_OPCODE_SENDC]    = { .name = "sendc",   .nsrc = 1, .ndst = 1 },
+   [BRW_OPCODE_SENDS]    = { .name = "sends",   .nsrc = 2, .ndst = 1 },
+   [BRW_OPCODE_SENDSC]   = { .name = "sendsc",  .nsrc = 2, .ndst = 1 },
+   [BRW_OPCODE_ILLEGAL]  = { .name = "illegal", .nsrc = 0, .ndst = 0 },
    [BRW_OPCODE_NOP]      = { .name = "nop",     .nsrc = 0, .ndst = 0 },
    [BRW_OPCODE_NENOP]    = { .name = "nenop",   .nsrc = 0, .ndst = 0 },
    [BRW_OPCODE_JMPI]     = { .name = "jmpi",    .nsrc = 0, .ndst = 0 },
-   [BRW_OPCODE_IF]       = { .name = "if",      .nsrc = 2, .ndst = 0 },
-   [BRW_OPCODE_IFF]      = { .name = "iff",     .nsrc = 2, .ndst = 1 },
-   [BRW_OPCODE_WHILE]    = { .name = "while",   .nsrc = 2, .ndst = 0 },
-   [BRW_OPCODE_ELSE]     = { .name = "else",    .nsrc = 2, .ndst = 0 },
-   [BRW_OPCODE_BREAK]    = { .name = "break",   .nsrc = 2, .ndst = 0 },
-   [BRW_OPCODE_CONTINUE] = { .name = "cont",    .nsrc = 1, .ndst = 0 },
-   [BRW_OPCODE_HALT]     = { .name = "halt",    .nsrc = 1, .ndst = 0 },
-   [BRW_OPCODE_MSAVE]    = { .name = "msave",   .nsrc = 1, .ndst = 1 },
-   [BRW_OPCODE_PUSH]     = { .name = "push",    .nsrc = 1, .ndst = 1 },
-   [BRW_OPCODE_MRESTORE] = { .name = "mrest",   .nsrc = 1, .ndst = 1 },
-   [BRW_OPCODE_POP]      = { .name = "pop",     .nsrc = 2, .ndst = 0 },
+   [BRW_OPCODE_IF]       = { .name = "if",      .nsrc = 0, .ndst = 0 },
+   [BRW_OPCODE_IFF]      = { .name = "iff",     .nsrc = 0, .ndst = 0 },
+   [BRW_OPCODE_WHILE]    = { .name = "while",   .nsrc = 0, .ndst = 0 },
+   [BRW_OPCODE_ELSE]     = { .name = "else",    .nsrc = 0, .ndst = 0 },
+   [BRW_OPCODE_BREAK]    = { .name = "break",   .nsrc = 0, .ndst = 0 },
+   [BRW_OPCODE_CONTINUE] = { .name = "cont",    .nsrc = 0, .ndst = 0 },
+   [BRW_OPCODE_HALT]     = { .name = "halt",    .nsrc = 0, .ndst = 0 },
+   // [BRW_OPCODE_MSAVE]    = { .name = "msave",   .nsrc = 1, .ndst = 1 },
+   // [BRW_OPCODE_PUSH]     = { .name = "push",    .nsrc = 1, .ndst = 1 },
+   // [BRW_OPCODE_MREST]    = { .name = "mrest",   .nsrc = 1, .ndst = 1 },
+   // [BRW_OPCODE_POP]      = { .name = "pop",     .nsrc = 2, .ndst = 0 },
    [BRW_OPCODE_WAIT]     = { .name = "wait",    .nsrc = 1, .ndst = 0 },
    [BRW_OPCODE_DO]       = { .name = "do",      .nsrc = 0, .ndst = 0 },
-   [BRW_OPCODE_ENDIF]    = { .name = "endif",   .nsrc = 2, .ndst = 0 },
+   [BRW_OPCODE_ENDIF]    = { .name = "endif",   .nsrc = 0, .ndst = 0 },
 };
 
 static bool
@@ -137,8 +141,8 @@ has_branch_ctrl(const struct brw_device_info *devinfo, enum opcode opcode)
       return false;
 
    return opcode == BRW_OPCODE_IF ||
-          opcode == BRW_OPCODE_ELSE ||
-          opcode == BRW_OPCODE_GOTO;
+          opcode == BRW_OPCODE_ELSE;
+          /* opcode == BRW_OPCODE_GOTO; */
 }
 
 static bool
@@ -622,6 +626,7 @@ static const char *const gen5_sampler_msg_type[] = {
    [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO]   = "gather4_po",
    [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C] = "gather4_po_c",
    [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c",
+   [GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W]     = "ld2dms_w",
    [GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS]       = "ld_mcs",
    [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS]       = "ld2dms",
    [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS]       = "ld2dss",
@@ -720,7 +725,7 @@ reg(FILE *file, unsigned _reg_file, unsigned _reg_nr)
 
    /* Clear the Compr4 instruction compression bit. */
    if (_reg_file == BRW_MESSAGE_REGISTER_FILE)
-      _reg_nr &= ~(1 << 7);
+      _reg_nr &= ~BRW_MRF_COMPR4;
 
    if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
       switch (_reg_nr & 0xf0) {
@@ -1644,7 +1649,7 @@ brw_disassemble_inst(FILE *file, const struct brw_device_info *devinfo,
          if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_COMPRESSED &&
              opcode_descs[opcode].ndst > 0 &&
              brw_inst_dst_reg_file(devinfo, inst) == BRW_MESSAGE_REGISTER_FILE &&
-             brw_inst_dst_da_reg_nr(devinfo, inst) & (1 << 7)) {
+             brw_inst_dst_da_reg_nr(devinfo, inst) & BRW_MRF_COMPR4) {
             format(file, " compr4");
          } else {
             err |= control(file, "compression control", compr_ctrl,
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 61683c81b79..a2eaf8fb1e0 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -111,9 +111,16 @@ brw_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
 static void
 gen6_set_prim(struct brw_context *brw, const struct _mesa_prim *prim)
 {
+   const struct gl_context *ctx = &brw->ctx;
+   uint32_t hw_prim;
+
    DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode));
 
-   const uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode);
+   if (prim->mode == GL_PATCHES)
+      hw_prim = _3DPRIM_PATCHLIST(ctx->TessCtrlProgram.patch_vertices);
+   else
+      hw_prim = get_hw_prim_for_gl_prim(prim->mode);
+
    if (hw_prim != brw->primitive) {
       brw->primitive = hw_prim;
       brw->ctx.NewDriverState |= BRW_NEW_PRIMITIVE;
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 0ac1ad9378b..829e39330f2 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -522,6 +522,10 @@ bool brw_try_compact_instruction(const struct brw_device_info *devinfo,
 void brw_debug_compact_uncompact(const struct brw_device_info *devinfo,
                                  brw_inst *orig, brw_inst *uncompacted);
 
+/* brw_eu_validate.c */
+bool brw_validate_instructions(const struct brw_codegen *p, int start_offset,
+                               struct annotation_info *annotation);
+
 static inline int
 next_offset(const struct brw_device_info *devinfo, void *store, int offset)
 {
@@ -533,6 +537,12 @@ next_offset(const struct brw_device_info *devinfo, void *store, int offset)
       return offset + 16;
 }
 
+static inline bool
+is_3src(enum opcode opcode)
+{
+   return opcode_descs[opcode].nsrc == 3;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c
index 07ace6bfbcb..bca8a84154f 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_compact.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c
@@ -954,13 +954,6 @@ is_compactable_immediate(unsigned imm)
    return imm == 0 || imm == 0xfffff000;
 }
 
-/* Returns whether an opcode takes three sources. */
-static bool
-is_3src(uint32_t op)
-{
-   return opcode_descs[op].nsrc == 3;
-}
-
 /**
  * Tries to compact instruction src into dst.
  *
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index a6fbb542919..da1ddfddb50 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -92,7 +92,7 @@ gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
  */
 unsigned
 brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
-                        enum brw_reg_type type, unsigned file)
+                        enum brw_reg_type type, enum brw_reg_file file)
 {
    if (file == BRW_IMMEDIATE_VALUE) {
       static const int imm_hw_types[] = {
@@ -147,7 +147,7 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
    const struct brw_device_info *devinfo = p->devinfo;
 
    if (dest.file == BRW_MESSAGE_REGISTER_FILE)
-      assert((dest.nr & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen));
+      assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
    else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
       assert(dest.nr < 128);
 
@@ -169,10 +169,10 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
          brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
       } else {
          brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
-         brw_inst_set_da16_writemask(devinfo, inst, dest.dw1.bits.writemask);
+         brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
          if (dest.file == BRW_GENERAL_REGISTER_FILE ||
              dest.file == BRW_MESSAGE_REGISTER_FILE) {
-            assert(dest.dw1.bits.writemask != 0);
+            assert(dest.writemask != 0);
          }
 	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
 	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
@@ -187,13 +187,13 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
        */
       if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
          brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
-                                       dest.dw1.bits.indirect_offset);
+                                       dest.indirect_offset);
 	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
 	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
          brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
       } else {
          brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
-                                        dest.dw1.bits.indirect_offset);
+                                        dest.indirect_offset);
 	 /* even ignored in da16, still need to set as '01' */
          brw_inst_set_dst_hstride(devinfo, inst, 1);
       }
@@ -243,7 +243,7 @@ validate_reg(const struct brw_device_info *devinfo,
     */
    if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
        reg.nr == BRW_ARF_ACCUMULATOR)
-      assert(reg.dw1.bits.swizzle == BRW_SWIZZLE_XYZW);
+      assert(reg.swizzle == BRW_SWIZZLE_XYZW);
 
    assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
    hstride = hstride_for_reg[reg.hstride];
@@ -311,7 +311,7 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
    const struct brw_device_info *devinfo = p->devinfo;
 
    if (reg.file == BRW_MESSAGE_REGISTER_FILE)
-      assert((reg.nr & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen));
+      assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
    else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
       assert(reg.nr < 128);
 
@@ -338,7 +338,7 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
    brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
 
    if (reg.file == BRW_IMMEDIATE_VALUE) {
-      brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud);
+      brw_inst_set_imm_ud(devinfo, inst, reg.ud);
 
       /* The Bspec's section titled "Non-present Operands" claims that if src0
        * is an immediate that src1's type must be the same as that of src0.
@@ -408,9 +408,9 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
          brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
 
          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
-            brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset);
+            brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
 	 } else {
-            brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset);
+            brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
 	 }
       }
 
@@ -427,13 +427,13 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
 	 }
       } else {
          brw_inst_set_src0_da16_swiz_x(devinfo, inst,
-            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
          brw_inst_set_src0_da16_swiz_y(devinfo, inst,
-            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
          brw_inst_set_src0_da16_swiz_z(devinfo, inst,
-            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
          brw_inst_set_src0_da16_swiz_w(devinfo, inst,
-            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
 
 	 /* This is an oddity of the fact we're using the same
 	  * descriptions for registers in align_16 as align_1:
@@ -479,7 +479,7 @@ brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
    assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
 
    if (reg.file == BRW_IMMEDIATE_VALUE) {
-      brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud);
+      brw_inst_set_imm_ud(devinfo, inst, reg.ud);
    } else {
       /* This is a hardware restriction, which may or may not be lifted
        * in the future:
@@ -507,13 +507,13 @@ brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
 	 }
       } else {
          brw_inst_set_src1_da16_swiz_x(devinfo, inst,
-            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X));
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
          brw_inst_set_src1_da16_swiz_y(devinfo, inst,
-            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y));
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
          brw_inst_set_src1_da16_swiz_z(devinfo, inst,
-            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z));
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
          brw_inst_set_src1_da16_swiz_w(devinfo, inst,
-            BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W));
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
 
 	 /* This is an oddity of the fact we're using the same
 	  * descriptions for registers in align_16 as align_1:
@@ -848,8 +848,8 @@ static int
 get_3src_subreg_nr(struct brw_reg reg)
 {
    if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
-      assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
-      return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
+      assert(brw_is_single_value_swizzle(reg.swizzle));
+      return reg.subnr / 4 + BRW_GET_SWZ(reg.swizzle, 0);
    } else {
       return reg.subnr / 4;
    }
@@ -879,12 +879,12 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
    }
    brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
    brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
-   brw_inst_set_3src_dst_writemask(devinfo, inst, dest.dw1.bits.writemask);
+   brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
 
    assert(src0.file == BRW_GENERAL_REGISTER_FILE);
    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
    assert(src0.nr < 128);
-   brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.dw1.bits.swizzle);
+   brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
    brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
    brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
    brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
@@ -895,7 +895,7 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
    assert(src1.file == BRW_GENERAL_REGISTER_FILE);
    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
    assert(src1.nr < 128);
-   brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.dw1.bits.swizzle);
+   brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
    brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
    brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
    brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
@@ -906,7 +906,7 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
    assert(src2.file == BRW_GENERAL_REGISTER_FILE);
    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
    assert(src2.nr < 128);
-   brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.dw1.bits.swizzle);
+   brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
    brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
    brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
    brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
@@ -2426,7 +2426,7 @@ void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
 
    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
       const int sampler_state_size = 16; /* 16 bytes */
-      uint32_t sampler = sampler_index.dw1.ud;
+      uint32_t sampler = sampler_index.ud;
 
       if (sampler >= 16) {
          assert(devinfo->is_haswell || devinfo->gen >= 8);
@@ -2581,7 +2581,7 @@ brw_send_indirect_surface_message(struct brw_codegen *p,
        */
       insn = brw_AND(p, addr,
                      suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
-                               BRW_GET_SWZ(surface.dw1.bits.swizzle, 0)),
+                               BRW_GET_SWZ(surface.swizzle, 0)),
                      brw_imm_ud(0xff));
 
       brw_pop_insn_state(p);
@@ -3336,7 +3336,7 @@ brw_broadcast(struct brw_codegen *p,
        * We will typically not get here if the optimizer is doing its job, but
        * asserting would be mean.
        */
-      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0;
+      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
       brw_MOV(p, dst,
               (align1 ? stride(suboffset(src, i), 0, 1, 0) :
                stride(suboffset(src, 4 * i), 0, 4, 1)));
diff --git a/src/mesa/drivers/dri/i965/brw_eu_validate.c b/src/mesa/drivers/dri/i965/brw_eu_validate.c
new file mode 100644
index 00000000000..2de2ea1babc
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_eu_validate.c
@@ -0,0 +1,407 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_eu_validate.c
+ *
+ * This file implements a pass that validates shader assembly.
+ */
+
+#include "brw_eu.h"
+
+/* We're going to do lots of string concatenation, so this should help. */
+struct string {
+   char *str;
+   size_t len;
+};
+
+static void
+cat(struct string *dest, const struct string src)
+{
+   dest->str = realloc(dest->str, dest->len + src.len + 1);
+   memcpy(dest->str + dest->len, src.str, src.len);
+   dest->str[dest->len + src.len] = '\0';
+   dest->len = dest->len + src.len;
+}
+#define CAT(dest, src) cat(&dest, (struct string){src, strlen(src)})
+
+#define error(str) "\tERROR: " str "\n"
+
+#define ERROR_IF(cond, msg)          \
+   do {                              \
+      if (cond) {                    \
+         CAT(error_msg, error(msg)); \
+         valid = false;              \
+      }                              \
+   } while(0)
+
+static bool
+src0_is_null(const struct brw_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src1_is_null(const struct brw_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+enum gen {
+   GEN4  = (1 << 0),
+   GEN45 = (1 << 1),
+   GEN5  = (1 << 2),
+   GEN6  = (1 << 3),
+   GEN7  = (1 << 4),
+   GEN75 = (1 << 5),
+   GEN8  = (1 << 6),
+   GEN9  = (1 << 7),
+   GEN_ALL = ~0
+};
+
+#define GEN_GE(gen) (~((gen) - 1) | gen)
+#define GEN_LE(gen) (((gen) - 1) | gen)
+
+struct inst_info {
+   enum gen gen;
+};
+
+static const struct inst_info inst_info[128] = {
+   [BRW_OPCODE_ILLEGAL] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MOV] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SEL] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MOVI] = {
+      .gen = GEN_GE(GEN45),
+   },
+   [BRW_OPCODE_NOT] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_AND] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_OR] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_XOR] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SHR] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SHL] = {
+      .gen = GEN_ALL,
+   },
+   /* BRW_OPCODE_DIM / BRW_OPCODE_SMOV */
+   /* Reserved - 11 */
+   [BRW_OPCODE_ASR] = {
+      .gen = GEN_ALL,
+   },
+   /* Reserved - 13-15 */
+   [BRW_OPCODE_CMP] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_CMPN] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_CSEL] = {
+      .gen = GEN_GE(GEN8),
+   },
+   [BRW_OPCODE_F32TO16] = {
+      .gen = GEN7 | GEN75,
+   },
+   [BRW_OPCODE_F16TO32] = {
+      .gen = GEN7 | GEN75,
+   },
+   /* Reserved - 21-22 */
+   [BRW_OPCODE_BFREV] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_BFE] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_BFI1] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_BFI2] = {
+      .gen = GEN_GE(GEN7),
+   },
+   /* Reserved - 27-31 */
+   [BRW_OPCODE_JMPI] = {
+      .gen = GEN_ALL,
+   },
+   /* BRW_OPCODE_BRD */
+   [BRW_OPCODE_IF] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_IFF] = { /* also BRW_OPCODE_BRC */
+      .gen = GEN_LE(GEN5),
+   },
+   [BRW_OPCODE_ELSE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_ENDIF] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_DO] = { /* also BRW_OPCODE_CASE */
+      .gen = GEN_LE(GEN5),
+   },
+   [BRW_OPCODE_WHILE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_BREAK] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_CONTINUE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_HALT] = {
+      .gen = GEN_ALL,
+   },
+   /* BRW_OPCODE_CALLA */
+   /* BRW_OPCODE_MSAVE / BRW_OPCODE_CALL */
+   /* BRW_OPCODE_MREST / BRW_OPCODE_RET */
+   /* BRW_OPCODE_PUSH / BRW_OPCODE_FORK / BRW_OPCODE_GOTO */
+   /* BRW_OPCODE_POP */
+   [BRW_OPCODE_WAIT] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SEND] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SENDC] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SENDS] = {
+      .gen = GEN_GE(GEN9),
+   },
+   [BRW_OPCODE_SENDSC] = {
+      .gen = GEN_GE(GEN9),
+   },
+   /* Reserved 53-55 */
+   [BRW_OPCODE_MATH] = {
+      .gen = GEN_GE(GEN6),
+   },
+   /* Reserved 57-63 */
+   [BRW_OPCODE_ADD] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MUL] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_AVG] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_FRC] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDU] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDD] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDZ] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MAC] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_MACH] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_LZD] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_FBH] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_FBL] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_CBIT] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_ADDC] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_SUBB] = {
+      .gen = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_SAD2] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_SADA2] = {
+      .gen = GEN_ALL,
+   },
+   /* Reserved 82-83 */
+   [BRW_OPCODE_DP4] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_DPH] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_DP3] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_DP2] = {
+      .gen = GEN_ALL,
+   },
+   /* Reserved 88 */
+   [BRW_OPCODE_LINE] = {
+      .gen = GEN_ALL,
+   },
+   [BRW_OPCODE_PLN] = {
+      .gen = GEN_GE(GEN45),
+   },
+   [BRW_OPCODE_MAD] = {
+      .gen = GEN_GE(GEN6),
+   },
+   [BRW_OPCODE_LRP] = {
+      .gen = GEN_GE(GEN6),
+   },
+   /* Reserved 93-124 */
+   /* BRW_OPCODE_NENOP */
+   [BRW_OPCODE_NOP] = {
+      .gen = GEN_ALL,
+   },
+};
+
+static unsigned
+num_sources_from_inst(const struct brw_device_info *devinfo,
+                      const brw_inst *inst)
+{
+   unsigned math_function;
+
+   if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) {
+      math_function = brw_inst_math_function(devinfo, inst);
+   } else if (devinfo->gen < 6 &&
+              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) {
+      if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) {
+         math_function = brw_inst_math_msg_function(devinfo, inst);
+      } else {
+         /* Send instructions are allowed to have null sources since they use
+          * the base_mrf field to specify which message register source.
+          */
+         return 0;
+      }
+   } else {
+      return opcode_descs[brw_inst_opcode(devinfo, inst)].nsrc;
+   }
+
+   switch (math_function) {
+   case BRW_MATH_FUNCTION_INV:
+   case BRW_MATH_FUNCTION_LOG:
+   case BRW_MATH_FUNCTION_EXP:
+   case BRW_MATH_FUNCTION_SQRT:
+   case BRW_MATH_FUNCTION_RSQ:
+   case BRW_MATH_FUNCTION_SIN:
+   case BRW_MATH_FUNCTION_COS:
+   case BRW_MATH_FUNCTION_SINCOS:
+   case GEN8_MATH_FUNCTION_INVM:
+   case GEN8_MATH_FUNCTION_RSQRTM:
+      return 1;
+   case BRW_MATH_FUNCTION_FDIV:
+   case BRW_MATH_FUNCTION_POW:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+      return 2;
+   default:
+      unreachable("not reached");
+   }
+}
+
+static enum gen
+gen_from_devinfo(const struct brw_device_info *devinfo)
+{
+   switch (devinfo->gen) {
+   case 4: return devinfo->is_g4x ? GEN45 : GEN4;
+   case 5: return GEN5;
+   case 6: return GEN6;
+   case 7: return devinfo->is_haswell ? GEN75 : GEN7;
+   case 8: return GEN8;
+   case 9: return GEN9;
+   default:
+      unreachable("not reached");
+   }
+}
+
+static bool
+is_unsupported_inst(const struct brw_device_info *devinfo,
+                    const brw_inst *inst)
+{
+   enum gen gen = gen_from_devinfo(devinfo);
+   return (inst_info[brw_inst_opcode(devinfo, inst)].gen & gen) == 0;
+}
+
+bool
+brw_validate_instructions(const struct brw_codegen *p, int start_offset,
+                          struct annotation_info *annotation)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+   const void *store = p->store + start_offset / 16;
+   bool valid = true;
+
+   for (int src_offset = 0; src_offset < p->next_insn_offset - start_offset;
+        src_offset += sizeof(brw_inst)) {
+      struct string error_msg = { .str = NULL, .len = 0 };
+      const brw_inst *inst = store + src_offset;
+
+      switch (num_sources_from_inst(devinfo, inst)) {
+      case 3:
+         /* Nothing to test. 3-src instructions can only have GRF sources, and
+          * there's no bit to control the file.
+          */
+         break;
+      case 2:
+         ERROR_IF(src1_is_null(devinfo, inst), "src1 is null");
+         /* fallthrough */
+      case 1:
+         ERROR_IF(src0_is_null(devinfo, inst), "src0 is null");
+         break;
+      case 0:
+      default:
+         break;
+      }
+
+      ERROR_IF(is_unsupported_inst(devinfo, inst),
+               "Instruction not supported on this Gen");
+
+      if (error_msg.str && annotation) {
+         annotation_insert_error(annotation, src_offset, error_msg.str);
+      }
+      free(error_msg.str);
+   }
+
+   return valid;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c b/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c
index 50bda619f55..830fc6e41df 100644
--- a/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c
@@ -436,7 +436,7 @@ gen6_sol_program(struct brw_ff_gs_compile *c, struct brw_ff_gs_prog_key *key,
             vertex_slot.nr += slot / 2;
             vertex_slot.subnr = (slot % 2) * 16;
             /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */
-            vertex_slot.dw1.bits.swizzle = varying == VARYING_SLOT_PSIZ
+            vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ
                ? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding];
             brw_set_default_access_mode(p, BRW_ALIGN_16);
             brw_MOV(p, stride(c->reg.header, 4, 4, 1),
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index e218a85a363..3bec7285ef5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -43,6 +43,7 @@
 #include "brw_wm.h"
 #include "brw_fs.h"
 #include "brw_cs.h"
+#include "brw_vec4_gs_visitor.h"
 #include "brw_cfg.h"
 #include "brw_dead_control_flow.h"
 #include "main/uniforms.h"
@@ -75,8 +76,9 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 
    /* This will be the case for almost all instructions. */
    switch (dst.file) {
-   case GRF:
-   case HW_REG:
+   case VGRF:
+   case ARF:
+   case FIXED_GRF:
    case MRF:
    case ATTR:
       this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
@@ -203,7 +205,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 
    int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
-   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
+   fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type);
    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
    inst->regs_written = regs_written;
 
@@ -232,7 +234,7 @@ fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
    const fs_builder ubld = bld.annotate("send dependency resolve")
                               .half(0);
 
-   ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
+   ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
 }
 
 bool
@@ -283,14 +285,15 @@ fs_inst::is_send_from_grf() const
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case SHADER_OPCODE_URB_READ_SIMD8:
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
       return true;
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
-      return src[1].file == GRF;
+      return src[1].file == VGRF;
    case FS_OPCODE_FB_WRITE:
-      return src[0].file == GRF;
+      return src[0].file == VGRF;
    default:
       if (is_tex())
-         return src[0].file == GRF;
+         return src[0].file == VGRF;
 
       return false;
    }
@@ -303,10 +306,10 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
       return false;
 
    fs_reg reg = this->src[0];
-   if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
+   if (reg.file != VGRF || reg.reg_offset != 0 || reg.stride == 0)
       return false;
 
-   if (grf_alloc.sizes[reg.reg] != this->regs_written)
+   if (grf_alloc.sizes[reg.nr] != this->regs_written)
       return false;
 
    for (int i = 0; i < this->sources; i++) {
@@ -378,7 +381,7 @@ fs_reg::fs_reg(float f)
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_F;
    this->stride = 0;
-   this->fixed_hw_reg.dw1.f = f;
+   this->f = f;
 }
 
 /** Immediate value constructor. */
@@ -388,7 +391,7 @@ fs_reg::fs_reg(int32_t i)
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_D;
    this->stride = 0;
-   this->fixed_hw_reg.dw1.d = i;
+   this->d = i;
 }
 
 /** Immediate value constructor. */
@@ -398,7 +401,7 @@ fs_reg::fs_reg(uint32_t u)
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_UD;
    this->stride = 0;
-   this->fixed_hw_reg.dw1.ud = u;
+   this->ud = u;
 }
 
 /** Vector float immediate value constructor. */
@@ -407,7 +410,7 @@ fs_reg::fs_reg(uint8_t vf[4])
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_VF;
-   memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
+   memcpy(&this->ud, vf, sizeof(unsigned));
 }
 
 /** Vector float immediate value constructor. */
@@ -416,42 +419,38 @@ fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_VF;
-   this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
-                               (vf1 <<  8) |
-                               (vf2 << 16) |
-                               (vf3 << 24);
+   this->ud = (vf0 <<  0) | (vf1 <<  8) | (vf2 << 16) | (vf3 << 24);
 }
 
-/** Fixed brw_reg. */
-fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
+fs_reg::fs_reg(struct brw_reg reg) :
+   backend_reg(reg)
 {
-   init();
-   this->file = HW_REG;
-   this->fixed_hw_reg = fixed_hw_reg;
-   this->type = fixed_hw_reg.type;
+   this->reg_offset = 0;
+   this->subreg_offset = 0;
+   this->reladdr = NULL;
+   this->stride = 1;
+   if (this->file == IMM &&
+       (this->type != BRW_REGISTER_TYPE_V &&
+        this->type != BRW_REGISTER_TYPE_UV &&
+        this->type != BRW_REGISTER_TYPE_VF)) {
+      this->stride = 0;
+   }
 }
 
 bool
 fs_reg::equals(const fs_reg &r) const
 {
-   return (file == r.file &&
-           reg == r.reg &&
+   return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 &&
            reg_offset == r.reg_offset &&
            subreg_offset == r.subreg_offset &&
-           type == r.type &&
-           negate == r.negate &&
-           abs == r.abs &&
            !reladdr && !r.reladdr &&
-           ((file != HW_REG && file != IMM) ||
-            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
-                   sizeof(fixed_hw_reg)) == 0) &&
            stride == r.stride);
 }
 
 fs_reg &
 fs_reg::set_smear(unsigned subreg)
 {
-   assert(file != HW_REG && file != IMM);
+   assert(file != ARF && file != FIXED_GRF && file != IMM);
    subreg_offset = subreg * type_sz(type);
    stride = 0;
    return *this;
@@ -466,9 +465,9 @@ fs_reg::is_contiguous() const
 unsigned
 fs_reg::component_size(unsigned width) const
 {
-   const unsigned stride = (file != HW_REG ? this->stride :
-                            fixed_hw_reg.hstride == 0 ? 0 :
-                            1 << (fixed_hw_reg.hstride - 1));
+   const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
+                            hstride == 0 ? 0 :
+                            1 << (hstride - 1));
    return MAX2(width * stride, 1) * type_sz(type);
 }
 
@@ -514,6 +513,19 @@ type_size_scalar(const struct glsl_type *type)
 }
 
 /**
+ * Returns the number of scalar components needed to store type, assuming
+ * that vectors are padded out to vec4.
+ *
+ * This has the packing rules of type_size_vec4(), but counts components
+ * similar to type_size_scalar().
+ */
+extern "C" int
+type_size_vec4_times_4(const struct glsl_type *type)
+{
+   return 4 * type_size_vec4(type);
+}
+
+/**
  * Create a MOV to read the timestamp register.
  *
  * The caller is responsible for emitting the MOV.  The return value is
@@ -529,7 +541,7 @@ fs_visitor::get_timestamp(const fs_builder &bld)
                                           0),
                              BRW_REGISTER_TYPE_UD));
 
-   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+   fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 
    /* We want to read the 3 fields we care about even if it's not enabled in
     * the dispatch.
@@ -584,7 +596,7 @@ fs_visitor::emit_shader_time_end()
 
    fs_reg start = shader_start_time;
    start.negate = true;
-   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+   fs_reg diff = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    diff.set_smear(0);
 
    const fs_builder cbld = ibld.group(1, 0);
@@ -706,7 +718,7 @@ fs_inst::components_read(unsigned i) const
       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
       /* First/second FB write color. */
       if (i < 2)
-         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud;
+         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
       else
          return 1;
 
@@ -717,6 +729,7 @@ fs_inst::components_read(unsigned i) const
    case SHADER_OPCODE_TXS_LOGICAL:
    case FS_OPCODE_TXB_LOGICAL:
    case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
    case SHADER_OPCODE_TXF_UMS_LOGICAL:
    case SHADER_OPCODE_TXF_MCS_LOGICAL:
    case SHADER_OPCODE_LOD_LOGICAL:
@@ -725,13 +738,16 @@ fs_inst::components_read(unsigned i) const
       assert(src[8].file == IMM && src[9].file == IMM);
       /* Texture coordinates. */
       if (i == 0)
-         return src[8].fixed_hw_reg.dw1.ud;
+         return src[8].ud;
       /* Texture derivatives. */
       else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
-         return src[9].fixed_hw_reg.dw1.ud;
+         return src[9].ud;
       /* Texture offset. */
       else if (i == 7)
          return 2;
+      /* MCS */
+      else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+         return 2;
       else
          return 1;
 
@@ -740,7 +756,7 @@ fs_inst::components_read(unsigned i) const
       assert(src[3].file == IMM);
       /* Surface coordinates. */
       if (i == 0)
-         return src[3].fixed_hw_reg.dw1.ud;
+         return src[3].ud;
       /* Surface operation source (ignored for reads). */
       else if (i == 1)
          return 0;
@@ -753,10 +769,10 @@ fs_inst::components_read(unsigned i) const
              src[4].file == IMM);
       /* Surface coordinates. */
       if (i == 0)
-         return src[3].fixed_hw_reg.dw1.ud;
+         return src[3].ud;
       /* Surface operation source. */
       else if (i == 1)
-         return src[4].fixed_hw_reg.dw1.ud;
+         return src[4].ud;
       else
          return 1;
 
@@ -764,10 +780,10 @@ fs_inst::components_read(unsigned i) const
    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
       assert(src[3].file == IMM &&
              src[4].file == IMM);
-      const unsigned op = src[4].fixed_hw_reg.dw1.ud;
+      const unsigned op = src[4].ud;
       /* Surface coordinates. */
       if (i == 0)
-         return src[3].fixed_hw_reg.dw1.ud;
+         return src[3].ud;
       /* Surface operation source. */
       else if (i == 1 && op == BRW_AOP_CMPWR)
          return 2;
@@ -793,6 +809,7 @@ fs_inst::regs_read(int arg) const
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case SHADER_OPCODE_URB_READ_SIMD8:
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
    case SHADER_OPCODE_UNTYPED_ATOMIC:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -825,7 +842,7 @@ fs_inst::regs_read(int arg) const
       return 1;
 
    default:
-      if (is_tex() && arg == 0 && src[0].file == GRF)
+      if (is_tex() && arg == 0 && src[0].file == VGRF)
          return mlen;
       break;
    }
@@ -836,9 +853,10 @@ fs_inst::regs_read(int arg) const
    case UNIFORM:
    case IMM:
       return 1;
-   case GRF:
+   case ARF:
+   case FIXED_GRF:
+   case VGRF:
    case ATTR:
-   case HW_REG:
       return DIV_ROUND_UP(components_read(arg) *
                           src[arg].component_size(exec_size),
                           REG_SIZE);
@@ -896,6 +914,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    case SHADER_OPCODE_TXD:
    case SHADER_OPCODE_TXF:
    case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
    case SHADER_OPCODE_TXF_MCS:
    case SHADER_OPCODE_TG4:
    case SHADER_OPCODE_TG4_OFFSET:
@@ -938,26 +957,24 @@ fs_reg
 fs_visitor::vgrf(const glsl_type *const type)
 {
    int reg_width = dispatch_width / 8;
-   return fs_reg(GRF, alloc.allocate(type_size_scalar(type) * reg_width),
+   return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
                  brw_type_for_base_type(type));
 }
 
-/** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int reg)
+fs_reg::fs_reg(enum brw_reg_file file, int nr)
 {
    init();
    this->file = file;
-   this->reg = reg;
+   this->nr = nr;
    this->type = BRW_REGISTER_TYPE_F;
    this->stride = (file == UNIFORM ? 0 : 1);
 }
 
-/** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
+fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
 {
    init();
    this->file = file;
-   this->reg = reg;
+   this->nr = nr;
    this->type = type;
    this->stride = (file == UNIFORM ? 0 : 1);
 }
@@ -1285,9 +1302,9 @@ fs_visitor::emit_sampleid_setup()
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
 
    if (key->compute_sample_id) {
-      fs_reg t1(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
+      fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
       t1.set_smear(0);
-      fs_reg t2(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+      fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
 
       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
        * 8x multisampling, subspan 0 will represent sample N (where N
@@ -1308,9 +1325,15 @@ fs_visitor::emit_sampleid_setup()
        * are sample 1 of subspan 0; the third group is sample 0 of
        * subspan 1, and finally sample 1 of subspan 1.
        */
+
+      /* SKL+ has an extra bit for the Starting Sample Pair Index to
+       * accomodate 16x MSAA.
+       */
+      unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0;
+
       abld.exec_all().group(1, 0)
           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
-               fs_reg(0xc0));
+               fs_reg(sspi_mask));
       abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5));
 
       /* This works for both SIMD8 and SIMD16 */
@@ -1362,6 +1385,57 @@ fs_visitor::emit_discard_jump()
 }
 
 void
+fs_visitor::emit_gs_thread_end()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data =
+      (struct brw_gs_prog_data *) prog_data;
+
+   if (gs_compile->control_data_header_size_bits > 0) {
+      emit_gs_control_data_bits(this->final_gs_vertex_count);
+   }
+
+   const fs_builder abld = bld.annotate("thread end");
+   fs_inst *inst;
+
+   if (gs_prog_data->static_vertex_count != -1) {
+      foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
+         if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
+             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
+             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
+             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
+            prev->eot = true;
+
+            /* Delete now dead instructions. */
+            foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
+               if (dead == prev)
+                  break;
+               dead->remove();
+            }
+            return;
+         } else if (prev->is_control_flow() || prev->has_side_effects()) {
+            break;
+         }
+      }
+      fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
+      inst->mlen = 1;
+   } else {
+      fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
+      sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+      sources[1] = this->final_gs_vertex_count;
+      abld.LOAD_PAYLOAD(payload, sources, 2, 2);
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+      inst->mlen = 2;
+   }
+   inst->eot = true;
+   inst->offset = 0;
+}
+
+void
 fs_visitor::assign_curb_setup()
 {
    if (dispatch_width == 8) {
@@ -1384,7 +1458,7 @@ fs_visitor::assign_curb_setup()
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       for (unsigned int i = 0; i < inst->sources; i++) {
 	 if (inst->src[i].file == UNIFORM) {
-            int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
+            int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset;
             int constant_nr;
             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
                constant_nr = push_constant_loc[uniform_nr];
@@ -1400,10 +1474,11 @@ fs_visitor::assign_curb_setup()
 	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
 						  constant_nr / 8,
 						  constant_nr % 8);
+            brw_reg.abs = inst->src[i].abs;
+            brw_reg.negate = inst->src[i].negate;
 
             assert(inst->src[i].stride == 0);
-	    inst->src[i].file = HW_REG;
-	    inst->src[i].fixed_hw_reg = byte_offset(
+            inst->src[i] = byte_offset(
                retype(brw_reg, inst->src[i].type),
                inst->src[i].subreg_offset);
 	 }
@@ -1518,13 +1593,13 @@ fs_visitor::assign_urb_setup()
     */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       if (inst->opcode == FS_OPCODE_LINTERP) {
-	 assert(inst->src[1].file == HW_REG);
-	 inst->src[1].fixed_hw_reg.nr += urb_start;
+	 assert(inst->src[1].file == FIXED_GRF);
+         inst->src[1].nr += urb_start;
       }
 
       if (inst->opcode == FS_OPCODE_CINTERP) {
-	 assert(inst->src[0].file == HW_REG);
-	 inst->src[0].fixed_hw_reg.nr += urb_start;
+	 assert(inst->src[0].file == FIXED_GRF);
+         inst->src[0].nr += urb_start;
       }
    }
 
@@ -1533,6 +1608,30 @@ fs_visitor::assign_urb_setup()
 }
 
 void
+fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
+{
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].file == ATTR) {
+         int grf = payload.num_regs +
+                   prog_data->curb_read_length +
+                   inst->src[i].nr +
+                   inst->src[i].reg_offset;
+
+         unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size;
+         struct brw_reg reg =
+            stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+                               inst->src[i].subreg_offset),
+                   inst->exec_size * inst->src[i].stride,
+                   width, inst->src[i].stride);
+         reg.abs = inst->src[i].abs;
+         reg.negate = inst->src[i].negate;
+
+         inst->src[i] = reg;
+      }
+   }
+}
+
+void
 fs_visitor::assign_vs_urb_setup()
 {
    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
@@ -1549,24 +1648,44 @@ fs_visitor::assign_vs_urb_setup()
 
    /* Rewrite all ATTR file references to the hw grf that they land in. */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == ATTR) {
-            int grf = payload.num_regs +
-                      prog_data->curb_read_length +
-                      inst->src[i].reg +
-                      inst->src[i].reg_offset;
-
-            inst->src[i].file = HW_REG;
-            inst->src[i].fixed_hw_reg =
-               stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
-                                  inst->src[i].subreg_offset),
-                      inst->exec_size * inst->src[i].stride,
-                      inst->exec_size, inst->src[i].stride);
-         }
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
+fs_visitor::assign_gs_urb_setup()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
+
+   first_non_payload_grf +=
+      8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
+
+   const unsigned first_icp_handle = payload.num_regs -
+      (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0);
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      /* Lower URB_READ_SIMD8 opcodes into real messages. */
+      if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
+         assert(inst->src[0].file == IMM);
+         inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
+                                            inst->src[0].ud,
+                                            0), BRW_REGISTER_TYPE_UD);
+         /* for now, assume constant - we can do per-slot offsets later */
+         assert(inst->src[1].file == IMM);
+         inst->offset = inst->src[1].ud;
+         inst->src[1] = fs_reg();
+         inst->mlen = 1;
+         inst->base_mrf = -1;
       }
+
+      /* Rewrite all ATTR file references to GRFs. */
+      convert_attr_sources_to_hw_regs(inst);
    }
 }
 
+
 /**
  * Split large virtual GRFs into separate components if we can.
  *
@@ -1609,30 +1728,30 @@ fs_visitor::split_virtual_grfs()
 
    /* Mark all used registers as fully splittable */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF) {
-         int reg = vgrf_to_reg[inst->dst.reg];
-         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
+      if (inst->dst.file == VGRF) {
+         int reg = vgrf_to_reg[inst->dst.nr];
+         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
             split_points[reg + j] = true;
       }
 
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF) {
-            int reg = vgrf_to_reg[inst->src[i].reg];
-            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
+         if (inst->src[i].file == VGRF) {
+            int reg = vgrf_to_reg[inst->src[i].nr];
+            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
                split_points[reg + j] = true;
          }
       }
    }
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF) {
-         int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
+      if (inst->dst.file == VGRF) {
+         int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
          for (int j = 1; j < inst->regs_written; j++)
             split_points[reg + j] = false;
       }
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF) {
-            int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
+         if (inst->src[i].file == VGRF) {
+            int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
             for (int j = 1; j < inst->regs_read(i); j++)
                split_points[reg + j] = false;
          }
@@ -1678,16 +1797,16 @@ fs_visitor::split_virtual_grfs()
    assert(reg == reg_count);
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF) {
-         reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
-         inst->dst.reg = new_virtual_grf[reg];
+      if (inst->dst.file == VGRF) {
+         reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
+         inst->dst.nr = new_virtual_grf[reg];
          inst->dst.reg_offset = new_reg_offset[reg];
          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
       }
       for (int i = 0; i < inst->sources; i++) {
-	 if (inst->src[i].file == GRF) {
-            reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
-            inst->src[i].reg = new_virtual_grf[reg];
+	 if (inst->src[i].file == VGRF) {
+            reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
+            inst->src[i].nr = new_virtual_grf[reg];
             inst->src[i].reg_offset = new_reg_offset[reg];
             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
          }
@@ -1714,12 +1833,12 @@ fs_visitor::compact_virtual_grfs()
 
    /* Mark which virtual GRFs are used. */
    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF)
-         remap_table[inst->dst.reg] = 0;
+      if (inst->dst.file == VGRF)
+         remap_table[inst->dst.nr] = 0;
 
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF)
-            remap_table[inst->src[i].reg] = 0;
+         if (inst->src[i].file == VGRF)
+            remap_table[inst->src[i].nr] = 0;
       }
    }
 
@@ -1743,12 +1862,12 @@ fs_visitor::compact_virtual_grfs()
 
    /* Patch all the instructions to use the newly renumbered registers */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF)
-         inst->dst.reg = remap_table[inst->dst.reg];
+      if (inst->dst.file == VGRF)
+         inst->dst.nr = remap_table[inst->dst.nr];
 
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF)
-            inst->src[i].reg = remap_table[inst->src[i].reg];
+         if (inst->src[i].file == VGRF)
+            inst->src[i].nr = remap_table[inst->src[i].nr];
       }
    }
 
@@ -1757,9 +1876,9 @@ fs_visitor::compact_virtual_grfs()
     * think some random VGRF is delta_xy.
     */
    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
-      if (delta_xy[i].file == GRF) {
-         if (remap_table[delta_xy[i].reg] != -1) {
-            delta_xy[i].reg = remap_table[delta_xy[i].reg];
+      if (delta_xy[i].file == VGRF) {
+         if (remap_table[delta_xy[i].nr] != -1) {
+            delta_xy[i].nr = remap_table[delta_xy[i].nr];
          } else {
             delta_xy[i].file = BAD_FILE;
          }
@@ -1811,7 +1930,7 @@ fs_visitor::assign_constant_locations()
             continue;
 
          if (inst->src[i].reladdr) {
-            int uniform = inst->src[i].reg;
+            int uniform = inst->src[i].nr;
 
             /* If this array isn't already present in the pull constant buffer,
              * add it.
@@ -1823,7 +1942,7 @@ fs_visitor::assign_constant_locations()
             }
          } else {
             /* Mark the the one accessed uniform as live */
-            int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
+            int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
             if (constant_nr >= 0 && constant_nr < (int) uniforms)
                is_live[constant_nr] = true;
          }
@@ -1899,7 +2018,7 @@ fs_visitor::demote_pull_constants()
 	    continue;
 
          int pull_index;
-         unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
+         unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
          if (location >= uniforms) /* Out of bounds access */
             pull_index = -1;
          else
@@ -1910,7 +2029,7 @@ fs_visitor::demote_pull_constants()
 
          /* Set up the annotation tracking for new generated instructions. */
          const fs_builder ibld(this, block, inst);
-         fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
+         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
          fs_reg dst = vgrf(glsl_type::float_type);
 
          assert(inst->src[i].stride == 0);
@@ -1918,7 +2037,7 @@ fs_visitor::demote_pull_constants()
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
-                                       surf_index,
+                                       fs_reg(index),
                                        *inst->src[i].reladdr,
                                        pull_index);
             inst->src[i].reladdr = NULL;
@@ -1927,13 +2046,14 @@ fs_visitor::demote_pull_constants()
             const fs_builder ubld = ibld.exec_all().group(8, 0);
             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      dst, surf_index, offset);
+                      dst, fs_reg(index), offset);
             inst->src[i].set_smear(pull_index & 3);
          }
+         brw_mark_surface_used(prog_data, index);
 
          /* Rewrite the instruction to use the temporary VGRF. */
-         inst->src[i].file = GRF;
-         inst->src[i].reg = dst.reg;
+         inst->src[i].file = VGRF;
+         inst->src[i].nr = dst.nr;
          inst->src[i].reg_offset = 0;
       }
    }
@@ -1955,8 +2075,7 @@ fs_visitor::opt_algebraic()
             if (inst->dst.type != inst->src[0].type)
                assert(!"unimplemented: saturate mixed types");
 
-            if (brw_saturate_immediate(inst->dst.type,
-                                       &inst->src[0].fixed_hw_reg)) {
+            if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
                inst->saturate = false;
                progress = true;
             }
@@ -1996,7 +2115,7 @@ fs_visitor::opt_algebraic()
          if (inst->src[0].file == IMM) {
             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
             inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
+            inst->src[0].f *= inst->src[1].f;
             inst->src[1] = reg_undef;
             progress = true;
             break;
@@ -2017,7 +2136,7 @@ fs_visitor::opt_algebraic()
          if (inst->src[0].file == IMM) {
             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
             inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
+            inst->src[0].f += inst->src[1].f;
             inst->src[1] = reg_undef;
             progress = true;
             break;
@@ -2066,7 +2185,7 @@ fs_visitor::opt_algebraic()
             case BRW_CONDITIONAL_L:
                switch (inst->src[1].type) {
                case BRW_REGISTER_TYPE_F:
-                  if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
+                  if (inst->src[1].f >= 1.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
                      inst->src[1] = reg_undef;
                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
@@ -2081,7 +2200,7 @@ fs_visitor::opt_algebraic()
             case BRW_CONDITIONAL_G:
                switch (inst->src[1].type) {
                case BRW_REGISTER_TYPE_F:
-                  if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
+                  if (inst->src[1].f <= 0.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
                      inst->src[1] = reg_undef;
                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
@@ -2118,7 +2237,7 @@ fs_visitor::opt_algebraic()
             progress = true;
          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
             inst->opcode = BRW_OPCODE_ADD;
-            inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
+            inst->src[1].f *= inst->src[2].f;
             inst->src[2] = reg_undef;
             progress = true;
          }
@@ -2143,7 +2262,7 @@ fs_visitor::opt_algebraic()
          } else if (inst->src[1].file == IMM) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->src[0] = component(inst->src[0],
-                                     inst->src[1].fixed_hw_reg.dw1.ud);
+                                     inst->src[1].ud);
             inst->sources = 1;
             inst->force_writemask_all = true;
             progress = true;
@@ -2344,31 +2463,31 @@ fs_visitor::opt_register_renaming()
 
       /* Rewrite instruction sources. */
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF &&
-             remap[inst->src[i].reg] != -1 &&
-             remap[inst->src[i].reg] != inst->src[i].reg) {
-            inst->src[i].reg = remap[inst->src[i].reg];
+         if (inst->src[i].file == VGRF &&
+             remap[inst->src[i].nr] != -1 &&
+             remap[inst->src[i].nr] != inst->src[i].nr) {
+            inst->src[i].nr = remap[inst->src[i].nr];
             progress = true;
          }
       }
 
-      const int dst = inst->dst.reg;
+      const int dst = inst->dst.nr;
 
       if (depth == 0 &&
-          inst->dst.file == GRF &&
-          alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
+          inst->dst.file == VGRF &&
+          alloc.sizes[inst->dst.nr] == inst->exec_size / 8 &&
           !inst->is_partial_write()) {
          if (remap[dst] == -1) {
             remap[dst] = dst;
          } else {
             remap[dst] = alloc.allocate(inst->exec_size / 8);
-            inst->dst.reg = remap[dst];
+            inst->dst.nr = remap[dst];
             progress = true;
          }
-      } else if (inst->dst.file == GRF &&
+      } else if (inst->dst.file == VGRF &&
                  remap[dst] != -1 &&
                  remap[dst] != dst) {
-         inst->dst.reg = remap[dst];
+         inst->dst.nr = remap[dst];
          progress = true;
       }
    }
@@ -2377,8 +2496,8 @@ fs_visitor::opt_register_renaming()
       invalidate_live_intervals();
 
       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
-         if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
-            delta_xy[i].reg = remap[delta_xy[i].reg];
+         if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
+            delta_xy[i].nr = remap[delta_xy[i].nr];
          }
       }
    }
@@ -2445,7 +2564,7 @@ fs_visitor::compute_to_mrf()
 
       if (inst->opcode != BRW_OPCODE_MOV ||
 	  inst->is_partial_write() ||
-	  inst->dst.file != MRF || inst->src[0].file != GRF ||
+	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
 	  inst->dst.type != inst->src[0].type ||
 	  inst->src[0].abs || inst->src[0].negate ||
           !inst->src[0].is_contiguous() ||
@@ -2455,9 +2574,9 @@ fs_visitor::compute_to_mrf()
       /* Work out which hardware MRF registers are written by this
        * instruction.
        */
-      int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
+      int mrf_low = inst->dst.nr & ~BRW_MRF_COMPR4;
       int mrf_high;
-      if (inst->dst.reg & BRW_MRF_COMPR4) {
+      if (inst->dst.nr & BRW_MRF_COMPR4) {
 	 mrf_high = mrf_low + 4;
       } else if (inst->exec_size == 16) {
 	 mrf_high = mrf_low + 1;
@@ -2468,15 +2587,15 @@ fs_visitor::compute_to_mrf()
       /* Can't compute-to-MRF this GRF if someone else was going to
        * read it later.
        */
-      if (this->virtual_grf_end[inst->src[0].reg] > ip)
+      if (this->virtual_grf_end[inst->src[0].nr] > ip)
 	 continue;
 
       /* Found a move of a GRF to a MRF.  Let's see if we can go
        * rewrite the thing that made this GRF to write into the MRF.
        */
       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
-	 if (scan_inst->dst.file == GRF &&
-	     scan_inst->dst.reg == inst->src[0].reg) {
+	 if (scan_inst->dst.file == VGRF &&
+            scan_inst->dst.nr == inst->src[0].nr) {
 	    /* Found the last thing to write our reg we want to turn
 	     * into a compute-to-MRF.
 	     */
@@ -2511,7 +2630,7 @@ fs_visitor::compute_to_mrf()
 	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
 	       /* Found the creator of our MRF's source value. */
 	       scan_inst->dst.file = MRF;
-	       scan_inst->dst.reg = inst->dst.reg;
+               scan_inst->dst.nr = inst->dst.nr;
 	       scan_inst->saturate |= inst->saturate;
 	       inst->remove(block);
 	       progress = true;
@@ -2531,8 +2650,8 @@ fs_visitor::compute_to_mrf()
 	  */
 	 bool interfered = false;
 	 for (int i = 0; i < scan_inst->sources; i++) {
-	    if (scan_inst->src[i].file == GRF &&
-		scan_inst->src[i].reg == inst->src[0].reg &&
+	    if (scan_inst->src[i].file == VGRF &&
+                scan_inst->src[i].nr == inst->src[0].nr &&
 		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
 	       interfered = true;
 	    }
@@ -2544,10 +2663,10 @@ fs_visitor::compute_to_mrf()
 	    /* If somebody else writes our MRF here, we can't
 	     * compute-to-MRF before that.
 	     */
-	    int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
+            int scan_mrf_low = scan_inst->dst.nr & ~BRW_MRF_COMPR4;
 	    int scan_mrf_high;
 
-	    if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
+            if (scan_inst->dst.nr & BRW_MRF_COMPR4) {
 	       scan_mrf_high = scan_mrf_low + 4;
 	    } else if (scan_inst->exec_size == 16) {
 	       scan_mrf_high = scan_mrf_low + 1;
@@ -2690,8 +2809,8 @@ fs_visitor::emit_repclear_shader()
 
    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
    if (uniforms == 1) {
-      assert(mov->src[0].file == HW_REG);
-      mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
+      assert(mov->src[0].file == FIXED_GRF);
+      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
    }
 }
 
@@ -2718,7 +2837,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       if (inst->opcode == BRW_OPCODE_MOV &&
 	  inst->dst.file == MRF) {
-	 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
+         fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
 	 if (prev_inst && inst->equals(prev_inst)) {
 	    inst->remove(block);
 	    progress = true;
@@ -2728,7 +2847,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       /* Clear out the last-write records for MRFs that were overwritten. */
       if (inst->dst.file == MRF) {
-	 last_mrf_move[inst->dst.reg] = NULL;
+         last_mrf_move[inst->dst.nr] = NULL;
       }
 
       if (inst->mlen > 0 && inst->base_mrf != -1) {
@@ -2741,10 +2860,10 @@ fs_visitor::remove_duplicate_mrf_writes()
       }
 
       /* Clear out any MRF move records whose sources got overwritten. */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
 	 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
 	    if (last_mrf_move[i] &&
-		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
+                last_mrf_move[i]->src[0].nr == inst->dst.nr) {
 	       last_mrf_move[i] = NULL;
 	    }
 	 }
@@ -2752,9 +2871,9 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       if (inst->opcode == BRW_OPCODE_MOV &&
 	  inst->dst.file == MRF &&
-	  inst->src[0].file == GRF &&
+	  inst->src[0].file == VGRF &&
 	  !inst->is_partial_write()) {
-	 last_mrf_move[inst->dst.reg] = inst;
+         last_mrf_move[inst->dst.nr] = inst;
       }
    }
 
@@ -2770,11 +2889,8 @@ clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
    /* Clear the flag for registers that actually got read (as expected). */
    for (int i = 0; i < inst->sources; i++) {
       int grf;
-      if (inst->src[i].file == GRF) {
-         grf = inst->src[i].reg;
-      } else if (inst->src[i].file == HW_REG &&
-                 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
-         grf = inst->src[i].fixed_hw_reg.nr;
+      if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
+         grf = inst->src[i].nr;
       } else {
          continue;
       }
@@ -2809,7 +2925,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
                                                         fs_inst *inst)
 {
    int write_len = inst->regs_written;
-   int first_write_grf = inst->dst.reg;
+   int first_write_grf = inst->dst.nr;
    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
    assert(write_len < (int)sizeof(needs_dep) - 1);
 
@@ -2840,9 +2956,9 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
        * instruction but a MOV that might have left us an outstanding
        * dependency has more latency than a MOV.
        */
-      if (scan_inst->dst.file == GRF) {
+      if (scan_inst->dst.file == VGRF) {
          for (int i = 0; i < scan_inst->regs_written; i++) {
-            int reg = scan_inst->dst.reg + i;
+            int reg = scan_inst->dst.nr + i;
 
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
@@ -2880,7 +2996,7 @@ void
 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
 {
    int write_len = inst->regs_written;
-   int first_write_grf = inst->dst.reg;
+   int first_write_grf = inst->dst.nr;
    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
    assert(write_len < (int)sizeof(needs_dep) - 1);
 
@@ -2906,13 +3022,13 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       /* We insert our reads as late as possible since they're reading the
        * result of a SEND, which has massive latency.
        */
-      if (scan_inst->dst.file == GRF &&
-          scan_inst->dst.reg >= first_write_grf &&
-          scan_inst->dst.reg < first_write_grf + write_len &&
-          needs_dep[scan_inst->dst.reg - first_write_grf]) {
+      if (scan_inst->dst.file == VGRF &&
+          scan_inst->dst.nr >= first_write_grf &&
+          scan_inst->dst.nr < first_write_grf + write_len &&
+          needs_dep[scan_inst->dst.nr - first_write_grf]) {
          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
-                         scan_inst->dst.reg);
-         needs_dep[scan_inst->dst.reg - first_write_grf] = false;
+                         scan_inst->dst.nr);
+         needs_dep[scan_inst->dst.nr - first_write_grf] = false;
       }
 
       /* Continue the loop only if we haven't resolved all the dependencies */
@@ -2939,7 +3055,7 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
     */
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->mlen != 0 && inst->dst.file == GRF) {
+      if (inst->mlen != 0 && inst->dst.file == VGRF) {
          insert_gen4_pre_send_dependency_workarounds(block, inst);
          insert_gen4_post_send_dependency_workarounds(block, inst);
          progress = true;
@@ -2980,18 +3096,18 @@ fs_visitor::lower_uniform_pull_constant_loads()
          fs_reg const_offset_reg = inst->src[1];
          assert(const_offset_reg.file == IMM &&
                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
-         const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
+         const_offset_reg.ud /= 4;
 
          fs_reg payload, offset;
          if (devinfo->gen >= 9) {
             /* We have to use a message header on Skylake to get SIMD4x2
              * mode.  Reserve space for the register.
             */
-            offset = payload = fs_reg(GRF, alloc.allocate(2));
+            offset = payload = fs_reg(VGRF, alloc.allocate(2));
             offset.reg_offset++;
             inst->mlen = 2;
          } else {
-            offset = payload = fs_reg(GRF, alloc.allocate(1));
+            offset = payload = fs_reg(VGRF, alloc.allocate(1));
             inst->mlen = 1;
          }
 
@@ -3038,13 +3154,13 @@ fs_visitor::lower_load_payload()
       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
          continue;
 
-      assert(inst->dst.file == MRF || inst->dst.file == GRF);
+      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
       assert(inst->saturate == false);
       fs_reg dst = inst->dst;
 
       /* Get rid of COMPR4.  We'll add it back in if we need it */
       if (dst.file == MRF)
-         dst.reg = dst.reg & ~BRW_MRF_COMPR4;
+         dst.nr = dst.nr & ~BRW_MRF_COMPR4;
 
       const fs_builder ibld(this, block, inst);
       const fs_builder hbld = ibld.exec_all().group(8, 0);
@@ -3058,7 +3174,7 @@ fs_visitor::lower_load_payload()
          dst = offset(dst, hbld, 1);
       }
 
-      if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
+      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
           inst->exec_size > 8) {
          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
           * a straightforward copy.  Instead, the result of the
@@ -3082,18 +3198,18 @@ fs_visitor::lower_load_payload()
             if (inst->src[i].file != BAD_FILE) {
                if (devinfo->has_compr4) {
                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
-                  compr4_dst.reg |= BRW_MRF_COMPR4;
+                  compr4_dst.nr |= BRW_MRF_COMPR4;
                   ibld.MOV(compr4_dst, inst->src[i]);
                } else {
                   /* Platform doesn't have COMPR4.  We have to fake it */
                   fs_reg mov_dst = retype(dst, inst->src[i].type);
                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
-                  mov_dst.reg += 4;
+                  mov_dst.nr += 4;
                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
                }
             }
 
-            dst.reg++;
+            dst.nr++;
          }
 
          /* The loop above only ever incremented us through the first set
@@ -3101,7 +3217,7 @@ fs_visitor::lower_load_payload()
           * actually wrote to the first 8 registers, so we need to take
           * that into account now.
           */
-         dst.reg += 4;
+         dst.nr += 4;
 
          /* The COMPR4 code took care of the first 4 sources.  We'll let
           * the regular path handle any remaining sources.  Yes, we are
@@ -3149,7 +3265,7 @@ fs_visitor::lower_integer_multiplication()
             continue;
 
          if (inst->src[1].file == IMM &&
-             inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+             inst->src[1].ud < (1 << 16)) {
             /* The MUL instruction isn't commutative. On Gen <= 6, only the low
              * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
              * src1 are used.
@@ -3158,7 +3274,7 @@ fs_visitor::lower_integer_multiplication()
              * single MUL instruction with that value in the proper location.
              */
             if (devinfo->gen < 7) {
-               fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+               fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
                           inst->dst.type);
                ibld.MOV(imm, inst->src[1]);
                ibld.MUL(inst->dst, imm, inst->src[0]);
@@ -3213,11 +3329,11 @@ fs_visitor::lower_integer_multiplication()
 
             fs_reg orig_dst = inst->dst;
             if (orig_dst.is_null() || orig_dst.file == MRF) {
-               inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+               inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                                   inst->dst.type);
             }
             fs_reg low = inst->dst;
-            fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+            fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
                         inst->dst.type);
 
             if (devinfo->gen >= 7) {
@@ -3225,8 +3341,8 @@ fs_visitor::lower_integer_multiplication()
                fs_reg src1_1_w = inst->src[1];
 
                if (inst->src[1].file == IMM) {
-                  src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
-                  src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+                  src1_0_w.ud &= 0xffff;
+                  src1_1_w.ud >>= 16;
                } else {
                   src1_0_w.type = BRW_REGISTER_TYPE_UW;
                   if (src1_0_w.stride != 0) {
@@ -3381,7 +3497,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
    fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
    const unsigned components =
-      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud;
+      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
 
    /* We can potentially have a message length of up to 15, so we have to set
     * base_mrf to either 0 or 1 in order to fit in m0..m15.
@@ -3411,7 +3527,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    }
 
    if (payload.aa_dest_stencil_reg) {
-      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
+      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
          .MOV(sources[length],
               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
@@ -3419,7 +3535,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    }
 
    if (prog_data->uses_omask) {
-      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
+      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
                                BRW_REGISTER_TYPE_UD);
 
       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
@@ -3485,9 +3601,9 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    fs_inst *load;
    if (devinfo->gen >= 7) {
       /* Send from the GRF */
-      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
+      fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
-      payload.reg = bld.shader->alloc.allocate(load->regs_written);
+      payload.nr = bld.shader->alloc.allocate(load->regs_written);
       load->dst = payload;
 
       inst->src[0] = payload;
@@ -3502,7 +3618,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
        * will do this for us if we just give it a COMPR4 destination.
        */
       if (devinfo->gen < 6 && bld.dispatch_width() == 16)
-         load->dst.reg |= BRW_MRF_COMPR4;
+         load->dst.nr |= BRW_MRF_COMPR4;
 
       inst->resize_sources(0);
       inst->base_mrf = 1;
@@ -3612,8 +3728,8 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
    inst->src[0] = reg_undef;
    inst->src[1] = sampler;
    inst->resize_sources(2);
-   inst->base_mrf = msg_begin.reg;
-   inst->mlen = msg_end.reg - msg_begin.reg;
+   inst->base_mrf = msg_begin.nr;
+   inst->mlen = msg_end.nr - msg_begin.nr;
    inst->header_size = 1;
 }
 
@@ -3637,7 +3753,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
        * go headerless.
        */
       header_size = 1;
-      message.reg--;
+      message.nr--;
    }
 
    for (unsigned i = 0; i < coord_components; i++) {
@@ -3707,8 +3823,8 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
    inst->src[0] = reg_undef;
    inst->src[1] = sampler;
    inst->resize_sources(2);
-   inst->base_mrf = message.reg;
-   inst->mlen = msg_end.reg - message.reg;
+   inst->base_mrf = message.nr;
+   inst->mlen = msg_end.nr - message.nr;
    inst->header_size = header_size;
 
    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
@@ -3721,7 +3837,7 @@ is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
    if (devinfo->gen < 8 && !devinfo->is_haswell)
       return false;
 
-   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+   return sampler.file != IMM || sampler.ud >= 16;
 }
 
 static void
@@ -3844,17 +3960,31 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
       coordinate_done = true;
       break;
    case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
    case SHADER_OPCODE_TXF_UMS:
    case SHADER_OPCODE_TXF_MCS:
-      if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+      if (op == SHADER_OPCODE_TXF_UMS ||
+          op == SHADER_OPCODE_TXF_CMS ||
+          op == SHADER_OPCODE_TXF_CMS_W) {
          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
          length++;
       }
 
-      if (op == SHADER_OPCODE_TXF_CMS) {
+      if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
          /* Data from the multisample control surface. */
          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
          length++;
+
+         /* On Gen9+ we'll use ld2dms_w instead which has two registers for
+          * the MCS data.
+          */
+         if (op == SHADER_OPCODE_TXF_CMS_W) {
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
+                    mcs.file == IMM ?
+                    mcs :
+                    offset(mcs, bld, 1));
+            length++;
+         }
       }
 
       /* There is no offsetting for this message; just copy in the integer
@@ -3912,7 +4042,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
    else
       mlen = length * reg_width;
 
-   const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
+   const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
                                      BRW_REGISTER_TYPE_F);
    bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
 
@@ -3942,8 +4072,8 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
    const fs_reg &sampler = inst->src[6];
    const fs_reg &offset_value = inst->src[7];
    assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
-   const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
-   const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
+   const unsigned coord_components = inst->src[8].ud;
+   const unsigned grad_components = inst->src[9].ud;
 
    if (devinfo->gen >= 7) {
       lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
@@ -4068,6 +4198,10 @@ fs_visitor::lower_logical_sends()
          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
          break;
 
+      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
+         break;
+
       case SHADER_OPCODE_TXF_UMS_LOGICAL:
          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
          break;
@@ -4260,6 +4394,21 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
       else
          return inst->exec_size;
 
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
+      /* This opcode can take up to 6 arguments which means that in some
+       * circumstances it can end up with a message that is too long in SIMD16
+       * mode.
+       */
+      const unsigned coord_components = inst->src[8].ud;
+      /* First three arguments are the sample index and the two arguments for
+       * the MCS data.
+       */
+      if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
+         return 8;
+      else
+         return inst->exec_size;
+   }
+
    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
@@ -4473,51 +4622,48 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    }
 
    switch (inst->dst.file) {
-   case GRF:
-      fprintf(file, "vgrf%d", inst->dst.reg);
-      if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
+   case VGRF:
+      fprintf(file, "vgrf%d", inst->dst.nr);
+      if (alloc.sizes[inst->dst.nr] != inst->regs_written ||
           inst->dst.subreg_offset)
          fprintf(file, "+%d.%d",
                  inst->dst.reg_offset, inst->dst.subreg_offset);
       break;
+   case FIXED_GRF:
+      fprintf(file, "g%d", inst->dst.nr);
+      break;
    case MRF:
-      fprintf(file, "m%d", inst->dst.reg);
+      fprintf(file, "m%d", inst->dst.nr);
       break;
    case BAD_FILE:
       fprintf(file, "(null)");
       break;
    case UNIFORM:
-      fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
+      fprintf(file, "***u%d***", inst->dst.nr + inst->dst.reg_offset);
       break;
    case ATTR:
-      fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
+      fprintf(file, "***attr%d***", inst->dst.nr + inst->dst.reg_offset);
       break;
-   case HW_REG:
-      if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
-         switch (inst->dst.fixed_hw_reg.nr) {
-         case BRW_ARF_NULL:
-            fprintf(file, "null");
-            break;
-         case BRW_ARF_ADDRESS:
-            fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
-            break;
-         case BRW_ARF_ACCUMULATOR:
-            fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
-            break;
-         case BRW_ARF_FLAG:
-            fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
-                             inst->dst.fixed_hw_reg.subnr);
-            break;
-         default:
-            fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
-                               inst->dst.fixed_hw_reg.subnr);
-            break;
-         }
-      } else {
-         fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
+   case ARF:
+      switch (inst->dst.nr) {
+      case BRW_ARF_NULL:
+         fprintf(file, "null");
+         break;
+      case BRW_ARF_ADDRESS:
+         fprintf(file, "a0.%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_ACCUMULATOR:
+         fprintf(file, "acc%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_FLAG:
+         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      default:
+         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
       }
-      if (inst->dst.fixed_hw_reg.subnr)
-         fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
+      if (inst->dst.subnr)
+         fprintf(file, "+%d", inst->dst.subnr);
       break;
    case IMM:
       unreachable("not reached");
@@ -4530,21 +4676,24 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       if (inst->src[i].abs)
          fprintf(file, "|");
       switch (inst->src[i].file) {
-      case GRF:
-         fprintf(file, "vgrf%d", inst->src[i].reg);
-         if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
+      case VGRF:
+         fprintf(file, "vgrf%d", inst->src[i].nr);
+         if (alloc.sizes[inst->src[i].nr] != (unsigned)inst->regs_read(i) ||
              inst->src[i].subreg_offset)
             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                     inst->src[i].subreg_offset);
          break;
+      case FIXED_GRF:
+         fprintf(file, "g%d", inst->src[i].nr);
+         break;
       case MRF:
-         fprintf(file, "***m%d***", inst->src[i].reg);
+         fprintf(file, "***m%d***", inst->src[i].nr);
          break;
       case ATTR:
-         fprintf(file, "attr%d+%d", inst->src[i].reg, inst->src[i].reg_offset);
+         fprintf(file, "attr%d+%d", inst->src[i].nr, inst->src[i].reg_offset);
          break;
       case UNIFORM:
-         fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
+         fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
          if (inst->src[i].reladdr) {
             fprintf(file, "+reladdr");
          } else if (inst->src[i].subreg_offset) {
@@ -4558,60 +4707,48 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       case IMM:
          switch (inst->src[i].type) {
          case BRW_REGISTER_TYPE_F:
-            fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
+            fprintf(file, "%ff", inst->src[i].f);
             break;
          case BRW_REGISTER_TYPE_W:
          case BRW_REGISTER_TYPE_D:
-            fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
+            fprintf(file, "%dd", inst->src[i].d);
             break;
          case BRW_REGISTER_TYPE_UW:
          case BRW_REGISTER_TYPE_UD:
-            fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
+            fprintf(file, "%uu", inst->src[i].ud);
             break;
          case BRW_REGISTER_TYPE_VF:
             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
+                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
             break;
          default:
             fprintf(file, "???");
             break;
          }
          break;
-      case HW_REG:
-         if (inst->src[i].fixed_hw_reg.negate)
-            fprintf(file, "-");
-         if (inst->src[i].fixed_hw_reg.abs)
-            fprintf(file, "|");
-         if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
-            switch (inst->src[i].fixed_hw_reg.nr) {
-            case BRW_ARF_NULL:
-               fprintf(file, "null");
-               break;
-            case BRW_ARF_ADDRESS:
-               fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
-               break;
-            case BRW_ARF_ACCUMULATOR:
-               fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
-               break;
-            case BRW_ARF_FLAG:
-               fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
-                                inst->src[i].fixed_hw_reg.subnr);
-               break;
-            default:
-               fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
-                                  inst->src[i].fixed_hw_reg.subnr);
-               break;
-            }
-         } else {
-            fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
+      case ARF:
+         switch (inst->src[i].nr) {
+         case BRW_ARF_NULL:
+            fprintf(file, "null");
+            break;
+         case BRW_ARF_ADDRESS:
+            fprintf(file, "a0.%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_ACCUMULATOR:
+            fprintf(file, "acc%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_FLAG:
+            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         default:
+            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
          }
-         if (inst->src[i].fixed_hw_reg.subnr)
-            fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
-         if (inst->src[i].fixed_hw_reg.abs)
-            fprintf(file, "|");
+         if (inst->src[i].subnr)
+            fprintf(file, "+%d", inst->src[i].subnr);
          break;
       }
       if (inst->src[i].abs)
@@ -4627,6 +4764,9 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
 
    fprintf(file, " ");
 
+   if (inst->force_writemask_all)
+      fprintf(file, "NoMask ");
+
    if (dispatch_width == 16 && inst->exec_size == 8) {
       if (inst->force_sechalf)
          fprintf(file, "2ndhalf ");
@@ -4779,6 +4919,45 @@ fs_visitor::setup_vs_payload()
  *
  */
 void
+fs_visitor::setup_gs_payload()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data =
+      (struct brw_gs_prog_data *) prog_data;
+   struct brw_vue_prog_data *vue_prog_data =
+      (struct brw_vue_prog_data *) prog_data;
+
+   /* R0: thread header, R1: output URB handles */
+   payload.num_regs = 2;
+
+   if (gs_prog_data->include_primitive_id) {
+      /* R2: Primitive ID 0..7 */
+      payload.num_regs++;
+   }
+
+   /* Use a maximum of 32 registers for push-model inputs. */
+   const unsigned max_push_components = 32;
+
+   /* If pushing our inputs would take too many registers, reduce the URB read
+    * length (which is in HWords, or 8 registers), and resort to pulling.
+    *
+    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
+    * have to multiply by VerticesIn to obtain the total storage requirement.
+    */
+   if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
+       max_push_components) {
+      gs_prog_data->base.include_vue_handles = true;
+
+      /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
+      payload.num_regs += nir->info.gs.vertices_in;
+
+      vue_prog_data->urb_read_length =
+         ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
+   }
+}
+
+void
 fs_visitor::setup_cs_payload()
 {
    assert(devinfo->gen >= 7);
@@ -4925,7 +5104,7 @@ fs_visitor::fixup_3src_null_dest()
 {
    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
       if (inst->is_3src() && inst->dst.is_null()) {
-         inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+         inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                             inst->dst.type);
       }
    }
@@ -5035,6 +5214,55 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
 }
 
 bool
+fs_visitor::run_gs()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   setup_gs_payload();
+
+   this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
+
+   if (gs_compile->control_data_header_size_bits > 0) {
+      /* Create a VGRF to store accumulated control data bits. */
+      this->control_data_bits = vgrf(glsl_type::uint_type);
+
+      /* If we're outputting more than 32 control data bits, then EmitVertex()
+       * will set control_data_bits to 0 after emitting the first vertex.
+       * Otherwise, we need to initialize it to 0 here.
+       */
+      if (gs_compile->control_data_header_size_bits <= 32) {
+         const fs_builder abld = bld.annotate("initialize control data bits");
+         abld.MOV(this->control_data_bits, fs_reg(0u));
+      }
+   }
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   emit_nir_code();
+
+   emit_gs_thread_end();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   if (failed)
+      return false;
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_gs_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers();
+
+   return !failed;
+}
+
+bool
 fs_visitor::run_fs(bool do_rep_send)
 {
    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 8058b344b7a..f40e58b8ca0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -70,9 +70,10 @@ offset(fs_reg reg, const brw::fs_builder& bld, unsigned delta)
    switch (reg.file) {
    case BAD_FILE:
       break;
-   case GRF:
+   case ARF:
+   case FIXED_GRF:
    case MRF:
-   case HW_REG:
+   case VGRF:
    case ATTR:
       return byte_offset(reg,
                          delta * reg.component_size(bld.dispatch_width()));
@@ -105,7 +106,8 @@ public:
               void *mem_ctx,
               struct brw_gs_compile *gs_compile,
               struct brw_gs_prog_data *prog_data,
-              const nir_shader *shader);
+              const nir_shader *shader,
+              int shader_time_index);
    void init();
    ~fs_visitor();
 
@@ -131,18 +133,22 @@ public:
 
    bool run_fs(bool do_rep_send);
    bool run_vs(gl_clip_plane *clip_planes);
+   bool run_gs();
    bool run_cs();
    void optimize();
    void allocate_registers();
    void setup_payload_gen4();
    void setup_payload_gen6();
    void setup_vs_payload();
+   void setup_gs_payload();
    void setup_cs_payload();
    void fixup_3src_null_dest();
    void assign_curb_setup();
    void calculate_urb_setup();
    void assign_urb_setup();
+   void convert_attr_sources_to_hw_regs(fs_inst *inst);
    void assign_vs_urb_setup();
+   void assign_gs_urb_setup();
    bool assign_regs(bool allow_spilling);
    void assign_regs_trivial();
    void calculate_payload_ranges(int payload_node_count,
@@ -258,6 +264,14 @@ public:
                             nir_load_const_instr *instr);
    void nir_emit_undef(const brw::fs_builder &bld,
                        nir_ssa_undef_instr *instr);
+   void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_gs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_fs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_cs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
    void nir_emit_intrinsic(const brw::fs_builder &bld,
                            nir_intrinsic_instr *instr);
    void nir_emit_ssbo_atomic(const brw::fs_builder &bld,
@@ -280,7 +294,16 @@ public:
                                  fs_reg color1, fs_reg color2,
                                  fs_reg src0_alpha, unsigned components);
    void emit_fb_writes();
-   void emit_urb_writes();
+   void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
+   void set_gs_stream_control_data_bits(const fs_reg &vertex_count,
+                                        unsigned stream_id);
+   void emit_gs_control_data_bits(const fs_reg &vertex_count);
+   void emit_gs_end_primitive(const nir_src &vertex_count_nir_src);
+   void emit_gs_vertex(const nir_src &vertex_count_nir_src,
+                       unsigned stream_id);
+   void emit_gs_thread_end();
+   void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
+                           unsigned offset, unsigned num_components);
    void emit_cs_terminate();
    fs_reg *emit_cs_local_invocation_id_setup();
    fs_reg *emit_cs_work_group_id_setup();
@@ -388,6 +411,8 @@ public:
    fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
    fs_reg shader_start_time;
    fs_reg userplane[MAX_CLIP_PLANES];
+   fs_reg final_gs_vertex_count;
+   fs_reg control_data_bits;
 
    unsigned grf_used;
    bool spilled_any_registers;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index f121f3463d3..22b2f22073f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -179,7 +179,7 @@ namespace brw {
          assert(dispatch_width() <= 32);
 
          if (n > 0)
-            return dst_reg(GRF, shader->alloc.allocate(
+            return dst_reg(VGRF, shader->alloc.allocate(
                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
                                            REG_SIZE)),
                            type);
@@ -224,12 +224,13 @@ namespace brw {
       src_reg
       sample_mask_reg() const
       {
-         const bool uses_kill =
-            (shader->stage == MESA_SHADER_FRAGMENT &&
-             ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
-         return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
-                 uses_kill ? brw_flag_reg(0, 1) :
-                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
+         if (shader->stage != MESA_SHADER_FRAGMENT) {
+            return src_reg(0xffff);
+         } else if (((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill) {
+            return brw_flag_reg(0, 1);
+         } else {
+            return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
+         }
       }
 
       /**
@@ -595,7 +596,7 @@ namespace brw {
       src_reg
       fix_3src_operand(const src_reg &src) const
       {
-         if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
+         if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
             return src;
          } else {
             dst_reg expanded = vgrf(src.type);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
index 883e8d2a49f..8fdc959f992 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp
@@ -62,7 +62,7 @@ opt_cmod_propagation_local(bblock_t *block)
            inst->opcode != BRW_OPCODE_MOV) ||
           inst->predicate != BRW_PREDICATE_NONE ||
           !inst->dst.is_null() ||
-          inst->src[0].file != GRF ||
+          inst->src[0].file != VGRF ||
           inst->src[0].abs)
          continue;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
index c182232285e..0c115f50748 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -121,7 +121,7 @@ struct imm {
     * constant value.
     */
    uint8_t subreg_offset;
-   uint16_t reg;
+   uint16_t nr;
 
    /** The number of coissuable instructions using this immediate. */
    uint16_t uses_by_coissue;
@@ -219,7 +219,7 @@ fs_visitor::opt_combine_constants()
              inst->src[i].type != BRW_REGISTER_TYPE_F)
             continue;
 
-         float val = fabsf(inst->src[i].fixed_hw_reg.dw1.f);
+         float val = fabsf(inst->src[i].f);
          struct imm *imm = find_imm(&table, val);
 
          if (imm) {
@@ -268,7 +268,7 @@ fs_visitor::opt_combine_constants()
 
 
    /* Insert MOVs to load the constant values into GRFs. */
-   fs_reg reg(GRF, alloc.allocate(dispatch_width / 8));
+   fs_reg reg(VGRF, alloc.allocate(dispatch_width / 8));
    reg.stride = 0;
    for (int i = 0; i < table.len; i++) {
       struct imm *imm = &table.imm[i];
@@ -280,12 +280,12 @@ fs_visitor::opt_combine_constants()
       const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
 
       ibld.MOV(reg, fs_reg(imm->val));
-      imm->reg = reg.reg;
+      imm->nr = reg.nr;
       imm->subreg_offset = reg.subreg_offset;
 
       reg.subreg_offset += sizeof(float);
       if ((unsigned)reg.subreg_offset == dispatch_width * sizeof(float)) {
-         reg.reg = alloc.allocate(dispatch_width / 8);
+         reg.nr = alloc.allocate(dispatch_width / 8);
          reg.subreg_offset = 0;
       }
    }
@@ -295,13 +295,12 @@ fs_visitor::opt_combine_constants()
    for (int i = 0; i < table.len; i++) {
       foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
          fs_reg *reg = link->reg;
-         reg->file = GRF;
-         reg->reg = table.imm[i].reg;
+         reg->file = VGRF;
+         reg->nr = table.imm[i].nr;
          reg->subreg_offset = table.imm[i].subreg_offset;
          reg->stride = 0;
-         reg->negate = signbit(reg->fixed_hw_reg.dw1.f) !=
-                               signbit(table.imm[i].val);
-         assert(fabsf(reg->fixed_hw_reg.dw1.f) == table.imm[i].val);
+         reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
+         assert(fabsf(reg->f) == table.imm[i].val);
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 26204827156..426ea57d8f9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -154,7 +154,7 @@ fs_copy_prop_dataflow::setup_initial_values()
    /* Initialize the COPY and KILL sets. */
    foreach_block (block, cfg) {
       foreach_inst_in_block(fs_inst, inst, block) {
-         if (inst->dst.file != GRF)
+         if (inst->dst.file != VGRF)
             continue;
 
          /* Mark ACP entries which are killed by this instruction. */
@@ -278,20 +278,20 @@ is_logic_op(enum opcode opcode)
 bool
 fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
 {
-   if (inst->src[arg].file != GRF)
+   if (inst->src[arg].file != VGRF)
       return false;
 
    if (entry->src.file == IMM)
       return false;
-   assert(entry->src.file == GRF || entry->src.file == UNIFORM ||
+   assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
           entry->src.file == ATTR);
 
    if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
        inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
       return false;
 
-   assert(entry->dst.file == GRF);
-   if (inst->src[arg].reg != entry->dst.reg)
+   assert(entry->dst.file == VGRF);
+   if (inst->src[arg].nr != entry->dst.nr)
       return false;
 
    /* Bail if inst is reading a range that isn't contained in the range
@@ -369,8 +369,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
       switch(inst->opcode) {
       case BRW_OPCODE_SEL:
          if (inst->src[1].file != IMM ||
-             inst->src[1].fixed_hw_reg.dw1.f < 0.0 ||
-             inst->src[1].fixed_hw_reg.dw1.f > 1.0) {
+             inst->src[1].f < 0.0 ||
+             inst->src[1].f > 1.0) {
             return false;
          }
          break;
@@ -380,19 +380,20 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    }
 
    inst->src[arg].file = entry->src.file;
-   inst->src[arg].reg = entry->src.reg;
+   inst->src[arg].nr = entry->src.nr;
    inst->src[arg].stride *= entry->src.stride;
    inst->saturate = inst->saturate || entry->saturate;
 
    switch (entry->src.file) {
    case UNIFORM:
    case BAD_FILE:
-   case HW_REG:
+   case ARF:
+   case FIXED_GRF:
       inst->src[arg].reg_offset = entry->src.reg_offset;
       inst->src[arg].subreg_offset = entry->src.subreg_offset;
       break;
    case ATTR:
-   case GRF:
+   case VGRF:
       {
          /* In this case, we'll just leave the width alone.  The source
           * register could have different widths depending on how it is
@@ -456,11 +457,11 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
       return false;
 
    for (int i = inst->sources - 1; i >= 0; i--) {
-      if (inst->src[i].file != GRF)
+      if (inst->src[i].file != VGRF)
          continue;
 
-      assert(entry->dst.file == GRF);
-      if (inst->src[i].reg != entry->dst.reg)
+      assert(entry->dst.file == VGRF);
+      if (inst->src[i].nr != entry->dst.nr)
          continue;
 
       /* Bail if inst is reading a range that isn't contained in the range
@@ -477,14 +478,14 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
 
       if (inst->src[i].abs) {
          if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
-             !brw_abs_immediate(val.type, &val.fixed_hw_reg)) {
+             !brw_abs_immediate(val.type, &val)) {
             continue;
          }
       }
 
       if (inst->src[i].negate) {
          if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
-             !brw_negate_immediate(val.type, &val.fixed_hw_reg)) {
+             !brw_negate_immediate(val.type, &val)) {
             continue;
          }
       }
@@ -605,10 +606,10 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
           * anyway.
           */
          assert(i == 0);
-         if (inst->src[0].fixed_hw_reg.dw1.f != 0.0f) {
+         if (inst->src[0].f != 0.0f) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->src[0] = val;
-            inst->src[0].fixed_hw_reg.dw1.f = 1.0f / inst->src[0].fixed_hw_reg.dw1.f;
+            inst->src[0].f = 1.0f / inst->src[0].f;
             progress = true;
          }
          break;
@@ -652,9 +653,9 @@ static bool
 can_propagate_from(fs_inst *inst)
 {
    return (inst->opcode == BRW_OPCODE_MOV &&
-           inst->dst.file == GRF &&
-           ((inst->src[0].file == GRF &&
-             (inst->src[0].reg != inst->dst.reg ||
+           inst->dst.file == VGRF &&
+           ((inst->src[0].file == VGRF &&
+             (inst->src[0].nr != inst->dst.nr ||
               inst->src[0].reg_offset != inst->dst.reg_offset)) ||
             inst->src[0].file == ATTR ||
             inst->src[0].file == UNIFORM ||
@@ -675,10 +676,10 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
    foreach_inst_in_block(fs_inst, inst, block) {
       /* Try propagating into this instruction. */
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file != GRF)
+         if (inst->src[i].file != VGRF)
             continue;
 
-         foreach_in_list(acp_entry, entry, &acp[inst->src[i].reg % ACP_HASH_SIZE]) {
+         foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) {
             if (try_constant_propagate(inst, entry))
                progress = true;
 
@@ -688,8 +689,8 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
       }
 
       /* kill the destination from the ACP */
-      if (inst->dst.file == GRF) {
-	 foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.reg % ACP_HASH_SIZE]) {
+      if (inst->dst.file == VGRF) {
+         foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
 	    if (inst->overwrites_reg(entry->dst)) {
 	       entry->remove();
 	    }
@@ -716,14 +717,14 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
          entry->regs_written = inst->regs_written;
          entry->opcode = inst->opcode;
          entry->saturate = inst->saturate;
-	 acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
+         acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
       } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
-                 inst->dst.file == GRF) {
+                 inst->dst.file == VGRF) {
          int offset = 0;
          for (int i = 0; i < inst->sources; i++) {
             int effective_width = i < inst->header_size ? 8 : inst->exec_size;
             int regs_written = effective_width / 8;
-            if (inst->src[i].file == GRF) {
+            if (inst->src[i].file == VGRF) {
                acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
                entry->dst = inst->dst;
                entry->dst.reg_offset = offset;
@@ -731,7 +732,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
                entry->regs_written = regs_written;
                entry->opcode = inst->opcode;
                if (!entry->dst.equals(inst->src[i])) {
-                  acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
+                  acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
                } else {
                   ralloc_free(entry);
                }
@@ -774,7 +775,7 @@ fs_visitor::opt_copy_propagate()
       for (int i = 0; i < dataflow.num_acp; i++) {
          if (BITSET_TEST(dataflow.bd[block->num].livein, i)) {
             struct acp_entry *entry = dataflow.acp[i];
-            in_acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
+            in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
          }
       }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 3a28c8d591d..8c67caff6e0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -110,20 +110,20 @@ operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
               (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
    } else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) {
       bool xs0_negate = xs[0].negate;
-      bool xs1_negate = xs[1].file == IMM ? xs[1].fixed_hw_reg.dw1.f < 0.0f
+      bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
                                           : xs[1].negate;
       bool ys0_negate = ys[0].negate;
-      bool ys1_negate = ys[1].file == IMM ? ys[1].fixed_hw_reg.dw1.f < 0.0f
+      bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
                                           : ys[1].negate;
-      float xs1_imm = xs[1].fixed_hw_reg.dw1.f;
-      float ys1_imm = ys[1].fixed_hw_reg.dw1.f;
+      float xs1_imm = xs[1].f;
+      float ys1_imm = ys[1].f;
 
       xs[0].negate = false;
       xs[1].negate = false;
       ys[0].negate = false;
       ys[1].negate = false;
-      xs[1].fixed_hw_reg.dw1.f = fabsf(xs[1].fixed_hw_reg.dw1.f);
-      ys[1].fixed_hw_reg.dw1.f = fabsf(ys[1].fixed_hw_reg.dw1.f);
+      xs[1].f = fabsf(xs[1].f);
+      ys[1].f = fabsf(ys[1].f);
 
       bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
                  (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
@@ -132,8 +132,8 @@ operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
       xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
       ys[0].negate = ys0_negate;
       ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
-      xs[1].fixed_hw_reg.dw1.f = xs1_imm;
-      ys[1].fixed_hw_reg.dw1.f = ys1_imm;
+      xs[1].f = xs1_imm;
+      ys[1].f = ys1_imm;
 
       *negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
       return ret;
@@ -196,7 +196,7 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
          header_size = 0;
       }
 
-      assert(src.file == GRF);
+      assert(src.file == VGRF);
       payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
       for (int i = 0; i < header_size; i++) {
          payload[i] = src;
@@ -226,7 +226,8 @@ fs_visitor::opt_cse_local(bblock_t *block)
    foreach_inst_in_block(fs_inst, inst, block) {
       /* Skip some cases. */
       if (is_expression(this, inst) && !inst->is_partial_write() &&
-          (inst->dst.file != HW_REG || inst->dst.is_null()))
+          ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
+           inst->dst.is_null()))
       {
          bool found = false;
          bool negate = false;
@@ -262,7 +263,7 @@ fs_visitor::opt_cse_local(bblock_t *block)
                                        .at(block, entry->generator->next);
                int written = entry->generator->regs_written;
 
-               entry->tmp = fs_reg(GRF, alloc.allocate(written),
+               entry->tmp = fs_reg(VGRF, alloc.allocate(written),
                                    entry->generator->dst.type);
 
                create_copy_instr(ibld, entry->generator, entry->tmp, false);
@@ -320,7 +321,7 @@ fs_visitor::opt_cse_local(bblock_t *block)
             /* Kill any AEB entries using registers that don't get reused any
              * more -- a sure sign they'll fail operands_match().
              */
-            if (src_reg->file == GRF && virtual_grf_end[src_reg->reg] < ip) {
+            if (src_reg->file == VGRF && virtual_grf_end[src_reg->nr] < ip) {
                entry->remove();
                ralloc_free(entry);
                break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
index 4b5548a9dc5..a50cf6f24f1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp
@@ -52,7 +52,7 @@ fs_visitor::dead_code_eliminate()
              sizeof(BITSET_WORD));
 
       foreach_inst_in_block_reverse(fs_inst, inst, block) {
-         if (inst->dst.file == GRF && !inst->has_side_effects()) {
+         if (inst->dst.file == VGRF && !inst->has_side_effects()) {
             bool result_live = false;
 
             if (inst->regs_written == 1) {
@@ -96,7 +96,7 @@ fs_visitor::dead_code_eliminate()
             continue;
          }
 
-         if (inst->dst.file == GRF) {
+         if (inst->dst.file == VGRF) {
             if (!inst->is_partial_write()) {
                int var = live_intervals->var_from_reg(inst->dst);
                for (int i = 0; i < inst->regs_written; i++) {
@@ -105,12 +105,12 @@ fs_visitor::dead_code_eliminate()
             }
          }
 
-         if (inst->writes_flag()) {
+         if (inst->writes_flag() && !inst->predicate) {
             BITSET_CLEAR(flag_live, inst->flag_subreg);
          }
 
          for (int i = 0; i < inst->sources; i++) {
-            if (inst->src[i].file == GRF) {
+            if (inst->src[i].file == VGRF) {
                int var = live_intervals->var_from_reg(inst->src[i]);
 
                for (int j = 0; j < inst->regs_read(i); j++) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index e207a77fdc1..139cda3ca59 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -33,22 +33,25 @@
 #include "brw_fs.h"
 #include "brw_cfg.h"
 
-static uint32_t brw_file_from_reg(fs_reg *reg)
+static enum brw_reg_file
+brw_file_from_reg(fs_reg *reg)
 {
    switch (reg->file) {
-   case GRF:
+   case ARF:
+      return BRW_ARCHITECTURE_REGISTER_FILE;
+   case FIXED_GRF:
+   case VGRF:
       return BRW_GENERAL_REGISTER_FILE;
    case MRF:
       return BRW_MESSAGE_REGISTER_FILE;
    case IMM:
       return BRW_IMMEDIATE_VALUE;
    case BAD_FILE:
-   case HW_REG:
    case ATTR:
    case UNIFORM:
       unreachable("not reached");
    }
-   return 0;
+   return BRW_ARCHITECTURE_REGISTER_FILE;
 }
 
 static struct brw_reg
@@ -58,13 +61,13 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen)
 
    switch (reg->file) {
    case MRF:
-      assert((reg->reg & ~(1 << 7)) < BRW_MAX_MRF(gen));
+      assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen));
       /* Fallthrough */
-   case GRF:
+   case VGRF:
       if (reg->stride == 0) {
-         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
+         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
       } else if (inst->exec_size < 8) {
-         brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
+         brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->nr, 0);
          brw_reg = stride(brw_reg, inst->exec_size * reg->stride,
                           inst->exec_size, reg->stride);
       } else {
@@ -77,12 +80,14 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen)
           * So, for registers with width > 8, we have to use a width of 8
           * and trust the compression state to sort out the exec size.
           */
-         brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
+         brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->nr, 0);
          brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
       }
 
       brw_reg = retype(brw_reg, reg->type);
       brw_reg = byte_offset(brw_reg, reg->subreg_offset);
+      brw_reg.abs = reg->abs;
+      brw_reg.negate = reg->negate;
       break;
    case IMM:
       assert(reg->stride == ((reg->type == BRW_REGISTER_TYPE_V ||
@@ -91,30 +96,33 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen)
 
       switch (reg->type) {
       case BRW_REGISTER_TYPE_F:
-	 brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
+	 brw_reg = brw_imm_f(reg->f);
 	 break;
       case BRW_REGISTER_TYPE_D:
-	 brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d);
+	 brw_reg = brw_imm_d(reg->d);
 	 break;
       case BRW_REGISTER_TYPE_UD:
-	 brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud);
+	 brw_reg = brw_imm_ud(reg->ud);
 	 break;
       case BRW_REGISTER_TYPE_W:
-	 brw_reg = brw_imm_w(reg->fixed_hw_reg.dw1.d);
+	 brw_reg = brw_imm_w(reg->d);
 	 break;
       case BRW_REGISTER_TYPE_UW:
-	 brw_reg = brw_imm_uw(reg->fixed_hw_reg.dw1.ud);
+	 brw_reg = brw_imm_uw(reg->ud);
 	 break;
       case BRW_REGISTER_TYPE_VF:
-         brw_reg = brw_imm_vf(reg->fixed_hw_reg.dw1.ud);
+         brw_reg = brw_imm_vf(reg->ud);
+         break;
+      case BRW_REGISTER_TYPE_V:
+         brw_reg = brw_imm_v(reg->ud);
          break;
       default:
 	 unreachable("not reached");
       }
       break;
-   case HW_REG:
-      assert(reg->type == reg->fixed_hw_reg.type);
-      brw_reg = reg->fixed_hw_reg;
+   case ARF:
+   case FIXED_GRF:
+      brw_reg = *static_cast<struct brw_reg *>(reg);
       break;
    case BAD_FILE:
       /* Probably unused. */
@@ -124,10 +132,6 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen)
    case UNIFORM:
       unreachable("not reached");
    }
-   if (reg->abs)
-      brw_reg = brw_abs(brw_reg);
-   if (reg->negate)
-      brw_reg = negate(brw_reg);
 
    return brw_reg;
 }
@@ -383,6 +387,9 @@ fs_generator::generate_urb_read(fs_inst *inst,
    brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
    brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
 
+   if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
+      brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
+
    brw_inst_set_mlen(p->devinfo, send, inst->mlen);
    brw_inst_set_rlen(p->devinfo, send, inst->regs_written);
    brw_inst_set_header_present(p->devinfo, send, true);
@@ -658,7 +665,7 @@ fs_generator::generate_get_buffer_size(fs_inst *inst,
               retype(dst, BRW_REGISTER_TYPE_UW),
               inst->base_mrf,
               src,
-              surf_index.dw1.ud,
+              surf_index.ud,
               0,
               GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
               rlen, /* response length */
@@ -667,7 +674,7 @@ fs_generator::generate_get_buffer_size(fs_inst *inst,
               simd_mode,
               BRW_SAMPLER_RETURN_FORMAT_SINT32);
 
-   brw_mark_surface_used(prog_data, surf_index.dw1.ud);
+   brw_mark_surface_used(prog_data, surf_index.ud);
 }
 
 void
@@ -741,6 +748,10 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
       case SHADER_OPCODE_TXF:
 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 	 break;
+      case SHADER_OPCODE_TXF_CMS_W:
+         assert(devinfo->gen >= 9);
+         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
+         break;
       case SHADER_OPCODE_TXF_CMS:
          if (devinfo->gen >= 7)
             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
@@ -905,7 +916,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
          : prog_data->binding_table.texture_start;
 
    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
-      uint32_t sampler = sampler_index.dw1.ud;
+      uint32_t sampler = sampler_index.ud;
 
       brw_SAMPLE(p,
                  retype(dst, BRW_REGISTER_TYPE_UW),
@@ -1172,16 +1183,14 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
 
    assert(index.file == BRW_IMMEDIATE_VALUE &&
 	  index.type == BRW_REGISTER_TYPE_UD);
-   uint32_t surf_index = index.dw1.ud;
+   uint32_t surf_index = index.ud;
 
    assert(offset.file == BRW_IMMEDIATE_VALUE &&
 	  offset.type == BRW_REGISTER_TYPE_UD);
-   uint32_t read_offset = offset.dw1.ud;
+   uint32_t read_offset = offset.ud;
 
    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
 			read_offset, surf_index);
-
-   brw_mark_surface_used(prog_data, surf_index);
 }
 
 void
@@ -1223,7 +1232,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
 
    if (index.file == BRW_IMMEDIATE_VALUE) {
 
-      uint32_t surf_index = index.dw1.ud;
+      uint32_t surf_index = index.ud;
 
       brw_push_insn_state(p);
       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
@@ -1242,9 +1251,6 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                               header_present,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
-
-      brw_mark_surface_used(prog_data, surf_index);
-
    } else {
 
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
@@ -1274,11 +1280,6 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
                               0);
 
       brw_pop_insn_state(p);
-
-      /* visitor knows more than we do about the surface limit required,
-       * so has already done marking.
-       */
-
    }
 }
 
@@ -1294,7 +1295,7 @@ fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
 
    assert(index.file == BRW_IMMEDIATE_VALUE &&
 	  index.type == BRW_REGISTER_TYPE_UD);
-   uint32_t surf_index = index.dw1.ud;
+   uint32_t surf_index = index.ud;
 
    uint32_t simd_mode, rlen, msg_type;
    if (dispatch_width == 16) {
@@ -1345,8 +1346,6 @@ fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
                            inst->header_size != 0,
                            simd_mode,
                            return_format);
-
-   brw_mark_surface_used(prog_data, surf_index);
 }
 
 void
@@ -1376,7 +1375,7 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
 
    if (index.file == BRW_IMMEDIATE_VALUE) {
 
-      uint32_t surf_index = index.dw1.ud;
+      uint32_t surf_index = index.ud;
 
       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
@@ -1391,8 +1390,6 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
                               simd_mode,
                               0);
 
-      brw_mark_surface_used(prog_data, surf_index);
-
    } else {
 
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
@@ -1423,10 +1420,6 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
                               false /* header */,
                               simd_mode,
                               0);
-
-      /* visitor knows more than we do about the surface limit required,
-       * so has already done marking.
-       */
    }
 }
 
@@ -2050,6 +2043,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
       case SHADER_OPCODE_TXD:
       case SHADER_OPCODE_TXF:
       case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXF_CMS_W:
       case SHADER_OPCODE_TXF_UMS:
       case SHADER_OPCODE_TXF_MCS:
       case SHADER_OPCODE_TXL:
@@ -2067,7 +2061,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
       case FS_OPCODE_DDY_COARSE:
       case FS_OPCODE_DDY_FINE:
          assert(src[1].file == BRW_IMMEDIATE_VALUE);
-         generate_ddy(inst->opcode, dst, src[0], src[1].dw1.ud);
+         generate_ddy(inst->opcode, dst, src[0], src[1].ud);
 	 break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
@@ -2086,6 +2080,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 	 break;
 
       case SHADER_OPCODE_URB_READ_SIMD8:
+      case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
          generate_urb_read(inst, dst, src[0]);
          break;
 
@@ -2135,37 +2130,37 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud,
+         brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
                             inst->mlen, !inst->dst.is_null());
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_read(p, dst, src[0], src[1],
-                                  inst->mlen, src[2].dw1.ud);
+                                  inst->mlen, src[2].ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_write(p, src[0], src[1],
-                                   inst->mlen, src[2].dw1.ud);
+                                   inst->mlen, src[2].ud);
          break;
 
       case SHADER_OPCODE_TYPED_ATOMIC:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_typed_atomic(p, dst, src[0], src[1],
-                          src[2].dw1.ud, inst->mlen, !inst->dst.is_null());
+                          src[2].ud, inst->mlen, !inst->dst.is_null());
          break;
 
       case SHADER_OPCODE_TYPED_SURFACE_READ:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_typed_surface_read(p, dst, src[0], src[1],
-                                inst->mlen, src[2].dw1.ud);
+                                inst->mlen, src[2].ud);
          break;
 
       case SHADER_OPCODE_TYPED_SURFACE_WRITE:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].dw1.ud);
+         brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
          break;
 
       case SHADER_OPCODE_MEMORY_FENCE:
@@ -2267,6 +2262,13 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
    brw_set_uip_jip(p);
    annotation_finalize(&annotation, p->next_insn_offset);
 
+#ifndef NDEBUG
+   bool validated = brw_validate_instructions(p, start_offset, &annotation);
+#else
+   if (unlikely(debug_flag))
+      brw_validate_instructions(p, start_offset, &annotation);
+#endif
+
    int before_size = p->next_insn_offset - start_offset;
    brw_compact_instructions(p, start_offset, annotation.ann_count,
                             annotation.ann);
@@ -2282,8 +2284,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 
       dump_assembly(p->store, annotation.ann_count, annotation.ann,
                     p->devinfo);
-      ralloc_free(annotation.ann);
+      ralloc_free(annotation.mem_ctx);
    }
+   assert(validated);
 
    compiler->shader_debug_log(log_data,
                               "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
index ce066a9778e..80fb8c28f81 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
@@ -117,7 +117,7 @@ fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
    /* The def[] bitset marks when an initialization in a block completely
     * screens off previous updates of that variable (VGRF channel).
     */
-   if (inst->dst.file == GRF && !inst->is_partial_write()) {
+   if (inst->dst.file == VGRF && !inst->is_partial_write()) {
       if (!BITSET_TEST(bd->use, var))
          BITSET_SET(bd->def, var);
    }
@@ -149,7 +149,7 @@ fs_live_variables::setup_def_use()
 	 for (unsigned int i = 0; i < inst->sources; i++) {
             fs_reg reg = inst->src[i];
 
-            if (reg.file != GRF)
+            if (reg.file != VGRF)
                continue;
 
             for (int j = 0; j < inst->regs_read(i); j++) {
@@ -172,7 +172,7 @@ fs_live_variables::setup_def_use()
          }
 
          /* Set def[] for this instruction */
-         if (inst->dst.file == GRF) {
+         if (inst->dst.file == VGRF) {
             fs_reg reg = inst->dst;
             for (int j = 0; j < inst->regs_written; j++) {
                setup_one_write(bd, inst, ip, reg);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
index c7457069ede..96cadea96aa 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h
@@ -68,7 +68,7 @@ public:
    bool vars_interfere(int a, int b);
    int var_from_reg(const fs_reg &reg) const
    {
-      return var_from_vgrf[reg.reg] + reg.reg_offset;
+      return var_from_vgrf[reg.nr] + reg.reg_offset;
    }
 
    /** Map from virtual GRF number to index in block_data arrays. */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 486741bea31..a47b6ce50cc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -28,6 +28,7 @@
 #include "program/prog_to_nir.h"
 #include "brw_fs.h"
 #include "brw_fs_surface_builder.h"
+#include "brw_vec4_gs_visitor.h"
 #include "brw_nir.h"
 #include "brw_fs_surface_builder.h"
 #include "brw_vec4_gs_visitor.h"
@@ -102,7 +103,8 @@ fs_visitor::nir_setup_outputs()
 
       switch (stage) {
       case MESA_SHADER_VERTEX:
-         for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) {
+      case MESA_SHADER_GEOMETRY:
+         for (int i = 0; i < type_size_vec4(var->type); i++) {
             int output = var->data.location + i;
             this->outputs[output] = offset(reg, bld, 4 * i);
             this->output_components[output] = vector_elements;
@@ -260,6 +262,10 @@ void
 fs_visitor::nir_emit_system_values()
 {
    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
+   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
+      nir_system_values[i] = fs_reg();
+   }
+
    nir_foreach_overload(nir, overload) {
       assert(strcmp(overload->function->name, "main") == 0);
       assert(overload->impl);
@@ -270,7 +276,11 @@ fs_visitor::nir_emit_system_values()
 void
 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 {
-   nir_locals = reralloc(mem_ctx, nir_locals, fs_reg, impl->reg_alloc);
+   nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
+   for (unsigned i = 0; i < impl->reg_alloc; i++) {
+      nir_locals[i] = fs_reg();
+   }
+
    foreach_list_typed(nir_register, reg, node, &impl->registers) {
       unsigned array_elems =
          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
@@ -358,7 +368,22 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       break;
 
    case nir_instr_type_intrinsic:
-      nir_emit_intrinsic(abld, nir_instr_as_intrinsic(instr));
+      switch (stage) {
+      case MESA_SHADER_VERTEX:
+         nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_GEOMETRY:
+         nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_FRAGMENT:
+         nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_COMPUTE:
+         nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      default:
+         unreachable("unsupported shader stage");
+      }
       break;
 
    case nir_instr_type_tex:
@@ -1060,18 +1085,17 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
    fs_reg image(UNIFORM, deref->var->data.driver_location,
                 BRW_REGISTER_TYPE_UD);
 
-   if (deref->deref.child) {
-      const nir_deref_array *deref_array =
-         nir_deref_as_array(deref->deref.child);
-      assert(deref->deref.child->deref_type == nir_deref_type_array &&
-             deref_array->deref.child == NULL);
-      const unsigned size = glsl_get_length(deref->var->type);
+   for (const nir_deref *tail = &deref->deref; tail->child;
+        tail = tail->child) {
+      const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
+      assert(tail->child->deref_type == nir_deref_type_array);
+      const unsigned size = glsl_get_length(tail->type);
+      const unsigned element_size = type_size_scalar(deref_array->deref.type);
       const unsigned base = MIN2(deref_array->base_offset, size - 1);
-
-      image = offset(image, bld, base * BRW_IMAGE_PARAM_SIZE);
+      image = offset(image, bld, base * element_size);
 
       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-         fs_reg *tmp = new(mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+         fs_reg tmp = vgrf(glsl_type::int_type);
 
          if (devinfo->gen == 7 && !devinfo->is_haswell) {
             /* IVB hangs when trying to access an invalid surface index with
@@ -1082,15 +1106,18 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
              * of the possible outcomes of the hang.  Clamp the index to
              * prevent access outside of the array bounds.
              */
-            bld.emit_minmax(*tmp, retype(get_nir_src(deref_array->indirect),
-                                         BRW_REGISTER_TYPE_UD),
+            bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
+                                        BRW_REGISTER_TYPE_UD),
                             fs_reg(size - base - 1), BRW_CONDITIONAL_L);
          } else {
-            bld.MOV(*tmp, get_nir_src(deref_array->indirect));
+            bld.MOV(tmp, get_nir_src(deref_array->indirect));
          }
 
-         bld.MUL(*tmp, *tmp, fs_reg(BRW_IMAGE_PARAM_SIZE));
-         image.reladdr = tmp;
+         bld.MUL(tmp, tmp, fs_reg(element_size));
+         if (image.reladdr)
+            bld.ADD(*image.reladdr, *image.reladdr, tmp);
+         else
+            image.reladdr = new(mem_ctx) fs_reg(tmp);
       }
    }
 
@@ -1108,7 +1135,7 @@ fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
       new_inst->dst = offset(new_inst->dst, bld, i);
       for (unsigned j = 0; j < new_inst->sources; j++)
-         if (new_inst->src[j].file == GRF)
+         if (new_inst->src[j].file == VGRF)
             new_inst->src[j] = offset(new_inst->src[j], bld, i);
 
       bld.emit(new_inst);
@@ -1194,16 +1221,498 @@ emit_pixel_interpolater_send(const fs_builder &bld,
    return inst;
 }
 
+/**
+ * Computes 1 << x, given a D/UD register containing some value x.
+ */
+static fs_reg
+intexp2(const fs_builder &bld, const fs_reg &x)
+{
+   assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
+
+   fs_reg result = bld.vgrf(x.type, 1);
+   fs_reg one = bld.vgrf(x.type, 1);
+
+   bld.MOV(one, retype(fs_reg(1), one.type));
+   bld.SHL(result, one, x);
+   return result;
+}
+
 void
-fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
+fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data =
+      (struct brw_gs_prog_data *) prog_data;
+
+   /* We can only do EndPrimitive() functionality when the control data
+    * consists of cut bits.  Fortunately, the only time it isn't is when the
+    * output type is points, in which case EndPrimitive() is a no-op.
+    */
+   if (gs_prog_data->control_data_format !=
+       GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
+      return;
+   }
+
+   /* Cut bits use one bit per vertex. */
+   assert(gs_compile->control_data_bits_per_vertex == 1);
+
+   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
+   vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
+    * vertex n, 0 otherwise.  So all we need to do here is mark bit
+    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
+    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
+    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
+    *
+    * Note that if EndPrimitive() is called before emitting any vertices, this
+    * will cause us to set bit 31 of the control_data_bits register to 1.
+    * That's fine because:
+    *
+    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
+    *   output, so the hardware will ignore cut bit 31.
+    *
+    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
+    *   last vertex, so setting cut bit 31 has no effect (since the primitive
+    *   is automatically ended when the GS terminates).
+    *
+    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
+    *   control_data_bits register to 0 when the first vertex is emitted.
+    */
+
+   const fs_builder abld = bld.annotate("end primitive");
+
+   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
+   fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
+   fs_reg mask = intexp2(abld, prev_count);
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
+    * ((vertex_count - 1) % 32).
+    */
+   abld.OR(this->control_data_bits, this->control_data_bits, mask);
+}
+
+void
+fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
 {
+   assert(stage == MESA_SHADER_GEOMETRY);
+   assert(gs_compile->control_data_bits_per_vertex != 0);
+
+   struct brw_gs_prog_data *gs_prog_data =
+      (struct brw_gs_prog_data *) prog_data;
+
+   const fs_builder abld = bld.annotate("emit control data bits");
+   const fs_builder fwa_bld = bld.exec_all();
+
+   /* We use a single UD register to accumulate control data bits (32 bits
+    * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
+    * at a time.
+    *
+    * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
+    * We have select a 128-bit group via the Global and Per-Slot Offsets, then
+    * use the Channel Mask phase to enable/disable which DWord within that
+    * group to write.  (Remember, different SIMD8 channels may have emitted
+    * different numbers of vertices, so we may need per-slot offsets.)
+    *
+    * Channel masking presents an annoying problem: we may have to replicate
+    * the data up to 4 times:
+    *
+    * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
+    *
+    * To avoid penalizing shaders that emit a small number of vertices, we
+    * can avoid these sometimes: if the size of the control data header is
+    * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
+    * land in the same 128-bit group, so we can skip per-slot offsets.
+    *
+    * Similarly, if the control data header is <= 32 bits, there is only one
+    * DWord, so we can skip channel masks.
+    */
+   enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
+
+   fs_reg channel_mask, per_slot_offset;
+
+   if (gs_compile->control_data_header_size_bits > 32) {
+      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+      channel_mask = vgrf(glsl_type::uint_type);
+   }
+
+   if (gs_compile->control_data_header_size_bits > 128) {
+      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
+      per_slot_offset = vgrf(glsl_type::uint_type);
+   }
+
+   /* Figure out which DWord we're trying to write to using the formula:
+    *
+    *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
+    *
+    * Since bits_per_vertex is a power of two, and is known at compile
+    * time, this can be optimized to:
+    *
+    *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+    */
+   if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
+      fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
+      unsigned log2_bits_per_vertex =
+         _mesa_fls(gs_compile->control_data_bits_per_vertex);
+      abld.SHR(dword_index, prev_count, fs_reg(6u - log2_bits_per_vertex));
+
+      if (per_slot_offset.file != BAD_FILE) {
+         /* Set the per-slot offset to dword_index / 4, so that we'll write to
+          * the appropriate OWord within the control data header.
+          */
+         abld.SHR(per_slot_offset, dword_index, fs_reg(2u));
+      }
+
+      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+       * write to the appropriate DWORD within the OWORD.
+       */
+      fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fwa_bld.AND(channel, dword_index, fs_reg(3u));
+      channel_mask = intexp2(fwa_bld, channel);
+      /* Then the channel masks need to be in bits 23:16. */
+      fwa_bld.SHL(channel_mask, channel_mask, fs_reg(16u));
+   }
+
+   /* Store the control data bits in the message payload and send it. */
+   int mlen = 2;
+   if (channel_mask.file != BAD_FILE)
+      mlen += 4; /* channel masks, plus 3 extra copies of the data */
+   if (per_slot_offset.file != BAD_FILE)
+      mlen++;
+
+   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
+   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
+   int i = 0;
+   sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+   if (per_slot_offset.file != BAD_FILE)
+      sources[i++] = per_slot_offset;
+   if (channel_mask.file != BAD_FILE)
+      sources[i++] = channel_mask;
+   while (i < mlen) {
+      sources[i++] = this->control_data_bits;
+   }
+
+   abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
+   fs_inst *inst = abld.emit(opcode, reg_undef, payload);
+   inst->mlen = mlen;
+   /* We need to increment Global Offset by 256-bits to make room for
+    * Broadwell's extra "Vertex Count" payload at the beginning of the
+    * URB entry.  Since this is an OWord message, Global Offset is counted
+    * in 128-bit units, so we must set it to 2.
+    */
+   if (gs_prog_data->static_vertex_count == -1)
+      inst->offset = 2;
+}
+
+void
+fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
+                                            unsigned stream_id)
+{
+   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
+
+   /* Note: we are calling this *before* increasing vertex_count, so
+    * this->vertex_count == vertex_count - 1 in the formula above.
+    */
+
+   /* Stream mode uses 2 bits per vertex */
+   assert(gs_compile->control_data_bits_per_vertex == 2);
+
+   /* Must be a valid stream */
+   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+
+   /* Control data bits are initialized to 0 so we don't have to set any
+    * bits when sending vertices to stream 0.
+    */
+   if (stream_id == 0)
+      return;
+
+   const fs_builder abld = bld.annotate("set stream control data bits", NULL);
+
+   /* reg::sid = stream_id */
+   fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.MOV(sid, fs_reg(stream_id));
+
+   /* reg:shift_count = 2 * (vertex_count - 1) */
+   fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.SHL(shift_count, vertex_count, fs_reg(1u));
+
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
+    * stream_id << ((2 * (vertex_count - 1)) % 32).
+    */
+   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.SHL(mask, sid, shift_count);
+   abld.OR(this->control_data_bits, this->control_data_bits, mask);
+}
+
+void
+fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
+                           unsigned stream_id)
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data =
+      (struct brw_gs_prog_data *) prog_data;
+
+   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
+   vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+   /* Haswell and later hardware ignores the "Render Stream Select" bits
+    * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
+    * and instead sends all primitives down the pipeline for rasterization.
+    * If the SOL stage is enabled, "Render Stream Select" is honored and
+    * primitives bound to non-zero streams are discarded after stream output.
+    *
+    * Since the only purpose of primives sent to non-zero streams is to
+    * be recorded by transform feedback, we can simply discard all geometry
+    * bound to these streams when transform feedback is disabled.
+    */
+   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
+      return;
+
+   /* If we're outputting 32 control data bits or less, then we can wait
+    * until the shader is over to output them all.  Otherwise we need to
+    * output them as we go.  Now is the time to do it, since we're about to
+    * output the vertex_count'th vertex, so it's guaranteed that the
+    * control data bits associated with the (vertex_count - 1)th vertex are
+    * correct.
+    */
+   if (gs_compile->control_data_header_size_bits > 32) {
+      const fs_builder abld =
+         bld.annotate("emit vertex: emit control data bits");
+
+      /* Only emit control data bits if we've finished accumulating a batch
+       * of 32 bits.  This is the case when:
+       *
+       *     (vertex_count * bits_per_vertex) % 32 == 0
+       *
+       * (in other words, when the last 5 bits of vertex_count *
+       * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
+       * integer n (which is always the case, since bits_per_vertex is
+       * always 1 or 2), this is equivalent to requiring that the last 5-n
+       * bits of vertex_count are 0:
+       *
+       *     vertex_count & (2^(5-n) - 1) == 0
+       *
+       * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
+       * equivalent to:
+       *
+       *     vertex_count & (32 / bits_per_vertex - 1) == 0
+       *
+       * TODO: If vertex_count is an immediate, we could do some of this math
+       *       at compile time...
+       */
+      fs_inst *inst =
+         abld.AND(bld.null_reg_d(), vertex_count,
+                  fs_reg(32u / gs_compile->control_data_bits_per_vertex - 1u));
+      inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+      abld.IF(BRW_PREDICATE_NORMAL);
+      /* If vertex_count is 0, then no control data bits have been
+       * accumulated yet, so we can skip emitting them.
+       */
+      abld.CMP(bld.null_reg_d(), vertex_count, fs_reg(0u),
+               BRW_CONDITIONAL_NEQ);
+      abld.IF(BRW_PREDICATE_NORMAL);
+      emit_gs_control_data_bits(vertex_count);
+      abld.emit(BRW_OPCODE_ENDIF);
+
+      /* Reset control_data_bits to 0 so we can start accumulating a new
+       * batch.
+       *
+       * Note: in the case where vertex_count == 0, this neutralizes the
+       * effect of any call to EndPrimitive() that the shader may have
+       * made before outputting its first vertex.
+       */
+      inst = abld.MOV(this->control_data_bits, fs_reg(0u));
+      inst->force_writemask_all = true;
+      abld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   emit_urb_writes(vertex_count);
+
+   /* In stream mode we have to set control data bits for all vertices
+    * unless we have disabled control data bits completely (which we do
+    * do for GL_POINTS outputs that don't use streams).
+    */
+   if (gs_compile->control_data_header_size_bits > 0 &&
+       gs_prog_data->control_data_format ==
+          GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
+      set_gs_stream_control_data_bits(vertex_count, stream_id);
+   }
+}
+
+void
+fs_visitor::emit_gs_input_load(const fs_reg &dst,
+                               const nir_src &vertex_src,
+                               unsigned input_offset,
+                               unsigned num_components)
+{
+   const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data;
+   const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0];
+
+   const unsigned array_stride = vue_prog_data->urb_read_length * 8;
+
+   const bool pushed = 4 * input_offset < array_stride;
+
+   if (input_offset == 0) {
+      /* This is the VUE header, containing VARYING_SLOT_LAYER [.y],
+       * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].
+       * Only gl_PointSize is available as a GS input, so they must
+       * be asking for that input.
+       */
+      if (pushed) {
+         bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type));
+      } else {
+         fs_reg tmp = bld.vgrf(dst.type, 4);
+         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
+                                  fs_reg(vertex), fs_reg(0));
+         inst->regs_written = 4;
+         bld.MOV(dst, offset(tmp, bld, 3));
+      }
+   } else {
+      if (pushed) {
+         int index = vertex * array_stride + 4 * input_offset;
+         for (unsigned i = 0; i < num_components; i++) {
+            bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type));
+         }
+      } else {
+         fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
+                                  fs_reg(vertex), fs_reg(input_offset));
+         inst->regs_written = num_components;
+      }
+   }
+}
+
+void
+fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_VERTEX);
+
    fs_reg dest;
    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
       dest = get_nir_dest(instr->dest);
 
-   bool has_indirect = false;
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id()");
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_instance_id: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      fs_reg val = nir_system_values[sv];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
 
    switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      assert(stage == MESA_SHADER_GEOMETRY);
+      assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
+      break;
+
+   case nir_intrinsic_load_input_indirect:
+   case nir_intrinsic_load_input:
+      unreachable("load_input intrinsics are invalid for the GS stage");
+
+   case nir_intrinsic_load_per_vertex_input_indirect:
+      assert(!"Not allowed");
+   case nir_intrinsic_load_per_vertex_input:
+      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
+                         instr->num_components);
+      break;
+
+   case nir_intrinsic_emit_vertex_with_counter:
+      emit_gs_vertex(instr->src[0], instr->const_index[0]);
+      break;
+
+   case nir_intrinsic_end_primitive_with_counter:
+      emit_gs_end_primitive(instr->src[0]);
+      break;
+
+   case nir_intrinsic_set_vertex_count:
+      bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
+      break;
+
+   case nir_intrinsic_load_invocation_id: {
+      fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *wm_prog_data =
+      (struct brw_wm_prog_data *) prog_data;
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_front_face:
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              *emit_frontfacing_interpolation());
+      break;
+
+   case nir_intrinsic_load_sample_pos: {
+      fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
+      assert(sample_pos.file != BAD_FILE);
+      dest.type = sample_pos.type;
+      bld.MOV(dest, sample_pos);
+      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
+      break;
+   }
+
+   case nir_intrinsic_load_sample_mask_in:
+   case nir_intrinsic_load_sample_id: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      fs_reg val = nir_system_values[sv];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
+      break;
+   }
+
    case nir_intrinsic_discard:
    case nir_intrinsic_discard_if: {
       /* We track our discarded pixels in f0.1.  By predicating on it, we can
@@ -1229,6 +1738,248 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_interp_var_at_centroid:
+   case nir_intrinsic_interp_var_at_sample:
+   case nir_intrinsic_interp_var_at_offset: {
+      /* Handle ARB_gpu_shader5 interpolation intrinsics
+       *
+       * It's worth a quick word of explanation as to why we handle the full
+       * variable-based interpolation intrinsic rather than a lowered version
+       * with like we do for other inputs.  We have to do that because the way
+       * we set up inputs doesn't allow us to use the already setup inputs for
+       * interpolation.  At the beginning of the shader, we go through all of
+       * the input variables and do the initial interpolation and put it in
+       * the nir_inputs array based on its location as determined in
+       * nir_lower_io.  If the input isn't used, dead code cleans up and
+       * everything works fine.  However, when we get to the ARB_gpu_shader5
+       * interpolation intrinsics, we need to reinterpolate the input
+       * differently.  If we used an intrinsic that just had an index it would
+       * only give us the offset into the nir_inputs array.  However, this is
+       * useless because that value is post-interpolation and we need
+       * pre-interpolation.  In order to get the actual location of the bits
+       * we get from the vertex fetching hardware, we need the variable.
+       */
+      wm_prog_data->pulls_bary = true;
+
+      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
+      const glsl_interp_qualifier interpolation =
+         (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation;
+
+      switch (instr->intrinsic) {
+      case nir_intrinsic_interp_var_at_centroid:
+         emit_pixel_interpolater_send(bld,
+                                      FS_OPCODE_INTERPOLATE_AT_CENTROID,
+                                      dst_xy,
+                                      fs_reg(), /* src */
+                                      fs_reg(0u),
+                                      interpolation);
+         break;
+
+      case nir_intrinsic_interp_var_at_sample: {
+         nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
+
+         if (const_sample) {
+            unsigned msg_data = const_sample->i[0] << 4;
+
+            emit_pixel_interpolater_send(bld,
+                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                         dst_xy,
+                                         fs_reg(), /* src */
+                                         fs_reg(msg_data),
+                                         interpolation);
+         } else {
+            const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
+                                             BRW_REGISTER_TYPE_UD);
+
+            if (nir_src_is_dynamically_uniform(instr->src[0])) {
+               const fs_reg sample_id = bld.emit_uniformize(sample_src);
+               const fs_reg msg_data = vgrf(glsl_type::uint_type);
+               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
+               emit_pixel_interpolater_send(bld,
+                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                            dst_xy,
+                                            fs_reg(), /* src */
+                                            msg_data,
+                                            interpolation);
+            } else {
+               /* Make a loop that sends a message to the pixel interpolater
+                * for the sample number in each live channel. If there are
+                * multiple channels with the same sample number then these
+                * will be handled simultaneously with a single interation of
+                * the loop.
+                */
+               bld.emit(BRW_OPCODE_DO);
+
+               /* Get the next live sample number into sample_id_reg */
+               const fs_reg sample_id = bld.emit_uniformize(sample_src);
+
+               /* Set the flag register so that we can perform the send
+                * message on all channels that have the same sample number
+                */
+               bld.CMP(bld.null_reg_ud(),
+                       sample_src, sample_id,
+                       BRW_CONDITIONAL_EQ);
+               const fs_reg msg_data = vgrf(glsl_type::uint_type);
+               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
+               fs_inst *inst =
+                  emit_pixel_interpolater_send(bld,
+                                               FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                               dst_xy,
+                                               fs_reg(), /* src */
+                                               msg_data,
+                                               interpolation);
+               set_predicate(BRW_PREDICATE_NORMAL, inst);
+
+               /* Continue the loop if there are any live channels left */
+               set_predicate_inv(BRW_PREDICATE_NORMAL,
+                                 true, /* inverse */
+                                 bld.emit(BRW_OPCODE_WHILE));
+            }
+         }
+
+         break;
+      }
+
+      case nir_intrinsic_interp_var_at_offset: {
+         nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+
+         if (const_offset) {
+            unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
+            unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
+
+            emit_pixel_interpolater_send(bld,
+                                         FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+                                         dst_xy,
+                                         fs_reg(), /* src */
+                                         fs_reg(off_x | (off_y << 4)),
+                                         interpolation);
+         } else {
+            fs_reg src = vgrf(glsl_type::ivec2_type);
+            fs_reg offset_src = retype(get_nir_src(instr->src[0]),
+                                       BRW_REGISTER_TYPE_F);
+            for (int i = 0; i < 2; i++) {
+               fs_reg temp = vgrf(glsl_type::float_type);
+               bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f));
+               fs_reg itemp = vgrf(glsl_type::int_type);
+               bld.MOV(itemp, temp);  /* float to int */
+
+               /* Clamp the upper end of the range to +7/16.
+                * ARB_gpu_shader5 requires that we support a maximum offset
+                * of +0.5, which isn't representable in a S0.4 value -- if
+                * we didn't clamp it, we'd end up with -8/16, which is the
+                * opposite of what the shader author wanted.
+                *
+                * This is legal due to ARB_gpu_shader5's quantization
+                * rules:
+                *
+                * "Not all values of <offset> may be supported; x and y
+                * offsets may be rounded to fixed-point values with the
+                * number of fraction bits given by the
+                * implementation-dependent constant
+                * FRAGMENT_INTERPOLATION_OFFSET_BITS"
+                */
+               set_condmod(BRW_CONDITIONAL_L,
+                           bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
+            }
+
+            const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
+            emit_pixel_interpolater_send(bld,
+                                         opcode,
+                                         dst_xy,
+                                         src,
+                                         fs_reg(0u),
+                                         interpolation);
+         }
+         break;
+      }
+
+      default:
+         unreachable("Invalid intrinsic");
+      }
+
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
+         src.type = dest.type;
+
+         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
+         dest = offset(dest, bld, 1);
+      }
+      break;
+   }
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_COMPUTE);
+   struct brw_cs_prog_data *cs_prog_data =
+      (struct brw_cs_prog_data *) prog_data;
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_barrier:
+      emit_barrier();
+      cs_prog_data->uses_barrier = true;
+      break;
+
+   case nir_intrinsic_load_local_invocation_id:
+   case nir_intrinsic_load_work_group_id: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      fs_reg val = nir_system_values[sv];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      for (unsigned i = 0; i < 3; i++)
+         bld.MOV(offset(dest, bld, i), offset(val, bld, i));
+      break;
+   }
+
+   case nir_intrinsic_load_num_work_groups: {
+      const unsigned surface =
+         cs_prog_data->binding_table.work_groups_start;
+
+      cs_prog_data->uses_num_work_groups = true;
+
+      fs_reg surf_index = fs_reg(surface);
+      brw_mark_surface_used(prog_data, surface);
+
+      /* Read the 3 GLuint components of gl_NumWorkGroups */
+      for (unsigned i = 0; i < 3; i++) {
+         fs_reg read_result =
+            emit_untyped_read(bld, surf_index,
+                              fs_reg(i << 2),
+                              1 /* dims */, 1 /* size */,
+                              BRW_PREDICATE_NONE);
+         read_result.type = dest.type;
+         bld.MOV(dest, read_result);
+         dest = offset(dest, bld, 1);
+      }
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
+{
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   bool has_indirect = false;
+
+   switch (instr->intrinsic) {
    case nir_intrinsic_atomic_counter_inc:
    case nir_intrinsic_atomic_counter_dec:
    case nir_intrinsic_atomic_counter_read: {
@@ -1324,6 +2075,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_memory_barrier_atomic_counter:
+   case nir_intrinsic_memory_barrier_buffer:
+   case nir_intrinsic_memory_barrier_image:
    case nir_intrinsic_memory_barrier: {
       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
       bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
@@ -1331,6 +2085,29 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_group_memory_barrier:
+   case nir_intrinsic_memory_barrier_shared:
+      /* We treat these workgroup-level barriers as no-ops.  This should be
+       * safe at present and as long as:
+       *
+       *  - Memory access instructions are not subsequently reordered by the
+       *    compiler back-end.
+       *
+       *  - All threads from a given compute shader workgroup fit within a
+       *    single subslice and therefore talk to the same HDC shared unit
+       *    what supposedly guarantees ordering and coherency between threads
+       *    from the same workgroup.  This may change in the future when we
+       *    start splitting workgroups across multiple subslices.
+       *
+       *  - The context is not in fault-and-stream mode, which could cause
+       *    memory transactions (including to SLM) prior to the barrier to be
+       *    replayed after the barrier if a pagefault occurs.  This shouldn't
+       *    be a problem up to and including SKL because fault-and-stream is
+       *    not usable due to hardware issues, but that's likely to change in
+       *    the future.
+       */
+      break;
+
    case nir_intrinsic_shader_clock: {
       /* We cannot do anything if there is an event, so ignore it for now */
       fs_reg shader_clock = get_timestamp(bld);
@@ -1390,44 +2167,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), fs_reg(1));
       break;
 
-   case nir_intrinsic_load_front_face:
-      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
-              *emit_frontfacing_interpolation());
-      break;
-
-   case nir_intrinsic_load_vertex_id:
-      unreachable("should be lowered by lower_vertex_id()");
-
-   case nir_intrinsic_load_primitive_id:
-      assert(stage == MESA_SHADER_GEOMETRY);
-      assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id);
-      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
-              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
-      break;
-
-   case nir_intrinsic_load_vertex_id_zero_base:
-   case nir_intrinsic_load_base_vertex:
-   case nir_intrinsic_load_instance_id:
-   case nir_intrinsic_load_invocation_id:
-   case nir_intrinsic_load_sample_mask_in:
-   case nir_intrinsic_load_sample_id: {
-      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
-      fs_reg val = nir_system_values[sv];
-      assert(val.file != BAD_FILE);
-      dest.type = val.type;
-      bld.MOV(dest, val);
-      break;
-   }
-
-   case nir_intrinsic_load_sample_pos: {
-      fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
-      assert(sample_pos.file != BAD_FILE);
-      dest.type = sample_pos.type;
-      bld.MOV(dest, sample_pos);
-      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
-      break;
-   }
-
    case nir_intrinsic_load_uniform_indirect:
       has_indirect = true;
       /* fallthrough */
@@ -1454,8 +2193,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg surf_index;
 
       if (const_index) {
-         surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
-                             const_index->u[0]);
+         const unsigned index = stage_prog_data->binding_table.ubo_start +
+                                const_index->u[0];
+         surf_index = fs_reg(index);
+         brw_mark_surface_used(prog_data, index);
       } else {
          /* The block index is not a constant. Evaluate the index expression
           * per-channel and add the base UBO index; we have to select a value
@@ -1579,177 +2320,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   /* Handle ARB_gpu_shader5 interpolation intrinsics
-    *
-    * It's worth a quick word of explanation as to why we handle the full
-    * variable-based interpolation intrinsic rather than a lowered version
-    * with like we do for other inputs.  We have to do that because the way
-    * we set up inputs doesn't allow us to use the already setup inputs for
-    * interpolation.  At the beginning of the shader, we go through all of
-    * the input variables and do the initial interpolation and put it in
-    * the nir_inputs array based on its location as determined in
-    * nir_lower_io.  If the input isn't used, dead code cleans up and
-    * everything works fine.  However, when we get to the ARB_gpu_shader5
-    * interpolation intrinsics, we need to reinterpolate the input
-    * differently.  If we used an intrinsic that just had an index it would
-    * only give us the offset into the nir_inputs array.  However, this is
-    * useless because that value is post-interpolation and we need
-    * pre-interpolation.  In order to get the actual location of the bits
-    * we get from the vertex fetching hardware, we need the variable.
-    */
-   case nir_intrinsic_interp_var_at_centroid:
-   case nir_intrinsic_interp_var_at_sample:
-   case nir_intrinsic_interp_var_at_offset: {
-      assert(stage == MESA_SHADER_FRAGMENT);
-
-      ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true;
-
-      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
-      const glsl_interp_qualifier interpolation =
-         (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation;
-
-      switch (instr->intrinsic) {
-      case nir_intrinsic_interp_var_at_centroid:
-         emit_pixel_interpolater_send(bld,
-                                      FS_OPCODE_INTERPOLATE_AT_CENTROID,
-                                      dst_xy,
-                                      fs_reg(), /* src */
-                                      fs_reg(0u),
-                                      interpolation);
-         break;
-
-      case nir_intrinsic_interp_var_at_sample: {
-         nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
-
-         if (const_sample) {
-            unsigned msg_data = const_sample->i[0] << 4;
-
-            emit_pixel_interpolater_send(bld,
-                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                         dst_xy,
-                                         fs_reg(), /* src */
-                                         fs_reg(msg_data),
-                                         interpolation);
-         } else {
-            const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
-                                             BRW_REGISTER_TYPE_UD);
-
-            if (nir_src_is_dynamically_uniform(instr->src[0])) {
-               const fs_reg sample_id = bld.emit_uniformize(sample_src);
-               const fs_reg msg_data = vgrf(glsl_type::uint_type);
-               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
-               emit_pixel_interpolater_send(bld,
-                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                            dst_xy,
-                                            fs_reg(), /* src */
-                                            msg_data,
-                                            interpolation);
-            } else {
-               /* Make a loop that sends a message to the pixel interpolater
-                * for the sample number in each live channel. If there are
-                * multiple channels with the same sample number then these
-                * will be handled simultaneously with a single interation of
-                * the loop.
-                */
-               bld.emit(BRW_OPCODE_DO);
-
-               /* Get the next live sample number into sample_id_reg */
-               const fs_reg sample_id = bld.emit_uniformize(sample_src);
-
-               /* Set the flag register so that we can perform the send
-                * message on all channels that have the same sample number
-                */
-               bld.CMP(bld.null_reg_ud(),
-                       sample_src, sample_id,
-                       BRW_CONDITIONAL_EQ);
-               const fs_reg msg_data = vgrf(glsl_type::uint_type);
-               bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
-               fs_inst *inst =
-                  emit_pixel_interpolater_send(bld,
-                                               FS_OPCODE_INTERPOLATE_AT_SAMPLE,
-                                               dst_xy,
-                                               fs_reg(), /* src */
-                                               msg_data,
-                                               interpolation);
-               set_predicate(BRW_PREDICATE_NORMAL, inst);
-
-               /* Continue the loop if there are any live channels left */
-               set_predicate_inv(BRW_PREDICATE_NORMAL,
-                                 true, /* inverse */
-                                 bld.emit(BRW_OPCODE_WHILE));
-            }
-         }
-
-         break;
-      }
-
-      case nir_intrinsic_interp_var_at_offset: {
-         nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-
-         if (const_offset) {
-            unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
-            unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
-
-            emit_pixel_interpolater_send(bld,
-                                         FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
-                                         dst_xy,
-                                         fs_reg(), /* src */
-                                         fs_reg(off_x | (off_y << 4)),
-                                         interpolation);
-         } else {
-            fs_reg src = vgrf(glsl_type::ivec2_type);
-            fs_reg offset_src = retype(get_nir_src(instr->src[0]),
-                                       BRW_REGISTER_TYPE_F);
-            for (int i = 0; i < 2; i++) {
-               fs_reg temp = vgrf(glsl_type::float_type);
-               bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f));
-               fs_reg itemp = vgrf(glsl_type::int_type);
-               bld.MOV(itemp, temp);  /* float to int */
-
-               /* Clamp the upper end of the range to +7/16.
-                * ARB_gpu_shader5 requires that we support a maximum offset
-                * of +0.5, which isn't representable in a S0.4 value -- if
-                * we didn't clamp it, we'd end up with -8/16, which is the
-                * opposite of what the shader author wanted.
-                *
-                * This is legal due to ARB_gpu_shader5's quantization
-                * rules:
-                *
-                * "Not all values of <offset> may be supported; x and y
-                * offsets may be rounded to fixed-point values with the
-                * number of fraction bits given by the
-                * implementation-dependent constant
-                * FRAGMENT_INTERPOLATION_OFFSET_BITS"
-                */
-               set_condmod(BRW_CONDITIONAL_L,
-                           bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
-            }
-
-            const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
-            emit_pixel_interpolater_send(bld,
-                                         opcode,
-                                         dst_xy,
-                                         src,
-                                         fs_reg(0u),
-                                         interpolation);
-         }
-         break;
-      }
-
-      default:
-         unreachable("Invalid intrinsic");
-      }
-
-      for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
-         src.type = dest.type;
-
-         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
-         dest = offset(dest, bld, 1);
-      }
-      break;
-   }
-
    case nir_intrinsic_store_ssbo_indirect:
       has_indirect = true;
       /* fallthrough */
@@ -1831,23 +2401,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_barrier:
-      emit_barrier();
-      if (stage == MESA_SHADER_COMPUTE)
-         ((struct brw_cs_prog_data *) prog_data)->uses_barrier = true;
-      break;
-
-   case nir_intrinsic_load_local_invocation_id:
-   case nir_intrinsic_load_work_group_id: {
-      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
-      fs_reg val = nir_system_values[sv];
-      assert(val.file != BAD_FILE);
-      dest.type = val.type;
-      for (unsigned i = 0; i < 3; i++)
-         bld.MOV(offset(dest, bld, i), offset(val, bld, i));
-      break;
-   }
-
    case nir_intrinsic_ssbo_atomic_add:
       nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
       break;
@@ -1888,44 +2441,30 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg source = fs_reg(0);
 
       int mlen = 1 * reg_width;
-      fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
+
+      /* A resinfo's sampler message is used to get the buffer size.
+       * The SIMD8's writeback message consists of four registers and
+       * SIMD16's writeback message consists of 8 destination registers
+       * (two per each component), although we are only interested on the
+       * first component, where resinfo returns the buffer size for
+       * SURFTYPE_BUFFER.
+       */
+      int regs_written = 4 * mlen;
+      fs_reg src_payload = fs_reg(VGRF, alloc.allocate(mlen),
                                   BRW_REGISTER_TYPE_UD);
       bld.LOAD_PAYLOAD(src_payload, &source, 1, 0);
-
-      fs_reg surf_index = fs_reg(prog_data->binding_table.ssbo_start + ssbo_index);
-      fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, dest,
-                               src_payload, surf_index);
+      fs_reg buffer_size = fs_reg(VGRF, alloc.allocate(regs_written),
+                                  BRW_REGISTER_TYPE_UD);
+      const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
+      fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, buffer_size,
+                               src_payload, fs_reg(index));
       inst->header_size = 0;
       inst->mlen = mlen;
+      inst->regs_written = regs_written;
       bld.emit(inst);
-      break;
-   }
-
-   case nir_intrinsic_load_num_work_groups: {
-      assert(devinfo->gen >= 7);
-      assert(stage == MESA_SHADER_COMPUTE);
-
-      struct brw_cs_prog_data *cs_prog_data =
-         (struct brw_cs_prog_data *) prog_data;
-      const unsigned surface =
-         cs_prog_data->binding_table.work_groups_start;
+      bld.MOV(retype(dest, buffer_size.type), buffer_size);
 
-      cs_prog_data->uses_num_work_groups = true;
-
-      fs_reg surf_index = fs_reg(surface);
-      brw_mark_surface_used(prog_data, surface);
-
-      /* Read the 3 GLuint components of gl_NumWorkGroups */
-      for (unsigned i = 0; i < 3; i++) {
-         fs_reg read_result =
-            emit_untyped_read(bld, surf_index,
-                              fs_reg(i << 2),
-                              1 /* dims */, 1 /* size */,
-                              BRW_PREDICATE_NONE);
-         read_result.type = dest.type;
-         bld.MOV(dest, read_result);
-         dest = offset(dest, bld, 1);
-      }
+      brw_mark_surface_used(prog_data, index);
       break;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 9251d9552a5..1b61f9fe01c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -35,8 +35,8 @@ using namespace brw;
 static void
 assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
 {
-   if (reg->file == GRF) {
-      reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset;
+   if (reg->file == VGRF) {
+      reg->nr = reg_hw_locations[reg->nr] + reg->reg_offset;
       reg->reg_offset = 0;
    }
 }
@@ -366,14 +366,13 @@ void fs_visitor::calculate_payload_ranges(int payload_node_count,
       else
          use_ip = ip;
 
-      /* Note that UNIFORM args have been turned into FIXED_HW_REG by
+      /* Note that UNIFORM args have been turned into FIXED_GRF by
        * assign_curbe_setup(), and interpolation uses fixed hardware regs from
        * the start (see interp_reg()).
        */
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == HW_REG &&
-             inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
-            int node_nr = inst->src[i].fixed_hw_reg.nr;
+         if (inst->src[i].file == FIXED_GRF) {
+            int node_nr = inst->src[i].nr;
             if (node_nr >= payload_node_count)
                continue;
 
@@ -489,10 +488,10 @@ get_used_mrfs(fs_visitor *v, bool *mrf_used)
 
    foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
       if (inst->dst.file == MRF) {
-         int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
          mrf_used[reg] = true;
          if (reg_width == 2) {
-            if (inst->dst.reg & BRW_MRF_COMPR4) {
+            if (inst->dst.nr & BRW_MRF_COMPR4) {
                mrf_used[reg + 4] = true;
             } else {
                mrf_used[reg + 1] = true;
@@ -584,8 +583,8 @@ fs_visitor::assign_regs(bool allow_spilling)
        * that register and set it to the appropriate class.
        */
       if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 &&
-          this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF &&
-          this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) {
+          this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == VGRF &&
+          this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].nr == i) {
          c = compiler->fs_reg_sets[rsi].aligned_pairs_class;
       }
 
@@ -616,7 +615,7 @@ fs_visitor::assign_regs(bool allow_spilling)
           * highest register that works.
           */
          if (inst->eot) {
-            int size = alloc.sizes[inst->src[0].reg];
+            int size = alloc.sizes[inst->src[0].nr];
             int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
 
             /* If something happened to spill, we want to push the EOT send
@@ -625,7 +624,7 @@ fs_visitor::assign_regs(bool allow_spilling)
              */
             reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf;
 
-            ra_set_node_reg(g, inst->src[0].reg, reg);
+            ra_set_node_reg(g, inst->src[0].nr, reg);
             break;
          }
       }
@@ -644,12 +643,12 @@ fs_visitor::assign_regs(bool allow_spilling)
        * destination interfere.
        */
       foreach_block_and_inst(block, fs_inst, inst, cfg) {
-         if (inst->dst.file != GRF)
+         if (inst->dst.file != VGRF)
             continue;
 
          for (int i = 0; i < inst->sources; ++i) {
-            if (inst->src[i].file == GRF) {
-               ra_add_node_interference(g, inst->dst.reg, inst->src[i].reg);
+            if (inst->src[i].file == VGRF) {
+               ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
             }
          }
       }
@@ -786,8 +785,8 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
     */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       for (unsigned int i = 0; i < inst->sources; i++) {
-	 if (inst->src[i].file == GRF) {
-	    spill_costs[inst->src[i].reg] += loop_scale;
+	 if (inst->src[i].file == VGRF) {
+            spill_costs[inst->src[i].nr] += loop_scale;
 
             /* Register spilling logic assumes full-width registers; smeared
              * registers have a width of 1 so if we try to spill them we'll
@@ -797,16 +796,16 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
              * register pressure anyhow.
              */
             if (!inst->src[i].is_contiguous()) {
-               no_spill[inst->src[i].reg] = true;
+               no_spill[inst->src[i].nr] = true;
             }
 	 }
       }
 
-      if (inst->dst.file == GRF) {
-	 spill_costs[inst->dst.reg] += inst->regs_written * loop_scale;
+      if (inst->dst.file == VGRF) {
+         spill_costs[inst->dst.nr] += inst->regs_written * loop_scale;
 
          if (!inst->dst.is_contiguous()) {
-            no_spill[inst->dst.reg] = true;
+            no_spill[inst->dst.nr] = true;
          }
       }
 
@@ -821,14 +820,14 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
 	 break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
-	 if (inst->src[0].file == GRF)
-	    no_spill[inst->src[0].reg] = true;
+	 if (inst->src[0].file == VGRF)
+            no_spill[inst->src[0].nr] = true;
 	 break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
       case SHADER_OPCODE_GEN7_SCRATCH_READ:
-	 if (inst->dst.file == GRF)
-	    no_spill[inst->dst.reg] = true;
+	 if (inst->dst.file == VGRF)
+            no_spill[inst->dst.nr] = true;
 	 break;
 
       default:
@@ -883,14 +882,14 @@ fs_visitor::spill_reg(int spill_reg)
     */
    foreach_block_and_inst (block, fs_inst, inst, cfg) {
       for (unsigned int i = 0; i < inst->sources; i++) {
-	 if (inst->src[i].file == GRF &&
-	     inst->src[i].reg == spill_reg) {
+	 if (inst->src[i].file == VGRF &&
+             inst->src[i].nr == spill_reg) {
             int regs_read = inst->regs_read(i);
             int subset_spill_offset = (spill_offset +
                                        REG_SIZE * inst->src[i].reg_offset);
-            fs_reg unspill_dst(GRF, alloc.allocate(regs_read));
+            fs_reg unspill_dst(VGRF, alloc.allocate(regs_read));
 
-            inst->src[i].reg = unspill_dst.reg;
+            inst->src[i].nr = unspill_dst.nr;
             inst->src[i].reg_offset = 0;
 
             emit_unspill(block, inst, unspill_dst, subset_spill_offset,
@@ -898,13 +897,13 @@ fs_visitor::spill_reg(int spill_reg)
 	 }
       }
 
-      if (inst->dst.file == GRF &&
-	  inst->dst.reg == spill_reg) {
+      if (inst->dst.file == VGRF &&
+          inst->dst.nr == spill_reg) {
          int subset_spill_offset = (spill_offset +
                                     REG_SIZE * inst->dst.reg_offset);
-         fs_reg spill_src(GRF, alloc.allocate(inst->regs_written));
+         fs_reg spill_src(VGRF, alloc.allocate(inst->regs_written));
 
-         inst->dst.reg = spill_src.reg;
+         inst->dst.nr = spill_src.nr;
          inst->dst.reg_offset = 0;
 
          /* If we're immediately spilling the register, we should not use
diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 34f8715eeb9..4578ad597c2 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -70,17 +70,17 @@ is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
         inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
        inst->is_partial_write() ||
        inst->saturate ||
-       inst->src[0].file != GRF ||
+       inst->src[0].file != VGRF ||
        inst->src[0].negate ||
        inst->src[0].abs ||
        !inst->src[0].is_contiguous() ||
-       inst->dst.file != GRF ||
+       inst->dst.file != VGRF ||
        inst->dst.type != inst->src[0].type) {
       return false;
    }
 
-   if (v->alloc.sizes[inst->src[0].reg] >
-       v->alloc.sizes[inst->dst.reg])
+   if (v->alloc.sizes[inst->src[0].nr] >
+       v->alloc.sizes[inst->dst.nr])
       return false;
 
    if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
@@ -170,19 +170,19 @@ fs_visitor::register_coalesce()
          continue;
       }
 
-      if (src_reg != inst->src[0].reg) {
-         src_reg = inst->src[0].reg;
+      if (src_reg != inst->src[0].nr) {
+         src_reg = inst->src[0].nr;
 
-         src_size = alloc.sizes[inst->src[0].reg];
+         src_size = alloc.sizes[inst->src[0].nr];
          assert(src_size <= MAX_VGRF_SIZE);
 
          channels_remaining = src_size;
          memset(mov, 0, sizeof(mov));
 
-         dst_reg = inst->dst.reg;
+         dst_reg = inst->dst.nr;
       }
 
-      if (dst_reg != inst->dst.reg)
+      if (dst_reg != inst->dst.nr)
          continue;
 
       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
@@ -250,17 +250,17 @@ fs_visitor::register_coalesce()
       }
 
       foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
-         if (scan_inst->dst.file == GRF &&
-             scan_inst->dst.reg == src_reg) {
-            scan_inst->dst.reg = dst_reg;
+         if (scan_inst->dst.file == VGRF &&
+             scan_inst->dst.nr == src_reg) {
+            scan_inst->dst.nr = dst_reg;
             scan_inst->dst.reg_offset =
                dst_reg_offset[scan_inst->dst.reg_offset];
          }
 
          for (int j = 0; j < scan_inst->sources; j++) {
-            if (scan_inst->src[j].file == GRF &&
-                scan_inst->src[j].reg == src_reg) {
-               scan_inst->src[j].reg = dst_reg;
+            if (scan_inst->src[j].file == VGRF &&
+                scan_inst->src[j].nr == src_reg) {
+               scan_inst->src[j].nr = dst_reg;
                scan_inst->src[j].reg_offset =
                   dst_reg_offset[scan_inst->src[j].reg_offset];
             }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
index 862e3245d43..52570943996 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
@@ -53,9 +53,9 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
 
       if (inst->opcode != BRW_OPCODE_MOV ||
           !inst->saturate ||
-          inst->dst.file != GRF ||
+          inst->dst.file != VGRF ||
           inst->dst.type != inst->src[0].type ||
-          inst->src[0].file != GRF ||
+          inst->src[0].file != VGRF ||
           inst->src[0].abs ||
           inst->src[0].negate)
          continue;
@@ -90,8 +90,8 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
             break;
          }
          for (int i = 0; i < scan_inst->sources; i++) {
-            if (scan_inst->src[i].file == GRF &&
-                scan_inst->src[i].reg == inst->src[0].reg &&
+            if (scan_inst->src[i].file == VGRF &&
+                scan_inst->src[i].nr == inst->src[0].nr &&
                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
                if (scan_inst->opcode != BRW_OPCODE_MOV ||
                    !scan_inst->saturate ||
diff --git a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp
index 814c551f1be..90edd023b30 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp
@@ -42,15 +42,15 @@ void
 fs_visitor::validate()
 {
    foreach_block_and_inst (block, fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
          fsv_assert(inst->dst.reg_offset + inst->regs_written <=
-                    alloc.sizes[inst->dst.reg]);
+                    alloc.sizes[inst->dst.nr]);
       }
 
       for (unsigned i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
             fsv_assert(inst->src[i].reg_offset + inst->regs_read(i) <=
-                       (int)alloc.sizes[inst->src[i].reg]);
+                       (int)alloc.sizes[inst->src[i].nr]);
          }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 5c57944ca39..a7bd9cea7af 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -143,7 +143,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
     * tracking to get the scaling factor.
     */
    if (devinfo->gen < 6 && is_rect) {
-      fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
+      fs_reg dst = fs_reg(VGRF, alloc.allocate(coord_components));
       fs_reg src = coordinate;
       coordinate = dst;
 
@@ -208,8 +208,8 @@ fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
    fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
                             ARRAY_SIZE(srcs));
 
-   /* We only care about one reg of response, but the sampler always writes
-    * 4/8.
+   /* We only care about one or two regs of response, but the sampler always
+    * writes 4/8.
     */
    inst->regs_written = 4 * dispatch_width / 8;
 
@@ -295,7 +295,10 @@ fs_visitor::emit_texture(ir_texture_opcode op,
       opcode = SHADER_OPCODE_TXF_LOGICAL;
       break;
    case ir_txf_ms:
-      opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
+      if ((key_tex->msaa_16 & (1 << sampler)))
+         opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
+      else
+         opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
       break;
    case ir_txs:
    case ir_query_levels:
@@ -319,7 +322,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
       inst->shadow_compare = true;
 
    if (offset_value.file == IMM)
-      inst->offset = offset_value.fixed_hw_reg.dw1.ud;
+      inst->offset = offset_value.ud;
 
    if (op == ir_tg4) {
       inst->offset |=
@@ -578,7 +581,7 @@ fs_visitor::emit_interpolation_setup_gen6()
        * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
        * compute our pixel centers.
        */
-      fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
+      fs_reg int_pixel_xy(VGRF, alloc.allocate(dispatch_width / 8),
                           BRW_REGISTER_TYPE_UW);
 
       const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0);
@@ -873,14 +876,14 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
 
       abld.MUL(output, outputs[clip_vertex], u);
       for (int j = 1; j < 4; j++) {
-         u.reg = userplane[i].reg + j;
+         u.nr = userplane[i].nr + j;
          abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u);
       }
    }
 }
 
 void
-fs_visitor::emit_urb_writes()
+fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
 {
    int slot, urb_offset, length;
    int starting_urb_offset = 0;
@@ -905,7 +908,7 @@ fs_visitor::emit_urb_writes()
     *    "The write data payload can be between 1 and 8 message phases long."
     */
    if (vue_map->slots_valid == 0) {
-      fs_reg payload = fs_reg(GRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
+      fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
       bld.exec_all().MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
                                                 BRW_REGISTER_TYPE_UD)));
 
@@ -916,9 +919,13 @@ fs_visitor::emit_urb_writes()
       return;
    }
 
+   opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
+   int header_size = 1;
+   fs_reg per_slot_offsets;
+
    if (stage == MESA_SHADER_GEOMETRY) {
       const struct brw_gs_prog_data *gs_prog_data =
-         (const struct brw_gs_prog_data *) prog_data;
+         (const struct brw_gs_prog_data *) this->prog_data;
 
       /* We need to increment the Global Offset to skip over the control data
        * header and the extra "Vertex Count" field (1 HWord) at the beginning
@@ -927,6 +934,27 @@ fs_visitor::emit_urb_writes()
       starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
       if (gs_prog_data->static_vertex_count == -1)
          starting_urb_offset += 2;
+
+      /* We also need to use per-slot offsets.  The per-slot offset is the
+       * Vertex Count.  SIMD8 mode processes 8 different primitives at a
+       * time; each may output a different number of vertices.
+       */
+      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT;
+      header_size++;
+
+      /* The URB offset is in 128-bit units, so we need to multiply by 2 */
+      const int output_vertex_size_owords =
+         gs_prog_data->output_vertex_size_hwords * 2;
+
+      fs_reg offset;
+      if (gs_vertex_count.file == IMM) {
+         per_slot_offsets = fs_reg(output_vertex_size_owords *
+                                   gs_vertex_count.ud);
+      } else {
+         per_slot_offsets = vgrf(glsl_type::int_type);
+         bld.MUL(per_slot_offsets, gs_vertex_count,
+                 fs_reg(output_vertex_size_owords));
+      }
    }
 
    length = 0;
@@ -947,7 +975,7 @@ fs_visitor::emit_urb_writes()
             break;
          }
 
-         fs_reg zero(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+         fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
          bld.MOV(zero, fs_reg(0u));
 
          sources[length++] = zero;
@@ -999,7 +1027,7 @@ fs_visitor::emit_urb_writes()
              * temp register and use that for the payload.
              */
             for (int i = 0; i < 4; i++) {
-               fs_reg reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
+               fs_reg reg = fs_reg(VGRF, alloc.allocate(1), outputs[varying].type);
                fs_reg src = offset(this->outputs[varying], bld, i);
                set_saturate(true, bld.MOV(reg, src));
                sources[length++] = reg;
@@ -1023,19 +1051,25 @@ fs_visitor::emit_urb_writes()
       if (length == 8 || last)
          flush = true;
       if (flush) {
-         fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
-         fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
+         fs_reg *payload_sources =
+            ralloc_array(mem_ctx, fs_reg, length + header_size);
+         fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
                                  BRW_REGISTER_TYPE_F);
          payload_sources[0] =
             fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 
-         memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
-         abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1);
+         if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT)
+            payload_sources[1] = per_slot_offsets;
 
-         fs_inst *inst =
-            abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+         memcpy(&payload_sources[header_size], sources,
+                length * sizeof sources[0]);
+
+         abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
+                           header_size);
+
+         fs_inst *inst = abld.emit(opcode, reg_undef, payload);
          inst->eot = last && stage == MESA_SHADER_VERTEX;
-         inst->mlen = length + 1;
+         inst->mlen = length + header_size;
          inst->offset = urb_offset;
          urb_offset = starting_urb_offset + slot + 1;
          length = 0;
@@ -1057,7 +1091,7 @@ fs_visitor::emit_cs_terminate()
     * make sure it uses the appropriate register range.
     */
    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
-   fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+   fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    bld.group(8, 0).exec_all().MOV(payload, g0);
 
    /* Send a message to the thread spawner to terminate the thread. */
@@ -1074,7 +1108,7 @@ fs_visitor::emit_barrier()
    /* We are getting the barrier ID from the compute shader header */
    assert(stage == MESA_SHADER_COMPUTE);
 
-   fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+   fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 
    const fs_builder pbld = bld.exec_all().group(8, 0);
 
@@ -1112,13 +1146,14 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
                        void *mem_ctx,
                        struct brw_gs_compile *c,
                        struct brw_gs_prog_data *prog_data,
-                       const nir_shader *shader)
+                       const nir_shader *shader,
+                       int shader_time_index)
    : backend_shader(compiler, log_data, mem_ctx, shader,
                     &prog_data->base.base),
      key(&c->key), gs_compile(c),
      prog_data(&prog_data->base.base), prog(NULL),
      dispatch_width(8),
-     shader_time_index(ST_GS),
+     shader_time_index(shader_time_index),
      bld(fs_builder(this, dispatch_width).at_end())
 {
    init();
@@ -1155,7 +1190,6 @@ fs_visitor::init()
    this->nir_ssa_values = NULL;
 
    memset(&this->payload, 0, sizeof(this->payload));
-   memset(this->outputs, 0, sizeof(this->outputs));
    memset(this->output_components, 0, sizeof(this->output_components));
    this->source_depth_to_render_target = false;
    this->runtime_check_aads_emit = false;
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index 4417555f18e..7e977e9e727 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -41,9 +41,9 @@ public:
    explicit fs_reg(uint32_t u);
    explicit fs_reg(uint8_t vf[4]);
    explicit fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3);
-   fs_reg(struct brw_reg fixed_hw_reg);
-   fs_reg(enum register_file file, int reg);
-   fs_reg(enum register_file file, int reg, enum brw_reg_type type);
+   fs_reg(struct brw_reg reg);
+   fs_reg(enum brw_reg_file file, int nr);
+   fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type);
 
    bool equals(const fs_reg &r) const;
    bool is_contiguous() const;
@@ -72,7 +72,7 @@ public:
 static inline fs_reg
 negate(fs_reg reg)
 {
-   assert(reg.file != HW_REG && reg.file != IMM);
+   assert(reg.file != IMM);
    reg.negate = !reg.negate;
    return reg;
 }
@@ -80,7 +80,7 @@ negate(fs_reg reg)
 static inline fs_reg
 retype(fs_reg reg, enum brw_reg_type type)
 {
-   reg.fixed_hw_reg.type = reg.type = type;
+   reg.type = type;
    return reg;
 }
 
@@ -90,15 +90,16 @@ byte_offset(fs_reg reg, unsigned delta)
    switch (reg.file) {
    case BAD_FILE:
       break;
-   case GRF:
+   case VGRF:
    case ATTR:
       reg.reg_offset += delta / 32;
       break;
    case MRF:
-      reg.reg += delta / 32;
+      reg.nr += delta / 32;
       break;
+   case ARF:
+   case FIXED_GRF:
    case IMM:
-   case HW_REG:
    case UNIFORM:
       assert(delta == 0);
    }
@@ -117,11 +118,12 @@ horiz_offset(fs_reg reg, unsigned delta)
        * horizontal offset should be a harmless no-op.
        */
       break;
-   case GRF:
+   case VGRF:
    case MRF:
    case ATTR:
       return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
-   case HW_REG:
+   case ARF:
+   case FIXED_GRF:
       assert(delta == 0);
    }
    return reg;
@@ -159,12 +161,13 @@ half(fs_reg reg, unsigned idx)
    case IMM:
       return reg;
 
-   case GRF:
+   case VGRF:
    case MRF:
       return horiz_offset(reg, 8 * idx);
 
+   case ARF:
+   case FIXED_GRF:
    case ATTR:
-   case HW_REG:
       unreachable("Cannot take half of this register type");
    }
    return reg;
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index 29642c6d2a4..110e64b979e 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -39,7 +39,7 @@ public:
 
    void init();
 
-   src_reg(register_file file, int reg, const glsl_type *type);
+   src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
    src_reg();
    src_reg(float f);
    src_reg(uint32_t u);
@@ -55,22 +55,21 @@ public:
 
    explicit src_reg(const dst_reg &reg);
 
-   unsigned swizzle; /**< BRW_SWIZZLE_XYZW macros from brw_reg.h. */
-
    src_reg *reladdr;
 };
 
 static inline src_reg
 retype(src_reg reg, enum brw_reg_type type)
 {
-   reg.fixed_hw_reg.type = reg.type = type;
+   reg.type = type;
    return reg;
 }
 
 static inline src_reg
 offset(src_reg reg, unsigned delta)
 {
-   assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM));
+   assert(delta == 0 ||
+          (reg.file != ARF && reg.file != FIXED_GRF && reg.file != IMM));
    reg.reg_offset += delta;
    return reg;
 }
@@ -82,7 +81,6 @@ offset(src_reg reg, unsigned delta)
 static inline src_reg
 swizzle(src_reg reg, unsigned swizzle)
 {
-   assert(reg.file != HW_REG);
    reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
    return reg;
 }
@@ -90,7 +88,7 @@ swizzle(src_reg reg, unsigned swizzle)
 static inline src_reg
 negate(src_reg reg)
 {
-   assert(reg.file != HW_REG && reg.file != IMM);
+   assert(reg.file != IMM);
    reg.negate = !reg.negate;
    return reg;
 }
@@ -110,10 +108,10 @@ public:
    void init();
 
    dst_reg();
-   dst_reg(register_file file, int reg);
-   dst_reg(register_file file, int reg, const glsl_type *type,
+   dst_reg(enum brw_reg_file file, int nr);
+   dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
            unsigned writemask);
-   dst_reg(register_file file, int reg, brw_reg_type type,
+   dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
            unsigned writemask);
    dst_reg(struct brw_reg reg);
    dst_reg(class vec4_visitor *v, const struct glsl_type *type);
@@ -122,22 +120,21 @@ public:
 
    bool equals(const dst_reg &r) const;
 
-   unsigned writemask; /**< Bitfield of WRITEMASK_[XYZW] */
-
    src_reg *reladdr;
 };
 
 static inline dst_reg
 retype(dst_reg reg, enum brw_reg_type type)
 {
-   reg.fixed_hw_reg.type = reg.type = type;
+   reg.type = type;
    return reg;
 }
 
 static inline dst_reg
 offset(dst_reg reg, unsigned delta)
 {
-   assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM));
+   assert(delta == 0 ||
+          (reg.file != ARF && reg.file != FIXED_GRF && reg.file != IMM));
    reg.reg_offset += delta;
    return reg;
 }
@@ -145,7 +142,7 @@ offset(dst_reg reg, unsigned delta)
 static inline dst_reg
 writemask(dst_reg reg, unsigned mask)
 {
-   assert(reg.file != HW_REG && reg.file != IMM);
+   assert(reg.file != IMM);
    assert((reg.writemask & mask) != 0);
    reg.writemask &= mask;
    return reg;
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index fc9bee43d80..29911732761 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -157,8 +157,6 @@ process_glsl_ir(gl_shader_stage stage,
                  _mesa_shader_stage_to_abbrev(shader->Stage));
    }
 
-   lower_ubo_reference(shader, shader->ir);
-
    bool progress;
    do {
       progress = false;
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index fbde3f04204..12e7c32e424 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -314,8 +314,7 @@ get_fast_clear_rect(struct gl_framebuffer *fb,
 }
 
 static void
-get_buffer_rect(struct brw_context *brw, struct gl_framebuffer *fb,
-                struct intel_renderbuffer *irb, struct rect *rect)
+get_buffer_rect(const struct gl_framebuffer *fb, struct rect *rect)
 {
    rect->x0 = fb->_Xmin;
    rect->x1 = fb->_Xmax;
@@ -526,16 +525,18 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
 
       case REP_CLEAR:
          rep_clear_buffers |= 1 << index;
-         get_buffer_rect(brw, fb, irb, &clear_rect);
+         get_buffer_rect(fb, &clear_rect);
          break;
 
       case PLAIN_CLEAR:
          plain_clear_buffers |= 1 << index;
-         get_buffer_rect(brw, fb, irb, &clear_rect);
+         get_buffer_rect(fb, &clear_rect);
          continue;
       }
    }
 
+   assert((fast_clear_buffers & rep_clear_buffers) == 0);
+
    if (!(fast_clear_buffers | rep_clear_buffers)) {
       if (plain_clear_buffers)
          /* If we only have plain clears, skip the meta save/restore. */
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
index cbbb919c6ee..4e9aa949506 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -163,6 +163,13 @@ static const char *fs_tmpl =
    "      txl_coords.x = ((X & int(0xfff8)) >> 2) | (X & int(0x1));\n"
    "      txl_coords.y = ((Y & int(0xfffc)) >> 1) | (Y & int(0x1));\n"
    "      sample_index = (X & 0x4) | (Y & 0x2) | ((X & 0x2) >> 1);\n"
+   "      break;\n"
+   "   case 16:\n"
+   "      txl_coords.x = ((X & int(0xfff8)) >> 2) | (X & int(0x1));\n"
+   "      txl_coords.y = ((Y & int(0xfff8)) >> 2) | (Y & int(0x1));\n"
+   "      sample_index = (((Y & 0x4) << 1) | (X & 0x4) | (Y & 0x2) |\n"
+   "                      ((X & 0x2) >> 1));\n"
+   "      break;\n"
    "   }\n"
    "}\n"
    "\n"
@@ -313,11 +320,16 @@ adjust_msaa(struct blit_dims *dims, int num_samples)
       dims->dst_x0 *= 2;
       dims->dst_x1 *= 2;
    } else if (num_samples) {
-      const int x_num_samples = num_samples / 2;
-      dims->dst_x0 = ROUND_DOWN_TO(dims->dst_x0 * x_num_samples, num_samples);
-      dims->dst_y0 = ROUND_DOWN_TO(dims->dst_y0 * 2, 4);
-      dims->dst_x1 = ALIGN(dims->dst_x1 * x_num_samples, num_samples);
-      dims->dst_y1 = ALIGN(dims->dst_y1 * 2, 4);
+      const int y_num_samples = num_samples >= 16 ? 4 : 2;
+      const int x_num_samples = num_samples / y_num_samples;
+      dims->dst_x0 = ROUND_DOWN_TO(dims->dst_x0 * x_num_samples,
+                                   x_num_samples * 2);
+      dims->dst_y0 = ROUND_DOWN_TO(dims->dst_y0 * y_num_samples,
+                                   y_num_samples * 2);
+      dims->dst_x1 = ALIGN(dims->dst_x1 * x_num_samples,
+                           x_num_samples * 2);
+      dims->dst_y1 = ALIGN(dims->dst_y1 * y_num_samples,
+                           y_num_samples * 2);
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_multisample_state.h b/src/mesa/drivers/dri/i965/brw_multisample_state.h
index 26633e72983..42a7fd35121 100644
--- a/src/mesa/drivers/dri/i965/brw_multisample_state.h
+++ b/src/mesa/drivers/dri/i965/brw_multisample_state.h
@@ -81,3 +81,29 @@ brw_multisample_positions_4x = 0xae2ae662;
  */
 static const uint32_t
 brw_multisample_positions_8x[] = { 0xdbb39d79, 0x3ff55117 };
+
+/**
+ * Sample positions:
+ *
+ *    0 1 2 3 4 5 6 7 8 9 a b c d e f
+ * 0   15
+ * 1                  9
+ * 2         10
+ * 3                        7
+ * 4                               13
+ * 5                1
+ * 6        4
+ * 7                          3
+ * 8 12
+ * 9                    0
+ * a            2
+ * b                            6
+ * c     11
+ * d                      5
+ * e              8
+ * f                             14
+ */
+static const uint32_t
+brw_multisample_positions_16x[] = {
+   0xc75a7599, 0xb3dbad36, 0x2c42816e, 0x10eff408
+};
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 8c1a34ee17a..58754adc887 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -56,7 +56,8 @@ remap_vs_attrs(nir_block *block, void *closure)
 }
 
 static void
-brw_nir_lower_inputs(nir_shader *nir, bool is_scalar)
+brw_nir_lower_inputs(const struct brw_device_info *devinfo,
+                     nir_shader *nir, bool is_scalar)
 {
    switch (nir->stage) {
    case MESA_SHADER_VERTEX:
@@ -90,11 +91,43 @@ brw_nir_lower_inputs(nir_shader *nir, bool is_scalar)
          }
       }
       break;
-   case MESA_SHADER_GEOMETRY:
-      foreach_list_typed(nir_variable, var, node, &nir->inputs) {
-         var->data.driver_location = var->data.location;
+   case MESA_SHADER_GEOMETRY: {
+      if (!is_scalar) {
+         foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+            var->data.driver_location = var->data.location;
+         }
+      } else {
+         /* The GLSL linker will have already matched up GS inputs and
+          * the outputs of prior stages.  The driver does extend VS outputs
+          * in some cases, but only for legacy OpenGL or Gen4-5 hardware,
+          * neither of which offer geometry shader support.  So we can
+          * safely ignore that.
+          *
+          * For SSO pipelines, we use a fixed VUE map layout based on variable
+          * locations, so we can rely on rendezvous-by-location to make this
+          * work.
+          *
+          * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not
+          * written by previous stages and shows up via payload magic.
+          */
+         struct brw_vue_map input_vue_map;
+         GLbitfield64 inputs_read =
+            nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID;
+         brw_compute_vue_map(devinfo, &input_vue_map, inputs_read,
+                             nir->info.separate_shader);
+
+         /* Start with the slot for the variable's base. */
+         foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+            assert(input_vue_map.varying_to_slot[var->data.location] != -1);
+            var->data.driver_location =
+               input_vue_map.varying_to_slot[var->data.location];
+         }
+
+         /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
+         nir_lower_io(nir, nir_var_shader_in, type_size_vec4);
       }
       break;
+   }
    case MESA_SHADER_FRAGMENT:
       assert(is_scalar);
       nir_assign_var_locations(&nir->inputs, &nir->num_inputs,
@@ -117,7 +150,8 @@ brw_nir_lower_outputs(nir_shader *nir, bool is_scalar)
    case MESA_SHADER_GEOMETRY:
       if (is_scalar) {
          nir_assign_var_locations(&nir->outputs, &nir->num_outputs,
-                                  type_size_scalar);
+                                  type_size_vec4_times_4);
+         nir_lower_io(nir, nir_var_shader_out, type_size_vec4_times_4);
       } else {
          nir_foreach_variable(var, &nir->outputs)
             var->data.driver_location = var->data.location;
@@ -187,6 +221,7 @@ brw_create_nir(struct brw_context *brw,
                bool is_scalar)
 {
    struct gl_context *ctx = &brw->ctx;
+   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
    const nir_shader_compiler_options *options =
       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
    nir_shader *nir;
@@ -267,7 +302,7 @@ brw_postprocess_nir(nir_shader *nir,
    bool debug_enabled =
       (INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->stage));
 
-   brw_nir_lower_inputs(nir, is_scalar);
+   brw_nir_lower_inputs(devinfo, nir, is_scalar);
    brw_nir_lower_outputs(nir, is_scalar);
    nir_assign_var_locations(&nir->uniforms,
                             &nir->num_uniforms,
@@ -285,7 +320,7 @@ brw_postprocess_nir(nir_shader *nir,
 
    if (devinfo->gen >= 6) {
       /* Try and fuse multiply-adds */
-      nir_opt_peephole_ffma(nir);
+      brw_nir_opt_peephole_ffma(nir);
       nir_validate_shader(nir);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index a6d6768795a..d259777e1c9 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -103,6 +103,8 @@ void brw_nir_setup_glsl_uniforms(nir_shader *shader,
 void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog,
                                 struct brw_stage_prog_data *stage_prog_data);
 
+bool brw_nir_opt_peephole_ffma(nir_shader *shader);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/glsl/nir/nir_opt_peephole_ffma.c b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
index 4f0f0dae04e..5603129bde7 100644
--- a/src/glsl/nir/nir_opt_peephole_ffma.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
@@ -25,7 +25,7 @@
  *
  */
 
-#include "nir.h"
+#include "brw_nir.h"
 
 /*
  * Implements a small peephole optimization that looks for a multiply that
@@ -133,8 +133,30 @@ get_mul_for_src(nir_alu_src *src, int num_components,
    return alu;
 }
 
+/**
+ * Given a list of (at least two) nir_alu_src's, tells if any of them is a
+ * constant value and is used only once.
+ */
 static bool
-nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
+any_alu_src_is_a_constant(nir_alu_src srcs[])
+{
+   for (unsigned i = 0; i < 2; i++) {
+      if (srcs[i].src.ssa->parent_instr->type == nir_instr_type_load_const) {
+         nir_load_const_instr *load_const =
+            nir_instr_as_load_const (srcs[i].src.ssa->parent_instr);
+
+         if (list_is_singular(&load_const->def.uses) &&
+             list_empty(&load_const->def.if_uses)) {
+            return true;
+         }
+      }
+   }
+
+   return false;
+}
+
+static bool
+brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
 {
    struct peephole_ffma_state *state = void_state;
 
@@ -183,6 +205,15 @@ nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
       mul_src[0] = mul->src[0].src.ssa;
       mul_src[1] = mul->src[1].src.ssa;
 
+      /* If any of the operands of the fmul and any of the fadd is a constant,
+       * we bypass because it will be more efficient as the constants will be
+       * propagated as operands, potentially saving two load_const instructions.
+       */
+      if (any_alu_src_is_a_constant(mul->src) &&
+          any_alu_src_is_a_constant(add->src)) {
+         continue;
+      }
+
       if (abs) {
          for (unsigned i = 0; i < 2; i++) {
             nir_alu_instr *abs = nir_alu_instr_create(state->mem_ctx,
@@ -237,7 +268,7 @@ nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
 }
 
 static bool
-nir_opt_peephole_ffma_impl(nir_function_impl *impl)
+brw_nir_opt_peephole_ffma_impl(nir_function_impl *impl)
 {
    struct peephole_ffma_state state;
 
@@ -245,7 +276,7 @@ nir_opt_peephole_ffma_impl(nir_function_impl *impl)
    state.impl = impl;
    state.progress = false;
 
-   nir_foreach_block(impl, nir_opt_peephole_ffma_block, &state);
+   nir_foreach_block(impl, brw_nir_opt_peephole_ffma_block, &state);
 
    if (state.progress)
       nir_metadata_preserve(impl, nir_metadata_block_index |
@@ -255,13 +286,13 @@ nir_opt_peephole_ffma_impl(nir_function_impl *impl)
 }
 
 bool
-nir_opt_peephole_ffma(nir_shader *shader)
+brw_nir_opt_peephole_ffma(nir_shader *shader)
 {
    bool progress = false;
 
    nir_foreach_overload(shader, overload) {
       if (overload->impl)
-         progress |= nir_opt_peephole_ffma_impl(overload->impl);
+         progress |= brw_nir_opt_peephole_ffma_impl(overload->impl);
    }
 
    return progress;
diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
index d3326e9fb86..87b383919df 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
+++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
@@ -98,6 +98,8 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var,
       if (storage->type->is_image()) {
          brw_setup_image_uniform_values(stage, stage_prog_data,
                                         uniform_index, storage);
+         uniform_index +=
+            BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1);
       } else {
          gl_constant_value *components = storage->storage;
          unsigned vector_count = (MAX2(storage->array_elements, 1) *
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index 083c46a3726..3da83b43b5d 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -219,7 +219,7 @@ enum PACKED brw_reg_type {
 };
 
 unsigned brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
-                                 enum brw_reg_type type, unsigned file);
+                                 enum brw_reg_type type, enum brw_reg_file file);
 const char *brw_reg_type_letters(unsigned brw_reg_type);
 
 #define REG_SIZE (8*4)
@@ -232,29 +232,29 @@ const char *brw_reg_type_letters(unsigned brw_reg_type);
  */
 struct brw_reg {
    enum brw_reg_type type:4;
-   unsigned file:2;
-   unsigned nr:8;
-   unsigned subnr:5;              /* :1 in align16 */
+   enum brw_reg_file file:3;      /* :2 hardware format */
    unsigned negate:1;             /* source only */
    unsigned abs:1;                /* source only */
-   unsigned vstride:4;            /* source only */
-   unsigned width:3;              /* src only, align1 only */
-   unsigned hstride:2;            /* align1 only */
    unsigned address_mode:1;       /* relative addressing, hopefully! */
    unsigned pad0:1;
+   unsigned subnr:5;              /* :1 in align16 */
+   unsigned nr:16;
 
    union {
       struct {
          unsigned swizzle:8;      /* src only, align16 only */
          unsigned writemask:4;    /* dest only, align16 only */
          int  indirect_offset:10; /* relative addressing offset */
-         unsigned pad1:10;        /* two dwords total */
-      } bits;
+         unsigned vstride:4;      /* source only */
+         unsigned width:3;        /* src only, align1 only */
+         unsigned hstride:2;      /* align1 only */
+         unsigned pad1:1;
+      };
 
       float f;
       int   d;
       unsigned ud;
-   } dw1;
+   };
 };
 
 
@@ -329,7 +329,7 @@ type_is_signed(unsigned type)
  * \param writemask WRITEMASK_X/Y/Z/W bitfield
  */
 static inline struct brw_reg
-brw_reg(unsigned file,
+brw_reg(enum brw_reg_file file,
         unsigned nr,
         unsigned subnr,
         unsigned negate,
@@ -353,15 +353,12 @@ brw_reg(unsigned file,
 
    reg.type = type;
    reg.file = file;
-   reg.nr = nr;
-   reg.subnr = subnr * type_sz(type);
    reg.negate = negate;
    reg.abs = abs;
-   reg.vstride = vstride;
-   reg.width = width;
-   reg.hstride = hstride;
    reg.address_mode = BRW_ADDRESS_DIRECT;
    reg.pad0 = 0;
+   reg.subnr = subnr * type_sz(type);
+   reg.nr = nr;
 
    /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
     * set swizzle and writemask to W, as the lower bits of subnr will
@@ -369,16 +366,19 @@ brw_reg(unsigned file,
     * keep track of as you'd want it adjusted by suboffset(), etc.
     * Perhaps fix up when converting to align16?
     */
-   reg.dw1.bits.swizzle = swizzle;
-   reg.dw1.bits.writemask = writemask;
-   reg.dw1.bits.indirect_offset = 0;
-   reg.dw1.bits.pad1 = 0;
+   reg.swizzle = swizzle;
+   reg.writemask = writemask;
+   reg.indirect_offset = 0;
+   reg.vstride = vstride;
+   reg.width = width;
+   reg.hstride = hstride;
+   reg.pad1 = 0;
    return reg;
 }
 
 /** Construct float[16] register */
 static inline struct brw_reg
-brw_vec16_reg(unsigned file, unsigned nr, unsigned subnr)
+brw_vec16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
 {
    return brw_reg(file,
                   nr,
@@ -395,7 +395,7 @@ brw_vec16_reg(unsigned file, unsigned nr, unsigned subnr)
 
 /** Construct float[8] register */
 static inline struct brw_reg
-brw_vec8_reg(unsigned file, unsigned nr, unsigned subnr)
+brw_vec8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
 {
    return brw_reg(file,
                   nr,
@@ -412,7 +412,7 @@ brw_vec8_reg(unsigned file, unsigned nr, unsigned subnr)
 
 /** Construct float[4] register */
 static inline struct brw_reg
-brw_vec4_reg(unsigned file, unsigned nr, unsigned subnr)
+brw_vec4_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
 {
    return brw_reg(file,
                   nr,
@@ -429,7 +429,7 @@ brw_vec4_reg(unsigned file, unsigned nr, unsigned subnr)
 
 /** Construct float[2] register */
 static inline struct brw_reg
-brw_vec2_reg(unsigned file, unsigned nr, unsigned subnr)
+brw_vec2_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
 {
    return brw_reg(file,
                   nr,
@@ -446,7 +446,7 @@ brw_vec2_reg(unsigned file, unsigned nr, unsigned subnr)
 
 /** Construct float[1] register */
 static inline struct brw_reg
-brw_vec1_reg(unsigned file, unsigned nr, unsigned subnr)
+brw_vec1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
 {
    return brw_reg(file,
                   nr,
@@ -462,7 +462,8 @@ brw_vec1_reg(unsigned file, unsigned nr, unsigned subnr)
 }
 
 static inline struct brw_reg
-brw_vecn_reg(unsigned width, unsigned file, unsigned nr, unsigned subnr)
+brw_vecn_reg(unsigned width, enum brw_reg_file file,
+             unsigned nr, unsigned subnr)
 {
    switch (width) {
    case 1:
@@ -529,21 +530,21 @@ byte_offset(struct brw_reg reg, unsigned bytes)
 
 /** Construct unsigned word[16] register */
 static inline struct brw_reg
-brw_uw16_reg(unsigned file, unsigned nr, unsigned subnr)
+brw_uw16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
 {
    return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
 }
 
 /** Construct unsigned word[8] register */
 static inline struct brw_reg
-brw_uw8_reg(unsigned file, unsigned nr, unsigned subnr)
+brw_uw8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
 {
    return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
 }
 
 /** Construct unsigned word[1] register */
 static inline struct brw_reg
-brw_uw1_reg(unsigned file, unsigned nr, unsigned subnr)
+brw_uw1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
 {
    return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
 }
@@ -569,7 +570,7 @@ static inline struct brw_reg
 brw_imm_f(float f)
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
-   imm.dw1.f = f;
+   imm.f = f;
    return imm;
 }
 
@@ -578,7 +579,7 @@ static inline struct brw_reg
 brw_imm_d(int d)
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
-   imm.dw1.d = d;
+   imm.d = d;
    return imm;
 }
 
@@ -587,7 +588,7 @@ static inline struct brw_reg
 brw_imm_ud(unsigned ud)
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
-   imm.dw1.ud = ud;
+   imm.ud = ud;
    return imm;
 }
 
@@ -596,7 +597,7 @@ static inline struct brw_reg
 brw_imm_uw(uint16_t uw)
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
-   imm.dw1.ud = uw | (uw << 16);
+   imm.ud = uw | (uw << 16);
    return imm;
 }
 
@@ -605,7 +606,7 @@ static inline struct brw_reg
 brw_imm_w(int16_t w)
 {
    struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
-   imm.dw1.d = w | (w << 16);
+   imm.d = w | (w << 16);
    return imm;
 }
 
@@ -621,7 +622,7 @@ brw_imm_v(unsigned v)
    imm.vstride = BRW_VERTICAL_STRIDE_0;
    imm.width = BRW_WIDTH_8;
    imm.hstride = BRW_HORIZONTAL_STRIDE_1;
-   imm.dw1.ud = v;
+   imm.ud = v;
    return imm;
 }
 
@@ -633,7 +634,7 @@ brw_imm_vf(unsigned v)
    imm.vstride = BRW_VERTICAL_STRIDE_0;
    imm.width = BRW_WIDTH_4;
    imm.hstride = BRW_HORIZONTAL_STRIDE_1;
-   imm.dw1.ud = v;
+   imm.ud = v;
    return imm;
 }
 
@@ -923,8 +924,8 @@ brw_swizzle(struct brw_reg reg, unsigned x, unsigned y, unsigned z, unsigned w)
 {
    assert(reg.file != BRW_IMMEDIATE_VALUE);
 
-   reg.dw1.bits.swizzle = brw_compose_swizzle(BRW_SWIZZLE4(x, y, z, w),
-                                              reg.dw1.bits.swizzle);
+   reg.swizzle = brw_compose_swizzle(BRW_SWIZZLE4(x, y, z, w),
+                                              reg.swizzle);
    return reg;
 }
 
@@ -939,7 +940,7 @@ static inline struct brw_reg
 brw_writemask(struct brw_reg reg, unsigned mask)
 {
    assert(reg.file != BRW_IMMEDIATE_VALUE);
-   reg.dw1.bits.writemask &= mask;
+   reg.writemask &= mask;
    return reg;
 }
 
@@ -947,7 +948,7 @@ static inline struct brw_reg
 brw_set_writemask(struct brw_reg reg, unsigned mask)
 {
    assert(reg.file != BRW_IMMEDIATE_VALUE);
-   reg.dw1.bits.writemask = mask;
+   reg.writemask = mask;
    return reg;
 }
 
@@ -980,7 +981,7 @@ brw_vec4_indirect(unsigned subnr, int offset)
    struct brw_reg reg =  brw_vec4_grf(0, 0);
    reg.subnr = subnr;
    reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
-   reg.dw1.bits.indirect_offset = offset;
+   reg.indirect_offset = offset;
    return reg;
 }
 
@@ -990,7 +991,18 @@ brw_vec1_indirect(unsigned subnr, int offset)
    struct brw_reg reg =  brw_vec1_grf(0, 0);
    reg.subnr = subnr;
    reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
-   reg.dw1.bits.indirect_offset = offset;
+   reg.indirect_offset = offset;
+   return reg;
+}
+
+static inline struct brw_reg
+brw_VxH_indirect(unsigned subnr, int offset)
+{
+   struct brw_reg reg = brw_vec1_grf(0, 0);
+   reg.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.indirect_offset = offset;
    return reg;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 88c45f74333..776f75d3e58 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -583,15 +583,14 @@ fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
       if (is_src_duplicate(inst, i))
          continue;
 
-      if (inst->src[i].file == GRF) {
-         reads_remaining[inst->src[i].reg]++;
-      } else if (inst->src[i].file == HW_REG &&
-               inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
-         if (inst->src[i].fixed_hw_reg.nr >= hw_reg_count)
+      if (inst->src[i].file == VGRF) {
+         reads_remaining[inst->src[i].nr]++;
+      } else if (inst->src[i].file == FIXED_GRF) {
+         if (inst->src[i].nr >= hw_reg_count)
             continue;
 
          for (int j = 0; j < inst->regs_read(i); j++)
-            hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + j]++;
+            hw_reads_remaining[inst->src[i].nr + j]++;
       }
    }
 }
@@ -660,21 +659,20 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
    if (!reads_remaining)
       return;
 
-   if (inst->dst.file == GRF) {
-      written[inst->dst.reg] = true;
+   if (inst->dst.file == VGRF) {
+      written[inst->dst.nr] = true;
    }
 
    for (int i = 0; i < inst->sources; i++) {
       if (is_src_duplicate(inst, i))
           continue;
 
-      if (inst->src[i].file == GRF) {
-         reads_remaining[inst->src[i].reg]--;
-      } else if (inst->src[i].file == HW_REG &&
-                 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE &&
-                 inst->src[i].fixed_hw_reg.nr < hw_reg_count) {
+      if (inst->src[i].file == VGRF) {
+         reads_remaining[inst->src[i].nr]--;
+      } else if (inst->src[i].file == FIXED_GRF &&
+                 inst->src[i].nr < hw_reg_count) {
          for (int off = 0; off < inst->regs_read(i); off++)
-            hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + off]--;
+            hw_reads_remaining[inst->src[i].nr + off]--;
       }
    }
 }
@@ -685,26 +683,25 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
    fs_inst *inst = (fs_inst *)be;
    int benefit = 0;
 
-   if (inst->dst.file == GRF) {
-      if (!BITSET_TEST(livein[block_idx], inst->dst.reg) &&
-          !written[inst->dst.reg])
-         benefit -= v->alloc.sizes[inst->dst.reg];
+   if (inst->dst.file == VGRF) {
+      if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
+          !written[inst->dst.nr])
+         benefit -= v->alloc.sizes[inst->dst.nr];
    }
 
    for (int i = 0; i < inst->sources; i++) {
       if (is_src_duplicate(inst, i))
          continue;
 
-      if (inst->src[i].file == GRF &&
-          !BITSET_TEST(liveout[block_idx], inst->src[i].reg) &&
-          reads_remaining[inst->src[i].reg] == 1)
-         benefit += v->alloc.sizes[inst->src[i].reg];
+      if (inst->src[i].file == VGRF &&
+          !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
+          reads_remaining[inst->src[i].nr] == 1)
+         benefit += v->alloc.sizes[inst->src[i].nr];
 
-      if (inst->src[i].file == HW_REG &&
-          inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE &&
-          inst->src[i].fixed_hw_reg.nr < hw_reg_count) {
+      if (inst->src[i].file == FIXED_GRF &&
+          inst->src[i].nr < hw_reg_count) {
          for (int off = 0; off < inst->regs_read(i); off++) {
-            int reg = inst->src[i].fixed_hw_reg.nr + off;
+            int reg = inst->src[i].nr + off;
             if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
                 hw_reads_remaining[reg] == 1) {
                benefit++;
@@ -927,7 +924,6 @@ fs_instruction_scheduler::calculate_deps()
     * granular level.
     */
    schedule_node *last_fixed_grf_write = NULL;
-   int reg_width = v->dispatch_width / 8;
 
    /* The last instruction always needs to still be the last
     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
@@ -951,24 +947,19 @@ fs_instruction_scheduler::calculate_deps()
 
       /* read-after-write deps. */
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
             if (post_reg_alloc) {
                for (int r = 0; r < inst->regs_read(i); r++)
-                  add_dep(last_grf_write[inst->src[i].reg + r], n);
+                  add_dep(last_grf_write[inst->src[i].nr + r], n);
             } else {
                for (int r = 0; r < inst->regs_read(i); r++) {
-                  add_dep(last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], n);
+                  add_dep(last_grf_write[inst->src[i].nr * 16 + inst->src[i].reg_offset + r], n);
                }
             }
-         } else if (inst->src[i].file == HW_REG &&
-                    (inst->src[i].fixed_hw_reg.file ==
-                     BRW_GENERAL_REGISTER_FILE)) {
+         } else if (inst->src[i].file == FIXED_GRF) {
             if (post_reg_alloc) {
-               int size = reg_width;
-               if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
-                  size = 1;
-               for (int r = 0; r < size; r++)
-                  add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
+               for (int r = 0; r < inst->regs_read(i); r++)
+                  add_dep(last_grf_write[inst->src[i].nr + r], n);
             } else {
                add_dep(last_fixed_grf_write, n);
             }
@@ -976,9 +967,7 @@ fs_instruction_scheduler::calculate_deps()
             add_dep(last_accumulator_write, n);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM &&
-                    (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
+                    inst->src[i].file != UNIFORM) {
             assert(inst->src[i].file != MRF);
             add_barrier_deps(n);
          }
@@ -1003,36 +992,35 @@ fs_instruction_scheduler::calculate_deps()
       }
 
       /* write-after-write deps. */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
          if (post_reg_alloc) {
             for (int r = 0; r < inst->regs_written; r++) {
-               add_dep(last_grf_write[inst->dst.reg + r], n);
-               last_grf_write[inst->dst.reg + r] = n;
+               add_dep(last_grf_write[inst->dst.nr + r], n);
+               last_grf_write[inst->dst.nr + r] = n;
             }
          } else {
             for (int r = 0; r < inst->regs_written; r++) {
-               add_dep(last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r], n);
-               last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
+               add_dep(last_grf_write[inst->dst.nr * 16 + inst->dst.reg_offset + r], n);
+               last_grf_write[inst->dst.nr * 16 + inst->dst.reg_offset + r] = n;
             }
          }
       } else if (inst->dst.file == MRF) {
-         int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
 
          add_dep(last_mrf_write[reg], n);
          last_mrf_write[reg] = n;
          if (is_compressed(inst)) {
-            if (inst->dst.reg & BRW_MRF_COMPR4)
+            if (inst->dst.nr & BRW_MRF_COMPR4)
                reg += 4;
             else
                reg++;
             add_dep(last_mrf_write[reg], n);
             last_mrf_write[reg] = n;
          }
-      } else if (inst->dst.file == HW_REG &&
-                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+      } else if (inst->dst.file == FIXED_GRF) {
          if (post_reg_alloc) {
-            for (int r = 0; r < reg_width; r++)
-               last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
+            for (int r = 0; r < inst->regs_written; r++)
+               last_grf_write[inst->dst.nr + r] = n;
          } else {
             last_fixed_grf_write = n;
          }
@@ -1080,24 +1068,19 @@ fs_instruction_scheduler::calculate_deps()
 
       /* write-after-read deps. */
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
             if (post_reg_alloc) {
                for (int r = 0; r < inst->regs_read(i); r++)
-                  add_dep(n, last_grf_write[inst->src[i].reg + r], 0);
+                  add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
             } else {
                for (int r = 0; r < inst->regs_read(i); r++) {
-                  add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], 0);
+                  add_dep(n, last_grf_write[inst->src[i].nr * 16 + inst->src[i].reg_offset + r], 0);
                }
             }
-         } else if (inst->src[i].file == HW_REG &&
-                    (inst->src[i].fixed_hw_reg.file ==
-                     BRW_GENERAL_REGISTER_FILE)) {
+         } else if (inst->src[i].file == FIXED_GRF) {
             if (post_reg_alloc) {
-               int size = reg_width;
-               if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
-                  size = 1;
-               for (int r = 0; r < size; r++)
-                  add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r], 0);
+               for (int r = 0; r < inst->regs_read(i); r++)
+                  add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
             } else {
                add_dep(n, last_fixed_grf_write, 0);
             }
@@ -1105,9 +1088,7 @@ fs_instruction_scheduler::calculate_deps()
             add_dep(n, last_accumulator_write, 0);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM &&
-                    (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
+                    inst->src[i].file != UNIFORM) {
             assert(inst->src[i].file != MRF);
             add_barrier_deps(n);
          }
@@ -1134,33 +1115,32 @@ fs_instruction_scheduler::calculate_deps()
       /* Update the things this instruction wrote, so earlier reads
        * can mark this as WAR dependency.
        */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
          if (post_reg_alloc) {
             for (int r = 0; r < inst->regs_written; r++)
-               last_grf_write[inst->dst.reg + r] = n;
+               last_grf_write[inst->dst.nr + r] = n;
          } else {
             for (int r = 0; r < inst->regs_written; r++) {
-               last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
+               last_grf_write[inst->dst.nr * 16 + inst->dst.reg_offset + r] = n;
             }
          }
       } else if (inst->dst.file == MRF) {
-         int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
 
          last_mrf_write[reg] = n;
 
          if (is_compressed(inst)) {
-            if (inst->dst.reg & BRW_MRF_COMPR4)
+            if (inst->dst.nr & BRW_MRF_COMPR4)
                reg += 4;
             else
                reg++;
 
             last_mrf_write[reg] = n;
          }
-      } else if (inst->dst.file == HW_REG &&
-                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+      } else if (inst->dst.file == FIXED_GRF) {
          if (post_reg_alloc) {
-            for (int r = 0; r < reg_width; r++)
-               last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
+            for (int r = 0; r < inst->regs_written; r++)
+               last_grf_write[inst->dst.nr + r] = n;
          } else {
             last_fixed_grf_write = n;
          }
@@ -1222,21 +1202,17 @@ vec4_instruction_scheduler::calculate_deps()
 
       /* read-after-write deps. */
       for (int i = 0; i < 3; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
             for (unsigned j = 0; j < inst->regs_read(i); ++j)
-               add_dep(last_grf_write[inst->src[i].reg + j], n);
-         } else if (inst->src[i].file == HW_REG &&
-                    (inst->src[i].fixed_hw_reg.file ==
-                     BRW_GENERAL_REGISTER_FILE)) {
+               add_dep(last_grf_write[inst->src[i].nr + j], n);
+         } else if (inst->src[i].file == FIXED_GRF) {
             add_dep(last_fixed_grf_write, n);
          } else if (inst->src[i].is_accumulator()) {
             assert(last_accumulator_write);
             add_dep(last_accumulator_write, n);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM &&
-                    (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
+                    inst->src[i].file != UNIFORM) {
             /* No reads from MRF, and ATTR is already translated away */
             assert(inst->src[i].file != MRF &&
                    inst->src[i].file != ATTR);
@@ -1265,16 +1241,15 @@ vec4_instruction_scheduler::calculate_deps()
       }
 
       /* write-after-write deps. */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
          for (unsigned j = 0; j < inst->regs_written; ++j) {
-            add_dep(last_grf_write[inst->dst.reg + j], n);
-            last_grf_write[inst->dst.reg + j] = n;
+            add_dep(last_grf_write[inst->dst.nr + j], n);
+            last_grf_write[inst->dst.nr + j] = n;
          }
       } else if (inst->dst.file == MRF) {
-         add_dep(last_mrf_write[inst->dst.reg], n);
-         last_mrf_write[inst->dst.reg] = n;
-     } else if (inst->dst.file == HW_REG &&
-                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+         add_dep(last_mrf_write[inst->dst.nr], n);
+         last_mrf_write[inst->dst.nr] = n;
+     } else if (inst->dst.file == FIXED_GRF) {
          last_fixed_grf_write = n;
       } else if (inst->dst.is_accumulator()) {
          add_dep(last_accumulator_write, n);
@@ -1320,20 +1295,16 @@ vec4_instruction_scheduler::calculate_deps()
 
       /* write-after-read deps. */
       for (int i = 0; i < 3; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
             for (unsigned j = 0; j < inst->regs_read(i); ++j)
-               add_dep(n, last_grf_write[inst->src[i].reg + j]);
-         } else if (inst->src[i].file == HW_REG &&
-                    (inst->src[i].fixed_hw_reg.file ==
-                     BRW_GENERAL_REGISTER_FILE)) {
+               add_dep(n, last_grf_write[inst->src[i].nr + j]);
+         } else if (inst->src[i].file == FIXED_GRF) {
             add_dep(n, last_fixed_grf_write);
          } else if (inst->src[i].is_accumulator()) {
             add_dep(n, last_accumulator_write);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM &&
-                    (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
+                    inst->src[i].file != UNIFORM) {
             assert(inst->src[i].file != MRF &&
                    inst->src[i].file != ATTR);
             add_barrier_deps(n);
@@ -1361,13 +1332,12 @@ vec4_instruction_scheduler::calculate_deps()
       /* Update the things this instruction wrote, so earlier reads
        * can mark this as WAR dependency.
        */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
          for (unsigned j = 0; j < inst->regs_written; ++j)
-            last_grf_write[inst->dst.reg + j] = n;
+            last_grf_write[inst->dst.nr + j] = n;
       } else if (inst->dst.file == MRF) {
-         last_mrf_write[inst->dst.reg] = n;
-      } else if (inst->dst.file == HW_REG &&
-                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+         last_mrf_write[inst->dst.nr] = n;
+      } else if (inst->dst.file == FIXED_GRF) {
          last_fixed_grf_write = n;
       } else if (inst->dst.is_accumulator()) {
          last_accumulator_write = n;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 063cb84a958..1f3ae7ab5e6 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -150,6 +150,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
          compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
 
       compiler->glsl_compiler_options[i].NirOptions = nir_options;
+
+      compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
    }
 
    return compiler;
@@ -291,7 +293,7 @@ const char *
 brw_instruction_name(enum opcode op)
 {
    switch (op) {
-   case BRW_OPCODE_MOV ... BRW_OPCODE_NOP:
+   case BRW_OPCODE_ILLEGAL ... BRW_OPCODE_NOP:
       assert(opcode_descs[op].name);
       return opcode_descs[op].name;
    case FS_OPCODE_FB_WRITE:
@@ -354,6 +356,10 @@ brw_instruction_name(enum opcode op)
       return "txf_cms";
    case SHADER_OPCODE_TXF_CMS_LOGICAL:
       return "txf_cms_logical";
+   case SHADER_OPCODE_TXF_CMS_W:
+      return "txf_cms_w";
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+      return "txf_cms_w_logical";
    case SHADER_OPCODE_TXF_UMS:
       return "txf_ums";
    case SHADER_OPCODE_TXF_UMS_LOGICAL:
@@ -426,6 +432,8 @@ brw_instruction_name(enum opcode op)
       return "gen8_urb_write_simd8_masked_per_slot";
    case SHADER_OPCODE_URB_READ_SIMD8:
       return "urb_read_simd8";
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+      return "urb_read_simd8_per_slot";
 
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
       return "find_live_channel";
@@ -561,7 +569,7 @@ brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg)
       unsigned ud;
       int d;
       float f;
-   } imm = { reg->dw1.ud }, sat_imm = { 0 };
+   } imm = { reg->ud }, sat_imm = { 0 };
 
    switch (type) {
    case BRW_REGISTER_TYPE_UD:
@@ -592,7 +600,7 @@ brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg)
    }
 
    if (imm.ud != sat_imm.ud) {
-      reg->dw1.ud = sat_imm.ud;
+      reg->ud = sat_imm.ud;
       return true;
    }
    return false;
@@ -604,17 +612,17 @@ brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg)
    switch (type) {
    case BRW_REGISTER_TYPE_D:
    case BRW_REGISTER_TYPE_UD:
-      reg->dw1.d = -reg->dw1.d;
+      reg->d = -reg->d;
       return true;
    case BRW_REGISTER_TYPE_W:
    case BRW_REGISTER_TYPE_UW:
-      reg->dw1.d = -(int16_t)reg->dw1.ud;
+      reg->d = -(int16_t)reg->ud;
       return true;
    case BRW_REGISTER_TYPE_F:
-      reg->dw1.f = -reg->dw1.f;
+      reg->f = -reg->f;
       return true;
    case BRW_REGISTER_TYPE_VF:
-      reg->dw1.ud ^= 0x80808080;
+      reg->ud ^= 0x80808080;
       return true;
    case BRW_REGISTER_TYPE_UB:
    case BRW_REGISTER_TYPE_B:
@@ -638,16 +646,16 @@ brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg)
 {
    switch (type) {
    case BRW_REGISTER_TYPE_D:
-      reg->dw1.d = abs(reg->dw1.d);
+      reg->d = abs(reg->d);
       return true;
    case BRW_REGISTER_TYPE_W:
-      reg->dw1.d = abs((int16_t)reg->dw1.ud);
+      reg->d = abs((int16_t)reg->ud);
       return true;
    case BRW_REGISTER_TYPE_F:
-      reg->dw1.f = fabsf(reg->dw1.f);
+      reg->f = fabsf(reg->f);
       return true;
    case BRW_REGISTER_TYPE_VF:
-      reg->dw1.ud &= ~0x80808080;
+      reg->ud &= ~0x80808080;
       return true;
    case BRW_REGISTER_TYPE_UB:
    case BRW_REGISTER_TYPE_B:
@@ -697,7 +705,7 @@ backend_reg::is_zero() const
    if (file != IMM)
       return false;
 
-   return fixed_hw_reg.dw1.d == 0;
+   return d == 0;
 }
 
 bool
@@ -707,8 +715,8 @@ backend_reg::is_one() const
       return false;
 
    return type == BRW_REGISTER_TYPE_F
-          ? fixed_hw_reg.dw1.f == 1.0
-          : fixed_hw_reg.dw1.d == 1;
+          ? f == 1.0
+          : d == 1;
 }
 
 bool
@@ -719,9 +727,9 @@ backend_reg::is_negative_one() const
 
    switch (type) {
    case BRW_REGISTER_TYPE_F:
-      return fixed_hw_reg.dw1.f == -1.0;
+      return f == -1.0;
    case BRW_REGISTER_TYPE_D:
-      return fixed_hw_reg.dw1.d == -1;
+      return d == -1;
    default:
       return false;
    }
@@ -730,25 +738,21 @@ backend_reg::is_negative_one() const
 bool
 backend_reg::is_null() const
 {
-   return file == HW_REG &&
-          fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
-          fixed_hw_reg.nr == BRW_ARF_NULL;
+   return file == ARF && nr == BRW_ARF_NULL;
 }
 
 
 bool
 backend_reg::is_accumulator() const
 {
-   return file == HW_REG &&
-          fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
-          fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
+   return file == ARF && nr == BRW_ARF_ACCUMULATOR;
 }
 
 bool
 backend_reg::in_range(const backend_reg &r, unsigned n) const
 {
    return (file == r.file &&
-           reg == r.reg &&
+           nr == r.nr &&
            reg_offset >= r.reg_offset &&
            reg_offset < r.reg_offset + n);
 }
@@ -779,7 +783,7 @@ backend_instruction::is_commutative() const
 bool
 backend_instruction::is_3src() const
 {
-   return opcode < ARRAY_SIZE(opcode_descs) && opcode_descs[opcode].nsrc == 3;
+   return ::is_3src(opcode);
 }
 
 bool
@@ -790,6 +794,7 @@ backend_instruction::is_tex() const
            opcode == SHADER_OPCODE_TXD ||
            opcode == SHADER_OPCODE_TXF ||
            opcode == SHADER_OPCODE_TXF_CMS ||
+           opcode == SHADER_OPCODE_TXF_CMS_W ||
            opcode == SHADER_OPCODE_TXF_UMS ||
            opcode == SHADER_OPCODE_TXF_MCS ||
            opcode == SHADER_OPCODE_TXL ||
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index f4647cca4f9..c4a37187ce2 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -38,38 +38,18 @@
 #define MAX_SAMPLER_MESSAGE_SIZE 11
 #define MAX_VGRF_SIZE 16
 
-enum PACKED register_file {
-   BAD_FILE,
-   GRF,
-   MRF,
-   IMM,
-   HW_REG, /* a struct brw_reg */
-   ATTR,
-   UNIFORM, /* prog_data->params[reg] */
-};
-
-struct backend_reg
-{
 #ifdef __cplusplus
+struct backend_reg : public brw_reg
+{
+   backend_reg() {}
+   backend_reg(struct brw_reg reg) : brw_reg(reg) {}
+
    bool is_zero() const;
    bool is_one() const;
    bool is_negative_one() const;
    bool is_null() const;
    bool is_accumulator() const;
    bool in_range(const backend_reg &r, unsigned n) const;
-#endif
-
-   enum register_file file; /**< Register file: GRF, MRF, IMM. */
-   enum brw_reg_type type;  /**< Register type: BRW_REGISTER_TYPE_* */
-
-   /**
-    * Register number.
-    *
-    * For GRF, it's a virtual register number until register allocation.
-    *
-    * For MRF, it's the hardware register.
-    */
-   uint16_t reg;
 
    /**
     * Offset within the virtual register.
@@ -81,12 +61,8 @@ struct backend_reg
     * For uniforms, this is in units of 1 float.
     */
    uint16_t reg_offset;
-
-   struct brw_reg fixed_hw_reg;
-
-   bool negate;
-   bool abs;
 };
+#endif
 
 struct cfg_t;
 struct bblock_t;
@@ -274,6 +250,7 @@ bool brw_cs_precompile(struct gl_context *ctx,
 
 int type_size_scalar(const struct glsl_type *type);
 int type_size_vec4(const struct glsl_type *type);
+int type_size_vec4_times_4(const struct glsl_type *type);
 
 bool is_scalar_shader_stage(const struct brw_compiler *compiler, int stage);
 
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 2aa1248fea6..94734bae621 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -172,7 +172,6 @@ brw_state_dirty(struct brw_context *brw, GLuint mesa_flags, uint64_t brw_flags)
 /* brw_binding_tables.c */
 void brw_upload_binding_table(struct brw_context *brw,
                               uint32_t packet_name,
-                              GLbitfield brw_new_binding_table,
                               const struct brw_stage_prog_data *prog_data,
                               struct brw_stage_state *stage_state);
 
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 0344b8a7fb0..6f8daf6d4d2 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -589,9 +589,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_PSP),
    DEFINE_BIT(BRW_NEW_SURFACES),
-   DEFINE_BIT(BRW_NEW_VS_BINDING_TABLE),
-   DEFINE_BIT(BRW_NEW_GS_BINDING_TABLE),
-   DEFINE_BIT(BRW_NEW_PS_BINDING_TABLE),
+   DEFINE_BIT(BRW_NEW_BINDING_TABLE_POINTERS),
    DEFINE_BIT(BRW_NEW_INDICES),
    DEFINE_BIT(BRW_NEW_VERTICES),
    DEFINE_BIT(BRW_NEW_BATCH),
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 01eb1580953..a086b43e11a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -51,12 +51,12 @@ src_reg::init()
    this->file = BAD_FILE;
 }
 
-src_reg::src_reg(register_file file, int reg, const glsl_type *type)
+src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type)
 {
    init();
 
    this->file = file;
-   this->reg = reg;
+   this->nr = nr;
    if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
       this->swizzle = brw_swizzle_for_size(type->vector_elements);
    else
@@ -77,7 +77,7 @@ src_reg::src_reg(float f)
 
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_F;
-   this->fixed_hw_reg.dw1.f = f;
+   this->f = f;
 }
 
 src_reg::src_reg(uint32_t u)
@@ -86,7 +86,7 @@ src_reg::src_reg(uint32_t u)
 
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_UD;
-   this->fixed_hw_reg.dw1.ud = u;
+   this->ud = u;
 }
 
 src_reg::src_reg(int32_t i)
@@ -95,7 +95,7 @@ src_reg::src_reg(int32_t i)
 
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_D;
-   this->fixed_hw_reg.dw1.d = i;
+   this->d = i;
 }
 
 src_reg::src_reg(uint8_t vf[4])
@@ -104,7 +104,7 @@ src_reg::src_reg(uint8_t vf[4])
 
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_VF;
-   memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
+   memcpy(&this->ud, vf, sizeof(unsigned));
 }
 
 src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
@@ -113,31 +113,21 @@ src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_VF;
-   this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
-                               (vf1 <<  8) |
-                               (vf2 << 16) |
-                               (vf3 << 24);
+   this->ud = (vf0 <<  0) | (vf1 <<  8) | (vf2 << 16) | (vf3 << 24);
 }
 
-src_reg::src_reg(struct brw_reg reg)
+src_reg::src_reg(struct brw_reg reg) :
+   backend_reg(reg)
 {
-   init();
-
-   this->file = HW_REG;
-   this->fixed_hw_reg = reg;
-   this->type = reg.type;
+   this->reg_offset = 0;
+   this->reladdr = NULL;
 }
 
-src_reg::src_reg(const dst_reg &reg)
+src_reg::src_reg(const dst_reg &reg) :
+   backend_reg(static_cast<struct brw_reg>(reg))
 {
-   init();
-
-   this->file = reg.file;
-   this->reg = reg.reg;
    this->reg_offset = reg.reg_offset;
-   this->type = reg.type;
    this->reladdr = reg.reladdr;
-   this->fixed_hw_reg = reg.fixed_hw_reg;
    this->swizzle = brw_swizzle_for_mask(reg.writemask);
 }
 
@@ -154,73 +144,58 @@ dst_reg::dst_reg()
    init();
 }
 
-dst_reg::dst_reg(register_file file, int reg)
+dst_reg::dst_reg(enum brw_reg_file file, int nr)
 {
    init();
 
    this->file = file;
-   this->reg = reg;
+   this->nr = nr;
 }
 
-dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
+dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
                  unsigned writemask)
 {
    init();
 
    this->file = file;
-   this->reg = reg;
+   this->nr = nr;
    this->type = brw_type_for_base_type(type);
    this->writemask = writemask;
 }
 
-dst_reg::dst_reg(register_file file, int reg, brw_reg_type type,
+dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
                  unsigned writemask)
 {
    init();
 
    this->file = file;
-   this->reg = reg;
+   this->nr = nr;
    this->type = type;
    this->writemask = writemask;
 }
 
-dst_reg::dst_reg(struct brw_reg reg)
+dst_reg::dst_reg(struct brw_reg reg) :
+   backend_reg(reg)
 {
-   init();
-
-   this->file = HW_REG;
-   this->fixed_hw_reg = reg;
-   this->type = reg.type;
+   this->reg_offset = 0;
+   this->reladdr = NULL;
 }
 
-dst_reg::dst_reg(const src_reg &reg)
+dst_reg::dst_reg(const src_reg &reg) :
+   backend_reg(static_cast<struct brw_reg>(reg))
 {
-   init();
-
-   this->file = reg.file;
-   this->reg = reg.reg;
    this->reg_offset = reg.reg_offset;
-   this->type = reg.type;
    this->writemask = brw_mask_for_swizzle(reg.swizzle);
    this->reladdr = reg.reladdr;
-   this->fixed_hw_reg = reg.fixed_hw_reg;
 }
 
 bool
 dst_reg::equals(const dst_reg &r) const
 {
-   return (file == r.file &&
-           reg == r.reg &&
+   return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 &&
            reg_offset == r.reg_offset &&
-           type == r.type &&
-           negate == r.negate &&
-           abs == r.abs &&
-           writemask == r.writemask &&
            (reladdr == r.reladdr ||
-            (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) &&
-           ((file != HW_REG && file != IMM) ||
-            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
-                   sizeof(fixed_hw_reg)) == 0));
+            (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
 }
 
 bool
@@ -339,6 +314,7 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
    case SHADER_OPCODE_TXD:
    case SHADER_OPCODE_TXF:
    case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
    case SHADER_OPCODE_TXF_MCS:
    case SHADER_OPCODE_TXS:
    case SHADER_OPCODE_TG4:
@@ -354,16 +330,9 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
 bool
 src_reg::equals(const src_reg &r) const
 {
-   return (file == r.file &&
-	   reg == r.reg &&
+   return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 &&
 	   reg_offset == r.reg_offset &&
-	   type == r.type &&
-	   negate == r.negate &&
-	   abs == r.abs &&
-	   swizzle == r.swizzle &&
-	   !reladdr && !r.reladdr &&
-	   memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
-		  sizeof(fixed_hw_reg)) == 0);
+	   !reladdr && !r.reladdr);
 }
 
 bool
@@ -372,7 +341,7 @@ vec4_visitor::opt_vector_float()
    bool progress = false;
 
    int last_reg = -1, last_reg_offset = -1;
-   enum register_file last_reg_file = BAD_FILE;
+   enum brw_reg_file last_reg_file = BAD_FILE;
 
    int remaining_channels = 0;
    uint8_t imm[4];
@@ -380,10 +349,10 @@ vec4_visitor::opt_vector_float()
    vec4_instruction *imm_inst[4];
 
    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
-      if (last_reg != inst->dst.reg ||
+      if (last_reg != inst->dst.nr ||
           last_reg_offset != inst->dst.reg_offset ||
           last_reg_file != inst->dst.file) {
-         last_reg = inst->dst.reg;
+         last_reg = inst->dst.nr;
          last_reg_offset = inst->dst.reg_offset;
          last_reg_file = inst->dst.file;
          remaining_channels = WRITEMASK_XYZW;
@@ -396,7 +365,7 @@ vec4_visitor::opt_vector_float()
           inst->src[0].file != IMM)
          continue;
 
-      int vf = brw_float_to_vf(inst->src[0].fixed_hw_reg.dw1.f);
+      int vf = brw_float_to_vf(inst->src[0].f);
       if (vf == -1)
          continue;
 
@@ -451,7 +420,9 @@ vec4_visitor::opt_reduce_swizzle()
    bool progress = false;
 
    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
-      if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG ||
+      if (inst->dst.file == BAD_FILE ||
+          inst->dst.file == ARF ||
+          inst->dst.file == FIXED_GRF ||
           inst->is_send_from_grf())
          continue;
 
@@ -479,7 +450,7 @@ vec4_visitor::opt_reduce_swizzle()
 
       /* Update sources' swizzles. */
       for (int i = 0; i < 3; i++) {
-         if (inst->src[i].file != GRF &&
+         if (inst->src[i].file != VGRF &&
              inst->src[i].file != ATTR &&
              inst->src[i].file != UNIFORM)
             continue;
@@ -505,7 +476,7 @@ vec4_visitor::split_uniform_registers()
    /* Prior to this, uniforms have been in an array sized according to
     * the number of vector uniforms present, sparsely filled (so an
     * aggregate results in reg indices being skipped over).  Now we're
-    * going to cut those aggregates up so each .reg index is one
+    * going to cut those aggregates up so each .nr index is one
     * vector.  The goal is to make elimination of unused uniform
     * components easier later.
     */
@@ -516,7 +487,7 @@ vec4_visitor::split_uniform_registers()
 
 	 assert(!inst->src[i].reladdr);
 
-	 inst->src[i].reg += inst->src[i].reg_offset;
+         inst->src[i].nr += inst->src[i].reg_offset;
 	 inst->src[i].reg_offset = 0;
       }
    }
@@ -565,7 +536,7 @@ vec4_visitor::pack_uniform_registers()
          if (inst->src[i].file != UNIFORM)
             continue;
 
-         int reg = inst->src[i].reg;
+         int reg = inst->src[i].nr;
          for (int c = 0; c < 4; c++) {
             if (!(readmask & (1 << c)))
                continue;
@@ -620,12 +591,12 @@ vec4_visitor::pack_uniform_registers()
    /* Now, update the instructions for our repacked uniforms. */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       for (int i = 0 ; i < 3; i++) {
-	 int src = inst->src[i].reg;
+         int src = inst->src[i].nr;
 
 	 if (inst->src[i].file != UNIFORM)
 	    continue;
 
-	 inst->src[i].reg = new_loc[src];
+         inst->src[i].nr = new_loc[src];
          inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
                                               new_chan[src], new_chan[src]);
       }
@@ -659,8 +630,7 @@ vec4_visitor::opt_algebraic()
             if (inst->dst.type != inst->src[0].type)
                assert(!"unimplemented: saturate mixed types");
 
-            if (brw_saturate_immediate(inst->dst.type,
-                                       &inst->src[0].fixed_hw_reg)) {
+            if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
                inst->saturate = false;
                progress = true;
             }
@@ -821,10 +791,10 @@ vec4_visitor::move_push_constants_to_pull_constants()
    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
       for (int i = 0 ; i < 3; i++) {
 	 if (inst->src[i].file != UNIFORM ||
-	     pull_constant_loc[inst->src[i].reg] == -1)
+             pull_constant_loc[inst->src[i].nr] == -1)
 	    continue;
 
-	 int uniform = inst->src[i].reg;
+         int uniform = inst->src[i].nr;
 
 	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
 
@@ -832,7 +802,7 @@ vec4_visitor::move_push_constants_to_pull_constants()
 				 pull_constant_loc[uniform]);
 
 	 inst->src[i].file = temp.file;
-	 inst->src[i].reg = temp.reg;
+         inst->src[i].nr = temp.nr;
 	 inst->src[i].reg_offset = temp.reg_offset;
 	 inst->src[i].reladdr = NULL;
       }
@@ -924,10 +894,10 @@ vec4_visitor::opt_set_dependency_control()
           * on, don't do dependency control across the read.
           */
          for (int i = 0; i < 3; i++) {
-            int reg = inst->src[i].reg + inst->src[i].reg_offset;
-            if (inst->src[i].file == GRF) {
+            int reg = inst->src[i].nr + inst->src[i].reg_offset;
+            if (inst->src[i].file == VGRF) {
                last_grf_write[reg] = NULL;
-            } else if (inst->src[i].file == HW_REG) {
+            } else if (inst->src[i].file == FIXED_GRF) {
                memset(last_grf_write, 0, sizeof(last_grf_write));
                break;
             }
@@ -943,8 +913,8 @@ vec4_visitor::opt_set_dependency_control()
          /* Now, see if we can do dependency control for this instruction
           * against a previous one writing to its destination.
           */
-         int reg = inst->dst.reg + inst->dst.reg_offset;
-         if (inst->dst.file == GRF) {
+         int reg = inst->dst.nr + inst->dst.reg_offset;
+         if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
             if (last_grf_write[reg] &&
                 !(inst->dst.writemask & grf_channels_written[reg])) {
                last_grf_write[reg]->no_dd_clear = true;
@@ -966,11 +936,6 @@ vec4_visitor::opt_set_dependency_control()
 
             last_mrf_write[reg] = inst;
             mrf_channels_written[reg] |= inst->dst.writemask;
-         } else if (inst->dst.reg == HW_REG) {
-            if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE)
-               memset(last_grf_write, 0, sizeof(last_grf_write));
-            if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE)
-               memset(last_mrf_write, 0, sizeof(last_mrf_write));
          }
       }
    }
@@ -998,11 +963,8 @@ vec4_instruction::can_reswizzle(const struct brw_device_info *devinfo,
    if (mlen > 0)
       return false;
 
-   /* We can't use swizzles on the accumulator and that's really the only
-    * HW_REG we would care to reswizzle so just disallow them all.
-    */
    for (int i = 0; i < 3; i++) {
-      if (src[i].file == HW_REG)
+      if (src[i].is_accumulator())
          return false;
    }
 
@@ -1058,16 +1020,16 @@ vec4_visitor::opt_register_coalesce()
       next_ip++;
 
       if (inst->opcode != BRW_OPCODE_MOV ||
-          (inst->dst.file != GRF && inst->dst.file != MRF) ||
+          (inst->dst.file != VGRF && inst->dst.file != MRF) ||
 	  inst->predicate ||
-	  inst->src[0].file != GRF ||
+	  inst->src[0].file != VGRF ||
 	  inst->dst.type != inst->src[0].type ||
 	  inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
 	 continue;
 
       /* Remove no-op MOVs */
       if (inst->dst.file == inst->src[0].file &&
-          inst->dst.reg == inst->src[0].reg &&
+          inst->dst.nr == inst->src[0].nr &&
           inst->dst.reg_offset == inst->src[0].reg_offset) {
          bool is_nop_mov = true;
 
@@ -1123,7 +1085,7 @@ vec4_visitor::opt_register_coalesce()
 
                if (devinfo->gen == 6) {
                   /* gen6 math instructions must have the destination be
-                   * GRF, so no compute-to-MRF for them.
+                   * VGRF, so no compute-to-MRF for them.
                    */
                   if (scan_inst->is_math()) {
                      break;
@@ -1188,8 +1150,8 @@ vec4_visitor::opt_register_coalesce()
           * in the register instead.
           */
          if (to_mrf && scan_inst->mlen > 0) {
-            if (inst->dst.reg >= scan_inst->base_mrf &&
-                inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
+            if (inst->dst.nr >= scan_inst->base_mrf &&
+                inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) {
                break;
             }
          } else {
@@ -1211,13 +1173,13 @@ vec4_visitor::opt_register_coalesce()
 	  */
          vec4_instruction *scan_inst = _scan_inst;
 	 while (scan_inst != inst) {
-	    if (scan_inst->dst.file == GRF &&
-		scan_inst->dst.reg == inst->src[0].reg &&
+	    if (scan_inst->dst.file == VGRF &&
+                scan_inst->dst.nr == inst->src[0].nr &&
 		scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
                scan_inst->reswizzle(inst->dst.writemask,
                                     inst->src[0].swizzle);
 	       scan_inst->dst.file = inst->dst.file;
-	       scan_inst->dst.reg = inst->dst.reg;
+               scan_inst->dst.nr = inst->dst.nr;
 	       scan_inst->dst.reg_offset = inst->dst.reg_offset;
                if (inst->saturate &&
                    inst->dst.type != scan_inst->dst.type) {
@@ -1314,12 +1276,12 @@ vec4_visitor::split_virtual_grfs()
     * to split.
     */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
-      if (inst->dst.file == GRF && inst->regs_written > 1)
-         split_grf[inst->dst.reg] = false;
+      if (inst->dst.file == VGRF && inst->regs_written > 1)
+         split_grf[inst->dst.nr] = false;
 
       for (int i = 0; i < 3; i++) {
-         if (inst->src[i].file == GRF && inst->regs_read(i) > 1)
-            split_grf[inst->src[i].reg] = false;
+         if (inst->src[i].file == VGRF && inst->regs_read(i) > 1)
+            split_grf[inst->src[i].nr] = false;
       }
    }
 
@@ -1340,16 +1302,16 @@ vec4_visitor::split_virtual_grfs()
    }
 
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
-      if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
+      if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
           inst->dst.reg_offset != 0) {
-         inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
+         inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
                           inst->dst.reg_offset - 1);
          inst->dst.reg_offset = 0;
       }
       for (int i = 0; i < 3; i++) {
-         if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
+         if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
              inst->src[i].reg_offset != 0) {
-            inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
+            inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
                                 inst->src[i].reg_offset - 1);
             inst->src[i].reg_offset = 0;
          }
@@ -1391,38 +1353,35 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    fprintf(file, " ");
 
    switch (inst->dst.file) {
-   case GRF:
-      fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
+   case VGRF:
+      fprintf(file, "vgrf%d.%d", inst->dst.nr, inst->dst.reg_offset);
+      break;
+   case FIXED_GRF:
+      fprintf(file, "g%d", inst->dst.nr);
       break;
    case MRF:
-      fprintf(file, "m%d", inst->dst.reg);
+      fprintf(file, "m%d", inst->dst.nr);
       break;
-   case HW_REG:
-      if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
-         switch (inst->dst.fixed_hw_reg.nr) {
-         case BRW_ARF_NULL:
-            fprintf(file, "null");
-            break;
-         case BRW_ARF_ADDRESS:
-            fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
-            break;
-         case BRW_ARF_ACCUMULATOR:
-            fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
-            break;
-         case BRW_ARF_FLAG:
-            fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
-                             inst->dst.fixed_hw_reg.subnr);
-            break;
-         default:
-            fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
-                               inst->dst.fixed_hw_reg.subnr);
-            break;
-         }
-      } else {
-         fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
+   case ARF:
+      switch (inst->dst.nr) {
+      case BRW_ARF_NULL:
+         fprintf(file, "null");
+         break;
+      case BRW_ARF_ADDRESS:
+         fprintf(file, "a0.%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_ACCUMULATOR:
+         fprintf(file, "acc%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_FLAG:
+         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      default:
+         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
       }
-      if (inst->dst.fixed_hw_reg.subnr)
-         fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
+      if (inst->dst.subnr)
+         fprintf(file, "+%d", inst->dst.subnr);
       break;
    case BAD_FILE:
       fprintf(file, "(null)");
@@ -1454,70 +1413,61 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       if (inst->src[i].abs)
          fprintf(file, "|");
       switch (inst->src[i].file) {
-      case GRF:
-         fprintf(file, "vgrf%d", inst->src[i].reg);
+      case VGRF:
+         fprintf(file, "vgrf%d", inst->src[i].nr);
+         break;
+      case FIXED_GRF:
+         fprintf(file, "g%d", inst->src[i].nr);
          break;
       case ATTR:
-         fprintf(file, "attr%d", inst->src[i].reg);
+         fprintf(file, "attr%d", inst->src[i].nr);
          break;
       case UNIFORM:
-         fprintf(file, "u%d", inst->src[i].reg);
+         fprintf(file, "u%d", inst->src[i].nr);
          break;
       case IMM:
          switch (inst->src[i].type) {
          case BRW_REGISTER_TYPE_F:
-            fprintf(file, "%fF", inst->src[i].fixed_hw_reg.dw1.f);
+            fprintf(file, "%fF", inst->src[i].f);
             break;
          case BRW_REGISTER_TYPE_D:
-            fprintf(file, "%dD", inst->src[i].fixed_hw_reg.dw1.d);
+            fprintf(file, "%dD", inst->src[i].d);
             break;
          case BRW_REGISTER_TYPE_UD:
-            fprintf(file, "%uU", inst->src[i].fixed_hw_reg.dw1.ud);
+            fprintf(file, "%uU", inst->src[i].ud);
             break;
          case BRW_REGISTER_TYPE_VF:
             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
+                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
             break;
          default:
             fprintf(file, "???");
             break;
          }
          break;
-      case HW_REG:
-         if (inst->src[i].fixed_hw_reg.negate)
-            fprintf(file, "-");
-         if (inst->src[i].fixed_hw_reg.abs)
-            fprintf(file, "|");
-         if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
-            switch (inst->src[i].fixed_hw_reg.nr) {
-            case BRW_ARF_NULL:
-               fprintf(file, "null");
-               break;
-            case BRW_ARF_ADDRESS:
-               fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
-               break;
-            case BRW_ARF_ACCUMULATOR:
-               fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
-               break;
-            case BRW_ARF_FLAG:
-               fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
-                                inst->src[i].fixed_hw_reg.subnr);
-               break;
-            default:
-               fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
-                                  inst->src[i].fixed_hw_reg.subnr);
-               break;
-            }
-         } else {
-            fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
+      case ARF:
+         switch (inst->src[i].nr) {
+         case BRW_ARF_NULL:
+            fprintf(file, "null");
+            break;
+         case BRW_ARF_ADDRESS:
+            fprintf(file, "a0.%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_ACCUMULATOR:
+            fprintf(file, "acc%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_FLAG:
+            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         default:
+            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
          }
-         if (inst->src[i].fixed_hw_reg.subnr)
-            fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
-         if (inst->src[i].fixed_hw_reg.abs)
-            fprintf(file, "|");
+         if (inst->src[i].subnr)
+            fprintf(file, "+%d", inst->src[i].subnr);
          break;
       case BAD_FILE:
          fprintf(file, "(null)");
@@ -1528,8 +1478,8 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
 
       /* Don't print .0; and only VGRFs have reg_offsets and sizes */
       if (inst->src[i].reg_offset != 0 &&
-          inst->src[i].file == GRF &&
-          alloc.sizes[inst->src[i].reg] != 1)
+          inst->src[i].file == VGRF &&
+          alloc.sizes[inst->src[i].nr] != 1)
          fprintf(file, ".%d", inst->src[i].reg_offset);
 
       if (inst->src[i].file != IMM) {
@@ -1551,6 +1501,9 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
          fprintf(file, ", ");
    }
 
+   if (inst->force_writemask_all)
+      fprintf(file, " NoMask");
+
    fprintf(file, "\n");
 }
 
@@ -1584,7 +1537,7 @@ vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       /* We have to support ATTR as a destination for GL_FIXED fixup. */
       if (inst->dst.file == ATTR) {
-	 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
+         int grf = attribute_map[inst->dst.nr + inst->dst.reg_offset];
 
          /* All attributes used in the shader need to have been assigned a
           * hardware register by the caller
@@ -1593,17 +1546,16 @@ vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
 
 	 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
 	 reg.type = inst->dst.type;
-	 reg.dw1.bits.writemask = inst->dst.writemask;
+	 reg.writemask = inst->dst.writemask;
 
-	 inst->dst.file = HW_REG;
-	 inst->dst.fixed_hw_reg = reg;
+         inst->dst = reg;
       }
 
       for (int i = 0; i < 3; i++) {
 	 if (inst->src[i].file != ATTR)
 	    continue;
 
-	 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
+         int grf = attribute_map[inst->src[i].nr + inst->src[i].reg_offset];
 
          /* All attributes used in the shader need to have been assigned a
           * hardware register by the caller
@@ -1611,15 +1563,14 @@ vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
          assert(grf != 0);
 
 	 struct brw_reg reg = attribute_to_hw_reg(grf, interleaved);
-	 reg.dw1.bits.swizzle = inst->src[i].swizzle;
+	 reg.swizzle = inst->src[i].swizzle;
          reg.type = inst->src[i].type;
 	 if (inst->src[i].abs)
 	    reg = brw_abs(reg);
 	 if (inst->src[i].negate)
 	    reg = negate(reg);
 
-	 inst->src[i].file = HW_REG;
-	 inst->src[i].fixed_hw_reg = reg;
+         inst->src[i] = reg;
       }
    }
 }
@@ -1803,26 +1754,26 @@ vec4_visitor::convert_to_hw_regs()
          struct src_reg &src = inst->src[i];
          struct brw_reg reg;
          switch (src.file) {
-         case GRF:
-            reg = brw_vec8_grf(src.reg + src.reg_offset, 0);
+         case VGRF:
+            reg = brw_vec8_grf(src.nr + src.reg_offset, 0);
             reg.type = src.type;
-            reg.dw1.bits.swizzle = src.swizzle;
+            reg.swizzle = src.swizzle;
             reg.abs = src.abs;
             reg.negate = src.negate;
             break;
 
          case IMM:
             reg = brw_imm_reg(src.type);
-            reg.dw1.ud = src.fixed_hw_reg.dw1.ud;
+            reg.ud = src.ud;
             break;
 
          case UNIFORM:
             reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
-                                      (src.reg + src.reg_offset) / 2,
-                                      ((src.reg + src.reg_offset) % 2) * 4),
+                                      (src.nr + src.reg_offset) / 2,
+                                      ((src.nr + src.reg_offset) % 2) * 4),
                          0, 4, 1);
             reg.type = src.type;
-            reg.dw1.bits.swizzle = src.swizzle;
+            reg.swizzle = src.swizzle;
             reg.abs = src.abs;
             reg.negate = src.negate;
 
@@ -1830,8 +1781,8 @@ vec4_visitor::convert_to_hw_regs()
             assert(!src.reladdr);
             break;
 
-         case HW_REG:
-            assert(src.type == src.fixed_hw_reg.type);
+         case ARF:
+         case FIXED_GRF:
             continue;
 
          case BAD_FILE:
@@ -1843,29 +1794,29 @@ vec4_visitor::convert_to_hw_regs()
          case ATTR:
             unreachable("not reached");
          }
-         src.fixed_hw_reg = reg;
+         src = reg;
       }
 
       dst_reg &dst = inst->dst;
       struct brw_reg reg;
 
       switch (inst->dst.file) {
-      case GRF:
-         reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
+      case VGRF:
+         reg = brw_vec8_grf(dst.nr + dst.reg_offset, 0);
          reg.type = dst.type;
-         reg.dw1.bits.writemask = dst.writemask;
+         reg.writemask = dst.writemask;
          break;
 
       case MRF:
-         assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen));
-         reg = brw_message_reg(dst.reg + dst.reg_offset);
+         assert(((dst.nr + dst.reg_offset) & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
+         reg = brw_message_reg(dst.nr + dst.reg_offset);
          reg.type = dst.type;
-         reg.dw1.bits.writemask = dst.writemask;
+         reg.writemask = dst.writemask;
          break;
 
-      case HW_REG:
-         assert(dst.type == dst.fixed_hw_reg.type);
-         reg = dst.fixed_hw_reg;
+      case ARF:
+      case FIXED_GRF:
+         reg = dst;
          break;
 
       case BAD_FILE:
@@ -1878,7 +1829,7 @@ vec4_visitor::convert_to_hw_regs()
          unreachable("not reached");
       }
 
-      dst.fixed_hw_reg = reg;
+      dst = reg;
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_builder.h b/src/mesa/drivers/dri/i965/brw_vec4_builder.h
index a90cadb77db..a76a4ce4639 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_builder.h
@@ -144,7 +144,7 @@ namespace brw {
          assert(dispatch_width() <= 32);
 
          if (n > 0)
-            return retype(dst_reg(GRF, shader->alloc.allocate(
+            return retype(dst_reg(VGRF, shader->alloc.allocate(
                                      n * DIV_ROUND_UP(type_sz(type), 4))),
                            type);
          else
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp
index 329f24269ce..7aa8f5d9b8f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp
@@ -48,7 +48,7 @@ opt_cmod_propagation_local(bblock_t *block)
            inst->opcode != BRW_OPCODE_MOV) ||
           inst->predicate != BRW_PREDICATE_NONE ||
           !inst->dst.is_null() ||
-          inst->src[0].file != GRF ||
+          inst->src[0].file != VGRF ||
           inst->src[0].abs)
          continue;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
index db99ecba35a..3b76e36a803 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
@@ -47,7 +47,7 @@ is_direct_copy(vec4_instruction *inst)
 {
    return (inst->opcode == BRW_OPCODE_MOV &&
 	   !inst->predicate &&
-	   inst->dst.file == GRF &&
+	   inst->dst.file == VGRF &&
 	   !inst->dst.reladdr &&
 	   !inst->src[0].reladdr &&
 	   (inst->dst.type == inst->src[0].type ||
@@ -70,8 +70,8 @@ is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
    const src_reg *src = values[ch];
 
    /* consider GRF only */
-   assert(inst->dst.file == GRF);
-   if (!src || src->file != GRF)
+   assert(inst->dst.file == VGRF);
+   if (!src || src->file != VGRF)
       return false;
 
    return (src->in_range(inst->dst, inst->regs_written) &&
@@ -134,21 +134,20 @@ try_constant_propagate(const struct brw_device_info *devinfo,
 
    if (inst->src[arg].abs) {
       if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
-          !brw_abs_immediate(value.type, &value.fixed_hw_reg)) {
+          !brw_abs_immediate(value.type, &value)) {
          return false;
       }
    }
 
    if (inst->src[arg].negate) {
       if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
-          !brw_negate_immediate(value.type, &value.fixed_hw_reg)) {
+          !brw_negate_immediate(value.type, &value)) {
          return false;
       }
    }
 
    if (value.type == BRW_REGISTER_TYPE_VF)
-      value.fixed_hw_reg.dw1.ud = swizzle_vf_imm(value.fixed_hw_reg.dw1.ud,
-                                                 inst->src[arg].swizzle);
+      value.ud = swizzle_vf_imm(value.ud, inst->src[arg].swizzle);
 
    switch (inst->opcode) {
    case BRW_OPCODE_MOV:
@@ -272,7 +271,7 @@ try_copy_propagate(const struct brw_device_info *devinfo,
    for (int i = 1; i < 4; i++) {
       /* This is equals() except we don't care about the swizzle. */
       if (value.file != entry->value[i]->file ||
-	  value.reg != entry->value[i]->reg ||
+          value.nr != entry->value[i]->nr ||
 	  value.reg_offset != entry->value[i]->reg_offset ||
 	  value.type != entry->value[i]->type ||
 	  value.negate != entry->value[i]->negate ||
@@ -293,7 +292,7 @@ try_copy_propagate(const struct brw_device_info *devinfo,
 
    /* Check that we can propagate that value */
    if (value.file != UNIFORM &&
-       value.file != GRF &&
+       value.file != VGRF &&
        value.file != ATTR)
       return false;
 
@@ -359,8 +358,8 @@ try_copy_propagate(const struct brw_device_info *devinfo,
              inst->src[0].type != BRW_REGISTER_TYPE_F ||
              inst->src[1].file != IMM ||
              inst->src[1].type != BRW_REGISTER_TYPE_F ||
-             inst->src[1].fixed_hw_reg.dw1.f < 0.0 ||
-             inst->src[1].fixed_hw_reg.dw1.f > 1.0) {
+             inst->src[1].f < 0.0 ||
+             inst->src[1].f > 1.0) {
             return false;
          }
          if (!inst->saturate)
@@ -417,14 +416,14 @@ vec4_visitor::opt_copy_propagation(bool do_constant_prop)
       }
 
       /* For each source arg, see if each component comes from a copy
-       * from the same type file (IMM, GRF, UNIFORM), and try
+       * from the same type file (IMM, VGRF, UNIFORM), and try
        * optimizing out access to the copy result
        */
       for (int i = 2; i >= 0; i--) {
 	 /* Copied values end up in GRFs, and we don't track reladdr
 	  * accesses.
 	  */
-	 if (inst->src[i].file != GRF ||
+	 if (inst->src[i].file != VGRF ||
 	     inst->src[i].reladdr)
 	    continue;
 
@@ -432,7 +431,7 @@ vec4_visitor::opt_copy_propagation(bool do_constant_prop)
          if (inst->regs_read(i) != 1)
             continue;
 
-	 int reg = (alloc.offsets[inst->src[i].reg] +
+         int reg = (alloc.offsets[inst->src[i].nr] +
 		    inst->src[i].reg_offset);
 
 	 /* Find the regs that each swizzle component came from.
@@ -473,9 +472,9 @@ vec4_visitor::opt_copy_propagation(bool do_constant_prop)
       }
 
       /* Track available source registers. */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
 	 const int reg =
-	    alloc.offsets[inst->dst.reg] + inst->dst.reg_offset;
+            alloc.offsets[inst->dst.nr] + inst->dst.reg_offset;
 
 	 /* Update our destination's current channel values.  For a direct copy,
 	  * the value is the newly propagated source.  Otherwise, we don't know
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 5a277f74c44..85cbf24092e 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -143,7 +143,8 @@ vec4_visitor::opt_cse_local(bblock_t *block)
    foreach_inst_in_block (vec4_instruction, inst, block) {
       /* Skip some cases. */
       if (is_expression(inst) && !inst->predicate && inst->mlen == 0 &&
-          (inst->dst.file != HW_REG || inst->dst.is_null()))
+          ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
+           inst->dst.is_null()))
       {
          bool found = false;
 
@@ -174,7 +175,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
              */
             bool no_existing_temp = entry->tmp.file == BAD_FILE;
             if (no_existing_temp && !entry->generator->dst.is_null()) {
-               entry->tmp = retype(src_reg(GRF, alloc.allocate(
+               entry->tmp = retype(src_reg(VGRF, alloc.allocate(
                                               entry->generator->regs_written),
                                            NULL), inst->dst.type);
 
@@ -233,7 +234,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
              * overwrote.
              */
             if (inst->dst.file == entry->generator->src[i].file &&
-                inst->dst.reg == entry->generator->src[i].reg) {
+                inst->dst.nr == entry->generator->src[i].nr) {
                entry->remove();
                ralloc_free(entry);
                break;
@@ -242,7 +243,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
             /* Kill any AEB entries using registers that don't get reused any
              * more -- a sure sign they'll fail operands_match().
              */
-            if (src->file == GRF) {
+            if (src->file == VGRF) {
                if (var_range_end(var_from_reg(alloc, *src), 4) < ip) {
                   entry->remove();
                   ralloc_free(entry);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
index 284e0a8d0a5..58aed810fcd 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
@@ -78,11 +78,11 @@ vec4_visitor::dead_code_eliminate()
              sizeof(BITSET_WORD));
 
       foreach_inst_in_block_reverse(vec4_instruction, inst, block) {
-         if ((inst->dst.file == GRF && !inst->has_side_effects()) ||
+         if ((inst->dst.file == VGRF && !inst->has_side_effects()) ||
              (inst->dst.is_null() && inst->writes_flag())){
             bool result_live[4] = { false };
 
-            if (inst->dst.file == GRF) {
+            if (inst->dst.file == VGRF) {
                for (unsigned i = 0; i < inst->regs_written; i++) {
                   for (int c = 0; c < 4; c++)
                      result_live[c] |= BITSET_TEST(
@@ -134,7 +134,7 @@ vec4_visitor::dead_code_eliminate()
             }
          }
 
-         if (inst->dst.file == GRF && !inst->predicate) {
+         if (inst->dst.file == VGRF && !inst->predicate) {
             for (unsigned i = 0; i < inst->regs_written; i++) {
                for (int c = 0; c < 4; c++) {
                   if (inst->dst.writemask & (1 << c)) {
@@ -145,13 +145,13 @@ vec4_visitor::dead_code_eliminate()
             }
          }
 
-         if (inst->writes_flag()) {
+         if (inst->writes_flag() && !inst->predicate) {
             for (unsigned c = 0; c < 4; c++)
                BITSET_CLEAR(flag_live, c);
          }
 
          for (int i = 0; i < 3; i++) {
-            if (inst->src[i].file == GRF) {
+            if (inst->src[i].file == VGRF) {
                for (unsigned j = 0; j < inst->regs_read(i); j++) {
                   for (int c = 0; c < 4; c++) {
                      BITSET_SET(live, var_from_reg(alloc,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 8bc21df5ffc..20107ac2054 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -46,7 +46,7 @@ check_gen6_math_src_arg(struct brw_reg src)
    /* Source swizzles are ignored. */
    assert(!src.abs);
    assert(!src.negate);
-   assert(src.dw1.bits.swizzle == BRW_SWIZZLE_XYZW);
+   assert(src.swizzle == BRW_SWIZZLE_XYZW);
 }
 
 static void
@@ -57,7 +57,7 @@ generate_math_gen6(struct brw_codegen *p,
                    struct brw_reg src1)
 {
    /* Can't do writemask because math can't be align16. */
-   assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
+   assert(dst.writemask == WRITEMASK_XYZW);
    /* Source swizzles are ignored. */
    check_gen6_math_src_arg(src0);
    if (src1.file == BRW_GENERAL_REGISTER_FILE)
@@ -135,6 +135,10 @@ generate_tex(struct brw_codegen *p,
       case SHADER_OPCODE_TXF:
 	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
 	 break;
+      case SHADER_OPCODE_TXF_CMS_W:
+         assert(devinfo->gen >= 9);
+         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
+         break;
       case SHADER_OPCODE_TXF_CMS:
          if (devinfo->gen >= 7)
             msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
@@ -260,7 +264,7 @@ generate_tex(struct brw_codegen *p,
          : prog_data->base.binding_table.texture_start;
 
    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
-      uint32_t sampler = sampler_index.dw1.ud;
+      uint32_t sampler = sampler_index.ud;
 
       brw_SAMPLE(p,
                  dst,
@@ -352,7 +356,7 @@ generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
 
    /* We pass the temporary passed in src0 as the writeback register */
    brw_urb_WRITE(p,
-                 inst->src[0].fixed_hw_reg, /* dest */
+                 inst->src[0], /* dest */
                  inst->base_mrf, /* starting mrf reg nr */
                  src,
                  BRW_URB_WRITE_ALLOCATE_COMPLETE,
@@ -365,8 +369,8 @@ generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
    brw_push_insn_state(p);
    brw_set_default_access_mode(p, BRW_ALIGN_1);
    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-   brw_MOV(p, get_element_ud(inst->dst.fixed_hw_reg, 0),
-           get_element_ud(inst->src[0].fixed_hw_reg, 0));
+   brw_MOV(p, get_element_ud(inst->dst, 0),
+           get_element_ud(inst->src[0], 0));
    brw_pop_insn_state(p);
 }
 
@@ -415,10 +419,10 @@ generate_gs_set_write_offset(struct brw_codegen *p,
    assert(p->devinfo->gen >= 7 &&
           src1.file == BRW_IMMEDIATE_VALUE &&
           src1.type == BRW_REGISTER_TYPE_UD &&
-          src1.dw1.ud <= USHRT_MAX);
+          src1.ud <= USHRT_MAX);
    if (src0.file == BRW_IMMEDIATE_VALUE) {
       brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
-              brw_imm_ud(src0.dw1.ud * src1.dw1.ud));
+              brw_imm_ud(src0.ud * src1.ud));
    } else {
       brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
               retype(src1, BRW_REGISTER_TYPE_UW));
@@ -736,7 +740,7 @@ generate_oword_dual_block_offsets(struct brw_codegen *p,
    brw_MOV(p, m1_0, index_0);
 
    if (index.file == BRW_IMMEDIATE_VALUE) {
-      index_4.dw1.ud += second_vertex_offset;
+      index_4.ud += second_vertex_offset;
       brw_MOV(p, m1_4, index_4);
    } else {
       brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
@@ -891,7 +895,7 @@ generate_pull_constant_load(struct brw_codegen *p,
    const struct brw_device_info *devinfo = p->devinfo;
    assert(index.file == BRW_IMMEDIATE_VALUE &&
 	  index.type == BRW_REGISTER_TYPE_UD);
-   uint32_t surf_index = index.dw1.ud;
+   uint32_t surf_index = index.ud;
 
    struct brw_reg header = brw_vec8_grf(0, 0);
 
@@ -925,8 +929,6 @@ generate_pull_constant_load(struct brw_codegen *p,
 			   2, /* mlen */
                            true, /* header_present */
 			   1 /* rlen */);
-
-   brw_mark_surface_used(&prog_data->base, surf_index);
 }
 
 static void
@@ -945,7 +947,7 @@ generate_get_buffer_size(struct brw_codegen *p,
               dst,
               inst->base_mrf,
               src,
-              surf_index.dw1.ud,
+              surf_index.ud,
               0,
               GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
               1, /* response length */
@@ -954,7 +956,7 @@ generate_get_buffer_size(struct brw_codegen *p,
               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
               BRW_SAMPLER_RETURN_FORMAT_SINT32);
 
-   brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
+   brw_mark_surface_used(&prog_data->base, surf_index.ud);
 }
 
 static void
@@ -973,7 +975,7 @@ generate_pull_constant_load_gen7(struct brw_codegen *p,
       brw_set_dest(p, insn, dst);
       brw_set_src0(p, insn, offset);
       brw_set_sampler_message(p, insn,
-                              surf_index.dw1.ud,
+                              surf_index.ud,
                               0, /* LD message ignores sampler unit */
                               GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
                               1, /* rlen */
@@ -982,7 +984,7 @@ generate_pull_constant_load_gen7(struct brw_codegen *p,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
 
-      brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
+      brw_mark_surface_used(&prog_data->base, surf_index.ud);
 
    } else {
 
@@ -1013,10 +1015,6 @@ generate_pull_constant_load_gen7(struct brw_codegen *p,
                               inst->header_size != 0,
                               BRW_SAMPLER_SIMD_MODE_SIMD4X2,
                               0);
-
-      /* visitor knows more than we do about the surface limit required,
-       * so has already done marking.
-       */
    }
 }
 
@@ -1061,9 +1059,9 @@ generate_code(struct brw_codegen *p,
          annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
 
       for (unsigned int i = 0; i < 3; i++) {
-	 src[i] = inst->src[i].fixed_hw_reg;
+         src[i] = inst->src[i];
       }
-      dst = inst->dst.fixed_hw_reg;
+      dst = inst->dst;
 
       brw_set_default_predicate_control(p, inst->predicate);
       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
@@ -1243,7 +1241,7 @@ generate_code(struct brw_codegen *p,
          break;
 
       case BRW_OPCODE_IF:
-         if (inst->src[0].file != BAD_FILE) {
+         if (!inst->src[0].is_null()) {
             /* The instruction has an embedded compare (only allowed on gen6) */
             assert(devinfo->gen == 6);
             gen6_IF(p, inst->conditional_mod, src[0], src[1]);
@@ -1313,6 +1311,7 @@ generate_code(struct brw_codegen *p,
       case SHADER_OPCODE_TXD:
       case SHADER_OPCODE_TXF:
       case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXF_CMS_W:
       case SHADER_OPCODE_TXF_MCS:
       case SHADER_OPCODE_TXL:
       case SHADER_OPCODE_TXS:
@@ -1416,38 +1415,38 @@ generate_code(struct brw_codegen *p,
 
       case SHADER_OPCODE_UNTYPED_ATOMIC:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud, inst->mlen,
+         brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
                             !inst->dst.is_null());
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
-                                  src[2].dw1.ud);
+                                  src[2].ud);
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
-                                   src[2].dw1.ud);
+                                   src[2].ud);
          break;
 
       case SHADER_OPCODE_TYPED_ATOMIC:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
-         brw_typed_atomic(p, dst, src[0], src[1], src[2].dw1.ud, inst->mlen,
+         brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
                           !inst->dst.is_null());
          break;
 
       case SHADER_OPCODE_TYPED_SURFACE_READ:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen,
-                                src[2].dw1.ud);
+                                src[2].ud);
          break;
 
       case SHADER_OPCODE_TYPED_SURFACE_WRITE:
          assert(src[2].file == BRW_IMMEDIATE_VALUE);
          brw_typed_surface_write(p, src[0], src[1], inst->mlen,
-                                 src[2].dw1.ud);
+                                 src[2].ud);
          break;
 
       case SHADER_OPCODE_MEMORY_FENCE:
@@ -1495,9 +1494,9 @@ generate_code(struct brw_codegen *p,
           *
           * where they pack the four bytes from the low and high four DW.
           */
-         assert(_mesa_is_pow_two(dst.dw1.bits.writemask) &&
-                dst.dw1.bits.writemask != 0);
-         unsigned offset = __builtin_ctz(dst.dw1.bits.writemask);
+         assert(_mesa_is_pow_two(dst.writemask) &&
+                dst.writemask != 0);
+         unsigned offset = __builtin_ctz(dst.writemask);
 
          dst.type = BRW_REGISTER_TYPE_UB;
 
@@ -1549,6 +1548,13 @@ generate_code(struct brw_codegen *p,
    brw_set_uip_jip(p);
    annotation_finalize(&annotation, p->next_insn_offset);
 
+#ifndef NDEBUG
+   bool validated = brw_validate_instructions(p, 0, &annotation);
+#else
+   if (unlikely(debug_flag))
+      brw_validate_instructions(p, 0, &annotation);
+#endif
+
    int before_size = p->next_insn_offset;
    brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann);
    int after_size = p->next_insn_offset;
@@ -1566,8 +1572,9 @@ generate_code(struct brw_codegen *p,
 
       dump_assembly(p->store, annotation.ann_count, annotation.ann,
                     p->devinfo);
-      ralloc_free(annotation.ann);
+      ralloc_free(annotation.mem_ctx);
    }
+   assert(validated);
 
    compiler->shader_debug_log(log_data,
                               "%s vec4 shader: %d inst, %d loops, %u cycles, "
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index cfb5cd95cb1..1a09f76a20c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -29,6 +29,7 @@
 
 #include "brw_vec4_gs_visitor.h"
 #include "gen6_gs_visitor.h"
+#include "brw_fs.h"
 
 namespace brw {
 
@@ -811,6 +812,36 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
    /* Now that prog_data setup is done, we are ready to actually compile the
     * program.
     */
+   if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
+      fprintf(stderr, "GS Input ");
+      brw_print_vue_map(stderr, &c.input_vue_map);
+      fprintf(stderr, "GS Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map);
+   }
+
+   if (compiler->scalar_gs) {
+      /* TODO: Support instanced GS.  We have basically no tests... */
+      assert(prog_data->invocations == 1);
+
+      fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader,
+                   shader_time_index);
+      if (v.run_gs()) {
+         prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+         fs_generator g(compiler, log_data, mem_ctx, &c.key,
+                        &prog_data->base.base, v.promoted_constants,
+                        false, "GS");
+         if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
+            const char *label =
+               shader->info.label ? shader->info.label : "unnamed";
+            char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s",
+                                         label, shader->info.name);
+            g.enable_debug(name);
+         }
+         g.generate_code(v.cfg, 8);
+         return g.get_assembly(final_assembly_size);
+      }
+   }
 
    if (compiler->devinfo->gen >= 7) {
       /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
index aa9a6572eee..57d5fbb75dd 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
@@ -75,7 +75,7 @@ vec4_live_variables::setup_def_use()
 
 	 /* Set use[] for this instruction */
 	 for (unsigned int i = 0; i < 3; i++) {
-	    if (inst->src[i].file == GRF) {
+	    if (inst->src[i].file == VGRF) {
                for (unsigned j = 0; j < inst->regs_read(i); j++) {
                   for (int c = 0; c < 4; c++) {
                      const unsigned v =
@@ -97,7 +97,7 @@ vec4_live_variables::setup_def_use()
 	  * are the things that screen off preceding definitions of a
 	  * variable, and thus qualify for being in def[].
 	  */
-	 if (inst->dst.file == GRF &&
+	 if (inst->dst.file == VGRF &&
 	     (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) {
             for (unsigned i = 0; i < inst->regs_written; i++) {
                for (int c = 0; c < 4; c++) {
@@ -256,7 +256,7 @@ vec4_visitor::calculate_live_intervals()
    int ip = 0;
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       for (unsigned int i = 0; i < 3; i++) {
-	 if (inst->src[i].file == GRF) {
+	 if (inst->src[i].file == VGRF) {
             for (unsigned j = 0; j < inst->regs_read(i); j++) {
                for (int c = 0; c < 4; c++) {
                   const unsigned v =
@@ -268,7 +268,7 @@ vec4_visitor::calculate_live_intervals()
 	 }
       }
 
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
          for (unsigned i = 0; i < inst->regs_written; i++) {
             for (int c = 0; c < 4; c++) {
                if (inst->dst.writemask & (1 << c)) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h
index e7929ec2189..12d281eb245 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h
@@ -82,9 +82,9 @@ inline unsigned
 var_from_reg(const simple_allocator &alloc, const src_reg &reg,
              unsigned c = 0)
 {
-   assert(reg.file == GRF && reg.reg < alloc.count &&
-          reg.reg_offset < alloc.sizes[reg.reg] && c < 4);
-   return (4 * (alloc.offsets[reg.reg] + reg.reg_offset) +
+   assert(reg.file == VGRF && reg.nr < alloc.count &&
+          reg.reg_offset < alloc.sizes[reg.nr] && c < 4);
+   return (4 * (alloc.offsets[reg.nr] + reg.reg_offset) +
            BRW_GET_SWZ(reg.swizzle, c));
 }
 
@@ -92,9 +92,9 @@ inline unsigned
 var_from_reg(const simple_allocator &alloc, const dst_reg &reg,
              unsigned c = 0)
 {
-   assert(reg.file == GRF && reg.reg < alloc.count &&
-          reg.reg_offset < alloc.sizes[reg.reg] && c < 4);
-   return 4 * (alloc.offsets[reg.reg] + reg.reg_offset) + c;
+   assert(reg.file == VGRF && reg.nr < alloc.count &&
+          reg.reg_offset < alloc.sizes[reg.nr] && c < 4);
+   return 4 * (alloc.offsets[reg.nr] + reg.reg_offset) + c;
 }
 
 } /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 1fb1773f856..258dd4f6548 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -106,6 +106,9 @@ void
 vec4_visitor::nir_setup_system_values()
 {
    nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX);
+   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
+      nir_system_values[i] = dst_reg();
+   }
 
    nir_foreach_overload(nir, overload) {
       assert(strcmp(overload->function->name, "main") == 0);
@@ -118,6 +121,9 @@ void
 vec4_visitor::nir_setup_inputs()
 {
    nir_inputs = ralloc_array(mem_ctx, src_reg, nir->num_inputs);
+   for (unsigned i = 0; i < nir->num_inputs; i++) {
+      nir_inputs[i] = dst_reg();
+   }
 
    nir_foreach_variable(var, &nir->inputs) {
       int offset = var->data.driver_location;
@@ -148,12 +154,15 @@ void
 vec4_visitor::nir_emit_impl(nir_function_impl *impl)
 {
    nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc);
+   for (unsigned i = 0; i < impl->reg_alloc; i++) {
+      nir_locals[i] = dst_reg();
+   }
 
    foreach_list_typed(nir_register, reg, node, &impl->registers) {
       unsigned array_elems =
          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
 
-      nir_locals[reg->index] = dst_reg(GRF, alloc.allocate(array_elems));
+      nir_locals[reg->index] = dst_reg(VGRF, alloc.allocate(array_elems));
    }
 
    nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
@@ -282,7 +291,7 @@ dst_reg
 vec4_visitor::get_nir_dest(nir_dest dest)
 {
    if (dest.is_ssa) {
-      dst_reg dst = dst_reg(GRF, alloc.allocate(1));
+      dst_reg dst = dst_reg(VGRF, alloc.allocate(1));
       nir_ssa_values[dest.ssa.index] = dst;
       return dst;
    } else {
@@ -342,7 +351,7 @@ vec4_visitor::get_nir_src(nir_src src, unsigned num_components)
 void
 vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
 {
-   dst_reg reg = dst_reg(GRF, alloc.allocate(1));
+   dst_reg reg = dst_reg(VGRF, alloc.allocate(1));
    reg.type =  BRW_REGISTER_TYPE_D;
 
    unsigned remaining = brw_writemask_for_size(instr->def.num_components);
@@ -427,15 +436,15 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
       unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
 
-      src_reg surf_index = src_reg(prog_data->base.binding_table.ssbo_start +
-                                   ssbo_index);
+      const unsigned index =
+         prog_data->base.binding_table.ssbo_start + ssbo_index;
       dst_reg result_dst = get_nir_dest(instr->dest);
       vec4_instruction *inst = new(mem_ctx)
          vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst);
 
       inst->base_mrf = 2;
       inst->mlen = 1; /* always at least one */
-      inst->src[1] = src_reg(surf_index);
+      inst->src[1] = src_reg(index);
 
       /* MRF for the first parameter */
       src_reg lod = src_reg(0);
@@ -444,6 +453,8 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod));
 
       emit(inst);
+
+      brw_mark_surface_used(&prog_data->base, index);
       break;
    }
 
@@ -749,8 +760,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          /* The block index is a constant, so just emit the binding table entry
           * as an immediate.
           */
-         surf_index = src_reg(prog_data->base.binding_table.ubo_start +
-                              const_block_index->u[0]);
+         const unsigned index = prog_data->base.binding_table.ubo_start +
+                                const_block_index->u[0];
+         surf_index = src_reg(index);
+         brw_mark_surface_used(&prog_data->base, index);
       } else {
          /* The block index is not a constant. Evaluate the index expression
           * per-channel and add the base UBO index; we have to select a value
@@ -1407,7 +1420,23 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_bcsel:
       emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
       inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
-      inst->predicate = BRW_PREDICATE_NORMAL;
+      switch (dst.writemask) {
+      case WRITEMASK_X:
+         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
+         break;
+      case WRITEMASK_Y:
+         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+         break;
+      case WRITEMASK_Z:
+         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+         break;
+      case WRITEMASK_W:
+         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
+         break;
+      default:
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         break;
+      }
       break;
 
    case nir_op_fdot_replicated2:
@@ -1708,7 +1737,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
 void
 vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr)
 {
-   nir_ssa_values[instr->def.index] = dst_reg(GRF, alloc.allocate(1));
+   nir_ssa_values[instr->def.index] = dst_reg(VGRF, alloc.allocate(1));
 }
 
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
index a49eca56118..6d27a4694d3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -34,8 +34,8 @@ namespace brw {
 static void
 assign(unsigned int *reg_hw_locations, backend_reg *reg)
 {
-   if (reg->file == GRF) {
-      reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset;
+   if (reg->file == VGRF) {
+      reg->nr = reg_hw_locations[reg->nr] + reg->reg_offset;
       reg->reg_offset = 0;
    }
 }
@@ -55,12 +55,12 @@ vec4_visitor::reg_allocate_trivial()
    }
 
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
-      if (inst->dst.file == GRF)
-	 virtual_grf_used[inst->dst.reg] = true;
+      if (inst->dst.file == VGRF)
+         virtual_grf_used[inst->dst.nr] = true;
 
       for (unsigned i = 0; i < 3; i++) {
-	 if (inst->src[i].file == GRF)
-	    virtual_grf_used[inst->src[i].reg] = true;
+	 if (inst->src[i].file == VGRF)
+            virtual_grf_used[inst->src[i].nr] = true;
       }
    }
 
@@ -292,12 +292,12 @@ static bool
 can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
                            unsigned scratch_reg)
 {
-   assert(inst->src[i].file == GRF);
+   assert(inst->src[i].file == VGRF);
    bool prev_inst_read_scratch_reg = false;
 
    /* See if any previous source in the same instructions reads scratch_reg */
    for (unsigned n = 0; n < i; n++) {
-      if (inst->src[n].file == GRF && inst->src[n].reg == scratch_reg)
+      if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg)
          prev_inst_read_scratch_reg = true;
    }
 
@@ -310,7 +310,7 @@ can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
        * it if the write is not conditional and the channels we write are
        * compatible with our read mask
        */
-      if (prev_inst->dst.file == GRF && prev_inst->dst.reg == scratch_reg) {
+      if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) {
          return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) &&
                 (brw_mask_for_swizzle(inst->src[i].swizzle) &
                  ~prev_inst->dst.writemask) == 0;
@@ -329,8 +329,8 @@ can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
        */
       int n;
       for (n = 0; n < 3; n++) {
-         if (prev_inst->src[n].file == GRF &&
-             prev_inst->src[n].reg == scratch_reg) {
+         if (prev_inst->src[n].file == VGRF &&
+             prev_inst->src[n].nr == scratch_reg) {
             prev_inst_read_scratch_reg = true;
             break;
          }
@@ -374,23 +374,23 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
     */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       for (unsigned int i = 0; i < 3; i++) {
-         if (inst->src[i].file == GRF) {
+         if (inst->src[i].file == VGRF) {
             /* We will only unspill src[i] it it wasn't unspilled for the
              * previous instruction, in which case we'll just reuse the scratch
              * reg for this instruction.
              */
-            if (!can_use_scratch_for_source(inst, i, inst->src[i].reg)) {
-               spill_costs[inst->src[i].reg] += loop_scale;
+            if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) {
+               spill_costs[inst->src[i].nr] += loop_scale;
                if (inst->src[i].reladdr)
-                  no_spill[inst->src[i].reg] = true;
+                  no_spill[inst->src[i].nr] = true;
             }
          }
       }
 
-      if (inst->dst.file == GRF) {
-         spill_costs[inst->dst.reg] += loop_scale;
+      if (inst->dst.file == VGRF) {
+         spill_costs[inst->dst.nr] += loop_scale;
          if (inst->dst.reladdr)
-            no_spill[inst->dst.reg] = true;
+            no_spill[inst->dst.nr] = true;
       }
 
       switch (inst->opcode) {
@@ -406,11 +406,11 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
          for (int i = 0; i < 3; i++) {
-            if (inst->src[i].file == GRF)
-               no_spill[inst->src[i].reg] = true;
+            if (inst->src[i].file == VGRF)
+               no_spill[inst->src[i].nr] = true;
          }
-         if (inst->dst.file == GRF)
-            no_spill[inst->dst.reg] = true;
+         if (inst->dst.file == VGRF)
+            no_spill[inst->dst.nr] = true;
          break;
 
       default:
@@ -445,7 +445,7 @@ vec4_visitor::spill_reg(int spill_reg_nr)
    int scratch_reg = -1;
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
       for (unsigned int i = 0; i < 3; i++) {
-         if (inst->src[i].file == GRF && inst->src[i].reg == spill_reg_nr) {
+         if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
             if (scratch_reg == -1 ||
                 !can_use_scratch_for_source(inst, i, scratch_reg)) {
                /* We need to unspill anyway so make sure we read the full vec4
@@ -455,19 +455,19 @@ vec4_visitor::spill_reg(int spill_reg_nr)
                 */
                scratch_reg = alloc.allocate(1);
                src_reg temp = inst->src[i];
-               temp.reg = scratch_reg;
+               temp.nr = scratch_reg;
                temp.swizzle = BRW_SWIZZLE_XYZW;
                emit_scratch_read(block, inst,
                                  dst_reg(temp), inst->src[i], spill_offset);
             }
             assert(scratch_reg != -1);
-            inst->src[i].reg = scratch_reg;
+            inst->src[i].nr = scratch_reg;
          }
       }
 
-      if (inst->dst.file == GRF && inst->dst.reg == spill_reg_nr) {
+      if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
          emit_scratch_write(block, inst, spill_offset);
-         scratch_reg = inst->dst.reg;
+         scratch_reg = inst->dst.nr;
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 92b089d7ff6..70a1ea4f9d4 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -237,8 +237,6 @@ vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
     * type to match src0 so we can compact the instruction.
     */
    dst.type = src0.type;
-   if (dst.file == HW_REG)
-      dst.fixed_hw_reg.type = dst.type;
 
    resolve_ud_negate(&src0);
    resolve_ud_negate(&src1);
@@ -635,8 +633,8 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
 {
    init();
 
-   this->file = GRF;
-   this->reg = v->alloc.allocate(type_size_vec4(type));
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type));
 
    if (type->is_array() || type->is_record()) {
       this->swizzle = BRW_SWIZZLE_NOOP;
@@ -653,8 +651,8 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
 
    init();
 
-   this->file = GRF;
-   this->reg = v->alloc.allocate(type_size_vec4(type) * size);
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type) * size);
 
    this->swizzle = BRW_SWIZZLE_NOOP;
 
@@ -665,8 +663,8 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 {
    init();
 
-   this->file = GRF;
-   this->reg = v->alloc.allocate(type_size_vec4(type));
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type));
 
    if (type->is_array() || type->is_record()) {
       this->writemask = WRITEMASK_XYZW;
@@ -864,7 +862,7 @@ vec4_visitor::is_high_sampler(src_reg sampler)
    if (devinfo->gen < 8 && !devinfo->is_haswell)
       return false;
 
-   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+   return sampler.file != IMM || sampler.ud >= 16;
 }
 
 void
@@ -901,7 +899,8 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
-   case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
+   case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
+                             SHADER_OPCODE_TXF_CMS); break;
    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
    case ir_tg4: opcode = offset_value.file != BAD_FILE
                          ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
@@ -993,7 +992,16 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
       } else if (op == ir_txf_ms) {
          emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
                   sample_index));
-         if (devinfo->gen >= 7) {
+         if (opcode == SHADER_OPCODE_TXF_CMS_W) {
+            /* MCS data is stored in the first two channels of ‘mcs’, but we
+             * need to get it into the .y and .z channels of the second vec4
+             * of params.
+             */
+            mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
+            emit(MOV(dst_reg(MRF, param_base + 1,
+                             glsl_type::uint_type, WRITEMASK_YZ),
+                     mcs));
+         } else if (devinfo->gen >= 7) {
             /* MCS data is in the first channel of `mcs`, but we need to get it into
              * the .y channel of the second vec4 of params, so replicate .x across
              * the whole vec4 and then mask off everything except .y
@@ -1184,24 +1192,27 @@ vec4_visitor::gs_end_primitive()
 
 void
 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                                  dst_reg dst, src_reg offset,
+                                  dst_reg dst, src_reg surf_offset,
                                   src_reg src0, src_reg src1)
 {
-   unsigned mlen = 0;
+   unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+   src_reg src_payload(this, glsl_type::uint_type, mlen);
+   dst_reg payload(src_payload);
+   payload.writemask = WRITEMASK_X;
 
    /* Set the atomic operation offset. */
-   emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
-   mlen++;
+   emit(MOV(offset(payload, 0), surf_offset));
+   unsigned i = 1;
 
    /* Set the atomic operation arguments. */
    if (src0.file != BAD_FILE) {
-      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
-      mlen++;
+      emit(MOV(offset(payload, i), src0));
+      i++;
    }
 
    if (src1.file != BAD_FILE) {
-      emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
-      mlen++;
+      emit(MOV(offset(payload, i), src1));
+      i++;
    }
 
    /* Emit the instruction.  Note that this maps to the normal SIMD8
@@ -1209,24 +1220,27 @@ vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
     * unused channels will be masked out.
     */
    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
-                                 brw_message_reg(0),
+                                 src_payload,
                                  src_reg(surf_index), src_reg(atomic_op));
    inst->mlen = mlen;
 }
 
 void
 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
-                                        src_reg offset)
+                                        src_reg surf_offset)
 {
+   dst_reg offset(this, glsl_type::uint_type);
+   offset.writemask = WRITEMASK_X;
+
    /* Set the surface read offset. */
-   emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
+   emit(MOV(offset, surf_offset));
 
    /* Emit the instruction.  Note that this maps to the normal SIMD8
     * untyped surface read message, but that's OK because unused
     * channels will be masked out.
     */
    vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
-                                 brw_message_reg(0),
+                                 src_reg(offset),
                                  src_reg(surf_index), src_reg(1));
    inst->mlen = 1;
 }
@@ -1602,7 +1616,7 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
    inst->insert_after(block, write);
 
    inst->dst.file = temp.file;
-   inst->dst.reg = temp.reg;
+   inst->dst.nr = temp.nr;
    inst->dst.reg_offset = temp.reg_offset;
    inst->dst.reladdr = NULL;
 }
@@ -1629,10 +1643,10 @@ vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
                                           *src.reladdr);
 
    /* Now handle scratch access on src */
-   if (src.file == GRF && scratch_loc[src.reg] != -1) {
+   if (src.file == VGRF && scratch_loc[src.nr] != -1) {
       dst_reg temp = dst_reg(this, glsl_type::vec4_type);
-      emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
-      src.reg = temp.reg;
+      emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
+      src.nr = temp.nr;
       src.reg_offset = temp.reg_offset;
       src.reladdr = NULL;
    }
@@ -1657,18 +1671,18 @@ vec4_visitor::move_grf_array_access_to_scratch()
     * scratch.
     */
    foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
-      if (inst->dst.file == GRF && inst->dst.reladdr) {
-         if (scratch_loc[inst->dst.reg] == -1) {
-            scratch_loc[inst->dst.reg] = last_scratch;
-            last_scratch += this->alloc.sizes[inst->dst.reg];
+      if (inst->dst.file == VGRF && inst->dst.reladdr) {
+         if (scratch_loc[inst->dst.nr] == -1) {
+            scratch_loc[inst->dst.nr] = last_scratch;
+            last_scratch += this->alloc.sizes[inst->dst.nr];
          }
 
          for (src_reg *iter = inst->dst.reladdr;
               iter->reladdr;
               iter = iter->reladdr) {
-            if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = last_scratch;
-               last_scratch += this->alloc.sizes[iter->reg];
+            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+               scratch_loc[iter->nr] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->nr];
             }
          }
       }
@@ -1677,9 +1691,9 @@ vec4_visitor::move_grf_array_access_to_scratch()
          for (src_reg *iter = &inst->src[i];
               iter->reladdr;
               iter = iter->reladdr) {
-            if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
-               scratch_loc[iter->reg] = last_scratch;
-               last_scratch += this->alloc.sizes[iter->reg];
+            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+               scratch_loc[iter->nr] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->nr];
             }
          }
       }
@@ -1705,8 +1719,8 @@ vec4_visitor::move_grf_array_access_to_scratch()
       /* Now that we have handled any (possibly recursive) reladdr scratch
        * accesses for dst we can safely do the scratch write for dst itself
        */
-      if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
-         emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
+      if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
+         emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
 
       /* Now handle scratch access on any src. In this case, since inst->src[i]
        * already is a src_reg, we can just call emit_resolve_reladdr with
@@ -1730,14 +1744,16 @@ vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
 				      int base_offset)
 {
    int reg_offset = base_offset + orig_src.reg_offset;
-   src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
+   const unsigned index = prog_data->base.binding_table.pull_constants_start;
    src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
                                              reg_offset);
 
    emit_pull_constant_load_reg(temp,
-                               index,
+                               src_reg(index),
                                offset,
                                block, inst);
+
+   brw_mark_surface_used(&prog_data->base, index);
 }
 
 /**
@@ -1773,7 +1789,7 @@ vec4_visitor::move_uniform_array_access_to_pull_constants()
             if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
                continue;
 
-            int uniform = inst->src[i].reg;
+            int uniform = inst->src[i].nr;
 
             if (inst->src[i].reladdr->reladdr)
                nested_reladdr = true;  /* will need another pass */
@@ -1804,7 +1820,7 @@ vec4_visitor::move_uniform_array_access_to_pull_constants()
                                     pull_constant_loc[uniform]);
 
             inst->src[i].file = temp.file;
-            inst->src[i].reg = temp.reg;
+            inst->src[i].nr = temp.nr;
             inst->src[i].reg_offset = temp.reg_offset;
             inst->src[i].reladdr = NULL;
          }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 0b805b1c0c4..967448e0e41 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -159,9 +159,13 @@ brw_codegen_vs_prog(struct brw_context *brw,
       start_time = get_time();
    }
 
-   if (unlikely(INTEL_DEBUG & DEBUG_VS))
+   if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
       brw_dump_ir("vertex", prog, vs ? &vs->base : NULL, &vp->program.Base);
 
+      fprintf(stderr, "VS Output ");
+      brw_print_vue_map(stderr, &prog_data.base.vue_map);
+   }
+
    int st_index = -1;
    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
       st_index = brw_get_shader_time_index(brw, prog, &vp->program.Base, ST_VS);
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
index 45662bd5afc..edb16087410 100644
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -178,3 +178,30 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
 
    vue_map->num_slots = separate ? slot + 1 : slot;
 }
+
+static const char *
+varying_name(brw_varying_slot slot)
+{
+   if (slot < VARYING_SLOT_MAX)
+      return gl_varying_slot_name(slot);
+
+   static const char *brw_names[] = {
+      [BRW_VARYING_SLOT_NDC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_NDC",
+      [BRW_VARYING_SLOT_PAD - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PAD",
+      [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC",
+   };
+
+   return brw_names[slot - VARYING_SLOT_MAX];
+}
+
+void
+brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map)
+{
+   fprintf(fp, "VUE map (%d slots, %s)\n",
+           vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
+   for (int i = 0; i < vue_map->num_slots; i++) {
+      fprintf(fp, "  [%d] %s\n", i,
+              varying_name(vue_map->slot_to_varying[i]));
+   }
+   fprintf(fp, "\n");
+}
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 5c49db9e63e..8d9ed3a6c33 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -212,6 +212,9 @@ brw_debug_recompile_sampler_key(struct brw_context *brw,
    found |= key_debug(brw, "compressed multisample layout",
                       old_key->compressed_multisample_layout_mask,
                       key->compressed_multisample_layout_mask);
+   found |= key_debug(brw, "16x msaa",
+                      old_key->msaa_16,
+                      key->msaa_16);
 
    for (unsigned int i = 0; i < MAX_SAMPLERS; i++) {
       found |= key_debug(brw, "textureGather workarounds",
@@ -371,6 +374,11 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx,
          if (brw->gen >= 7 &&
              intel_tex->mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) {
             key->compressed_multisample_layout_mask |= 1 << s;
+
+            if (intel_tex->mt->num_samples >= 16) {
+               assert(brw->gen >= 9);
+               key->msaa_16 |= 1 << s;
+            }
          }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index 8444c0c9bae..8eb620de56b 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -48,6 +48,9 @@ gen6_get_sample_position(struct gl_context *ctx,
    case 8:
       bits = brw_multisample_positions_8x[index >> 2] >> (8 * (index & 3));
       break;
+   case 16:
+      bits = brw_multisample_positions_16x[index >> 2] >> (8 * (index & 3));
+      break;
    default:
       unreachable("Not implemented");
    }
@@ -88,6 +91,17 @@ gen6_get_sample_position(struct gl_context *ctx,
  *           | 6 | 7 |                      | 7 | 1 |
  *           ---------                      ---------
  *
+ * 16X MSAA sample index layout  16x MSAA sample number layout
+ *         -----------------            -----------------
+ *         | 0 | 1 | 2 | 3 |            |15 |10 | 9 | 7 |
+ *         -----------------            -----------------
+ *         | 4 | 5 | 6 | 7 |            | 4 | 1 | 3 |13 |
+ *         -----------------            -----------------
+ *         | 8 | 9 |10 |11 |            |12 | 2 | 0 | 6 |
+ *         -----------------            -----------------
+ *         |12 |13 |14 |15 |            |11 | 8 | 5 |14 |
+ *         -----------------            -----------------
+ *
  * A sample map is used to map sample indices to sample numbers.
  */
 void
@@ -96,10 +110,13 @@ gen6_set_sample_maps(struct gl_context *ctx)
    uint8_t map_2x[2] = {0, 1};
    uint8_t map_4x[4] = {0, 1, 2, 3};
    uint8_t map_8x[8] = {5, 2, 4, 6, 0, 3, 7, 1};
+   uint8_t map_16x[16] = { 15, 10, 9, 7, 4, 1, 3, 13,
+                           12, 2, 0, 6, 11, 8, 5, 14 };
 
    memcpy(ctx->Const.SampleMap2x, map_2x, sizeof(map_2x));
    memcpy(ctx->Const.SampleMap4x, map_4x, sizeof(map_4x));
    memcpy(ctx->Const.SampleMap8x, map_8x, sizeof(map_8x));
+   memcpy(ctx->Const.SampleMap16x, map_16x, sizeof(map_16x));
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index 3899ce9451f..2f6eadffd2e 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -131,7 +131,7 @@ brw_gs_upload_binding_table(struct brw_context *brw)
       }
       if (!need_binding_table) {
          if (brw->ff_gs.bind_bo_offset != 0) {
-            brw->ctx.NewDriverState |= BRW_NEW_GS_BINDING_TABLE;
+            brw->ctx.NewDriverState |= BRW_NEW_BINDING_TABLE_POINTERS;
             brw->ff_gs.bind_bo_offset = 0;
          }
          return;
@@ -162,7 +162,7 @@ brw_gs_upload_binding_table(struct brw_context *brw)
       if (!need_binding_table) {
          if (brw->gs.base.bind_bo_offset != 0) {
             brw->gs.base.bind_bo_offset = 0;
-            brw->ctx.NewDriverState |= BRW_NEW_GS_BINDING_TABLE;
+            brw->ctx.NewDriverState |= BRW_NEW_BINDING_TABLE_POINTERS;
          }
          return;
       }
@@ -179,7 +179,7 @@ brw_gs_upload_binding_table(struct brw_context *brw)
              BRW_MAX_SURFACES * sizeof(uint32_t));
    }
 
-   brw->ctx.NewDriverState |= BRW_NEW_GS_BINDING_TABLE;
+   brw->ctx.NewDriverState |= BRW_NEW_BINDING_TABLE_POINTERS;
 }
 
 const struct brw_tracked_state gen6_gs_binding_table = {
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
index 5080f1c3fe4..438caefdd4a 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
@@ -78,7 +78,7 @@ gen7_surface_msaa_bits(unsigned num_samples, enum intel_msaa_layout layout)
 {
    uint32_t ss4 = 0;
 
-   assert(num_samples <= 8);
+   assert(num_samples <= 16);
 
    /* The SURFACE_MULTISAMPLECOUNT_X enums are simply log2(num_samples) << 3. */
    ss4 |= (ffs(MAX2(num_samples, 1)) - 1) << 3;
diff --git a/src/mesa/drivers/dri/i965/gen8_multisample_state.c b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
index 75cbe06c522..4427f15996d 100644
--- a/src/mesa/drivers/dri/i965/gen8_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_multisample_state.c
@@ -52,13 +52,11 @@ gen8_emit_3dstate_sample_pattern(struct brw_context *brw)
    BEGIN_BATCH(9);
    OUT_BATCH(_3DSTATE_SAMPLE_PATTERN << 16 | (9 - 2));
 
-   /* 16x MSAA
-    * XXX: Need to program these.
-    */
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
+   /* 16x MSAA */
+   OUT_BATCH(brw_multisample_positions_16x[0]); /* positions  3,  2,  1,  0 */
+   OUT_BATCH(brw_multisample_positions_16x[1]); /* positions  7,  6,  5,  4 */
+   OUT_BATCH(brw_multisample_positions_16x[2]); /* positions 11, 10,  9,  8 */
+   OUT_BATCH(brw_multisample_positions_16x[3]); /* positions 15, 14, 13, 12 */
 
    /* 8x MSAA */
    OUT_BATCH(brw_multisample_positions_8x[1]); /* sample positions 7654 */
diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.c b/src/mesa/drivers/dri/i965/intel_asm_annotation.c
index b3d6324a5fe..fdd605a7db0 100644
--- a/src/mesa/drivers/dri/i965/intel_asm_annotation.c
+++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.c
@@ -23,12 +23,8 @@
 
 #include "brw_cfg.h"
 #include "brw_eu.h"
-#include "brw_context.h"
 #include "intel_debug.h"
 #include "intel_asm_annotation.h"
-#include "program/prog_print.h"
-#include "program/prog_instruction.h"
-#include "main/macros.h"
 #include "glsl/nir/nir.h"
 
 void
@@ -69,6 +65,10 @@ dump_assembly(void *assembly, int num_annotations, struct annotation *annotation
 
       brw_disassemble(devinfo, assembly, start_offset, end_offset, stderr);
 
+      if (annotation[i].error) {
+         fputs(annotation[i].error, stderr);
+      }
+
       if (annotation[i].block_end) {
          fprintf(stderr, "   END B%d", annotation[i].block_end->num);
          foreach_list_typed(struct bblock_link, successor_link, link,
@@ -82,9 +82,8 @@ dump_assembly(void *assembly, int num_annotations, struct annotation *annotation
    fprintf(stderr, "\n");
 }
 
-void annotate(const struct brw_device_info *devinfo,
-              struct annotation_info *annotation, const struct cfg_t *cfg,
-              struct backend_instruction *inst, unsigned offset)
+static bool
+annotation_array_ensure_space(struct annotation_info *annotation)
 {
    if (annotation->ann_size <= annotation->ann_count) {
       int old_size = annotation->ann_size;
@@ -92,12 +91,25 @@ void annotate(const struct brw_device_info *devinfo,
       annotation->ann = reralloc(annotation->mem_ctx, annotation->ann,
                                  struct annotation, annotation->ann_size);
       if (!annotation->ann)
-         return;
+         return false;
 
       memset(annotation->ann + old_size, 0,
              (annotation->ann_size - old_size) * sizeof(struct annotation));
    }
 
+   return true;
+}
+
+void annotate(const struct brw_device_info *devinfo,
+              struct annotation_info *annotation, const struct cfg_t *cfg,
+              struct backend_instruction *inst, unsigned offset)
+{
+   if (annotation->mem_ctx == NULL)
+      annotation->mem_ctx = ralloc_context(NULL);
+
+   if (!annotation_array_ensure_space(annotation))
+      return;
+
    struct annotation *ann = &annotation->ann[annotation->ann_count++];
    ann->offset = offset;
    if ((INTEL_DEBUG & DEBUG_ANNOTATION) != 0) {
@@ -109,6 +121,24 @@ void annotate(const struct brw_device_info *devinfo,
       ann->block_start = cfg->blocks[annotation->cur_block];
    }
 
+   if (bblock_end(cfg->blocks[annotation->cur_block]) == inst) {
+      ann->block_end = cfg->blocks[annotation->cur_block];
+      annotation->cur_block++;
+   }
+
+   /* Merge this annotation with the previous if possible. */
+   struct annotation *prev = annotation->ann_count > 1 ?
+         &annotation->ann[annotation->ann_count - 2] : NULL;
+   if (prev != NULL &&
+       ann->ir == prev->ir &&
+       ann->annotation == prev->annotation &&
+       ann->block_start == NULL &&
+       prev->block_end == NULL) {
+      if (ann->block_end == NULL)
+         annotation->ann_count--;
+      return;
+   }
+
    /* There is no hardware DO instruction on Gen6+, so since DO always
     * starts a basic block, we need to set the .block_start of the next
     * instruction's annotation with a pointer to the bblock started by
@@ -120,11 +150,6 @@ void annotate(const struct brw_device_info *devinfo,
    if (devinfo->gen >= 6 && inst->opcode == BRW_OPCODE_DO) {
       annotation->ann_count--;
    }
-
-   if (bblock_end(cfg->blocks[annotation->cur_block]) == inst) {
-      ann->block_end = cfg->blocks[annotation->cur_block];
-      annotation->cur_block++;
-   }
 }
 
 void
@@ -140,3 +165,47 @@ annotation_finalize(struct annotation_info *annotation,
    }
    annotation->ann[annotation->ann_count].offset = next_inst_offset;
 }
+
+void
+annotation_insert_error(struct annotation_info *annotation, unsigned offset,
+                        const char *error)
+{
+   struct annotation *ann;
+
+   if (!annotation->ann_count)
+      return;
+
+   /* We may have to split an annotation, so ensure we have enough space
+    * allocated for that case up front.
+    */
+   if (!annotation_array_ensure_space(annotation))
+      return;
+
+   assume(annotation->ann_count > 0);
+
+   for (int i = 0; i < annotation->ann_count; i++) {
+      struct annotation *cur = &annotation->ann[i];
+      struct annotation *next = &annotation->ann[i + 1];
+      ann = cur;
+
+      if (next->offset <= offset)
+         continue;
+
+      if (offset + sizeof(brw_inst) != next->offset) {
+         memmove(next, cur,
+                 (annotation->ann_count - i + 2) * sizeof(struct annotation));
+         cur->error = NULL;
+         cur->error_length = 0;
+         cur->block_end = NULL;
+         next->offset = offset + sizeof(brw_inst);
+         next->block_start = NULL;
+         annotation->ann_count++;
+      }
+      break;
+   }
+
+   if (ann->error)
+      ralloc_strcat(&ann->error, error);
+   else
+      ann->error = ralloc_strdup(annotation->mem_ctx, error);
+}
diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.h b/src/mesa/drivers/dri/i965/intel_asm_annotation.h
index 6c72326f058..662a4b4e0f7 100644
--- a/src/mesa/drivers/dri/i965/intel_asm_annotation.h
+++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.h
@@ -37,6 +37,9 @@ struct cfg_t;
 struct annotation {
    int offset;
 
+   size_t error_length;
+   char *error;
+
    /* Pointers to the basic block in the CFG if the instruction group starts
     * or ends a basic block.
     */
@@ -69,6 +72,10 @@ annotate(const struct brw_device_info *devinfo,
 void
 annotation_finalize(struct annotation_info *annotation, unsigned offset);
 
+void
+annotation_insert_error(struct annotation_info *annotation, unsigned offset,
+                        const char *error);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 4643ea3e87b..386b63c123d 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -174,6 +174,7 @@ intelInitExtensions(struct gl_context *ctx)
 
    assert(brw->gen >= 4);
 
+   ctx->Extensions.ARB_arrays_of_arrays = true;
    ctx->Extensions.ARB_buffer_storage = true;
    ctx->Extensions.ARB_clear_texture = true;
    ctx->Extensions.ARB_clip_control = true;
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index b6e35205727..b1a7632d82f 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -416,9 +416,13 @@ intel_miptree_create_layout(struct brw_context *brw,
             width0 = ALIGN(width0, 2) * 4;
             height0 = ALIGN(height0, 2) * 2;
             break;
+         case 16:
+            width0 = ALIGN(width0, 2) * 4;
+            height0 = ALIGN(height0, 2) * 4;
+            break;
          default:
-            /* num_samples should already have been quantized to 0, 1, 2, 4, or
-             * 8.
+            /* num_samples should already have been quantized to 0, 1, 2, 4, 8
+             * or 16.
              */
             unreachable("not reached");
          }
@@ -1423,6 +1427,12 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
        */
       format = MESA_FORMAT_R_UINT32;
       break;
+   case 16:
+      /* 64 bits/pixel are required for MCS data when using 16x MSAA (4 bits
+       * for each sample).
+       */
+      format = MESA_FORMAT_RG_UINT32;
+      break;
    default:
       unreachable("Unrecognized sample count in intel_miptree_alloc_mcs");
    };
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index fb95fb629ad..d64ebade769 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1178,12 +1178,15 @@ intel_detect_timestamp(struct intel_screen *screen)
 const int*
 intel_supported_msaa_modes(const struct intel_screen  *screen)
 {
+   static const int gen9_modes[] = {16, 8, 4, 2, 0, -1};
    static const int gen8_modes[] = {8, 4, 2, 0, -1};
    static const int gen7_modes[] = {8, 4, 0, -1};
    static const int gen6_modes[] = {4, 0, -1};
    static const int gen4_modes[] = {0, -1};
 
-   if (screen->devinfo->gen >= 8) {
+   if (screen->devinfo->gen >= 9) {
+      return gen9_modes;
+   } else if (screen->devinfo->gen >= 8) {
       return gen8_modes;
    } else if (screen->devinfo->gen >= 7) {
       return gen7_modes;
diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index e80b71b558d..a1f91d9c56a 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -144,7 +144,7 @@ TEST_F(copy_propagation_test, test_swizzle_swizzle)
 
    copy_propagation(v);
 
-   EXPECT_EQ(test_mov->src[0].reg, a.reg);
+   EXPECT_EQ(test_mov->src[0].nr, a.nr);
    EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_Z,
                                                     SWIZZLE_W,
                                                     SWIZZLE_X,
@@ -174,7 +174,7 @@ TEST_F(copy_propagation_test, test_swizzle_writemask)
    copy_propagation(v);
 
    /* should not copy propagate */
-   EXPECT_EQ(test_mov->src[0].reg, b.reg);
+   EXPECT_EQ(test_mov->src[0].nr, b.nr);
    EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_W,
                                                     SWIZZLE_W,
                                                     SWIZZLE_W,
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index 2f824617454..d84e2e98ec0 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -213,7 +213,7 @@ TEST_F(register_coalesce_test, test_dp4_grf)
 
    register_coalesce(v);
 
-   EXPECT_EQ(dp4->dst.reg, to.reg);
+   EXPECT_EQ(dp4->dst.nr, to.nr);
    EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
 }
 
@@ -239,5 +239,5 @@ TEST_F(register_coalesce_test, test_channel_mul_grf)
 
    register_coalesce(v);
 
-   EXPECT_EQ(mul->dst.reg, to.reg);
+   EXPECT_EQ(mul->dst.nr, to.nr);
 }
diff --git a/src/mesa/main/arrayobj.c b/src/mesa/main/arrayobj.c
index 061e557a397..897dac6e5db 100644
--- a/src/mesa/main/arrayobj.c
+++ b/src/mesa/main/arrayobj.c
@@ -149,8 +149,6 @@ unbind_array_object_vbos(struct gl_context *ctx, struct gl_vertex_array_object *
 
 /**
  * Allocate and initialize a new vertex array object.
- *
- * This function is intended to be called via
  */
 struct gl_vertex_array_object *
 _mesa_new_vao(struct gl_context *ctx, GLuint name)
@@ -164,9 +162,6 @@ _mesa_new_vao(struct gl_context *ctx, GLuint name)
 
 /**
  * Delete an array object.
- *
- * This function is intended to be called via
- * \c dd_function_table::DeleteArrayObject.
  */
 void
 _mesa_delete_vao(struct gl_context *ctx, struct gl_vertex_array_object *obj)
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index 20aa4980935..ddf7f497f1e 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -639,7 +639,7 @@ _mesa_AlphaFunc( GLenum func, GLclampf ref )
  * \param opcode operation.
  *
  * Verifies that \p opcode is a valid enum and updates
-gl_colorbuffer_attrib::LogicOp.
+ * gl_colorbuffer_attrib::LogicOp.
  * On a change, flushes the vertices and notifies the driver via the
  * dd_function_table::LogicOpcode callback.
  */
diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h
index 1e7a12c8a84..4798b1f9b43 100644
--- a/src/mesa/main/context.h
+++ b/src/mesa/main/context.h
@@ -50,6 +50,7 @@
 
 
 #include "imports.h"
+#include "extensions.h"
 #include "mtypes.h"
 #include "vbo/vbo.h"
 
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index f02e842f34d..d571d221bce 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -62,6 +62,8 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum target,
                struct gl_renderbuffer **renderbuffer,
                mesa_format *format,
                GLenum *internalFormat,
+               GLuint *width,
+               GLuint *height,
                const char *dbg_prefix)
 {
    if (name == 0) {
@@ -126,6 +128,8 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum target,
       *renderbuffer = rb;
       *format = rb->Format;
       *internalFormat = rb->InternalFormat;
+      *width = rb->Width;
+      *height = rb->Height;
       *tex_image = NULL;
    } else {
       struct gl_texture_object *texObj = _mesa_lookup_texture(ctx, name);
@@ -194,6 +198,8 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum target,
       *renderbuffer = NULL;
       *format = (*tex_image)->TexFormat;
       *internalFormat = (*tex_image)->InternalFormat;
+      *width = (*tex_image)->Width;
+      *height = (*tex_image)->Height;
    }
 
    return true;
@@ -423,6 +429,7 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
    struct gl_renderbuffer *srcRenderbuffer, *dstRenderbuffer;
    mesa_format srcFormat, dstFormat;
    GLenum srcIntFormat, dstIntFormat;
+   GLuint src_w, src_h, dst_w, dst_h;
    GLuint src_bw, src_bh, dst_bw, dst_bh;
    int dstWidth, dstHeight, dstDepth;
    int i;
@@ -445,17 +452,41 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
 
    if (!prepare_target(ctx, srcName, srcTarget, srcLevel, srcZ, srcDepth,
                        &srcTexImage, &srcRenderbuffer, &srcFormat,
-                       &srcIntFormat, "src"))
+                       &srcIntFormat, &src_w, &src_h, "src"))
       return;
 
    if (!prepare_target(ctx, dstName, dstTarget, dstLevel, dstZ, srcDepth,
                        &dstTexImage, &dstRenderbuffer, &dstFormat,
-                       &dstIntFormat, "dst"))
+                       &dstIntFormat, &dst_w, &dst_h, "dst"))
       return;
 
    _mesa_get_format_block_size(srcFormat, &src_bw, &src_bh);
+
+   /* Section 18.3.2 (Copying Between Images) of the OpenGL 4.5 Core Profile
+    * spec says:
+    *
+    *    An INVALID_VALUE error is generated if the dimensions of either
+    *    subregion exceeds the boundaries of the corresponding image object,
+    *    or if the image format is compressed and the dimensions of the
+    *    subregion fail to meet the alignment constraints of the format.
+    *
+    * and Section 8.7 (Compressed Texture Images) says:
+    *
+    *    An INVALID_OPERATION error is generated if any of the following
+    *    conditions occurs:
+    *
+    *      * width is not a multiple of four, and width + xoffset is not
+    *        equal to the value of TEXTURE_WIDTH.
+    *      * height is not a multiple of four, and height + yoffset is not
+    *        equal to the value of TEXTURE_HEIGHT.
+    *
+    * so we take that to mean that you can copy the "last" block of a
+    * compressed texture image even if it's smaller than the minimum block
+    * dimensions.
+    */
    if ((srcX % src_bw != 0) || (srcY % src_bh != 0) ||
-       (srcWidth % src_bw != 0) || (srcHeight % src_bh != 0)) {
+       (srcWidth % src_bw != 0 && (srcX + srcWidth) != src_w) ||
+       (srcHeight % src_bh != 0 && (srcY + srcHeight) != src_h)) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glCopyImageSubData(unaligned src rectangle)");
       return;
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index d964f030ecb..e94d2b74749 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -42,35 +42,6 @@ struct gl_extensions _mesa_extension_override_disables;
 static char *extra_extensions = NULL;
 static char *cant_disable_extensions = NULL;
 
-enum {
-   DISABLE = 0,
-   GLL = 1 << API_OPENGL_COMPAT,       /* GL Legacy / Compatibility */
-   GLC = 1 << API_OPENGL_CORE,  /* GL Core */
-   GL  = (1 << API_OPENGL_COMPAT) | (1 << API_OPENGL_CORE),
-   ES1 = 1 << API_OPENGLES,
-   ES2 = 1 << API_OPENGLES2,
-   ES3 = 1 << (API_OPENGL_LAST + 1),
-   ES31 = 1 << (API_OPENGL_LAST + 2),
-};
-
-/**
- * \brief An element of the \c extension_table.
- */
-struct extension {
-   /** Name of extension, such as "GL_ARB_depth_clamp". */
-   const char *name;
-
-   /** Offset (in bytes) of the corresponding member in struct gl_extensions. */
-   size_t offset;
-
-   /** Set of API's in which the extension exists, as a bitset. */
-   uint8_t api_set;
-
-   /** Year the extension was proposed or approved.  Used to sort the 
-    * extension string chronologically. */
-   uint16_t year;
-};
-
 
 /**
  * Given a member \c x of struct gl_extensions, return offset of
@@ -82,341 +53,26 @@ struct extension {
 /**
  * \brief Table of supported OpenGL extensions for all API's.
  */
-static const struct extension extension_table[] = {
-   /* ARB Extensions */
-   { "GL_ARB_ES2_compatibility",                   o(ARB_ES2_compatibility),                   GL,             2009 },
-   { "GL_ARB_ES3_compatibility",                   o(ARB_ES3_compatibility),                   GL,             2012 },
-   { "GL_ARB_arrays_of_arrays",                    o(ARB_arrays_of_arrays),                    GL,             2012 },
-   { "GL_ARB_base_instance",                       o(ARB_base_instance),                       GL,             2011 },
-   { "GL_ARB_blend_func_extended",                 o(ARB_blend_func_extended),                 GL,             2009 },
-   { "GL_ARB_buffer_storage",                      o(ARB_buffer_storage),                      GL,             2013 },
-   { "GL_ARB_clear_buffer_object",                 o(dummy_true),                              GL,             2012 },
-   { "GL_ARB_clear_texture",                       o(ARB_clear_texture),                       GL,             2013 },
-   { "GL_ARB_clip_control",                        o(ARB_clip_control),                        GL,             2014 },
-   { "GL_ARB_color_buffer_float",                  o(ARB_color_buffer_float),                  GL,             2004 },
-   { "GL_ARB_compressed_texture_pixel_storage",    o(dummy_true),                              GL,             2011 },
-   { "GL_ARB_compute_shader",                      o(ARB_compute_shader),                      GL,             2012 },
-   { "GL_ARB_conditional_render_inverted",         o(ARB_conditional_render_inverted),         GL,             2014 },
-   { "GL_ARB_copy_buffer",                         o(dummy_true),                              GL,             2008 },
-   { "GL_ARB_copy_image",                          o(ARB_copy_image),                          GL,             2012 },
-   { "GL_ARB_conservative_depth",                  o(ARB_conservative_depth),                  GL,             2011 },
-   { "GL_ARB_debug_output",                        o(dummy_true),                              GL,             2009 },
-   { "GL_ARB_depth_buffer_float",                  o(ARB_depth_buffer_float),                  GL,             2008 },
-   { "GL_ARB_depth_clamp",                         o(ARB_depth_clamp),                         GL,             2003 },
-   { "GL_ARB_depth_texture",                       o(ARB_depth_texture),                       GLL,            2001 },
-   { "GL_ARB_derivative_control",                  o(ARB_derivative_control),                  GL,             2014 },
-   { "GL_ARB_direct_state_access",                 o(dummy_true),                              GLC,            2014 },
-   { "GL_ARB_draw_buffers",                        o(dummy_true),                              GL,             2002 },
-   { "GL_ARB_draw_buffers_blend",                  o(ARB_draw_buffers_blend),                  GL,             2009 },
-   { "GL_ARB_draw_elements_base_vertex",           o(ARB_draw_elements_base_vertex),           GL,             2009 },
-   { "GL_ARB_draw_indirect",                       o(ARB_draw_indirect),                       GLC,            2010 },
-   { "GL_ARB_draw_instanced",                      o(ARB_draw_instanced),                      GL,             2008 },
-   { "GL_ARB_explicit_attrib_location",            o(ARB_explicit_attrib_location),            GL,             2009 },
-   { "GL_ARB_explicit_uniform_location",           o(ARB_explicit_uniform_location),           GL,             2012 },
-   { "GL_ARB_fragment_coord_conventions",          o(ARB_fragment_coord_conventions),          GL,             2009 },
-   { "GL_ARB_fragment_layer_viewport",             o(ARB_fragment_layer_viewport),             GLC,            2012 },
-   { "GL_ARB_fragment_program",                    o(ARB_fragment_program),                    GLL,            2002 },
-   { "GL_ARB_fragment_program_shadow",             o(ARB_fragment_program_shadow),             GLL,            2003 },
-   { "GL_ARB_fragment_shader",                     o(ARB_fragment_shader),                     GL,             2002 },
-   { "GL_ARB_framebuffer_no_attachments",          o(ARB_framebuffer_no_attachments),          GL,             2012 },
-   { "GL_ARB_framebuffer_object",                  o(ARB_framebuffer_object),                  GL,             2005 },
-   { "GL_ARB_framebuffer_sRGB",                    o(EXT_framebuffer_sRGB),                    GL,             1998 },
-   { "GL_ARB_get_program_binary",                  o(dummy_true),                              GL,             2010 },
-   { "GL_ARB_get_texture_sub_image",               o(dummy_true),                              GL,             2014 },
-   { "GL_ARB_gpu_shader5",                         o(ARB_gpu_shader5),                         GLC,            2010 },
-   { "GL_ARB_gpu_shader_fp64",                     o(ARB_gpu_shader_fp64),                     GLC,            2010 },
-   { "GL_ARB_half_float_pixel",                    o(dummy_true),                              GL,             2003 },
-   { "GL_ARB_half_float_vertex",                   o(ARB_half_float_vertex),                   GL,             2008 },
-   { "GL_ARB_instanced_arrays",                    o(ARB_instanced_arrays),                    GL,             2008 },
-   { "GL_ARB_internalformat_query",                o(ARB_internalformat_query),                GL,             2011 },
-   { "GL_ARB_invalidate_subdata",                  o(dummy_true),                              GL,             2012 },
-   { "GL_ARB_map_buffer_alignment",                o(dummy_true),                              GL,             2011 },
-   { "GL_ARB_map_buffer_range",                    o(ARB_map_buffer_range),                    GL,             2008 },
-   { "GL_ARB_multi_bind",                          o(dummy_true),                              GL,             2013 },
-   { "GL_ARB_multi_draw_indirect",                 o(ARB_draw_indirect),                       GLC,            2012 },
-   { "GL_ARB_multisample",                         o(dummy_true),                              GLL,            1994 },
-   { "GL_ARB_multitexture",                        o(dummy_true),                              GLL,            1998 },
-   { "GL_ARB_occlusion_query2",                    o(ARB_occlusion_query2),                    GL,             2003 },
-   { "GL_ARB_occlusion_query",                     o(ARB_occlusion_query),                     GLL,            2001 },
-   { "GL_ARB_pipeline_statistics_query",           o(ARB_pipeline_statistics_query),           GL,             2014 },
-   { "GL_ARB_pixel_buffer_object",                 o(EXT_pixel_buffer_object),                 GL,             2004 },
-   { "GL_ARB_point_parameters",                    o(EXT_point_parameters),                    GLL,            1997 },
-   { "GL_ARB_point_sprite",                        o(ARB_point_sprite),                        GL,             2003 },
-   { "GL_ARB_program_interface_query",             o(dummy_true),                              GL,             2012 },
-   { "GL_ARB_provoking_vertex",                    o(EXT_provoking_vertex),                    GL,             2009 },
-   { "GL_ARB_robustness",                          o(dummy_true),                              GL,             2010 },
-   { "GL_ARB_sample_shading",                      o(ARB_sample_shading),                      GL,             2009 },
-   { "GL_ARB_sampler_objects",                     o(dummy_true),                              GL,             2009 },
-   { "GL_ARB_seamless_cube_map",                   o(ARB_seamless_cube_map),                   GL,             2009 },
-   { "GL_ARB_seamless_cubemap_per_texture",        o(AMD_seamless_cubemap_per_texture),        GL,             2013 },
-   { "GL_ARB_separate_shader_objects",             o(dummy_true),                              GL,             2010 },
-   { "GL_ARB_shader_atomic_counters",              o(ARB_shader_atomic_counters),              GL,             2011 },
-   { "GL_ARB_shader_bit_encoding",                 o(ARB_shader_bit_encoding),                 GL,             2010 },
-   { "GL_ARB_shader_clock",                        o(ARB_shader_clock),                        GL,             2015 },
-   { "GL_ARB_shader_image_load_store",             o(ARB_shader_image_load_store),             GL,             2011 },
-   { "GL_ARB_shader_image_size",                   o(ARB_shader_image_size),                   GL,             2012 },
-   { "GL_ARB_shader_objects",                      o(dummy_true),                              GL,             2002 },
-   { "GL_ARB_shader_precision",                    o(ARB_shader_precision),                    GL,             2010 },
-   { "GL_ARB_shader_stencil_export",               o(ARB_shader_stencil_export),               GL,             2009 },
-   { "GL_ARB_shader_storage_buffer_object",        o(ARB_shader_storage_buffer_object),        GL,             2012 },
-   { "GL_ARB_shader_subroutine",                   o(ARB_shader_subroutine),                   GLC,            2010 },
-   { "GL_ARB_shader_texture_image_samples",        o(ARB_shader_texture_image_samples),        GL,             2014 },
-   { "GL_ARB_shader_texture_lod",                  o(ARB_shader_texture_lod),                  GL,             2009 },
-   { "GL_ARB_shading_language_100",                o(dummy_true),                              GLL,            2003 },
-   { "GL_ARB_shading_language_packing",            o(ARB_shading_language_packing),            GL,             2011 },
-   { "GL_ARB_shading_language_420pack",            o(ARB_shading_language_420pack),            GL,             2011 },
-   { "GL_ARB_shadow",                              o(ARB_shadow),                              GLL,            2001 },
-   { "GL_ARB_stencil_texturing",                   o(ARB_stencil_texturing),                   GL,             2012 },
-   { "GL_ARB_sync",                                o(ARB_sync),                                GL,             2003 },
-   { "GL_ARB_texture_barrier",                     o(NV_texture_barrier),                      GL,             2014 },
-   { "GL_ARB_tessellation_shader",                 o(ARB_tessellation_shader),                 GLC,            2009 },
-   { "GL_ARB_texture_border_clamp",                o(ARB_texture_border_clamp),                GLL,            2000 },
-   { "GL_ARB_texture_buffer_object",               o(ARB_texture_buffer_object),               GLC,            2008 },
-   { "GL_ARB_texture_buffer_object_rgb32",         o(ARB_texture_buffer_object_rgb32),         GLC,            2009 },
-   { "GL_ARB_texture_buffer_range",                o(ARB_texture_buffer_range),                GLC,            2012 },
-   { "GL_ARB_texture_compression",                 o(dummy_true),                              GLL,            2000 },
-   { "GL_ARB_texture_compression_bptc",            o(ARB_texture_compression_bptc),            GL,             2010 },
-   { "GL_ARB_texture_compression_rgtc",            o(ARB_texture_compression_rgtc),            GL,             2004 },
-   { "GL_ARB_texture_cube_map",                    o(ARB_texture_cube_map),                    GLL,            1999 },
-   { "GL_ARB_texture_cube_map_array",              o(ARB_texture_cube_map_array),              GL,             2009 },
-   { "GL_ARB_texture_env_add",                     o(dummy_true),                              GLL,            1999 },
-   { "GL_ARB_texture_env_combine",                 o(ARB_texture_env_combine),                 GLL,            2001 },
-   { "GL_ARB_texture_env_crossbar",                o(ARB_texture_env_crossbar),                GLL,            2001 },
-   { "GL_ARB_texture_env_dot3",                    o(ARB_texture_env_dot3),                    GLL,            2001 },
-   { "GL_ARB_texture_float",                       o(ARB_texture_float),                       GL,             2004 },
-   { "GL_ARB_texture_gather",                      o(ARB_texture_gather),                      GL,             2009 },
-   { "GL_ARB_texture_mirrored_repeat",             o(dummy_true),                              GLL,            2001 },
-   { "GL_ARB_texture_mirror_clamp_to_edge",        o(ARB_texture_mirror_clamp_to_edge),        GL,             2013 },
-   { "GL_ARB_texture_multisample",                 o(ARB_texture_multisample),                 GL,             2009 },
-   { "GL_ARB_texture_non_power_of_two",            o(ARB_texture_non_power_of_two),            GL,             2003 },
-   { "GL_ARB_texture_query_levels",                o(ARB_texture_query_levels),                GL,             2012 },
-   { "GL_ARB_texture_query_lod",                   o(ARB_texture_query_lod),                   GL,             2009 },
-   { "GL_ARB_texture_rectangle",                   o(NV_texture_rectangle),                    GL,             2004 },
-   { "GL_ARB_texture_rgb10_a2ui",                  o(ARB_texture_rgb10_a2ui),                  GL,             2009 },
-   { "GL_ARB_texture_rg",                          o(ARB_texture_rg),                          GL,             2008 },
-   { "GL_ARB_texture_stencil8",                    o(ARB_texture_stencil8),                    GL,             2013 },
-   { "GL_ARB_texture_storage",                     o(dummy_true),                              GL,             2011 },
-   { "GL_ARB_texture_storage_multisample",         o(ARB_texture_multisample),                 GL,             2012 },
-   { "GL_ARB_texture_view",                        o(ARB_texture_view),                        GL,             2012 },
-   { "GL_ARB_texture_swizzle",                     o(EXT_texture_swizzle),                     GL,             2008 },
-   { "GL_ARB_timer_query",                         o(ARB_timer_query),                         GL,             2010 },
-   { "GL_ARB_transform_feedback2",                 o(ARB_transform_feedback2),                 GL,             2010 },
-   { "GL_ARB_transform_feedback3",                 o(ARB_transform_feedback3),                 GL,             2010 },
-   { "GL_ARB_transform_feedback_instanced",        o(ARB_transform_feedback_instanced),        GL,             2011 },
-   { "GL_ARB_transpose_matrix",                    o(dummy_true),                              GLL,            1999 },
-   { "GL_ARB_uniform_buffer_object",               o(ARB_uniform_buffer_object),               GL,             2009 },
-   { "GL_ARB_vertex_array_bgra",                   o(EXT_vertex_array_bgra),                   GL,             2008 },
-   { "GL_ARB_vertex_array_object",                 o(dummy_true),                              GL,             2006 },
-   { "GL_ARB_vertex_attrib_binding",               o(dummy_true),                              GL,             2012 },
-   { "GL_ARB_vertex_buffer_object",                o(dummy_true),                              GLL,            2003 },
-   { "GL_ARB_vertex_program",                      o(ARB_vertex_program),                      GLL,            2002 },
-   { "GL_ARB_vertex_shader",                       o(ARB_vertex_shader),                       GL,             2002 },
-   { "GL_ARB_vertex_attrib_64bit",                 o(ARB_vertex_attrib_64bit),                 GLC,            2010 },
-   { "GL_ARB_vertex_type_10f_11f_11f_rev",         o(ARB_vertex_type_10f_11f_11f_rev),         GL,             2013 },
-   { "GL_ARB_vertex_type_2_10_10_10_rev",          o(ARB_vertex_type_2_10_10_10_rev),          GL,             2009 },
-   { "GL_ARB_viewport_array",                      o(ARB_viewport_array),                      GLC,            2010 },
-   { "GL_ARB_window_pos",                          o(dummy_true),                              GLL,            2001 },
-   /* EXT extensions */
-   { "GL_EXT_abgr",                                o(dummy_true),                              GL,             1995 },
-   { "GL_EXT_bgra",                                o(dummy_true),                              GLL,            1995 },
-   { "GL_EXT_blend_color",                         o(EXT_blend_color),                         GLL,            1995 },
-   { "GL_EXT_blend_equation_separate",             o(EXT_blend_equation_separate),             GL,             2003 },
-   { "GL_EXT_blend_func_separate",                 o(EXT_blend_func_separate),                 GLL,            1999 },
-   { "GL_EXT_discard_framebuffer",                 o(dummy_true),                                    ES1 | ES2, 2009 },
-   { "GL_EXT_blend_minmax",                        o(EXT_blend_minmax),                        GLL | ES1 | ES2, 1995 },
-   { "GL_EXT_blend_subtract",                      o(dummy_true),                              GLL,            1995 },
-   { "GL_EXT_compiled_vertex_array",               o(dummy_true),                              GLL,            1996 },
-   { "GL_EXT_copy_texture",                        o(dummy_true),                              GLL,            1995 },
-   { "GL_EXT_depth_bounds_test",                   o(EXT_depth_bounds_test),                   GL,             2002 },
-   { "GL_EXT_draw_buffers",                        o(dummy_true),                                         ES2, 2012 },
-   { "GL_EXT_draw_buffers2",                       o(EXT_draw_buffers2),                       GL,             2006 },
-   { "GL_EXT_draw_elements_base_vertex",           o(ARB_draw_elements_base_vertex),                      ES2, 2014 },
-   { "GL_EXT_draw_instanced",                      o(ARB_draw_instanced),                      GL,             2006 },
-   { "GL_EXT_draw_range_elements",                 o(dummy_true),                              GLL,            1997 },
-   { "GL_EXT_fog_coord",                           o(dummy_true),                              GLL,            1999 },
-   { "GL_EXT_framebuffer_blit",                    o(dummy_true),                              GL,             2005 },
-   { "GL_EXT_framebuffer_multisample",             o(EXT_framebuffer_multisample),             GL,             2005 },
-   { "GL_EXT_framebuffer_multisample_blit_scaled", o(EXT_framebuffer_multisample_blit_scaled), GL,             2011 },
-   { "GL_EXT_framebuffer_object",                  o(dummy_true),                              GLL,            2000 },
-   { "GL_EXT_framebuffer_sRGB",                    o(EXT_framebuffer_sRGB),                    GL,             1998 },
-   { "GL_EXT_gpu_program_parameters",              o(EXT_gpu_program_parameters),              GLL,            2006 },
-   { "GL_EXT_gpu_shader4",                         o(EXT_gpu_shader4),                         GL,             2006 },
-   { "GL_EXT_map_buffer_range",                    o(ARB_map_buffer_range),                          ES1 | ES2, 2012 },
-   { "GL_EXT_multi_draw_arrays",                   o(dummy_true),                              GLL | ES1 | ES2, 1999 },
-   { "GL_EXT_packed_depth_stencil",                o(dummy_true),                              GL,             2005 },
-   { "GL_EXT_packed_float",                        o(EXT_packed_float),                        GL,             2004 },
-   { "GL_EXT_packed_pixels",                       o(dummy_true),                              GLL,            1997 },
-   { "GL_EXT_pixel_buffer_object",                 o(EXT_pixel_buffer_object),                 GL,             2004 },
-   { "GL_EXT_point_parameters",                    o(EXT_point_parameters),                    GLL,            1997 },
-   { "GL_EXT_polygon_offset",                      o(dummy_true),                              GLL,            1995 },
-   { "GL_EXT_polygon_offset_clamp",                o(EXT_polygon_offset_clamp),                GL,             2014 },
-   { "GL_EXT_provoking_vertex",                    o(EXT_provoking_vertex),                    GL,             2009 },
-   { "GL_EXT_rescale_normal",                      o(dummy_true),                              GLL,            1997 },
-   { "GL_EXT_secondary_color",                     o(dummy_true),                              GLL,            1999 },
-   { "GL_EXT_separate_shader_objects",             o(dummy_true),                                         ES2, 2013 },
-   { "GL_EXT_separate_specular_color",             o(dummy_true),                              GLL,            1997 },
-   { "GL_EXT_shader_integer_mix",                  o(EXT_shader_integer_mix),                  GL       | ES3, 2013 },
-   { "GL_EXT_shadow_funcs",                        o(ARB_shadow),                              GLL,            2002 },
-   { "GL_EXT_stencil_two_side",                    o(EXT_stencil_two_side),                    GLL,            2001 },
-   { "GL_EXT_stencil_wrap",                        o(dummy_true),                              GLL,            2002 },
-   { "GL_EXT_subtexture",                          o(dummy_true),                              GLL,            1995 },
-   { "GL_EXT_texture3D",                           o(EXT_texture3D),                           GLL,            1996 },
-   { "GL_EXT_texture_array",                       o(EXT_texture_array),                       GL,             2006 },
-   { "GL_EXT_texture_compression_dxt1",            o(ANGLE_texture_compression_dxt),           GL | ES1 | ES2, 2004 },
-   { "GL_ANGLE_texture_compression_dxt3",          o(ANGLE_texture_compression_dxt),           GL | ES1 | ES2, 2011 },
-   { "GL_ANGLE_texture_compression_dxt5",          o(ANGLE_texture_compression_dxt),           GL | ES1 | ES2, 2011 },
-   { "GL_EXT_texture_compression_latc",            o(EXT_texture_compression_latc),            GLL,            2006 },
-   { "GL_EXT_texture_compression_rgtc",            o(ARB_texture_compression_rgtc),            GL,             2004 },
-   { "GL_EXT_texture_compression_s3tc",            o(EXT_texture_compression_s3tc),            GL,             2000 },
-   { "GL_EXT_texture_cube_map",                    o(ARB_texture_cube_map),                    GLL,            2001 },
-   { "GL_EXT_texture_edge_clamp",                  o(dummy_true),                              GLL,            1997 },
-   { "GL_EXT_texture_env_add",                     o(dummy_true),                              GLL,            1999 },
-   { "GL_EXT_texture_env_combine",                 o(dummy_true),                              GLL,            2000 },
-   { "GL_EXT_texture_env_dot3",                    o(EXT_texture_env_dot3),                    GLL,            2000 },
-   { "GL_EXT_texture_filter_anisotropic",          o(EXT_texture_filter_anisotropic),          GL | ES1 | ES2, 1999 },
-   { "GL_EXT_texture_format_BGRA8888",             o(dummy_true),                                   ES1 | ES2, 2005 },
-   { "GL_EXT_texture_rg",                          o(ARB_texture_rg),                                     ES2, 2011 },
-   { "GL_EXT_read_format_bgra",                    o(dummy_true),                                   ES1 | ES2, 2009 },
-   { "GL_EXT_texture_integer",                     o(EXT_texture_integer),                     GL,             2006 },
-   { "GL_EXT_texture_lod_bias",                    o(dummy_true),                              GLL | ES1,      1999 },
-   { "GL_EXT_texture_mirror_clamp",                o(EXT_texture_mirror_clamp),                GL,             2004 },
-   { "GL_EXT_texture_object",                      o(dummy_true),                              GLL,            1995 },
-   { "GL_EXT_texture",                             o(dummy_true),                              GLL,            1996 },
-   { "GL_EXT_texture_rectangle",                   o(NV_texture_rectangle),                    GLL,            2004 },
-   { "GL_EXT_texture_shared_exponent",             o(EXT_texture_shared_exponent),             GL,             2004 },
-   { "GL_EXT_texture_snorm",                       o(EXT_texture_snorm),                       GL,             2009 },
-   { "GL_EXT_texture_sRGB",                        o(EXT_texture_sRGB),                        GL,             2004 },
-   { "GL_EXT_texture_sRGB_decode",                 o(EXT_texture_sRGB_decode),                        GL,      2006 },
-   { "GL_EXT_texture_swizzle",                     o(EXT_texture_swizzle),                     GL,             2008 },
-   { "GL_EXT_texture_type_2_10_10_10_REV",         o(dummy_true),                                         ES2, 2008 },
-   { "GL_EXT_timer_query",                         o(EXT_timer_query),                         GL,             2006 },
-   { "GL_EXT_transform_feedback",                  o(EXT_transform_feedback),                  GL,             2011 },
-   { "GL_EXT_unpack_subimage",                     o(dummy_true),                                         ES2, 2011 },
-   { "GL_EXT_vertex_array_bgra",                   o(EXT_vertex_array_bgra),                   GL,             2008 },
-   { "GL_EXT_vertex_array",                        o(dummy_true),                              GLL,            1995 },
-   { "GL_EXT_color_buffer_float",                  o(dummy_true),                                         ES3, 2013 },
-
-   /* OES extensions */
-   { "GL_OES_blend_equation_separate",             o(EXT_blend_equation_separate),                  ES1,       2009 },
-   { "GL_OES_blend_func_separate",                 o(EXT_blend_func_separate),                      ES1,       2009 },
-   { "GL_OES_blend_subtract",                      o(dummy_true),                                   ES1,       2009 },
-   { "GL_OES_byte_coordinates",                    o(dummy_true),                                   ES1,       2002 },
-   { "GL_OES_compressed_ETC1_RGB8_texture",        o(OES_compressed_ETC1_RGB8_texture),             ES1 | ES2, 2005 },
-   { "GL_OES_compressed_paletted_texture",         o(dummy_true),                                   ES1,       2003 },
-   { "GL_OES_depth24",                             o(dummy_true),                                   ES1 | ES2, 2005 },
-   { "GL_OES_depth32",                             o(dummy_false),                     DISABLE,                2005 },
-   { "GL_OES_depth_texture",                       o(ARB_depth_texture),                                  ES2, 2006 },
-   { "GL_OES_depth_texture_cube_map",              o(OES_depth_texture_cube_map),                         ES2, 2012 },
-   { "GL_OES_draw_elements_base_vertex",           o(ARB_draw_elements_base_vertex),                      ES2, 2014 },
-   { "GL_OES_draw_texture",                        o(OES_draw_texture),                             ES1,       2004 },
-   { "GL_OES_EGL_sync",                            o(dummy_true),                                   ES1 | ES2, 2010 },
-   /*  FIXME: Mesa expects GL_OES_EGL_image to be available in OpenGL contexts. */
-   { "GL_OES_EGL_image",                           o(OES_EGL_image),                           GL | ES1 | ES2, 2006 },
-   { "GL_OES_EGL_image_external",                  o(OES_EGL_image_external),                       ES1 | ES2, 2010 },
-   { "GL_OES_element_index_uint",                  o(dummy_true),                                   ES1 | ES2, 2005 },
-   { "GL_OES_fbo_render_mipmap",                   o(dummy_true),                                   ES1 | ES2, 2005 },
-   { "GL_OES_fixed_point",                         o(dummy_true),                                   ES1,       2002 },
-   { "GL_OES_framebuffer_object",                  o(dummy_true),                                   ES1,       2005 },
-   { "GL_OES_get_program_binary",                  o(dummy_true),                                         ES2, 2008 },
-   { "GL_OES_mapbuffer",                           o(dummy_true),                                   ES1 | ES2, 2005 },
-   { "GL_OES_packed_depth_stencil",                o(dummy_true),                                   ES1 | ES2, 2007 },
-   { "GL_OES_point_size_array",                    o(dummy_true),                                   ES1,       2004 },
-   { "GL_OES_point_sprite",                        o(ARB_point_sprite),                             ES1,       2004 },
-   { "GL_OES_query_matrix",                        o(dummy_true),                                   ES1,       2003 },
-   { "GL_OES_read_format",                         o(dummy_true),                              GL | ES1,       2003 },
-   { "GL_OES_rgb8_rgba8",                          o(dummy_true),                                   ES1 | ES2, 2005 },
-   { "GL_OES_single_precision",                    o(dummy_true),                                   ES1,       2003 },
-   { "GL_OES_standard_derivatives",                o(OES_standard_derivatives),                           ES2, 2005 },
-   { "GL_OES_stencil1",                            o(dummy_false),                     DISABLE,                2005 },
-   { "GL_OES_stencil4",                            o(dummy_false),                     DISABLE,                2005 },
-   { "GL_OES_stencil8",                            o(dummy_true),                                   ES1 | ES2, 2005 },
-   { "GL_OES_stencil_wrap",                        o(dummy_true),                                   ES1,       2002 },
-   { "GL_OES_surfaceless_context",                 o(dummy_true),                                   ES1 | ES2, 2012 },
-   { "GL_OES_texture_3D",                          o(EXT_texture3D),                                      ES2, 2005 },
-   { "GL_OES_texture_cube_map",                    o(ARB_texture_cube_map),                         ES1,       2007 },
-   { "GL_OES_texture_env_crossbar",                o(ARB_texture_env_crossbar),                     ES1,       2005 },
-   { "GL_OES_texture_float",                       o(OES_texture_float),                                  ES2, 2005 },
-   { "GL_OES_texture_float_linear",                o(OES_texture_float_linear),                           ES2, 2005 },
-   { "GL_OES_texture_half_float",                  o(OES_texture_half_float),                             ES2, 2005 },
-   { "GL_OES_texture_half_float_linear",           o(OES_texture_half_float_linear),                      ES2, 2005 },
-   { "GL_OES_texture_mirrored_repeat",             o(dummy_true),                                   ES1,       2005 },
-   { "GL_OES_texture_storage_multisample_2d_array",o(ARB_texture_multisample),                           ES31, 2014 },
-   { "GL_OES_texture_npot",                        o(ARB_texture_non_power_of_two),                 ES1 | ES2, 2005 },
-   { "GL_OES_vertex_array_object",                 o(dummy_true),                                   ES1 | ES2, 2010 },
-
-   /* KHR extensions */
-   { "GL_KHR_debug",                               o(dummy_true),                              GL,             2012 },
-   { "GL_KHR_context_flush_control",               o(dummy_true),                              GL       | ES2, 2014 },
-   { "GL_KHR_texture_compression_astc_hdr",        o(KHR_texture_compression_astc_hdr),        GL       | ES2, 2012 },
-   { "GL_KHR_texture_compression_astc_ldr",        o(KHR_texture_compression_astc_ldr),        GL       | ES2, 2012 },
-
-   /* Vendor extensions */
-   { "GL_3DFX_texture_compression_FXT1",           o(TDFX_texture_compression_FXT1),           GL,             1999 },
-   { "GL_AMD_conservative_depth",                  o(ARB_conservative_depth),                  GL,             2009 },
-   { "GL_AMD_draw_buffers_blend",                  o(ARB_draw_buffers_blend),                  GL,             2009 },
-   { "GL_AMD_performance_monitor",                 o(AMD_performance_monitor),                 GL,             2007 },
-   { "GL_AMD_pinned_memory",                       o(AMD_pinned_memory),                       GL,             2013 },
-   { "GL_AMD_seamless_cubemap_per_texture",        o(AMD_seamless_cubemap_per_texture),        GL,             2009 },
-   { "GL_AMD_shader_stencil_export",               o(ARB_shader_stencil_export),               GL,             2009 },
-   { "GL_AMD_shader_trinary_minmax",               o(dummy_true),                              GL,             2012 },
-   { "GL_AMD_vertex_shader_layer",                 o(AMD_vertex_shader_layer),                 GLC,            2012 },
-   { "GL_AMD_vertex_shader_viewport_index",        o(AMD_vertex_shader_viewport_index),        GLC,            2012 },
-   { "GL_APPLE_object_purgeable",                  o(APPLE_object_purgeable),                  GL,             2006 },
-   { "GL_APPLE_packed_pixels",                     o(dummy_true),                              GLL,            2002 },
-   { "GL_APPLE_texture_max_level",                 o(dummy_true),                                   ES1 | ES2, 2009 },
-   { "GL_APPLE_vertex_array_object",               o(dummy_true),                              GLL,            2002 },
-   { "GL_ATI_blend_equation_separate",             o(EXT_blend_equation_separate),             GL,             2003 },
-   { "GL_ATI_draw_buffers",                        o(dummy_true),                              GLL,            2002 },
-   { "GL_ATI_fragment_shader",                     o(ATI_fragment_shader),                     GLL,            2001 },
-   { "GL_ATI_separate_stencil",                    o(ATI_separate_stencil),                    GLL,            2006 },
-   { "GL_ATI_texture_compression_3dc",             o(ATI_texture_compression_3dc),             GLL,            2004 },
-   { "GL_ATI_texture_env_combine3",                o(ATI_texture_env_combine3),                GLL,            2002 },
-   { "GL_ATI_texture_float",                       o(ARB_texture_float),                       GL,             2002 },
-   { "GL_ATI_texture_mirror_once",                 o(ATI_texture_mirror_once),                 GL,             2006 },
-   { "GL_IBM_multimode_draw_arrays",               o(dummy_true),                              GL,             1998 },
-   { "GL_IBM_rasterpos_clip",                      o(dummy_true),                              GLL,            1996 },
-   { "GL_IBM_texture_mirrored_repeat",             o(dummy_true),                              GLL,            1998 },
-   { "GL_INGR_blend_func_separate",                o(EXT_blend_func_separate),                 GLL,            1999 },
-   { "GL_INTEL_performance_query",                 o(INTEL_performance_query),                       GL | ES2, 2013 },
-   { "GL_MESA_pack_invert",                        o(MESA_pack_invert),                        GL,             2002 },
-   { "GL_MESA_texture_signed_rgba",                o(EXT_texture_snorm),                       GL,             2009 },
-   { "GL_MESA_window_pos",                         o(dummy_true),                              GLL,            2000 },
-   { "GL_MESA_ycbcr_texture",                      o(MESA_ycbcr_texture),                      GL,             2002 },
-   { "GL_NV_blend_square",                         o(dummy_true),                              GLL,            1999 },
-   { "GL_NV_conditional_render",                   o(NV_conditional_render),                   GL,             2008 },
-   { "GL_NV_depth_clamp",                          o(ARB_depth_clamp),                         GL,             2001 },
-   { "GL_NV_draw_buffers",                         o(dummy_true),                                         ES2, 2011 },
-   { "GL_NV_fbo_color_attachments",                o(dummy_true),                                         ES2, 2010 },
-   { "GL_NV_fog_distance",                         o(NV_fog_distance),                         GLL,            2001 },
-   { "GL_NV_fragment_program_option",              o(NV_fragment_program_option),              GLL,            2005 },
-   { "GL_NV_light_max_exponent",                   o(dummy_true),                              GLL,            1999 },
-   { "GL_NV_packed_depth_stencil",                 o(dummy_true),                              GL,             2000 },
-   { "GL_NV_point_sprite",                         o(NV_point_sprite),                         GL,             2001 },
-   { "GL_NV_primitive_restart",                    o(NV_primitive_restart),                    GLL,            2002 },
-   { "GL_NV_read_buffer",                          o(dummy_true),                              ES2,            2011 },
-   { "GL_NV_read_depth",                           o(dummy_true),                              ES2,            2011 },
-   { "GL_NV_read_depth_stencil",                   o(dummy_true),                              ES2,            2011 },
-   { "GL_NV_read_stencil",                         o(dummy_true),                              ES2,            2011 },
-   { "GL_NV_texgen_reflection",                    o(dummy_true),                              GLL,            1999 },
-   { "GL_NV_texture_barrier",                      o(NV_texture_barrier),                      GL,             2009 },
-   { "GL_NV_texture_env_combine4",                 o(NV_texture_env_combine4),                 GLL,            1999 },
-   { "GL_NV_texture_rectangle",                    o(NV_texture_rectangle),                    GLL,            2000 },
-   { "GL_NV_vdpau_interop",                        o(NV_vdpau_interop),                        GL,             2010 },
-   { "GL_S3_s3tc",                                 o(ANGLE_texture_compression_dxt),           GL,             1999 },
-   { "GL_SGIS_generate_mipmap",                    o(dummy_true),                              GLL,            1997 },
-   { "GL_SGIS_texture_border_clamp",               o(ARB_texture_border_clamp),                GLL,            1997 },
-   { "GL_SGIS_texture_edge_clamp",                 o(dummy_true),                              GLL,            1997 },
-   { "GL_SGIS_texture_lod",                        o(dummy_true),                              GLL,            1997 },
-   { "GL_SUN_multi_draw_arrays",                   o(dummy_true),                              GLL,            1999 },
-
-   { 0, 0, 0, 0 },
+const struct mesa_extension _mesa_extension_table[] = {
+#define EXT(name_str, driver_cap, gll_ver, glc_ver, gles_ver, gles2_ver, yyyy) \
+        { .name = "GL_" #name_str, .offset = o(driver_cap), \
+          .version = { \
+            [API_OPENGL_COMPAT] = gll_ver, \
+            [API_OPENGL_CORE]   = glc_ver, \
+            [API_OPENGLES]      = gles_ver, \
+            [API_OPENGLES2]     = gles2_ver, \
+           }, \
+           .year = yyyy \
+        },
+#include "extensions_table.h"
+#undef EXT
 };
 
 
 /**
  * Given an extension name, lookup up the corresponding member of struct
  * gl_extensions and return that member's offset (in bytes).  If the name is
- * not found in the \c extension_table, return 0.
+ * not found in the \c _mesa_extension_table, return 0.
  *
  * \param name Name of extension.
  * \return Offset of member in struct gl_extensions.
@@ -424,14 +80,14 @@ static const struct extension extension_table[] = {
 static size_t
 name_to_offset(const char* name)
 {
-   const struct extension *i;
+   unsigned i;
 
    if (name == 0)
       return 0;
 
-   for (i = extension_table; i->name != 0; ++i) {
-      if (strcmp(name, i->name) == 0)
-	 return i->offset;
+   for (i = 0; i < ARRAY_SIZE(_mesa_extension_table); ++i) {
+      if (strcmp(name, _mesa_extension_table[i].name) == 0)
+	 return _mesa_extension_table[i].offset;
    }
 
    return 0;
@@ -444,15 +100,16 @@ name_to_offset(const char* name)
 static void
 override_extensions_in_context(struct gl_context *ctx)
 {
-   const struct extension *i;
+   unsigned i;
    const GLboolean *enables =
       (GLboolean*) &_mesa_extension_override_enables;
    const GLboolean *disables =
       (GLboolean*) &_mesa_extension_override_disables;
    GLboolean *ctx_ext = (GLboolean*)&ctx->Extensions;
 
-   for (i = extension_table; i->name != 0; ++i) {
-      size_t offset = i->offset;
+   for (i = 0; i < ARRAY_SIZE(_mesa_extension_table); ++i) {
+      size_t offset = _mesa_extension_table[i].offset;
+
       assert(!enables[offset] || !disables[offset]);
       if (enables[offset]) {
          ctx_ext[offset] = 1;
@@ -726,7 +383,6 @@ _mesa_init_extensions(struct gl_extensions *extensions)
 
    /* Then, selectively turn default extensions on. */
    extensions->dummy_true = GL_TRUE;
-   extensions->EXT_texture3D = GL_TRUE;
 }
 
 
@@ -734,18 +390,33 @@ typedef unsigned short extension_index;
 
 
 /**
+ * Given an extension enum, return whether or not the extension is supported
+ * dependent on the following factors:
+ * There's driver support and the OpenGL/ES version is at least that
+ * specified in the _mesa_extension_table.
+ */
+static inline bool
+_mesa_extension_supported(const struct gl_context *ctx, extension_index i)
+{
+   const bool *base = (bool *) &ctx->Extensions;
+   const struct mesa_extension *ext = _mesa_extension_table + i;
+
+   return (ctx->Version >= ext->version[ctx->API]) && base[ext->offset];
+}
+
+/**
  * Compare two entries of the extensions table.  Sorts first by year,
  * then by name.
  *
- * Arguments are indices into extension_table.
+ * Arguments are indices into _mesa_extension_table.
  */
 static int
 extension_compare(const void *p1, const void *p2)
 {
    extension_index i1 = * (const extension_index *) p1;
    extension_index i2 = * (const extension_index *) p2;
-   const struct extension *e1 = &extension_table[i1];
-   const struct extension *e2 = &extension_table[i2];
+   const struct mesa_extension *e1 = &_mesa_extension_table[i1];
+   const struct mesa_extension *e2 = &_mesa_extension_table[i2];
    int res;
 
    res = (int)e1->year - (int)e2->year;
@@ -775,15 +446,9 @@ _mesa_make_extension_string(struct gl_context *ctx)
    extension_index *extension_indices;
    /* String of extra extensions. */
    char *extra_extensions = get_extension_override(ctx);
-   GLboolean *base = (GLboolean *) &ctx->Extensions;
-   const struct extension *i;
+   unsigned k;
    unsigned j;
    unsigned maxYear = ~0;
-   unsigned api_set = (1 << ctx->API);
-   if (_mesa_is_gles3(ctx))
-      api_set |= ES3;
-   if (_mesa_is_gles31(ctx))
-      api_set |= ES31;
 
    /* Check if the MESA_EXTENSION_MAX_YEAR env var is set */
    {
@@ -797,10 +462,11 @@ _mesa_make_extension_string(struct gl_context *ctx)
 
    /* Compute length of the extension string. */
    count = 0;
-   for (i = extension_table; i->name != 0; ++i) {
-      if (base[i->offset] &&
-          i->year <= maxYear &&
-          (i->api_set & api_set)) {
+   for (k = 0; k < ARRAY_SIZE(_mesa_extension_table); ++k) {
+      const struct mesa_extension *i = _mesa_extension_table + k;
+
+      if (i->year <= maxYear &&
+          _mesa_extension_supported(ctx, k)) {
 	 length += strlen(i->name) + 1; /* +1 for space */
 	 ++count;
       }
@@ -827,11 +493,10 @@ _mesa_make_extension_string(struct gl_context *ctx)
     * expect will fit into that buffer.
     */
    j = 0;
-   for (i = extension_table; i->name != 0; ++i) {
-      if (base[i->offset] &&
-          i->year <= maxYear &&
-          (i->api_set & api_set)) {
-         extension_indices[j++] = i - extension_table;
+   for (k = 0; k < ARRAY_SIZE(_mesa_extension_table); ++k) {
+      if (_mesa_extension_table[k].year <= maxYear &&
+         _mesa_extension_supported(ctx, k)) {
+         extension_indices[j++] = k;
       }
    }
    assert(j == count);
@@ -840,8 +505,8 @@ _mesa_make_extension_string(struct gl_context *ctx)
 
    /* Build the extension string.*/
    for (j = 0; j < count; ++j) {
-      i = &extension_table[extension_indices[j]];
-      assert(base[i->offset] && (i->api_set & api_set));
+      const struct mesa_extension *i = &_mesa_extension_table[extension_indices[j]];
+      assert(_mesa_extension_supported(ctx, extension_indices[j]));
       strcat(exts, i->name);
       strcat(exts, " ");
    }
@@ -860,23 +525,15 @@ _mesa_make_extension_string(struct gl_context *ctx)
 GLuint
 _mesa_get_extension_count(struct gl_context *ctx)
 {
-   GLboolean *base;
-   const struct extension *i;
-   unsigned api_set = (1 << ctx->API);
-   if (_mesa_is_gles3(ctx))
-      api_set |= ES3;
-   if (_mesa_is_gles31(ctx))
-      api_set |= ES31;
+   unsigned k;
 
    /* only count once */
    if (ctx->Extensions.Count != 0)
       return ctx->Extensions.Count;
 
-   base = (GLboolean *) &ctx->Extensions;
-   for (i = extension_table; i->name != 0; ++i) {
-      if (base[i->offset] && (i->api_set & api_set)) {
+   for (k = 0; k < ARRAY_SIZE(_mesa_extension_table); ++k) {
+      if (_mesa_extension_supported(ctx, k))
 	 ctx->Extensions.Count++;
-      }
    }
    return ctx->Extensions.Count;
 }
@@ -887,21 +544,13 @@ _mesa_get_extension_count(struct gl_context *ctx)
 const GLubyte *
 _mesa_get_enabled_extension(struct gl_context *ctx, GLuint index)
 {
-   const GLboolean *base;
-   size_t n;
-   const struct extension *i;
-   unsigned api_set = (1 << ctx->API);
-   if (_mesa_is_gles3(ctx))
-      api_set |= ES3;
-   if (_mesa_is_gles31(ctx))
-      api_set |= ES31;
-
-   base = (GLboolean*) &ctx->Extensions;
-   n = 0;
-   for (i = extension_table; i->name != 0; ++i) {
-      if (base[i->offset] && (i->api_set & api_set)) {
+   size_t n = 0;
+   unsigned i;
+
+   for (i = 0; i < ARRAY_SIZE(_mesa_extension_table); ++i) {
+      if (_mesa_extension_supported(ctx, i)) {
          if (n == index)
-            return (const GLubyte*) i->name;
+            return (const GLubyte*) _mesa_extension_table[i].name;
          else
             ++n;
       }
diff --git a/src/mesa/main/extensions.h b/src/mesa/main/extensions.h
index 595512a5d5c..1615e1cc0a4 100644
--- a/src/mesa/main/extensions.h
+++ b/src/mesa/main/extensions.h
@@ -55,6 +55,50 @@ _mesa_get_extension_count(struct gl_context *ctx);
 extern const GLubyte *
 _mesa_get_enabled_extension(struct gl_context *ctx, GLuint index);
 
+
+/**
+ * \brief An element of the \c extension_table.
+ */
+struct mesa_extension {
+   /** Name of extension, such as "GL_ARB_depth_clamp". */
+   const char *name;
+
+   /** Offset (in bytes) of the corresponding member in struct gl_extensions. */
+   size_t offset;
+
+   /** Minimum version the extension requires for the given API
+    * (see gl_api defined in mtypes.h). The value is equal to:
+    * 10 * major_version + minor_version
+    */
+   uint8_t version[API_OPENGL_LAST + 1];
+
+   /** Year the extension was proposed or approved.  Used to sort the 
+    * extension string chronologically. */
+   uint16_t year;
+};
+
+extern const struct mesa_extension _mesa_extension_table[];
+
+
+/* Generate enums for the functions below */
+enum {
+#define EXT(name_str, ...) MESA_EXTENSION_##name_str,
+#include "extensions_table.h"
+#undef EXT
+};
+
+
+/** Checks if the context suports a user-facing extension */
+#define EXT(name_str, driver_cap, ...) \
+static inline bool \
+_mesa_has_##name_str(const struct gl_context *ctx) \
+{ \
+   return ctx->Extensions.driver_cap && (ctx->Extensions.Version >= \
+          _mesa_extension_table[MESA_EXTENSION_##name_str].version[ctx->API]); \
+}
+#include "extensions_table.h"
+#undef EXT
+
 extern struct gl_extensions _mesa_extension_override_enables;
 extern struct gl_extensions _mesa_extension_override_disables;
 
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
new file mode 100644
index 00000000000..d12fd9f1c8d
--- /dev/null
+++ b/src/mesa/main/extensions_table.h
@@ -0,0 +1,335 @@
+#define GLL 0
+#define GLC 0
+#define ES1 0
+#define ES2 0
+#define  x ~0
+EXT(ARB_ES2_compatibility                   , ARB_ES2_compatibility                  , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_ES3_compatibility                   , ARB_ES3_compatibility                  , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_arrays_of_arrays                    , ARB_arrays_of_arrays                   , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_base_instance                       , ARB_base_instance                      , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_blend_func_extended                 , ARB_blend_func_extended                , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_buffer_storage                      , ARB_buffer_storage                     , GLL, GLC,  x ,  x , 2013)
+EXT(ARB_clear_buffer_object                 , dummy_true                             , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_clear_texture                       , ARB_clear_texture                      , GLL, GLC,  x ,  x , 2013)
+EXT(ARB_clip_control                        , ARB_clip_control                       , GLL, GLC,  x ,  x , 2014)
+EXT(ARB_color_buffer_float                  , ARB_color_buffer_float                 , GLL, GLC,  x ,  x , 2004)
+EXT(ARB_compressed_texture_pixel_storage    , dummy_true                             , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_compute_shader                      , ARB_compute_shader                     , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_conditional_render_inverted         , ARB_conditional_render_inverted        , GLL, GLC,  x ,  x , 2014)
+EXT(ARB_copy_buffer                         , dummy_true                             , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_copy_image                          , ARB_copy_image                         , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_conservative_depth                  , ARB_conservative_depth                 , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_debug_output                        , dummy_true                             , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_depth_buffer_float                  , ARB_depth_buffer_float                 , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_depth_clamp                         , ARB_depth_clamp                        , GLL, GLC,  x ,  x , 2003)
+EXT(ARB_depth_texture                       , ARB_depth_texture                      , GLL,  x ,  x ,  x , 2001)
+EXT(ARB_derivative_control                  , ARB_derivative_control                 , GLL, GLC,  x ,  x , 2014)
+EXT(ARB_direct_state_access                 , dummy_true                             ,  x , GLC,  x ,  x , 2014)
+EXT(ARB_draw_buffers                        , dummy_true                             , GLL, GLC,  x ,  x , 2002)
+EXT(ARB_draw_buffers_blend                  , ARB_draw_buffers_blend                 , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_draw_indirect                       , ARB_draw_indirect                      ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_draw_instanced                      , ARB_draw_instanced                     , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_enhanced_layouts                    , ARB_enhanced_layouts                   ,  x , GLC,  x ,  x , 2013)
+EXT(ARB_explicit_attrib_location            , ARB_explicit_attrib_location           , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_explicit_uniform_location           , ARB_explicit_uniform_location          , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_fragment_coord_conventions          , ARB_fragment_coord_conventions         , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_fragment_layer_viewport             , ARB_fragment_layer_viewport            ,  x , GLC,  x ,  x , 2012)
+EXT(ARB_fragment_program                    , ARB_fragment_program                   , GLL,  x ,  x ,  x , 2002)
+EXT(ARB_fragment_program_shadow             , ARB_fragment_program_shadow            , GLL,  x ,  x ,  x , 2003)
+EXT(ARB_fragment_shader                     , ARB_fragment_shader                    , GLL, GLC,  x ,  x , 2002)
+EXT(ARB_framebuffer_no_attachments          , ARB_framebuffer_no_attachments         , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_framebuffer_object                  , ARB_framebuffer_object                 , GLL, GLC,  x ,  x , 2005)
+EXT(ARB_framebuffer_sRGB                    , EXT_framebuffer_sRGB                   , GLL, GLC,  x ,  x , 1998)
+EXT(ARB_get_program_binary                  , dummy_true                             , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_get_texture_sub_image               , dummy_true                             , GLL, GLC,  x ,  x , 2014)
+EXT(ARB_gpu_shader5                         , ARB_gpu_shader5                        ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_gpu_shader_fp64                     , ARB_gpu_shader_fp64                    ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_half_float_pixel                    , dummy_true                             , GLL, GLC,  x ,  x , 2003)
+EXT(ARB_half_float_vertex                   , ARB_half_float_vertex                  , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_instanced_arrays                    , ARB_instanced_arrays                   , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_internalformat_query                , ARB_internalformat_query               , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_invalidate_subdata                  , dummy_true                             , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_map_buffer_alignment                , dummy_true                             , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_map_buffer_range                    , ARB_map_buffer_range                   , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_multi_bind                          , dummy_true                             , GLL, GLC,  x ,  x , 2013)
+EXT(ARB_multi_draw_indirect                 , ARB_draw_indirect                      ,  x , GLC,  x ,  x , 2012)
+EXT(ARB_multisample                         , dummy_true                             , GLL,  x ,  x ,  x , 1994)
+EXT(ARB_multitexture                        , dummy_true                             , GLL,  x ,  x ,  x , 1998)
+EXT(ARB_occlusion_query2                    , ARB_occlusion_query2                   , GLL, GLC,  x ,  x , 2003)
+EXT(ARB_occlusion_query                     , ARB_occlusion_query                    , GLL,  x ,  x ,  x , 2001)
+EXT(ARB_pipeline_statistics_query           , ARB_pipeline_statistics_query          , GLL, GLC,  x ,  x , 2014)
+EXT(ARB_pixel_buffer_object                 , EXT_pixel_buffer_object                , GLL, GLC,  x ,  x , 2004)
+EXT(ARB_point_parameters                    , EXT_point_parameters                   , GLL,  x ,  x ,  x , 1997)
+EXT(ARB_point_sprite                        , ARB_point_sprite                       , GLL, GLC,  x ,  x , 2003)
+EXT(ARB_program_interface_query             , dummy_true                             , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_provoking_vertex                    , EXT_provoking_vertex                   , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_robustness                          , dummy_true                             , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_sample_shading                      , ARB_sample_shading                     , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_sampler_objects                     , dummy_true                             , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_seamless_cube_map                   , ARB_seamless_cube_map                  , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_seamless_cubemap_per_texture        , AMD_seamless_cubemap_per_texture       , GLL, GLC,  x ,  x , 2013)
+EXT(ARB_separate_shader_objects             , dummy_true                             , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_shader_atomic_counters              , ARB_shader_atomic_counters             , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_shader_bit_encoding                 , ARB_shader_bit_encoding                , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_shader_clock                        , ARB_shader_clock                       , GLL, GLC,  x ,  x , 2015)
+EXT(ARB_shader_image_load_store             , ARB_shader_image_load_store            , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_shader_image_size                   , ARB_shader_image_size                  , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_shader_objects                      , dummy_true                             , GLL, GLC,  x ,  x , 2002)
+EXT(ARB_shader_precision                    , ARB_shader_precision                   , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_shader_stencil_export               , ARB_shader_stencil_export              , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_shader_storage_buffer_object        , ARB_shader_storage_buffer_object       , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_shader_subroutine                   , ARB_shader_subroutine                  ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_shader_texture_image_samples        , ARB_shader_texture_image_samples       , GLL, GLC,  x ,  x , 2014)
+EXT(ARB_shader_texture_lod                  , ARB_shader_texture_lod                 , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_shading_language_100                , dummy_true                             , GLL,  x ,  x ,  x , 2003)
+EXT(ARB_shading_language_packing            , ARB_shading_language_packing           , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_shading_language_420pack            , ARB_shading_language_420pack           , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_shadow                              , ARB_shadow                             , GLL,  x ,  x ,  x , 2001)
+EXT(ARB_stencil_texturing                   , ARB_stencil_texturing                  , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_sync                                , ARB_sync                               , GLL, GLC,  x ,  x , 2003)
+EXT(ARB_texture_barrier                     , NV_texture_barrier                     , GLL, GLC,  x ,  x , 2014)
+EXT(ARB_tessellation_shader                 , ARB_tessellation_shader                ,  x , GLC,  x ,  x , 2009)
+EXT(ARB_texture_border_clamp                , ARB_texture_border_clamp               , GLL,  x ,  x ,  x , 2000)
+EXT(ARB_texture_buffer_object               , ARB_texture_buffer_object              ,  x , GLC,  x ,  x , 2008)
+EXT(ARB_texture_buffer_object_rgb32         , ARB_texture_buffer_object_rgb32        ,  x , GLC,  x ,  x , 2009)
+EXT(ARB_texture_buffer_range                , ARB_texture_buffer_range               ,  x , GLC,  x ,  x , 2012)
+EXT(ARB_texture_compression                 , dummy_true                             , GLL,  x ,  x ,  x , 2000)
+EXT(ARB_texture_compression_bptc            , ARB_texture_compression_bptc           , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_texture_compression_rgtc            , ARB_texture_compression_rgtc           , GLL, GLC,  x ,  x , 2004)
+EXT(ARB_texture_cube_map                    , ARB_texture_cube_map                   , GLL,  x ,  x ,  x , 1999)
+EXT(ARB_texture_cube_map_array              , ARB_texture_cube_map_array             , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_texture_env_add                     , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(ARB_texture_env_combine                 , ARB_texture_env_combine                , GLL,  x ,  x ,  x , 2001)
+EXT(ARB_texture_env_crossbar                , ARB_texture_env_crossbar               , GLL,  x ,  x ,  x , 2001)
+EXT(ARB_texture_env_dot3                    , ARB_texture_env_dot3                   , GLL,  x ,  x ,  x , 2001)
+EXT(ARB_texture_float                       , ARB_texture_float                      , GLL, GLC,  x ,  x , 2004)
+EXT(ARB_texture_gather                      , ARB_texture_gather                     , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_texture_mirrored_repeat             , dummy_true                             , GLL,  x ,  x ,  x , 2001)
+EXT(ARB_texture_mirror_clamp_to_edge        , ARB_texture_mirror_clamp_to_edge       , GLL, GLC,  x ,  x , 2013)
+EXT(ARB_texture_multisample                 , ARB_texture_multisample                , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_texture_non_power_of_two            , ARB_texture_non_power_of_two           , GLL, GLC,  x ,  x , 2003)
+EXT(ARB_texture_query_levels                , ARB_texture_query_levels               , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_texture_query_lod                   , ARB_texture_query_lod                  , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_texture_rectangle                   , NV_texture_rectangle                   , GLL, GLC,  x ,  x , 2004)
+EXT(ARB_texture_rgb10_a2ui                  , ARB_texture_rgb10_a2ui                 , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_texture_rg                          , ARB_texture_rg                         , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_texture_stencil8                    , ARB_texture_stencil8                   , GLL, GLC,  x ,  x , 2013)
+EXT(ARB_texture_storage                     , dummy_true                             , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_texture_storage_multisample         , ARB_texture_multisample                , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_texture_view                        , ARB_texture_view                       , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_texture_swizzle                     , EXT_texture_swizzle                    , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_timer_query                         , ARB_timer_query                        , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_transform_feedback2                 , ARB_transform_feedback2                , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_transform_feedback3                 , ARB_transform_feedback3                , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_transform_feedback_instanced        , ARB_transform_feedback_instanced       , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_transpose_matrix                    , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(ARB_uniform_buffer_object               , ARB_uniform_buffer_object              , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_vertex_array_bgra                   , EXT_vertex_array_bgra                  , GLL, GLC,  x ,  x , 2008)
+EXT(ARB_vertex_array_object                 , dummy_true                             , GLL, GLC,  x ,  x , 2006)
+EXT(ARB_vertex_attrib_binding               , dummy_true                             , GLL, GLC,  x ,  x , 2012)
+EXT(ARB_vertex_buffer_object                , dummy_true                             , GLL,  x ,  x ,  x , 2003)
+EXT(ARB_vertex_program                      , ARB_vertex_program                     , GLL,  x ,  x ,  x , 2002)
+EXT(ARB_vertex_shader                       , ARB_vertex_shader                      , GLL, GLC,  x ,  x , 2002)
+EXT(ARB_vertex_attrib_64bit                 , ARB_vertex_attrib_64bit                ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_vertex_type_10f_11f_11f_rev         , ARB_vertex_type_10f_11f_11f_rev        , GLL, GLC,  x ,  x , 2013)
+EXT(ARB_vertex_type_2_10_10_10_rev          , ARB_vertex_type_2_10_10_10_rev         , GLL, GLC,  x ,  x , 2009)
+EXT(ARB_viewport_array                      , ARB_viewport_array                     ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_window_pos                          , dummy_true                             , GLL,  x ,  x ,  x , 2001)
+
+EXT(EXT_abgr                                , dummy_true                             , GLL, GLC,  x ,  x , 1995)
+EXT(EXT_bgra                                , dummy_true                             , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_blend_color                         , EXT_blend_color                        , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_blend_equation_separate             , EXT_blend_equation_separate            , GLL, GLC,  x ,  x , 2003)
+EXT(EXT_blend_func_separate                 , EXT_blend_func_separate                , GLL,  x ,  x ,  x , 1999)
+EXT(EXT_buffer_storage                      , ARB_buffer_storage                     ,  x ,  x ,  x ,  31, 2015)
+EXT(EXT_discard_framebuffer                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
+EXT(EXT_blend_minmax                        , EXT_blend_minmax                       , GLL,  x , ES1, ES2, 1995)
+EXT(EXT_blend_subtract                      , dummy_true                             , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_compiled_vertex_array               , dummy_true                             , GLL,  x ,  x ,  x , 1996)
+EXT(EXT_copy_texture                        , dummy_true                             , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_depth_bounds_test                   , EXT_depth_bounds_test                  , GLL, GLC,  x ,  x , 2002)
+EXT(EXT_draw_buffers                        , dummy_true                             ,  x ,  x ,  x , ES2, 2012)
+EXT(EXT_draw_buffers2                       , EXT_draw_buffers2                      , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          ,  x ,  x ,  x , ES2, 2014)
+EXT(EXT_draw_instanced                      , ARB_draw_instanced                     , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_draw_range_elements                 , dummy_true                             , GLL,  x ,  x ,  x , 1997)
+EXT(EXT_fog_coord                           , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(EXT_framebuffer_blit                    , dummy_true                             , GLL, GLC,  x ,  x , 2005)
+EXT(EXT_framebuffer_multisample             , EXT_framebuffer_multisample            , GLL, GLC,  x ,  x , 2005)
+EXT(EXT_framebuffer_multisample_blit_scaled , EXT_framebuffer_multisample_blit_scaled, GLL, GLC,  x ,  x , 2011)
+EXT(EXT_framebuffer_object                  , dummy_true                             , GLL,  x ,  x ,  x , 2000)
+EXT(EXT_framebuffer_sRGB                    , EXT_framebuffer_sRGB                   , GLL, GLC,  x ,  x , 1998)
+EXT(EXT_gpu_program_parameters              , EXT_gpu_program_parameters             , GLL,  x ,  x ,  x , 2006)
+EXT(EXT_gpu_shader4                         , EXT_gpu_shader4                        , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_map_buffer_range                    , ARB_map_buffer_range                   ,  x ,  x , ES1, ES2, 2012)
+EXT(EXT_multi_draw_arrays                   , dummy_true                             , GLL,  x , ES1, ES2, 1999)
+EXT(EXT_packed_depth_stencil                , dummy_true                             , GLL, GLC,  x ,  x , 2005)
+EXT(EXT_packed_float                        , EXT_packed_float                       , GLL, GLC,  x ,  x , 2004)
+EXT(EXT_packed_pixels                       , dummy_true                             , GLL,  x ,  x ,  x , 1997)
+EXT(EXT_pixel_buffer_object                 , EXT_pixel_buffer_object                , GLL, GLC,  x ,  x , 2004)
+EXT(EXT_point_parameters                    , EXT_point_parameters                   , GLL,  x ,  x ,  x , 1997)
+EXT(EXT_polygon_offset                      , dummy_true                             , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_polygon_offset_clamp                , EXT_polygon_offset_clamp               , GLL, GLC,  x ,  x , 2014)
+EXT(EXT_provoking_vertex                    , EXT_provoking_vertex                   , GLL, GLC,  x ,  x , 2009)
+EXT(EXT_rescale_normal                      , dummy_true                             , GLL,  x ,  x ,  x , 1997)
+EXT(EXT_secondary_color                     , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(EXT_separate_shader_objects             , dummy_true                             ,  x ,  x ,  x , ES2, 2013)
+EXT(EXT_separate_specular_color             , dummy_true                             , GLL,  x ,  x ,  x , 1997)
+EXT(EXT_shader_integer_mix                  , EXT_shader_integer_mix                 , GLL, GLC, ES1,  30, 2013)
+EXT(EXT_shadow_funcs                        , ARB_shadow                             , GLL,  x ,  x ,  x , 2002)
+EXT(EXT_stencil_two_side                    , EXT_stencil_two_side                   , GLL,  x ,  x ,  x , 2001)
+EXT(EXT_stencil_wrap                        , dummy_true                             , GLL,  x ,  x ,  x , 2002)
+EXT(EXT_subtexture                          , dummy_true                             , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_texture3D                           , dummy_true                             , GLL,  x ,  x ,  x , 1996)
+EXT(EXT_texture_array                       , EXT_texture_array                      , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_texture_compression_dxt1            , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2004)
+EXT(ANGLE_texture_compression_dxt3          , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2011)
+EXT(ANGLE_texture_compression_dxt5          , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2011)
+EXT(EXT_texture_compression_latc            , EXT_texture_compression_latc           , GLL,  x ,  x ,  x , 2006)
+EXT(EXT_texture_compression_rgtc            , ARB_texture_compression_rgtc           , GLL, GLC,  x ,  x , 2004)
+EXT(EXT_texture_compression_s3tc            , EXT_texture_compression_s3tc           , GLL, GLC,  x ,  x , 2000)
+EXT(EXT_texture_cube_map                    , ARB_texture_cube_map                   , GLL,  x ,  x ,  x , 2001)
+EXT(EXT_texture_edge_clamp                  , dummy_true                             , GLL,  x ,  x ,  x , 1997)
+EXT(EXT_texture_env_add                     , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(EXT_texture_env_combine                 , dummy_true                             , GLL,  x ,  x ,  x , 2000)
+EXT(EXT_texture_env_dot3                    , EXT_texture_env_dot3                   , GLL,  x ,  x ,  x , 2000)
+EXT(EXT_texture_filter_anisotropic          , EXT_texture_filter_anisotropic         , GLL, GLC, ES1, ES2, 1999)
+EXT(EXT_texture_format_BGRA8888             , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(EXT_texture_rg                          , ARB_texture_rg                         ,  x ,  x ,  x , ES2, 2011)
+EXT(EXT_read_format_bgra                    , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
+EXT(EXT_texture_integer                     , EXT_texture_integer                    , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_texture_lod_bias                    , dummy_true                             , GLL,  x , ES1,  x , 1999)
+EXT(EXT_texture_mirror_clamp                , EXT_texture_mirror_clamp               , GLL, GLC,  x ,  x , 2004)
+EXT(EXT_texture_object                      , dummy_true                             , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_texture                             , dummy_true                             , GLL,  x ,  x ,  x , 1996)
+EXT(EXT_texture_rectangle                   , NV_texture_rectangle                   , GLL,  x ,  x ,  x , 2004)
+EXT(EXT_texture_shared_exponent             , EXT_texture_shared_exponent            , GLL, GLC,  x ,  x , 2004)
+EXT(EXT_texture_snorm                       , EXT_texture_snorm                      , GLL, GLC,  x ,  x , 2009)
+EXT(EXT_texture_sRGB                        , EXT_texture_sRGB                       , GLL, GLC,  x ,  x , 2004)
+EXT(EXT_texture_sRGB_decode                 , EXT_texture_sRGB_decode                , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_texture_swizzle                     , EXT_texture_swizzle                    , GLL, GLC,  x ,  x , 2008)
+EXT(EXT_texture_type_2_10_10_10_REV         , dummy_true                             ,  x ,  x ,  x , ES2, 2008)
+EXT(EXT_timer_query                         , EXT_timer_query                        , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_transform_feedback                  , EXT_transform_feedback                 , GLL, GLC,  x ,  x , 2011)
+EXT(EXT_unpack_subimage                     , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(EXT_vertex_array_bgra                   , EXT_vertex_array_bgra                  , GLL, GLC,  x ,  x , 2008)
+EXT(EXT_vertex_array                        , dummy_true                             , GLL,  x ,  x ,  x , 1995)
+EXT(EXT_color_buffer_float                  , dummy_true                             ,  x ,  x , ES1,  30, 2013)
+
+
+EXT(OES_blend_equation_separate             , EXT_blend_equation_separate            ,  x ,  x , ES1,  x , 2009)
+EXT(OES_blend_func_separate                 , EXT_blend_func_separate                ,  x ,  x , ES1,  x , 2009)
+EXT(OES_blend_subtract                      , dummy_true                             ,  x ,  x , ES1,  x , 2009)
+EXT(OES_byte_coordinates                    , dummy_true                             ,  x ,  x , ES1,  x , 2002)
+EXT(OES_compressed_ETC1_RGB8_texture        , OES_compressed_ETC1_RGB8_texture       ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_compressed_paletted_texture         , dummy_true                             ,  x ,  x , ES1,  x , 2003)
+EXT(OES_depth24                             , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_depth32                             , dummy_false                            ,  x ,  x ,  x ,  x , 2005)
+EXT(OES_depth_texture                       , ARB_depth_texture                      ,  x ,  x ,  x , ES2, 2006)
+EXT(OES_depth_texture_cube_map              , OES_depth_texture_cube_map             ,  x ,  x ,  x , ES2, 2012)
+EXT(OES_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          ,  x ,  x ,  x , ES2, 2014)
+EXT(OES_draw_texture                        , OES_draw_texture                       ,  x ,  x , ES1,  x , 2004)
+EXT(OES_EGL_sync                            , dummy_true                             ,  x ,  x , ES1, ES2, 2010)
+EXT(OES_EGL_image                           , OES_EGL_image                          , GLL, GLC, ES1, ES2, 2006) /* FIXME: Mesa expects GL_OES_EGL_image to be available in OpenGL contexts. */
+EXT(OES_EGL_image_external                  , OES_EGL_image_external                 ,  x ,  x , ES1, ES2, 2010)
+EXT(OES_element_index_uint                  , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_fbo_render_mipmap                   , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_fixed_point                         , dummy_true                             ,  x ,  x , ES1,  x , 2002)
+EXT(OES_framebuffer_object                  , dummy_true                             ,  x ,  x , ES1,  x , 2005)
+EXT(OES_get_program_binary                  , dummy_true                             ,  x ,  x ,  x , ES2, 2008)
+EXT(OES_mapbuffer                           , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_packed_depth_stencil                , dummy_true                             ,  x ,  x , ES1, ES2, 2007)
+EXT(OES_point_size_array                    , dummy_true                             ,  x ,  x , ES1,  x , 2004)
+EXT(OES_point_sprite                        , ARB_point_sprite                       ,  x ,  x , ES1,  x , 2004)
+EXT(OES_query_matrix                        , dummy_true                             ,  x ,  x , ES1,  x , 2003)
+EXT(OES_read_format                         , dummy_true                             , GLL, GLC, ES1,  x , 2003)
+EXT(OES_rgb8_rgba8                          , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_single_precision                    , dummy_true                             ,  x ,  x , ES1,  x , 2003)
+EXT(OES_standard_derivatives                , OES_standard_derivatives               ,  x ,  x ,  x , ES2, 2005)
+EXT(OES_stencil1                            , dummy_false                            ,  x ,  x ,  x ,  x , 2005)
+EXT(OES_stencil4                            , dummy_false                            ,  x ,  x ,  x ,  x , 2005)
+EXT(OES_stencil8                            , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_stencil_wrap                        , dummy_true                             ,  x ,  x , ES1,  x , 2002)
+EXT(OES_surfaceless_context                 , dummy_true                             ,  x ,  x , ES1, ES2, 2012)
+EXT(OES_texture_3D                          , dummy_true                             ,  x ,  x ,  x , ES2, 2005)
+EXT(OES_texture_cube_map                    , ARB_texture_cube_map                   ,  x ,  x , ES1,  x , 2007)
+EXT(OES_texture_env_crossbar                , ARB_texture_env_crossbar               ,  x ,  x , ES1,  x , 2005)
+EXT(OES_texture_float                       , OES_texture_float                      ,  x ,  x ,  x , ES2, 2005)
+EXT(OES_texture_float_linear                , OES_texture_float_linear               ,  x ,  x ,  x , ES2, 2005)
+EXT(OES_texture_half_float                  , OES_texture_half_float                 ,  x ,  x ,  x , ES2, 2005)
+EXT(OES_texture_half_float_linear           , OES_texture_half_float_linear          ,  x ,  x ,  x , ES2, 2005)
+EXT(OES_texture_mirrored_repeat             , dummy_true                             ,  x ,  x , ES1,  x , 2005)
+EXT(OES_texture_storage_multisample_2d_array, ARB_texture_multisample                ,  x ,  x , ES1,  31, 2014)
+EXT(OES_texture_npot                        , ARB_texture_non_power_of_two           ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_vertex_array_object                 , dummy_true                             ,  x ,  x , ES1, ES2, 2010)
+
+
+EXT(KHR_debug                               , dummy_true                             , GLL, GLC,  x ,  x , 2012)
+EXT(KHR_context_flush_control               , dummy_true                             , GLL, GLC,  x , ES2, 2014)
+EXT(KHR_texture_compression_astc_hdr        , KHR_texture_compression_astc_hdr       , GLL, GLC,  x , ES2, 2012)
+EXT(KHR_texture_compression_astc_ldr        , KHR_texture_compression_astc_ldr       , GLL, GLC,  x , ES2, 2012)
+
+
+EXT(3DFX_texture_compression_FXT1           , TDFX_texture_compression_FXT1          , GLL, GLC,  x ,  x , 1999)
+EXT(AMD_conservative_depth                  , ARB_conservative_depth                 , GLL, GLC,  x ,  x , 2009)
+EXT(AMD_draw_buffers_blend                  , ARB_draw_buffers_blend                 , GLL, GLC,  x ,  x , 2009)
+EXT(AMD_performance_monitor                 , AMD_performance_monitor                , GLL, GLC,  x ,  x , 2007)
+EXT(AMD_pinned_memory                       , AMD_pinned_memory                      , GLL, GLC,  x ,  x , 2013)
+EXT(AMD_seamless_cubemap_per_texture        , AMD_seamless_cubemap_per_texture       , GLL, GLC,  x ,  x , 2009)
+EXT(AMD_shader_stencil_export               , ARB_shader_stencil_export              , GLL, GLC,  x ,  x , 2009)
+EXT(AMD_shader_trinary_minmax               , dummy_true                             , GLL, GLC,  x ,  x , 2012)
+EXT(AMD_vertex_shader_layer                 , AMD_vertex_shader_layer                ,  x , GLC,  x ,  x , 2012)
+EXT(AMD_vertex_shader_viewport_index        , AMD_vertex_shader_viewport_index       ,  x , GLC,  x ,  x , 2012)
+EXT(APPLE_object_purgeable                  , APPLE_object_purgeable                 , GLL, GLC,  x ,  x , 2006)
+EXT(APPLE_packed_pixels                     , dummy_true                             , GLL,  x ,  x ,  x , 2002)
+EXT(APPLE_texture_max_level                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
+EXT(APPLE_vertex_array_object               , dummy_true                             , GLL,  x ,  x ,  x , 2002)
+EXT(ATI_blend_equation_separate             , EXT_blend_equation_separate            , GLL, GLC,  x ,  x , 2003)
+EXT(ATI_draw_buffers                        , dummy_true                             , GLL,  x ,  x ,  x , 2002)
+EXT(ATI_fragment_shader                     , ATI_fragment_shader                    , GLL,  x ,  x ,  x , 2001)
+EXT(ATI_separate_stencil                    , ATI_separate_stencil                   , GLL,  x ,  x ,  x , 2006)
+EXT(ATI_texture_compression_3dc             , ATI_texture_compression_3dc            , GLL,  x ,  x ,  x , 2004)
+EXT(ATI_texture_env_combine3                , ATI_texture_env_combine3               , GLL,  x ,  x ,  x , 2002)
+EXT(ATI_texture_float                       , ARB_texture_float                      , GLL, GLC,  x ,  x , 2002)
+EXT(ATI_texture_mirror_once                 , ATI_texture_mirror_once                , GLL, GLC,  x ,  x , 2006)
+EXT(IBM_multimode_draw_arrays               , dummy_true                             , GLL, GLC,  x ,  x , 1998)
+EXT(IBM_rasterpos_clip                      , dummy_true                             , GLL,  x ,  x ,  x , 1996)
+EXT(IBM_texture_mirrored_repeat             , dummy_true                             , GLL,  x ,  x ,  x , 1998)
+EXT(INGR_blend_func_separate                , EXT_blend_func_separate                , GLL,  x ,  x ,  x , 1999)
+EXT(INTEL_performance_query                 , INTEL_performance_query                , GLL, GLC,  x , ES2, 2013)
+EXT(MESA_pack_invert                        , MESA_pack_invert                       , GLL, GLC,  x ,  x , 2002)
+EXT(MESA_texture_signed_rgba                , EXT_texture_snorm                      , GLL, GLC,  x ,  x , 2009)
+EXT(MESA_window_pos                         , dummy_true                             , GLL,  x ,  x ,  x , 2000)
+EXT(MESA_ycbcr_texture                      , MESA_ycbcr_texture                     , GLL, GLC,  x ,  x , 2002)
+EXT(NV_blend_square                         , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(NV_conditional_render                   , NV_conditional_render                  , GLL, GLC,  x ,  x , 2008)
+EXT(NV_depth_clamp                          , ARB_depth_clamp                        , GLL, GLC,  x ,  x , 2001)
+EXT(NV_draw_buffers                         , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_fbo_color_attachments                , dummy_true                             ,  x ,  x ,  x , ES2, 2010)
+EXT(NV_fog_distance                         , NV_fog_distance                        , GLL,  x ,  x ,  x , 2001)
+EXT(NV_fragment_program_option              , NV_fragment_program_option             , GLL,  x ,  x ,  x , 2005)
+EXT(NV_light_max_exponent                   , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(NV_packed_depth_stencil                 , dummy_true                             , GLL, GLC,  x ,  x , 2000)
+EXT(NV_point_sprite                         , NV_point_sprite                        , GLL, GLC,  x ,  x , 2001)
+EXT(NV_primitive_restart                    , NV_primitive_restart                   , GLL,  x ,  x ,  x , 2002)
+EXT(NV_read_buffer                          , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_read_depth                           , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_read_depth_stencil                   , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_read_stencil                         , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_texgen_reflection                    , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+EXT(NV_texture_barrier                      , NV_texture_barrier                     , GLL, GLC,  x ,  x , 2009)
+EXT(NV_texture_env_combine4                 , NV_texture_env_combine4                , GLL,  x ,  x ,  x , 1999)
+EXT(NV_texture_rectangle                    , NV_texture_rectangle                   , GLL,  x ,  x ,  x , 2000)
+EXT(NV_vdpau_interop                        , NV_vdpau_interop                       , GLL, GLC,  x ,  x , 2010)
+EXT(S3_s3tc                                 , ANGLE_texture_compression_dxt          , GLL, GLC,  x ,  x , 1999)
+EXT(SGIS_generate_mipmap                    , dummy_true                             , GLL,  x ,  x ,  x , 1997)
+EXT(SGIS_texture_border_clamp               , ARB_texture_border_clamp               , GLL,  x ,  x ,  x , 1997)
+EXT(SGIS_texture_edge_clamp                 , dummy_true                             , GLL,  x ,  x ,  x , 1997)
+EXT(SGIS_texture_lod                        , dummy_true                             , GLL,  x ,  x ,  x , 1997)
+EXT(SUN_multi_draw_arrays                   , dummy_true                             , GLL,  x ,  x ,  x , 1999)
+#undef GLL
+#undef GLC
+#undef ES1
+#undef ES2
+#undef  x
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 02dd257d79d..95cbba4ed57 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2721,13 +2721,14 @@ struct gl_shader_program
    struct gl_uniform_block **ShaderStorageBlocks;
 
    /**
-    * Indices into the _LinkedShaders's UniformBlocks[] array for each stage
-    * they're used in, or -1.
+    * Indices into the BufferInterfaceBlocks[] array for each stage they're
+    * used in, or -1.
     *
-    * This is used to maintain the Binding values of the stage's UniformBlocks[]
-    * and to answer the GL_UNIFORM_BLOCK_REFERENCED_BY_*_SHADER queries.
+    * This is used to maintain the Binding values of the stage's
+    * BufferInterfaceBlocks[] and to answer the
+    * GL_UNIFORM_BLOCK_REFERENCED_BY_*_SHADER queries.
     */
-   int *UniformBlockStageIndex[MESA_SHADER_STAGES];
+   int *InterfaceBlockStageIndex[MESA_SHADER_STAGES];
 
    /**
     * Map of active uniform names to locations
@@ -2879,6 +2880,8 @@ struct gl_shader_compiler_options
     */
    GLboolean OptimizeForAOS;
 
+   GLboolean LowerBufferInterfaceBlocks; /**< Lower UBO and SSBO access to intrinsics. */
+
    const struct nir_shader_compiler_options *NirOptions;
 };
 
@@ -3582,11 +3585,24 @@ struct gl_constants
     * below:
     *    SampleMap8x = {a, b, c, d, e, f, g, h};
     *
-    * Follow the logic for other sample counts.
+    * Follow the logic for sample counts 2-8.
+    *
+    * For 16x the sample indices layout as a 4x4 grid as follows:
+    *
+    *            -----------------
+    *            | 0 | 1 | 2 | 3 |
+    *            -----------------
+    *            | 4 | 5 | 6 | 7 |
+    *            -----------------
+    *            | 8 | 9 |10 |11 |
+    *            -----------------
+    *            |12 |13 |14 |15 |
+    *            -----------------
     */
    uint8_t SampleMap2x[2];
    uint8_t SampleMap4x[4];
    uint8_t SampleMap8x[8];
+   uint8_t SampleMap16x[16];
 
    /** GL_ARB_shader_atomic_counters */
    GLuint MaxAtomicBufferBindings;
@@ -3667,6 +3683,7 @@ struct gl_extensions
    GLboolean ARB_fragment_shader;
    GLboolean ARB_framebuffer_no_attachments;
    GLboolean ARB_framebuffer_object;
+   GLboolean ARB_enhanced_layouts;
    GLboolean ARB_explicit_attrib_location;
    GLboolean ARB_explicit_uniform_location;
    GLboolean ARB_geometry_shader4;
@@ -3750,7 +3767,6 @@ struct gl_extensions
    GLboolean EXT_provoking_vertex;
    GLboolean EXT_shader_integer_mix;
    GLboolean EXT_stencil_two_side;
-   GLboolean EXT_texture3D;
    GLboolean EXT_texture_array;
    GLboolean EXT_texture_compression_latc;
    GLboolean EXT_texture_compression_s3tc;
@@ -3808,6 +3824,12 @@ struct gl_extensions
    const GLubyte *String;
    /** Number of supported extensions */
    GLuint Count;
+   /**
+    * The context version which extension helper functions compare against.
+    * By default, the value is equal to ctx->Version. This changes to ~0
+    * while meta is in progress.
+    */
+   GLubyte Version;
 };
 
 
diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 699a2ae47eb..90dff13485b 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -907,6 +907,21 @@ _mesa_ValidateProgramPipeline(GLuint pipeline)
 
    _mesa_validate_program_pipeline(ctx, pipe,
                                    (ctx->_Shader->Name == pipe->Name));
+
+   /* Validate inputs against outputs, this cannot be done during linking
+    * since programs have been linked separately from each other.
+    *
+    * From OpenGL 4.5 Core spec:
+    *     "Separable program objects may have validation failures that cannot be
+    *     detected without the complete program pipeline. Mismatched interfaces,
+    *     improper usage of program objects together, and the same
+    *     state-dependent failures can result in validation errors for such
+    *     program objects."
+    *
+    * OpenGL ES 3.1 specification has the same text.
+    */
+   if (!_mesa_validate_pipeline_io(pipe))
+      pipe->Validated = GL_FALSE;
 }
 
 void GLAPIENTRY
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index dd51bba3386..58ba04153e6 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -980,7 +980,7 @@ is_resource_referenced(struct gl_shader_program *shProg,
       return RESOURCE_ATC(res)->StageReferences[stage];
 
    if (res->Type == GL_UNIFORM_BLOCK || res->Type == GL_SHADER_STORAGE_BLOCK)
-      return shProg->UniformBlockStageIndex[stage][index] != -1;
+      return shProg->InterfaceBlockStageIndex[stage][index] != -1;
 
    return res->StageReferences & (1 << stage);
 }
@@ -1359,3 +1359,65 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
    if (length)
       *length = amount;
 }
+
+static bool
+validate_io(const struct gl_shader *input_stage,
+            const struct gl_shader *output_stage)
+{
+   assert(input_stage && output_stage);
+
+   /* For each output in a, find input in b and do any required checks. */
+   foreach_in_list(ir_instruction, out, input_stage->ir) {
+      ir_variable *out_var = out->as_variable();
+      if (!out_var || out_var->data.mode != ir_var_shader_out)
+         continue;
+
+      foreach_in_list(ir_instruction, in, output_stage->ir) {
+         ir_variable *in_var = in->as_variable();
+         if (!in_var || in_var->data.mode != ir_var_shader_in)
+            continue;
+
+         if (strcmp(in_var->name, out_var->name) == 0) {
+            /* From OpenGL ES 3.1 spec:
+             *     "When both shaders are in separate programs, mismatched
+             *     precision qualifiers will result in a program interface
+             *     mismatch that will result in program pipeline validation
+             *     failures, as described in section 7.4.1 (“Shader Interface
+             *     Matching”) of the OpenGL ES 3.1 Specification."
+             */
+            if (in_var->data.precision != out_var->data.precision)
+               return false;
+         }
+      }
+   }
+   return true;
+}
+
+/**
+ * Validate inputs against outputs in a program pipeline.
+ */
+extern "C" bool
+_mesa_validate_pipeline_io(struct gl_pipeline_object *pipeline)
+{
+   struct gl_shader_program **shProg =
+      (struct gl_shader_program **) pipeline->CurrentProgram;
+
+   /* Find first active stage in pipeline. */
+   unsigned idx, prev = 0;
+   for (idx = 0; idx < ARRAY_SIZE(pipeline->CurrentProgram); idx++) {
+      if (shProg[idx]) {
+         prev = idx;
+         break;
+      }
+   }
+
+   for (idx = prev + 1; idx < ARRAY_SIZE(pipeline->CurrentProgram); idx++) {
+      if (shProg[idx]) {
+         if (!validate_io(shProg[prev]->_LinkedShaders[prev],
+                          shProg[idx]->_LinkedShaders[idx]))
+            return false;
+         prev = idx;
+      }
+   }
+   return true;
+}
diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c
index ffc71931fec..203ccef7fc4 100644
--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -294,8 +294,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
    shProg->BufferInterfaceBlocks = NULL;
    shProg->NumBufferInterfaceBlocks = 0;
    for (i = 0; i < MESA_SHADER_STAGES; i++) {
-      ralloc_free(shProg->UniformBlockStageIndex[i]);
-      shProg->UniformBlockStageIndex[i] = NULL;
+      ralloc_free(shProg->InterfaceBlockStageIndex[i]);
+      shProg->InterfaceBlockStageIndex[i] = NULL;
    }
 
    ralloc_free(shProg->AtomicBuffers);
diff --git a/src/mesa/main/shaderobj.h b/src/mesa/main/shaderobj.h
index 796de470735..be80752d7f2 100644
--- a/src/mesa/main/shaderobj.h
+++ b/src/mesa/main/shaderobj.h
@@ -234,6 +234,9 @@ _mesa_shader_stage_to_subroutine_uniform(gl_shader_stage stage)
    }
 }
 
+extern bool
+_mesa_validate_pipeline_io(struct gl_pipeline_object *);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index ac2d2332df8..abe0f432572 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2506,5 +2506,8 @@ const struct function gles31_functions_possible[] = {
    /* GL_OES_texture_storage_multisample_2d_array */
    { "glTexStorage3DMultisampleOES", 31, -1 },
 
+   /* GL_EXT_buffer_storage */
+   { "glBufferStorageEXT", 31, -1 },
+
    { NULL, 0, -1 },
  };
diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c
index cb147fac476..9d88554d945 100644
--- a/src/mesa/main/texstate.c
+++ b/src/mesa/main/texstate.c
@@ -330,7 +330,8 @@ _mesa_ClientActiveTexture(GLenum texture)
       return;
 
    if (texUnit >= ctx->Const.MaxTextureCoordUnits) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture)");
+      _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture=%s)",
+                  _mesa_enum_to_string(texture));
       return;
    }
 
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index bc235380d97..758ca2456df 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1026,7 +1026,7 @@ _mesa_UniformBlockBinding(GLuint program,
       shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding = uniformBlockBinding;
 
       for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index = shProg->UniformBlockStageIndex[i][uniformBlockIndex];
+	 int stage_index = shProg->InterfaceBlockStageIndex[i][uniformBlockIndex];
 
 	 if (stage_index != -1) {
 	    struct gl_shader *sh = shProg->_LinkedShaders[i];
@@ -1079,7 +1079,7 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
       shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding;
 
       for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index = shProg->UniformBlockStageIndex[i][shaderStorageBlockIndex];
+	 int stage_index = shProg->InterfaceBlockStageIndex[i][shaderStorageBlockIndex];
 
 	 if (stage_index != -1) {
 	    struct gl_shader *sh = shProg->_LinkedShaders[i];
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 5635a643200..314b26dc74f 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -524,6 +524,7 @@ _mesa_compute_version(struct gl_context *ctx)
       return;
 
    ctx->Version = _mesa_get_version(&ctx->Extensions, &ctx->Const, ctx->API);
+   ctx->Extensions.Version = ctx->Version;
 
    /* Make sure that the GLSL version lines up with the GL version. In some
     * cases it can be too high, e.g. if an extension is missing.
diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c
index 8afd336779f..5d20b26d26e 100644
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -83,6 +83,7 @@ st_bufferobj_free(struct gl_context *ctx, struct gl_buffer_object *obj)
    if (st_obj->buffer)
       pipe_resource_reference(&st_obj->buffer, NULL);
 
+   mtx_destroy(&st_obj->Base.Mutex);
    free(st_obj->Base.Label);
    free(st_obj);
 }
diff --git a/src/mesa/state_tracker/st_cb_copyimage.c b/src/mesa/state_tracker/st_cb_copyimage.c
index 75114cdb712..03a7294e7c9 100644
--- a/src/mesa/state_tracker/st_cb_copyimage.c
+++ b/src/mesa/state_tracker/st_cb_copyimage.c
@@ -552,6 +552,10 @@ st_CopyImageSubData(struct gl_context *ctx,
       src_res = src->pt;
       src_level = src_image->Level;
       src_z += src_image->Face;
+      if (src_image->TexObject->Immutable) {
+         src_level += src_image->TexObject->MinLevel;
+         src_z += src_image->TexObject->MinLayer;
+      }
    } else {
       struct st_renderbuffer *src = st_renderbuffer(src_renderbuffer);
       src_res = src->texture;
@@ -563,6 +567,10 @@ st_CopyImageSubData(struct gl_context *ctx,
       dst_res = dst->pt;
       dst_level = dst_image->Level;
       dst_z += dst_image->Face;
+      if (dst_image->TexObject->Immutable) {
+         dst_level += dst_image->TexObject->MinLevel;
+         dst_z += dst_image->TexObject->MinLayer;
+      }
    } else {
       struct st_renderbuffer *dst = st_renderbuffer(dst_renderbuffer);
       dst_res = dst->texture;
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index d4c916e8057..62f149aa0fb 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1873,6 +1873,34 @@ st_TextureView(struct gl_context *ctx,
    return GL_TRUE;
 }
 
+static void
+st_ClearTexSubImage(struct gl_context *ctx,
+                    struct gl_texture_image *texImage,
+                    GLint xoffset, GLint yoffset, GLint zoffset,
+                    GLsizei width, GLsizei height, GLsizei depth,
+                    const GLvoid *clearValue)
+{
+   static const char zeros[16] = {0};
+   struct st_texture_image *stImage = st_texture_image(texImage);
+   struct pipe_resource *pt = stImage->pt;
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   unsigned level = texImage->Level;
+   struct pipe_box box;
+
+   if (!pt)
+      return;
+
+   u_box_3d(xoffset, yoffset, zoffset + texImage->Face,
+            width, height, depth, &box);
+   if (texImage->TexObject->Immutable) {
+      level += texImage->TexObject->MinLevel;
+      box.z += texImage->TexObject->MinLayer;
+   }
+
+   pipe->clear_texture(pipe, pt, level, &box, clearValue ? clearValue : zeros);
+}
+
 void
 st_init_texture_functions(struct dd_function_table *functions)
 {
@@ -1904,4 +1932,5 @@ st_init_texture_functions(struct dd_function_table *functions)
 
    functions->AllocTextureStorage = st_AllocTextureStorage;
    functions->TextureView = st_TextureView;
+   functions->ClearTexSubImage = st_ClearTexSubImage;
 }
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index bd7cbccc20c..99e96e1f3ae 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -254,6 +254,7 @@ void st_init_limits(struct pipe_screen *screen,
                                       PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT);
 
       options->LowerClipDistance = true;
+      options->LowerBufferInterfaceBlocks = true;
    }
 
    c->LowerTessLevel = true;
@@ -438,6 +439,7 @@ void st_init_extensions(struct pipe_screen *screen,
    static const struct st_extension_cap_mapping cap_mapping[] = {
       { o(ARB_base_instance),                PIPE_CAP_START_INSTANCE                   },
       { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT   },
+      { o(ARB_clear_texture),                PIPE_CAP_CLEAR_TEXTURE                    },
       { o(ARB_color_buffer_float),           PIPE_CAP_VERTEX_COLOR_UNCLAMPED           },
       { o(ARB_copy_image),                   PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS },
       { o(ARB_depth_clamp),                  PIPE_CAP_DEPTH_CLIP_DISABLE               },
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index f481e8902d8..3ad1afdecda 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -4408,6 +4408,7 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
    TGSI_SEMANTIC_SAMPLEID,
    TGSI_SEMANTIC_SAMPLEPOS,
    TGSI_SEMANTIC_SAMPLEMASK,
+   TGSI_SEMANTIC_HELPER_INVOCATION,
 
    /* Tessellation shaders
     */
@@ -5138,6 +5139,8 @@ st_translate_program(
           TGSI_SEMANTIC_BASEVERTEX);
    assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_TESS_COORD] ==
           TGSI_SEMANTIC_TESSCOORD);
+   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_HELPER_INVOCATION] ==
+          TGSI_SEMANTIC_HELPER_INVOCATION);
 
    t = CALLOC_STRUCT(st_translate);
    if (!t) {
@@ -5822,7 +5825,6 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                          (!ctx->Const.NativeIntegers ? INT_DIV_TO_MUL_RCP : 0) |
                          (options->EmitNoSat ? SAT_TO_CLAMP : 0));
 
-      lower_ubo_reference(prog->_LinkedShaders[i], ir);
       do_vec_index_to_cond_assign(ir);
       lower_vector_insert(ir, true);
       lower_quadop_vector(ir, false);
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 7abd128e719..d0d261f4fde 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -623,6 +623,58 @@ st_context_destroy(struct st_context_iface *stctxi)
    st_destroy_context(st);
 }
 
+static void
+st_debug_message(void *data,
+                 unsigned *id,
+                 enum pipe_debug_type ptype,
+                 const char *fmt,
+                 va_list args)
+{
+   struct st_context *st = data;
+   enum mesa_debug_source source;
+   enum mesa_debug_type type;
+   enum mesa_debug_severity severity;
+
+   switch (ptype) {
+   case PIPE_DEBUG_TYPE_OUT_OF_MEMORY:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_ERROR;
+      severity = MESA_DEBUG_SEVERITY_MEDIUM;
+      break;
+   case PIPE_DEBUG_TYPE_ERROR:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_ERROR;
+      severity = MESA_DEBUG_SEVERITY_MEDIUM;
+      break;
+   case PIPE_DEBUG_TYPE_SHADER_INFO:
+      source = MESA_DEBUG_SOURCE_SHADER_COMPILER;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_PERF_INFO:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_PERFORMANCE;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_INFO:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_FALLBACK:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_PERFORMANCE;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   case PIPE_DEBUG_TYPE_CONFORMANCE:
+      source = MESA_DEBUG_SOURCE_API;
+      type = MESA_DEBUG_TYPE_OTHER;
+      severity = MESA_DEBUG_SEVERITY_NOTIFICATION;
+      break;
+   }
+   _mesa_gl_vdebug(st->ctx, id, source, type, severity, fmt, args);
+}
+
 static struct st_context_iface *
 st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
                       const struct st_context_attribs *attribs,
@@ -677,6 +729,11 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
          return NULL;
       }
       st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_DEBUG_BIT;
+
+      if (pipe->set_debug_callback) {
+         struct pipe_debug_callback cb = { st_debug_message, st };
+         pipe->set_debug_callback(pipe, &cb);
+      }
    }
 
    if (attribs->flags & ST_CONTEXT_FLAG_FORWARD_COMPATIBLE)
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index a614b26cae4..7534599c313 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -114,6 +114,7 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
       if (_mesa_inside_begin_end(exec->ctx)) {
 	 exec->vtx.prim[0].mode = exec->ctx->Driver.CurrentExecPrimitive;
 	 exec->vtx.prim[0].begin = 0;
+         exec->vtx.prim[0].end = 0;
 	 exec->vtx.prim[0].start = 0;
 	 exec->vtx.prim[0].count = 0;
 	 exec->vtx.prim_count++;
@@ -846,17 +847,23 @@ static void GLAPIENTRY vbo_exec_End( void )
          /* We're finishing drawing a line loop.  Append 0th vertex onto
           * end of vertex buffer so we can draw it as a line strip.
           */
-         const fi_type *src = exec->vtx.buffer_map;
+         const fi_type *src = exec->vtx.buffer_map +
+            last_prim->start * exec->vtx.vertex_size;
          fi_type *dst = exec->vtx.buffer_map +
             exec->vtx.vert_count * exec->vtx.vertex_size;
 
          /* copy 0th vertex to end of buffer */
          memcpy(dst, src, exec->vtx.vertex_size * sizeof(fi_type));
 
-         assert(last_prim->start == 0);
          last_prim->start++;  /* skip vertex0 */
          /* note that last_prim->count stays unchanged */
          last_prim->mode = GL_LINE_STRIP;
+
+         /* Increment the vertex count so the next primitive doesn't
+          * overwrite the last vertex which we just added.
+          */
+         exec->vtx.vert_count++;
+         exec->vtx.buffer_ptr += exec->vtx.vertex_size;
       }
 
       try_vbo_merge(exec);
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index ed5d9e947b0..0d42618f246 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -117,6 +117,7 @@ vbo_copy_vertices( struct vbo_exec_context *exec )
           * subtract one from last_prim->start) so that we copy the 0th vertex
           * to the next vertex buffer.
           */
+         assert(last_prim->start > 0);
          src -= sz;
       }
       /* fall-through */
diff --git a/src/util/list.h b/src/util/list.h
index d4b485174fc..f0dec5da608 100644
--- a/src/util/list.h
+++ b/src/util/list.h
@@ -99,6 +99,14 @@ static inline bool list_empty(struct list_head *list)
    return list->next == list;
 }
 
+/**
+ * Returns whether the list has exactly one element.
+ */
+static inline bool list_is_singular(const struct list_head *list)
+{
+   return list->next != NULL && list->next->next == list;
+}
+
 static inline unsigned list_length(struct list_head *list)
 {
    struct list_head *node;
diff --git a/src/util/ralloc.c b/src/util/ralloc.c
index e07fce74f23..bb4cf9612eb 100644
--- a/src/util/ralloc.c
+++ b/src/util/ralloc.c
@@ -499,6 +499,7 @@ ralloc_vasprintf_rewrite_tail(char **str, size_t *start, const char *fmt,
    if (unlikely(*str == NULL)) {
       // Assuming a NULL context is probably bad, but it's expected behavior.
       *str = ralloc_vasprintf(NULL, fmt, args);
+      *start = strlen(*str);
       return true;
    }
author	Jason Ekstrand <[email protected]>	2015-11-14 07:56:10 -0800
committer	Jason Ekstrand <[email protected]>	2015-11-14 07:56:10 -0800
commit	1469ccb7464836c752fa2664c36d8fae7e80606c (patch)
tree	6f15e2eeb7e16e4085a0c58d50a36a4c12b231a5
parent	e8f51fe4deb5082fece5f8cb167b89b0f03eb244 (diff)
parent	f94e1d97381ec787c2abbbcd5265252596217e33 (diff)