223 files changed, 5971 insertions, 1461 deletions
diff --git a/configure.ac b/configure.ac
index 6cadd040194..577ca222f41 100644
--- a/configure.ac
+++ b/configure.ac
@@ -72,8 +72,8 @@ LIBDRM_REQUIRED=2.4.60
 LIBDRM_RADEON_REQUIRED=2.4.56
 LIBDRM_AMDGPU_REQUIRED=2.4.63
 LIBDRM_INTEL_REQUIRED=2.4.61
-LIBDRM_NVVIEUX_REQUIRED=2.4.33
-LIBDRM_NOUVEAU_REQUIRED=2.4.62
+LIBDRM_NVVIEUX_REQUIRED=2.4.66
+LIBDRM_NOUVEAU_REQUIRED=2.4.66
 LIBDRM_FREEDRENO_REQUIRED=2.4.65
 DRI2PROTO_REQUIRED=2.6
 DRI3PROTO_REQUIRED=1.0
@@ -98,8 +98,7 @@ AC_PROG_CXX
 AM_PROG_CC_C_O
 AM_PROG_AS
 AX_CHECK_GNU_MAKE
-AC_CHECK_PROGS([PYTHON2], [python2 python])
-AC_CHECK_PROGS([PYTHON3], [python3])
+AC_CHECK_PROGS([PYTHON2], [python2.7 python2 python])
 AC_PROG_SED
 AC_PROG_MKDIR_P
 
@@ -384,10 +383,11 @@ save_CFLAGS="$CFLAGS"
 CFLAGS="$SSE41_CFLAGS $CFLAGS"
 AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
 #include <smmintrin.h>
+int param;
 int main () {
-    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+    __m128i a = _mm_set1_epi32 (param), b = _mm_set1_epi32 (param + 1), c;
     c = _mm_max_epu32(a, b);
-    return 0;
+    return _mm_cvtsi128_si32(c);
 }]])], SSE41_SUPPORTED=1)
 CFLAGS="$save_CFLAGS"
 if test "x$SSE41_SUPPORTED" = x1; then
@@ -1715,7 +1715,15 @@ AC_ARG_WITH([clang-libdir],
    [CLANG_LIBDIR=''])
 
 PKG_CHECK_EXISTS([libclc], [have_libclc=yes], [have_libclc=no])
-AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;ELF_LIB=-lelf])
+PKG_CHECK_MODULES([LIBELF], [libelf], [have_libelf=yes], [have_libelf=no])
+
+if test "x$have_libelf" = xno; then
+   LIBELF_LIBS=''
+   LIBELF_CFLAGS=''
+   AC_CHECK_LIB([elf], [elf_memory], [have_libelf=yes;LIBELF_LIBS=-lelf], [have_libelf=no])
+   AC_SUBST([LIBELF_LIBS])
+   AC_SUBST([LIBELF_CFLAGS])
+fi
 
 if test "x$enable_opencl" = xyes; then
     if test -z "$with_gallium_drivers"; then
@@ -2302,8 +2310,6 @@ if test "x$USE_VC4_SIMULATOR" = xyes -a "x$HAVE_GALLIUM_ILO" = xyes; then
     AC_MSG_ERROR([VC4 simulator on x86 replaces i965 driver build, so ilo must be disabled.])
 fi
 
-AC_SUBST([ELF_LIB])
-
 AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
 AM_CONDITIONAL(HAVE_X11_DRIVER, test "x$enable_xlib_glx" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
@@ -2584,7 +2590,6 @@ if test "x$MESA_LLVM" = x1; then
     echo ""
 fi
 echo "        PYTHON2:         $PYTHON2"
-echo "        PYTHON3:         $PYTHON3"
 
 echo ""
 echo "        Run '${MAKE-make}' to build Mesa"
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 84b5a172d58..58aace9a13c 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -112,7 +112,7 @@ GL 4.0, GLSL 4.00 --- all DONE: nvc0, r600, radeonsi
   GL_ARB_gpu_shader_fp64                               DONE (llvmpipe, softpipe)
   GL_ARB_sample_shading                                DONE (i965, nv50)
   GL_ARB_shader_subroutine                             DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_tessellation_shader                           DONE ()
+  GL_ARB_tessellation_shader                           DONE (i965/gen8+)
   GL_ARB_texture_buffer_object_rgb32                   DONE (i965, llvmpipe, softpipe)
   GL_ARB_texture_cube_map_array                        DONE (i965, nv50, llvmpipe, softpipe)
   GL_ARB_texture_gather                                DONE (i965, nv50, llvmpipe, softpipe)
@@ -184,7 +184,7 @@ GL 4.4, GLSL 4.40:
   - forced alignment within blocks                     in progress
   - specified vec4-slot component numbers              in progress
   - specified transform/feedback layout                in progress
-  - input/output block locations                       in progress
+  - input/output block locations                       DONE
   GL_ARB_multi_bind                                    DONE (all drivers)
   GL_ARB_query_buffer_object                           not started
   GL_ARB_texture_mirror_clamp_to_edge                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
@@ -233,7 +233,7 @@ GLES3.1, GLSL ES 3.1
       glMemoryBarrierByRegion                          DONE
       glGetTexLevelParameter[fi]v - needs updates      DONE
       glGetBooleani_v - restrict to GLES enums
-      gl_HelperInvocation support
+      gl_HelperInvocation support                      DONE (i965, nvc0, r600)
 
 GLES3.2, GLSL ES 3.2
   GL_EXT_color_buffer_float                            DONE (all drivers)
diff --git a/docs/contents.html b/docs/contents.html
index 6612cbefa84..5e0f514c1e1 100644
--- a/docs/contents.html
+++ b/docs/contents.html
@@ -96,8 +96,7 @@
 <br>
 <blockquote>
 <a href="http://sourceforge.net"
-target="_parent"><img src="http://sourceforge.net/sflogo.php?group_id=3&amp;type=1"
-width="88" height="31" align="bottom" alt="Sourceforge.net" border="0"></a>
+target="_parent">sourceforge.net</a>
 </blockquote>
 
 </body>
diff --git a/docs/index.html b/docs/index.html
index 6b1221d1bbd..bdb2845bc53 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,19 @@
 
 <h1>News</h1>
 
+<h2>December 21, 2015</h2>
+<p>
+<a href="relnotes/11.0.8.html">Mesa 11.0.8</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>December 15, 2015</h2>
+<p>
+<a href="relnotes/11.1.0.html">Mesa 11.1.0</a> is released.  This is a new
+development release.  See the release notes for more information about
+the release.
+</p>
+
 <h2>December 9, 2015</h2>
 <p>
 <a href="relnotes/11.0.7.html">Mesa 11.0.7</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index a8aaa5fd2e8..29438c98c15 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,8 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/11.0.8.html">11.0.8 release notes</a>
+<li><a href="relnotes/11.1.0.html">11.1.0 release notes</a>
 <li><a href="relnotes/11.0.7.html">11.0.7 release notes</a>
 <li><a href="relnotes/11.0.6.html">11.0.6 release notes</a>
 <li><a href="relnotes/11.0.5.html">11.0.5 release notes</a>
diff --git a/docs/relnotes/11.0.8.html b/docs/relnotes/11.0.8.html
new file mode 100644
index 00000000000..ce6e1d6a0b8
--- /dev/null
+++ b/docs/relnotes/11.0.8.html
@@ -0,0 +1,200 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.8 Release Notes / December 9, 2015</h1>
+
+<p>
+Mesa 11.0.8 is a bug fix release which fixes bugs found since the 11.0.7 release.
+</p>
+<p>
+Mesa 11.0.8 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+ab9db87b54d7525e4b611b82577ea9a9eae55927558df57b190059d5ecd9406f  mesa-11.0.8.tar.gz
+5696e4730518b6805d2ed5def393c4293f425a2c2c01bd5ed4bdd7ad62f7ad75  mesa-11.0.8.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91806">Bug 91806</a> - configure does not test whether assembler supports sse4.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92849">Bug 92849</a> - [IVB HSW BDW] piglit image load/store load-from-cleared-image.shader_test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92909">Bug 92909</a> - Offset/alignment issue with layout std140 and vec3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93004">Bug 93004</a> - Guild Wars 2 crash on nouveau DX11 cards</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93215">Bug 93215</a> - [Regression bisected] Ogles1conform Automatic mipmap generation test is fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93266">Bug 93266</a> - gl_arb_shading_language_420pack does not allow binding of image variables</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Boyuan Zhang (1):</p>
+<ul>
+  <li>radeon/uvd: uv pitch separation for stoney</li>
+</ul>
+
+<p>Dave Airlie (9):</p>
+<ul>
+  <li>r600: do SQ flush ES ring rolling workaround</li>
+  <li>r600: SMX returns CONTEXT_DONE early workaround</li>
+  <li>r600/shader: split address get out to a function.</li>
+  <li>r600/shader: add utility functions to do single slot arithmatic</li>
+  <li>r600g: fix geom shader input indirect indexing.</li>
+  <li>r600: handle geometry dynamic input array index</li>
+  <li>radeonsi: handle doubles in lds load path.</li>
+  <li>mesa/varray: set double arrays to non-normalised.</li>
+  <li>mesa/shader: return correct attribute location for double matrix arrays</li>
+</ul>
+
+<p>Emil Velikov (8):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.7</li>
+  <li>cherry-ignore: don't pick a specific i965 formats patch</li>
+  <li>Revert "i965/nir: Remove unused indirect handling"</li>
+  <li>Revert "i965/state: Get rid of dword_pitch arguments to buffer functions"</li>
+  <li>Revert "i965/vec4: Use a stride of 1 and byte offsets for UBOs"</li>
+  <li>Revert "i965/fs: Use a stride of 1 and byte offsets for UBOs"</li>
+  <li>Revert "i965/vec4: Use byte offsets for UBO pulls on Sandy Bridge"</li>
+  <li>Update version to 11.0.8</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965: Resolve color and flush for all active shader images in intel_update_state().</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>meta/generate_mipmap: Work-around GLES 1.x problem with GL_DRAW_FRAMEBUFFER</li>
+</ul>
+
+<p>Ilia Mirkin (17):</p>
+<ul>
+  <li>freedreno/a4xx: support lod_bias</li>
+  <li>freedreno/a4xx: fix 5_5_5_1 texture sampler format</li>
+  <li>freedreno/a4xx: point regid to "red" even for alpha-only rb formats</li>
+  <li>nvc0/ir: fold postfactor into immediate</li>
+  <li>nv50/ir: deal with loops with no breaks</li>
+  <li>nv50/ir: the mad source might not have a defining instruction</li>
+  <li>nv50/ir: fix instruction permutation logic</li>
+  <li>nv50/ir: don't forget to mark flagsDef on cvt in txb lowering</li>
+  <li>nv50/ir: fix DCE to not generate 96-bit loads</li>
+  <li>nv50/ir: avoid looking at uninitialized srcMods entries</li>
+  <li>gk110/ir: fix imul hi emission with limm arg</li>
+  <li>gk104/ir: sampler doesn't matter for txf</li>
+  <li>gk110/ir: fix imad sat/hi flag emission for immediate args</li>
+  <li>nv50/ir: fix cutoff for using r63 vs r127 when replacing zero</li>
+  <li>nv50/ir: can't have predication and immediates</li>
+  <li>glsl: assign varying locations to tess shaders when doing SSO</li>
+  <li>ttn: add TEX2 support</li>
+</ul>
+
+<p>Jason Ekstrand (5):</p>
+<ul>
+  <li>i965/vec4: Use byte offsets for UBO pulls on Sandy Bridge</li>
+  <li>i965/fs: Use a stride of 1 and byte offsets for UBOs</li>
+  <li>i965/vec4: Use a stride of 1 and byte offsets for UBOs</li>
+  <li>i965/state: Get rid of dword_pitch arguments to buffer functions</li>
+  <li>i965/nir: Remove unused indirect handling</li>
+</ul>
+
+<p>Jonathan Gray (2):</p>
+<ul>
+  <li>configure.ac: use pkg-config for libelf</li>
+  <li>configure: check for python2.7 for PYTHON2</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>i965: Fix fragment shader struct inputs.</li>
+  <li>i965: Fix scalar vertex shader struct outputs.</li>
+</ul>
+
+<p>Marek Olšák (8):</p>
+<ul>
+  <li>radeonsi: fix occlusion queries on Fiji</li>
+  <li>radeonsi: fix a hang due to uninitialized border color registers</li>
+  <li>radeonsi: fix Fiji for LLVM &lt;= 3.7</li>
+  <li>radeonsi: don't call of u_prims_for_vertices for patches and rectangles</li>
+  <li>radeonsi: apply the streamout workaround to Fiji as well</li>
+  <li>gallium/radeon: fix Hyper-Z hangs by programming PA_SC_MODE_CNTL_1 correctly</li>
+  <li>tgsi/scan: add flag colors_written</li>
+  <li>r600g: write all MRTs only if there is exactly one output (fixes a hang)</li>
+</ul>
+
+<p>Matt Turner (1):</p>
+<ul>
+  <li>glsl: Allow binding of image variables with 420pack.</li>
+</ul>
+
+<p>Neil Roberts (2):</p>
+<ul>
+  <li>i965: Add MESA_FORMAT_B8G8R8X8_SRGB to brw_format_for_mesa_format</li>
+  <li>i965: Add B8G8R8X8_SRGB to the alpha format override</li>
+</ul>
+
+<p>Oded Gabbay (1):</p>
+<ul>
+  <li>configura.ac: fix test for SSE4.1 assembler support</li>
+</ul>
+
+<p>Patrick Rudolph (2):</p>
+<ul>
+  <li>nv50,nvc0: fix use-after-free when vertex buffers are unbound</li>
+  <li>gallium/util: return correct number of bound vertex buffers</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>nvc0: free memory allocated by the prog which reads MP perf counters</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>i965: use _Shader to get fragment program when updating surface state</li>
+</ul>
+
+<p>Tom Stellard (2):</p>
+<ul>
+  <li>radeonsi: Rename si_shader::ls_rsrc{1,2} to si_shader::rsrc{1,2}</li>
+  <li>radeonsi/compute: Use the compiler's COMPUTE_PGM_RSRC* register values</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html
index 542f368d36c..9dd21dd3173 100644
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -14,7 +14,7 @@
 <iframe src="../contents.html"></iframe>
 <div class="content">
 
-<h1>Mesa 11.1.0 Release Notes / TBD</h1>
+<h1>Mesa 11.1.0 Release Notes / 15 December 2015</h1>
 
 <p>
 Mesa 11.1.0 is a new development release.
@@ -33,7 +33,8 @@ because compatibility contexts are not supported.
 
 <h2>SHA256 checksums</h2>
 <pre>
-TBD.
+e3bc44be4df5e4dc728dfda7b55b1aaeadfce36eca6a367b76cc07598070cb2d  mesa-11.1.0.tar.gz
+9befe03b04223eb1ede177fa8cac001e2850292c8c12a3ec9929106afad9cf1f  mesa-11.1.0.tar.xz
 </pre>
 
 
@@ -84,11 +85,196 @@ Note: some of the new features are only available with certain drivers.
 
 <h2>Bug fixes</h2>
 
-TBD.
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=28130">Bug 28130</a> - vbo: premature flushing breaks GL_LINE_LOOP</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=38109">Bug 38109</a> - i915 driver crashes if too few vertices are submitted (Mesa 7.10.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=49779">Bug 49779</a> - Extra line segments in GL_LINE_LOOP</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=55552">Bug 55552</a> - Compile errors with --enable-mangling</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=71789">Bug 71789</a> - [r300g] Visuals not found in (default) depth = 24</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=79783">Bug 79783</a> - Distorted output in obs-studio where other vendors &quot;work&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=80821">Bug 80821</a> - When LIBGL_ALWAYS_SOFTWARE is set, KHR_create_context is not supported</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=81174">Bug 81174</a> - Gallium: GL_LINE_LOOP broken with more than 512 points</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=83508">Bug 83508</a> - [UBO] Assertion for array of blocks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84677">Bug 84677</a> - Triangle disappears with glPolygonMode GL_LINE</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86281">Bug 86281</a> - brw_meta_fast_clear (brw=brw&#64;entry=0x7fffd4097a08, fb=fb&#64;entry=0x7fffd40fa900, buffers=buffers&#64;entry=2, partial_clear=partial_clear&#64;entry=false)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86469">Bug 86469</a> - Unreal Engine demo doesn't run</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86720">Bug 86720</a> - [radeon] Europa Universalis 4 freezing during game start (10.3.3+, still broken on 11.0.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89014">Bug 89014</a> - PIPE_QUERY_GPU_FINISHED is not acting as expected on SI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90175">Bug 90175</a> - [hsw bisected][PATCH] atomic counters doesn't work for a binding point different to zero</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90348">Bug 90348</a> - Spilling failure of b96 merged value</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90631">Bug 90631</a> - Compilation failure for fragment shader with many branches on Sandy Bridge</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90734">Bug 90734</a> - glBufferSubData is corrupting data when buffer is &gt; 32k</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90887">Bug 90887</a> - PhiMovesPass in register allocator broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91044">Bug 91044</a> - piglit spec/egl_khr_create_context/valid debug flag gles* fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91114">Bug 91114</a> - ES3-CTS.gtf.GL3Tests.shadow.shadow_execution_vert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91254">Bug 91254</a> - (regresion) video using VA-API on Intel slow and freeze system with mesa 10.6 or 10.6.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91292">Bug 91292</a> - [BDW+] glVertexAttribDivisor not working in combination with glPolygonMode</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91342">Bug 91342</a> - Very dark textures on some objects in indoors environments in Postal 2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91526">Bug 91526</a> - World of Warcraft (on Wine) has UI corruption with nouveau</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91551">Bug 91551</a> - DXTn compressed normal maps produce severe artifacts on all NV5x and NVDx chipsets</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91596">Bug 91596</a> - EGL_KHR_gl_colorspace (v2) causes problem with Android-x86 GUI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91716">Bug 91716</a> - [bisected] piglit.shaders.glsl-vs-int-attrib regresses on 32 bit BYT, HSW, IVB, SNB</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91718">Bug 91718</a> - piglit.spec.arb_shader_image_load_store.invalid causes intermittent GPU HANG</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91719">Bug 91719</a> - [SNB,HSW,BYT] dEQP regressions associated with using NIR for vertex shaders</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91726">Bug 91726</a> - R600 asserts in tgsi_cmp/make_src_for_op3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91780">Bug 91780</a> - Rendering issues with geometry shader</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91785">Bug 91785</a> - make check DispatchSanity_test.GLES31 regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91788">Bug 91788</a> - [HSW Regression] Synmark2_v6 Multithread performance case FPS reduced by 36%</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91847">Bug 91847</a> - glGenerateTextureMipmap not working (no errors) unless glActiveTexture(GL_TEXTURE1) is called before</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91857">Bug 91857</a> - Mesa 10.6.3 linker is slow</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91881">Bug 91881</a> - regression: GPU lockups since mesa-11.0.0_rc1 on RV620 (r600) driver</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91890">Bug 91890</a> - [nve7] witcher2: blurry image &amp; DATA_ERRORs (class 0xa097 mthd 0x2380/0x238c)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91898">Bug 91898</a> - src/util/mesa-sha1.c:250:25: fatal error: openssl/sha.h: No such file or directory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91927">Bug 91927</a> - [SKL] [regression] piglit compressed textures tests fail  with kernel upgrade</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91930">Bug 91930</a> - Program with GtkGLArea widget does not redraw</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91970">Bug 91970</a> - [BSW regression] dEQP-GLES3.functional.shaders.precision.int.highp_mul_vertex</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91985">Bug 91985</a> - [regression, bisected] FTBFS with commit f9caabe8f1: R600_UCP_CONST_BUFFER is undefined</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91993">Bug 91993</a> - Graphical glitch in Astromenace (open-source game).</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92009">Bug 92009</a> - ES3-CTS.gtf.GL3Tests.packed_pixels.packed_pixels fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92033">Bug 92033</a> - [SNB,regression,dEQP,bisected] functional.shaders.random tests regressed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92052">Bug 92052</a> - nir/nir_builder.h:79: error: expected primary-expression before ‘.’ token</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92054">Bug 92054</a> - make check gbm-symbols-check regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92066">Bug 92066</a> - [ILK,G45,regression] New assertion on BRW_MAX_MRF breaks ilk and g45</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92072">Bug 92072</a> - Wine breakage since d082c5324 (st/mesa: don't call st_validate_state in BlitFramebuffer)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92095">Bug 92095</a> - [Regression, bisected] arb_shader_atomic_counters.compiler.builtins.frag</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92122">Bug 92122</a> - [bisected, cts] Regression with Assault Android Cactus</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92124">Bug 92124</a> - shader_query.cpp:841:34: error: ‘strndup’ was not declared in this scope</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92183">Bug 92183</a> - linker.cpp:3187:46: error: ‘strtok_r’ was not declared in this scope</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92193">Bug 92193</a> - [SKL] ES2-CTS.gtf.GL2ExtensionTests.compressed_astc_texture.compressed_astc_texture fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92214">Bug 92214</a> - Flightgear crashes during splashboot with R600 driver, LLVM 3.7.0 and mesa 11.0.2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92221">Bug 92221</a> - Unintended code changes in _mesa_base_tex_format commit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92265">Bug 92265</a> - Black windows in weston after update mesa to 11.0.2-1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92304">Bug 92304</a> - [cts] cts.shaders.negative conformance tests fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92363">Bug 92363</a> - [BSW/BDW] ogles1conform Gets test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92437">Bug 92437</a> - osmesa: Expose GL entry points for Windows build, via .def file</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92438">Bug 92438</a> - Segfault in pushbuf_kref when running the android emulator (qemu) on nv50</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92476">Bug 92476</a> - [cts] ES2-CTS.gtf.GL2ExtensionTests.egl_image.egl_image fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92588">Bug 92588</a> - [HSW,BDW,BSW,SKL-Y][GLES 3.1 CTS] ES31-CTS.arrays_of_arrays.InteractionFunctionCalls2 - assert</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92621">Bug 92621</a> - [G965 ILK G45] Regression: 24 piglit regressions in glsl-1.10</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92623">Bug 92623</a> - Differences in prog_data ignored when caching fragment programs (causes hangs)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92634">Bug 92634</a> - gallium's vl_mpeg12_decoder does not work with st/va</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92639">Bug 92639</a> - [Regression bisected] Ogles1conform mustpass.c fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92641">Bug 92641</a> - [SKL BSW] [Regression] Ogles1conform userclip.c fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92645">Bug 92645</a> - kodi vdpau interop fails since  mesa,meta: move gl_texture_object::TargetIndex initializations</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92705">Bug 92705</a> - [clover] fail to build with llvm-svn/clang-svn 3.8</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92709">Bug 92709</a> - &quot;LLVM triggered Diagnostic Handler: unsupported call to function ldexpf in main&quot; when starting race in stuntrally</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92738">Bug 92738</a> - Randon R7 240 doesn't work on 16KiB page size platform</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92744">Bug 92744</a> - [g965 Regression bisected] Performance regression and piglit assertions due to liveness analysis</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92770">Bug 92770</a> - [SNB, regression, dEQP] deqp-gles3.functional.shaders.discard.dynamic_loop_texture</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92824">Bug 92824</a> - [regression, bisected] `make check` dispatch-sanity broken by GL_EXT_buffer_storage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92849">Bug 92849</a> - [IVB HSW BDW] piglit image load/store load-from-cleared-image.shader_test fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92859">Bug 92859</a> - [regression, bisected] validate_intrinsic_instr: Assertion triggered</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92860">Bug 92860</a> - [radeonsi][bisected] st/mesa: implement ARB_copy_image - Corruption in ARK Survival Evolved</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92900">Bug 92900</a> - [regression bisected] About 700 piglit regressions is what could go wrong</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92909">Bug 92909</a> - Offset/alignment issue with layout std140 and vec3</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92985">Bug 92985</a> - Mac OS X build error &quot;ar: no archive members specified&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93015">Bug 93015</a> - Tonga Elemental segfault + VM faults since  radeon: implement r600_query_hw_get_result via function pointers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93048">Bug 93048</a> - [CTS regression] mesa af2723 breaks GL Conformance for debug extension</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93063">Bug 93063</a> - drm_helper.h:227:1: error: static declaration of ‘pipe_virgl_create_screen’ follows non-static declaration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93091">Bug 93091</a> - [opencl] segfault when running any opencl programs (like clinfo)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93126">Bug 93126</a> - wrongly claim supporting GL_EXT_texture_rg</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93180">Bug 93180</a> - [regression] arb_separate_shader_objects.active sampler conflict fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93235">Bug 93235</a> - [regression] dispatch sanity broken by GetPointerv</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93266">Bug 93266</a> - gl_arb_shading_language_420pack does not allow binding of image variables</li>
+
+</ul>
+
 
 <h2>Changes</h2>
 
-TBD.
+<li>MPEG4 decoding has been disabled by default in the VAAPI driver</li>
 
 </div>
 </body>
diff --git a/docs/relnotes/11.2.0.html b/docs/relnotes/11.2.0.html
index 12e0f07a860..6f1752cec3f 100644
--- a/docs/relnotes/11.2.0.html
+++ b/docs/relnotes/11.2.0.html
@@ -47,7 +47,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_base_instance on freedreno/a4xx</li>
 <li>GL_ARB_compute_shader on i965</li>
 <li>GL_ARB_copy_image on r600</li>
-<li>GL_ARB_tessellation_shader on r600 (evergreen/cayman only)</li>
+<li>GL_ARB_tessellation_shader on i965/gen8+ and r600 (evergreen/cayman only)</li>
 <li>GL_ARB_texture_buffer_object_rgb32 on freedreno/a4xx</li>
 <li>GL_ARB_texture_buffer_range on freedreno/a4xx</li>
 <li>GL_ARB_texture_query_lod on freedreno/a4xx</li>
@@ -56,6 +56,8 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_vertex_type_10f_11f_11f_rev on freedreno/a4xx</li>
 <li>GL_KHR_texture_compression_astc_ldr on freedreno/a4xx</li>
 <li>GL_AMD_performance_monitor on radeonsi (CIK+ only)</li>
+<li>New OSMesaCreateContextAttribs() function (for creating core profile
+    contexts)</li>
 </ul>
 
 <h2>Bug fixes</h2>
diff --git a/docs/thanks.html b/docs/thanks.html
index 29a41e49c7c..685cb31e640 100644
--- a/docs/thanks.html
+++ b/docs/thanks.html
@@ -42,9 +42,7 @@ Tungsten Graphics, Inc. have supported the ongoing development of Mesa.
 <li>The
 <a href="http://www.mesa3d.org">Mesa</a>
 website is hosted by
-<a href="http://sourceforge.net">
-<img src="http://sourceforge.net/sflogo.php?group_id=3&amp;type=1"
-width="88" height="31" align="bottom" alt="Sourceforge.net" border="0"></a>
+<a href="http://sourceforge.net">sourceforge.net</a>.
 <br>
 <br>
 
diff --git a/include/GL/osmesa.h b/include/GL/osmesa.h
index ca0d1675a9d..39cd54ef9b7 100644
--- a/include/GL/osmesa.h
+++ b/include/GL/osmesa.h
@@ -58,8 +58,8 @@ extern "C" {
 #include <GL/gl.h>
 
 
-#define OSMESA_MAJOR_VERSION 10
-#define OSMESA_MINOR_VERSION 0
+#define OSMESA_MAJOR_VERSION 11
+#define OSMESA_MINOR_VERSION 2
 #define OSMESA_PATCH_VERSION 0
 
 
@@ -95,6 +95,18 @@ extern "C" {
 #define OSMESA_MAX_WIDTH	0x24  /* new in 4.0 */
 #define OSMESA_MAX_HEIGHT	0x25  /* new in 4.0 */
 
+/*
+ * Accepted in OSMesaCreateContextAttrib's attribute list.
+ */
+#define OSMESA_DEPTH_BITS            0x30
+#define OSMESA_STENCIL_BITS          0x31
+#define OSMESA_ACCUM_BITS            0x32
+#define OSMESA_PROFILE               0x33
+#define OSMESA_CORE_PROFILE          0x34
+#define OSMESA_COMPAT_PROFILE        0x35
+#define OSMESA_CONTEXT_MAJOR_VERSION 0x36
+#define OSMESA_CONTEXT_MINOR_VERSION 0x37
+
 
 typedef struct osmesa_context *OSMesaContext;
 
@@ -128,6 +140,35 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits,
 
 
 /*
+ * Create an Off-Screen Mesa rendering context with attribute list.
+ * The list is composed of (attribute, value) pairs and terminated with
+ * attribute==0.  Supported Attributes:
+ *
+ * Attributes                    Values
+ * --------------------------------------------------------------------------
+ * OSMESA_FORMAT                 OSMESA_RGBA*, OSMESA_BGRA, OSMESA_ARGB, etc.
+ * OSMESA_DEPTH_BITS             0*, 16, 24, 32
+ * OSMESA_STENCIL_BITS           0*, 8
+ * OSMESA_ACCUM_BITS             0*, 16
+ * OSMESA_PROFILE                OSMESA_COMPAT_PROFILE*, OSMESA_CORE_PROFILE
+ * OSMESA_CONTEXT_MAJOR_VERSION  1*, 2, 3
+ * OSMESA_CONTEXT_MINOR_VERSION  0+
+ *
+ * Note: * = default value
+ *
+ * We return a context version >= what's specified by OSMESA_CONTEXT_MAJOR/
+ * MINOR_VERSION for the given profile.  For example, if you request a GL 1.4
+ * compat profile, you might get a GL 3.0 compat profile.
+ * Otherwise, null is returned if the version/profile is not supported.
+ *
+ * New in Mesa 11.2
+ */
+GLAPI OSMesaContext GLAPIENTRY
+OSMesaCreateContextAttribs( const int *attribList, OSMesaContext sharelist );
+
+
+
+/*
  * Destroy an Off-Screen Mesa rendering context.
  *
  * Input:  ctx - the context to destroy
diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am
index 7b026b51c33..bcdf297030f 100644
--- a/src/gallium/auxiliary/Makefile.am
+++ b/src/gallium/auxiliary/Makefile.am
@@ -1,11 +1,10 @@
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
-noinst_LTLIBRARIES = libgallium.la
+noinst_LTLIBRARIES = libgallium_nir.la
 
 AM_CFLAGS = \
 	-I$(top_srcdir)/src/loader \
-	-I$(top_builddir)/src/glsl/nir \
 	-I$(top_srcdir)/src/gallium/auxiliary/util \
 	$(GALLIUM_CFLAGS) \
 	$(VISIBILITY_CFLAGS) \
@@ -15,11 +14,24 @@ AM_CXXFLAGS = \
 	$(VISIBILITY_CXXFLAGS) \
 	$(MSVC2008_COMPAT_CXXFLAGS)
 
+libgallium_nir_la_SOURCES = \
+	$(NIR_SOURCES)
+
+libgallium_nir_la_CFLAGS = \
+	-I$(top_builddir)/src/glsl/nir \
+	$(GALLIUM_CFLAGS) \
+	$(VISIBILITY_CFLAGS) \
+	$(MSVC2013_COMPAT_CFLAGS)
+
+noinst_LTLIBRARIES += libgallium.la
+
 libgallium_la_SOURCES = \
 	$(C_SOURCES) \
-	$(NIR_SOURCES) \
 	$(GENERATED_SOURCES)
 
+libgallium_la_LIBADD = \
+	libgallium_nir.la
+
 if HAVE_MESA_LLVM
 
 AM_CFLAGS += \
diff --git a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
index 779b2374e20..34add829353 100644
--- a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h
@@ -91,34 +91,34 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
          }
 
          for (i = 0; i < 4; i++) {
-            out->clip[i] = clipvertex[i];
-            out->pre_clip_pos[i] = position[i];
+            out->clip_pos[i] = position[i];
          }
 
+         /* Be careful with NaNs. Comparisons must be true for them. */
          /* Do the hardwired planes first:
           */
          if (flags & DO_CLIP_XY_GUARD_BAND) {
-            if (-0.50 * position[0] + position[3] < 0) mask |= (1<<0);
-            if ( 0.50 * position[0] + position[3] < 0) mask |= (1<<1);
-            if (-0.50 * position[1] + position[3] < 0) mask |= (1<<2);
-            if ( 0.50 * position[1] + position[3] < 0) mask |= (1<<3);
+            if (!(-0.50 * position[0] + position[3] >= 0)) mask |= (1<<0);
+            if (!( 0.50 * position[0] + position[3] >= 0)) mask |= (1<<1);
+            if (!(-0.50 * position[1] + position[3] >= 0)) mask |= (1<<2);
+            if (!( 0.50 * position[1] + position[3] >= 0)) mask |= (1<<3);
          }
          else if (flags & DO_CLIP_XY) {
-            if (-position[0] + position[3] < 0) mask |= (1<<0);
-            if ( position[0] + position[3] < 0) mask |= (1<<1);
-            if (-position[1] + position[3] < 0) mask |= (1<<2);
-            if ( position[1] + position[3] < 0) mask |= (1<<3);
+            if (!(-position[0] + position[3] >= 0)) mask |= (1<<0);
+            if (!( position[0] + position[3] >= 0)) mask |= (1<<1);
+            if (!(-position[1] + position[3] >= 0)) mask |= (1<<2);
+            if (!( position[1] + position[3] >= 0)) mask |= (1<<3);
          }
 
          /* Clip Z planes according to full cube, half cube or none.
           */
          if (flags & DO_CLIP_FULL_Z) {
-            if ( position[2] + position[3] < 0) mask |= (1<<4);
-            if (-position[2] + position[3] < 0) mask |= (1<<5);
+            if (!( position[2] + position[3] >= 0)) mask |= (1<<4);
+            if (!(-position[2] + position[3] >= 0)) mask |= (1<<5);
          }
          else if (flags & DO_CLIP_HALF_Z) {
-            if ( position[2]               < 0) mask |= (1<<4);
-            if (-position[2] + position[3] < 0) mask |= (1<<5);
+            if (!( position[2]               >= 0)) mask |= (1<<4);
+            if (!(-position[2] + position[3] >= 0)) mask |= (1<<5);
          }
 
          if (flags & DO_CLIP_USER) {
@@ -137,7 +137,6 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
                if (have_cd && num_written_clipdistance) {
                   float clipdist;
                   i = plane_idx - 6;
-                  out->have_clipdist = 1;
                   /* first four clip distance in first vector etc. */
                   if (i < 4)
                      clipdist = out->data[cd[0]][i];
@@ -146,7 +145,7 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
                   if (clipdist < 0 || util_is_inf_or_nan(clipdist))
                      mask |= 1 << plane_idx;
                } else {
-                  if (dot4(clipvertex, plane[plane_idx]) < 0)
+                  if (!(dot4(clipvertex, plane[plane_idx]) >= 0))
                      mask |= 1 << plane_idx;
                }
             }
@@ -192,7 +191,6 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs,
 
       out = (struct vertex_header *)( (char *)out + info->stride );
    }
-
    return need_pipeline != 0;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index ee974243b3e..f25dafef954 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -188,6 +188,7 @@ create_jit_sampler_type(struct gallivm_state *gallivm, const char *struct_name)
    sampler_type = LLVMStructTypeInContext(gallivm->context, elem_types,
                                           Elements(elem_types), 0);
 
+   (void) target; /* silence unused var warning for non-debug build */
    LP_CHECK_MEMBER_OFFSET(struct draw_jit_sampler, min_lod,
                           target, sampler_type,
                           DRAW_JIT_SAMPLER_MIN_LOD);
@@ -234,6 +235,8 @@ create_jit_context_type(struct gallivm_state *gallivm,
                                  PIPE_MAX_SAMPLERS); /* samplers */
    context_type = LLVMStructTypeInContext(gallivm->context, elem_types,
                                           Elements(elem_types), 0);
+
+   (void) target; /* silence unused var warning for non-debug build */
    LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, vs_constants,
                           target, context_type, DRAW_JIT_CTX_CONSTANTS);
    LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, num_vs_constants,
@@ -375,15 +378,14 @@ static LLVMTypeRef
 create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems)
 {
    LLVMTargetDataRef target = gallivm->target;
-   LLVMTypeRef elem_types[4];
+   LLVMTypeRef elem_types[3];
    LLVMTypeRef vertex_header;
    char struct_name[24];
 
    util_snprintf(struct_name, 23, "vertex_header%d", data_elems);
 
    elem_types[DRAW_JIT_VERTEX_VERTEX_ID]  = LLVMIntTypeInContext(gallivm->context, 32);
-   elem_types[DRAW_JIT_VERTEX_CLIP]  = LLVMArrayType(LLVMFloatTypeInContext(gallivm->context), 4);
-   elem_types[DRAW_JIT_VERTEX_PRE_CLIP_POS]  = LLVMArrayType(LLVMFloatTypeInContext(gallivm->context), 4);
+   elem_types[DRAW_JIT_VERTEX_CLIP_POS]  = LLVMArrayType(LLVMFloatTypeInContext(gallivm->context), 4);
    elem_types[DRAW_JIT_VERTEX_DATA]  = LLVMArrayType(elem_types[1], data_elems);
 
    vertex_header = LLVMStructTypeInContext(gallivm->context, elem_types,
@@ -403,12 +405,10 @@ create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems)
       target, vertex_header,
       DRAW_JIT_VERTEX_VERTEX_ID);
    */
-   LP_CHECK_MEMBER_OFFSET(struct vertex_header, clip,
-                          target, vertex_header,
-                          DRAW_JIT_VERTEX_CLIP);
-   LP_CHECK_MEMBER_OFFSET(struct vertex_header, pre_clip_pos,
+   (void) target; /* silence unused var warning for non-debug build */
+   LP_CHECK_MEMBER_OFFSET(struct vertex_header, clip_pos,
                           target, vertex_header,
-                          DRAW_JIT_VERTEX_PRE_CLIP_POS);
+                          DRAW_JIT_VERTEX_CLIP_POS);
    LP_CHECK_MEMBER_OFFSET(struct vertex_header, data,
                           target, vertex_header,
                           DRAW_JIT_VERTEX_DATA);
@@ -826,7 +826,7 @@ store_aos(struct gallivm_state *gallivm,
  * struct vertex_header {
  *    unsigned clipmask:DRAW_TOTAL_CLIP_PLANES;
  *    unsigned edgeflag:1;
- *    unsigned have_clipdist:1;
+ *    unsigned pad:1;
  *    unsigned vertex_id:16;
  *    [...]
  * }
@@ -838,7 +838,7 @@ store_aos(struct gallivm_state *gallivm,
  * {
  *   return (x >> 16) |              // vertex_id
  *          ((x & 0x3fff) << 18) |   // clipmask
- *          ((x & 0x4000) << 3) |    // have_clipdist
+ *          ((x & 0x4000) << 3) |    // pad
  *          ((x & 0x8000) << 1);     // edgeflag
  * }
  */
@@ -850,19 +850,23 @@ adjust_mask(struct gallivm_state *gallivm,
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef vertex_id;
    LLVMValueRef clipmask;
-   LLVMValueRef have_clipdist;
+   LLVMValueRef pad;
    LLVMValueRef edgeflag;
 
    vertex_id = LLVMBuildLShr(builder, mask, lp_build_const_int32(gallivm, 16), "");
    clipmask  = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x3fff), "");
    clipmask  = LLVMBuildShl(builder, clipmask, lp_build_const_int32(gallivm, 18), "");
-   have_clipdist = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x4000), "");
-   have_clipdist = LLVMBuildShl(builder, have_clipdist, lp_build_const_int32(gallivm, 3), "");
+   if (0) {
+      pad = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x4000), "");
+      pad = LLVMBuildShl(builder, pad, lp_build_const_int32(gallivm, 3), "");
+   }
    edgeflag = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x8000), "");
    edgeflag = LLVMBuildShl(builder, edgeflag, lp_build_const_int32(gallivm, 1), "");
 
    mask = LLVMBuildOr(builder, vertex_id, clipmask, "");
-   mask = LLVMBuildOr(builder, mask, have_clipdist, "");
+   if (0) {
+      mask = LLVMBuildOr(builder, mask, pad, "");
+   }
    mask = LLVMBuildOr(builder, mask, edgeflag, "");
 #endif
    return mask;
@@ -877,7 +881,7 @@ store_aos_array(struct gallivm_state *gallivm,
                 int attrib,
                 int num_outputs,
                 LLVMValueRef clipmask,
-                boolean have_clipdist)
+                boolean need_edgeflag)
 {
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef attr_index = lp_build_const_int32(gallivm, attrib);
@@ -908,11 +912,15 @@ store_aos_array(struct gallivm_state *gallivm,
        * code here.  See struct vertex_header in draw_private.h.
        */
       assert(DRAW_TOTAL_CLIP_PLANES==14);
-      /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */
-      vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
-      if (have_clipdist)
-         vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1);
-      val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type), vertex_id_pad_edgeflag);
+      /* initialize vertex id:16 = 0xffff, pad:1 = 0, edgeflag:1 = 1 */
+      if (!need_edgeflag) {
+         vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES);
+      }
+      else {
+         vertex_id_pad_edgeflag = (0xffff << 16);
+      }
+      val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type),
+                                   vertex_id_pad_edgeflag);
       /* OR with the clipmask */
       cliptmp = LLVMBuildOr(builder, val, clipmask, "");
       for (i = 0; i < vector_length; i++) {
@@ -942,7 +950,7 @@ convert_to_aos(struct gallivm_state *gallivm,
                LLVMValueRef clipmask,
                int num_outputs,
                struct lp_type soa_type,
-               boolean have_clipdist)
+               boolean need_edgeflag)
 {
    LLVMBuilderRef builder = gallivm->builder;
    unsigned chan, attrib, i;
@@ -998,7 +1006,8 @@ convert_to_aos(struct gallivm_state *gallivm,
                       aos,
                       attrib,
                       num_outputs,
-                      clipmask, have_clipdist);
+                      clipmask,
+                      need_edgeflag);
    }
 #if DEBUG_STORE
    lp_build_printf(gallivm, "   # storing end\n");
@@ -1014,7 +1023,7 @@ store_clip(struct gallivm_state *gallivm,
            const struct lp_type vs_type,
            LLVMValueRef io_ptr,
            LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
-           boolean pre_clip_pos, int idx)
+           int idx)
 {
    LLVMBuilderRef builder = gallivm->builder;
    LLVMValueRef soa[4];
@@ -1041,14 +1050,8 @@ store_clip(struct gallivm_state *gallivm,
    soa[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 .. zn*/
    soa[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 .. wn*/
 
-   if (!pre_clip_pos) {
-      for (i = 0; i < vs_type.length; i++) {
-         clip_ptrs[i] = draw_jit_header_clip(gallivm, io_ptrs[i]);
-      }
-   } else {
-      for (i = 0; i < vs_type.length; i++) {
-         clip_ptrs[i] = draw_jit_header_pre_clip_pos(gallivm, io_ptrs[i]);
-      }
+   for (i = 0; i < vs_type.length; i++) {
+      clip_ptrs[i] = draw_jit_header_clip_pos(gallivm, io_ptrs[i]);
    }
 
    lp_build_transpose_aos(gallivm, vs_type, soa, soa);
@@ -1140,11 +1143,7 @@ generate_clipmask(struct draw_llvm *llvm,
                   struct gallivm_state *gallivm,
                   struct lp_type vs_type,
                   LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
-                  boolean clip_xy,
-                  boolean clip_z,
-                  boolean clip_user,
-                  boolean clip_halfz,
-                  unsigned ucp_enable,
+                  struct draw_llvm_variant_key *key,
                   LLVMValueRef context_ptr,
                   boolean *have_clipdist)
 {
@@ -1160,7 +1159,9 @@ generate_clipmask(struct draw_llvm *llvm,
    const unsigned pos = llvm->draw->vs.position_output;
    const unsigned cv = llvm->draw->vs.clipvertex_output;
    int num_written_clipdistance = llvm->draw->vs.vertex_shader->info.num_written_clipdistance;
-   bool have_cd = false;
+   boolean have_cd = false;
+   boolean clip_user = key->clip_user;
+   unsigned ucp_enable = key->ucp_enable;
    unsigned cd[2];
 
    cd[0] = llvm->draw->vs.clipdistance_output[0];
@@ -1200,8 +1201,16 @@ generate_clipmask(struct draw_llvm *llvm,
       cv_w = pos_w;
    }
 
+   /*
+    * Be careful with the comparisons and NaNs (using llvm's unordered
+    * comparisons here).
+    */
    /* Cliptest, for hardwired planes */
-   if (clip_xy) {
+   /*
+    * XXX should take guardband into account (currently not in key).
+    * Otherwise might run the draw pipeline stages for nothing.
+    */
+   if (key->clip_xy) {
       /* plane 1 */
       test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, pos_x , pos_w);
       temp = shift;
@@ -1229,9 +1238,9 @@ generate_clipmask(struct draw_llvm *llvm,
       mask = LLVMBuildOr(builder, mask, test, "");
    }
 
-   if (clip_z) {
+   if (key->clip_z) {
       temp = lp_build_const_int_vec(gallivm, i32_type, 16);
-      if (clip_halfz) {
+      if (key->clip_halfz) {
          /* plane 5 */
          test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, pos_z);
          test = LLVMBuildAnd(builder, test, temp, "");
@@ -1318,6 +1327,20 @@ generate_clipmask(struct draw_llvm *llvm,
          }
       }
    }
+   if (key->need_edgeflags) {
+      /*
+       * This isn't really part of clipmask but stored the same in vertex
+       * header later, so do it here.
+       */
+      unsigned edge_attr = llvm->draw->vs.edgeflag_output;
+      LLVMValueRef one = lp_build_const_vec(gallivm, f32_type, 1.0);
+      LLVMValueRef edgeflag = LLVMBuildLoad(builder, outputs[edge_attr][0], "");
+      test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_EQUAL, one, edgeflag);
+      temp = lp_build_const_int_vec(gallivm, i32_type,
+                                    1LL << DRAW_TOTAL_CLIP_PLANES);
+      test = LLVMBuildAnd(builder, test, temp, "");
+      mask = LLVMBuildOr(builder, mask, test, "");
+   }
    return mask;
 }
 
@@ -1329,7 +1352,8 @@ generate_clipmask(struct draw_llvm *llvm,
 static LLVMValueRef
 clipmask_booli32(struct gallivm_state *gallivm,
                  const struct lp_type vs_type,
-                 LLVMValueRef clipmask_bool_ptr)
+                 LLVMValueRef clipmask_bool_ptr,
+                 boolean edgeflag_in_clipmask)
 {
    LLVMBuilderRef builder = gallivm->builder;
    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
@@ -1339,8 +1363,18 @@ clipmask_booli32(struct gallivm_state *gallivm,
    int i;
 
    /*
-    * Can do this with log2(vector length) pack instructions and one extract
-    * (as we don't actually need a or) with sse2 which would be way better.
+    * We need to invert the edgeflag bit from the clipmask here
+    * (because the result is really if we want to run the pipeline or not
+    * and we (may) need it if edgeflag was 0).
+    */
+   if (edgeflag_in_clipmask) {
+      struct lp_type i32_type = lp_int_type(vs_type);
+      LLVMValueRef edge = lp_build_const_int_vec(gallivm, i32_type,
+                                                 1LL << DRAW_TOTAL_CLIP_PLANES);
+      clipmask_bool = LLVMBuildXor(builder, clipmask_bool, edge, "");
+   }
+   /*
+    * Could do much better with just cmp/movmskps.
     */
    for (i=0; i < vs_type.length; i++) {
       temp = LLVMBuildExtractElement(builder, clipmask_bool,
@@ -1536,8 +1570,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
    const boolean bypass_viewport = key->has_gs || key->bypass_viewport ||
                                    llvm->draw->vs.vertex_shader->info.writes_viewport_index;
    const boolean enable_cliptest = !key->has_gs && (key->clip_xy ||
-                                                    key->clip_z  ||
-                                                    key->clip_user);
+                                                    key->clip_z ||
+                                                    key->clip_user ||
+                                                    key->need_edgeflags);
    LLVMValueRef variant_func;
    const unsigned pos = llvm->draw->vs.position_output;
    const unsigned cv = llvm->draw->vs.clipvertex_output;
@@ -1766,8 +1801,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
 
       if (pos != -1 && cv != -1) {
          /* store original positions in clip before further manipulation */
-         store_clip(gallivm, vs_type, io, outputs, FALSE, key->clip_user ? cv : pos);
-         store_clip(gallivm, vs_type, io, outputs, TRUE, pos);
+         store_clip(gallivm, vs_type, io, outputs, pos);
 
          /* do cliptest */
          if (enable_cliptest) {
@@ -1777,11 +1811,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
                                          gallivm,
                                          vs_type,
                                          outputs,
-                                         key->clip_xy,
-                                         key->clip_z,
-                                         key->clip_user,
-                                         key->clip_halfz,
-                                         key->ucp_enable,
+                                         key,
                                          context_ptr, &have_clipdist);
             temp = LLVMBuildOr(builder, clipmask, temp, "");
             /* store temporary clipping boolean value */
@@ -1806,14 +1836,15 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant,
        */
       convert_to_aos(gallivm, io, NULL, outputs, clipmask,
                      vs_info->num_outputs, vs_type,
-                     have_clipdist);
+                     enable_cliptest && key->need_edgeflags);
    }
    lp_build_loop_end_cond(&lp_loop, count, step, LLVMIntUGE);
 
    sampler->destroy(sampler);
 
    /* return clipping boolean value for function */
-   ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr);
+   ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr,
+                          enable_cliptest && key->need_edgeflags);
 
    LLVMBuildRet(builder, ret);
 
@@ -1847,6 +1878,7 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store)
    key->clip_user = llvm->draw->clip_user;
    key->bypass_viewport = llvm->draw->bypass_viewport;
    key->clip_halfz = llvm->draw->rasterizer->clip_halfz;
+   /* XXX assumes edgeflag output not at 0 */
    key->need_edgeflags = (llvm->draw->vs.edgeflag_output ? TRUE : FALSE);
    key->ucp_enable = llvm->draw->rasterizer->clip_plane_enable;
    key->has_gs = llvm->draw->gs.geometry_shader != NULL;
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index d153c166ead..f617a29e2e7 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -104,8 +104,7 @@ enum {
 
 enum {
    DRAW_JIT_VERTEX_VERTEX_ID = 0,
-   DRAW_JIT_VERTEX_CLIP,
-   DRAW_JIT_VERTEX_PRE_CLIP_POS,
+   DRAW_JIT_VERTEX_CLIP_POS,
    DRAW_JIT_VERTEX_DATA
 };
 
@@ -162,11 +161,8 @@ enum {
 #define draw_jit_header_id(_gallivm, _ptr)              \
    lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_VERTEX_ID, "id")
 
-#define draw_jit_header_clip(_gallivm, _ptr) \
-   lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_CLIP, "clip")
-
-#define draw_jit_header_pre_clip_pos(_gallivm, _ptr) \
-   lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_PRE_CLIP_POS, "pre_clip_pos")
+#define draw_jit_header_clip_pos(_gallivm, _ptr) \
+   lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_CLIP_POS, "clip_pos")
 
 #define draw_jit_header_data(_gallivm, _ptr)            \
    lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_DATA, "data")
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 877db5979b1..3ce550a3ae8 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -646,6 +646,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
    struct pipe_context *pipe = draw->pipe;
    const struct pipe_rasterizer_state *rast = draw->rasterizer;
    uint num_samplers;
+   uint num_sampler_views;
    void *r;
 
    assert(draw->rasterizer->line_smooth);
@@ -667,9 +668,9 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
    draw_aaline_prepare_outputs(draw, draw->pipeline.aaline);
 
    /* how many samplers? */
-   /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
-   num_samplers = MAX2(aaline->num_sampler_views, aaline->num_samplers);
-   num_samplers = MAX2(num_samplers, aaline->fs->sampler_unit + 1);
+   /* we'll use sampler/texture[aaline->sampler_unit] for the alpha texture */
+   num_samplers = MAX2(aaline->num_samplers, aaline->fs->sampler_unit + 1);
+   num_sampler_views = MAX2(num_samplers, aaline->num_sampler_views);
 
    aaline->state.sampler[aaline->fs->sampler_unit] = aaline->sampler_cso;
    pipe_sampler_view_reference(&aaline->state.sampler_views[aaline->fs->sampler_unit],
@@ -681,7 +682,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
                                       num_samplers, aaline->state.sampler);
 
    aaline->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                                    num_samplers, aaline->state.sampler_views);
+                                    num_sampler_views, aaline->state.sampler_views);
 
    /* Disable triangle culling, stippling, unfilled mode etc. */
    r = draw_get_rasterizer_no_cull(draw, rast->scissor, rast->flatshade);
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 47765cd0ef9..67d8ecaa35f 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -59,6 +59,8 @@ struct clip_stage {
    struct draw_stage stage;      /**< base class */
 
    unsigned pos_attr;
+   boolean have_clipdist;
+   int cv_attr;
 
    /* List of the attributes to be constant interpolated. */
    uint num_const_attribs;
@@ -145,20 +147,23 @@ static void interp(const struct clip_stage *clip,
     */
    dst->clipmask = 0;
    dst->edgeflag = 0;        /* will get overwritten later */
-   dst->have_clipdist = in->have_clipdist;
+   dst->pad = 0;
    dst->vertex_id = UNDEFINED_VERTEX_ID;
 
    /* Interpolate the clip-space coords.
     */
-   interp_attr(dst->clip, t, in->clip, out->clip);
+   if (clip->cv_attr >= 0) {
+      interp_attr(dst->data[clip->cv_attr], t,
+                  in->data[clip->cv_attr], out->data[clip->cv_attr]);
+   }
    /* interpolate the clip-space position */
-   interp_attr(dst->pre_clip_pos, t, in->pre_clip_pos, out->pre_clip_pos);
+   interp_attr(dst->clip_pos, t, in->clip_pos, out->clip_pos);
 
    /* Do the projective divide and viewport transformation to get
     * new window coordinates:
     */
    {
-      const float *pos = dst->pre_clip_pos;
+      const float *pos = dst->clip_pos;
       const float *scale =
          clip->stage.draw->viewports[viewport_index].scale;
       const float *trans =
@@ -192,11 +197,11 @@ static void interp(const struct clip_stage *clip,
       t_nopersp = t;
       /* find either in.x != out.x or in.y != out.y */
       for (k = 0; k < 2; k++) {
-         if (in->pre_clip_pos[k] != out->pre_clip_pos[k]) {
+         if (in->clip_pos[k] != out->clip_pos[k]) {
             /* do divide by W, then compute linear interpolation factor */
-            float in_coord = in->pre_clip_pos[k] / in->pre_clip_pos[3];
-            float out_coord = out->pre_clip_pos[k] / out->pre_clip_pos[3];
-            float dst_coord = dst->pre_clip_pos[k] / dst->pre_clip_pos[3];
+            float in_coord = in->clip_pos[k] / in->clip_pos[3];
+            float out_coord = out->clip_pos[k] / out->clip_pos[3];
+            float dst_coord = dst->clip_pos[k] / dst->clip_pos[3];
             t_nopersp = (dst_coord - out_coord) / (in_coord - out_coord);
             break;
          }
@@ -214,9 +219,9 @@ static void interp(const struct clip_stage *clip,
  * Triangle is considered null/empty if its area is equal to zero.
  */
 static inline boolean
-is_tri_null(struct draw_context *draw, const struct prim_header *header)
+is_tri_null(const struct clip_stage *clip, const struct prim_header *header)
 {
-   const unsigned pos_attr = draw_current_shader_position_output(draw);
+   const unsigned pos_attr = clip->pos_attr;
    float x1 = header->v[1]->data[pos_attr][0] - header->v[0]->data[pos_attr][0];
    float y1 = header->v[1]->data[pos_attr][1] - header->v[0]->data[pos_attr][1];
    float z1 = header->v[1]->data[pos_attr][2] - header->v[0]->data[pos_attr][2];
@@ -242,6 +247,7 @@ static void emit_poly(struct draw_stage *stage,
                       unsigned n,
                       const struct prim_header *origPrim)
 {
+   const struct clip_stage *clipper = clip_stage(stage);
    struct prim_header header;
    unsigned i;
    ushort edge_first, edge_middle, edge_last;
@@ -281,7 +287,7 @@ static void emit_poly(struct draw_stage *stage,
          header.v[2] = inlist[0];  /* the provoking vertex */
       }
 
-      tri_null = is_tri_null(stage->draw, &header);
+      tri_null = is_tri_null(clipper, &header);
       /* If we generated a triangle with an area, aka. non-null triangle,
        * or if the previous triangle was also null then skip all subsequent
        * null triangles */
@@ -306,11 +312,18 @@ static void emit_poly(struct draw_stage *stage,
          debug_printf("Clipped tri: (flat-shade-first = %d)\n",
                       stage->draw->rasterizer->flatshade_first);
          for (j = 0; j < 3; j++) {
-            debug_printf("  Vert %d: clip: %f %f %f %f\n", j,
-                         header.v[j]->clip[0],
-                         header.v[j]->clip[1],
-                         header.v[j]->clip[2],
-                         header.v[j]->clip[3]);
+            debug_printf("  Vert %d: clip pos: %f %f %f %f\n", j,
+                         header.v[j]->clip_pos[0],
+                         header.v[j]->clip_pos[1],
+                         header.v[j]->clip_pos[2],
+                         header.v[j]->clip_pos[3]);
+            if (clipper->cv_attr >= 0) {
+               debug_printf("  Vert %d: cv: %f %f %f %f\n", j,
+                            header.v[j]->data[clipper->cv_attr][0],
+                            header.v[j]->data[clipper->cv_attr][1],
+                            header.v[j]->data[clipper->cv_attr][2],
+                            header.v[j]->data[clipper->cv_attr][3]);
+            }
             for (k = 0; k < draw_num_shader_outputs(stage->draw); k++) {
                debug_printf("  Vert %d: Attr %d:  %f %f %f %f\n", j, k,
                             header.v[j]->data[k][0],
@@ -320,7 +333,7 @@ static void emit_poly(struct draw_stage *stage,
             }
          }
       }
-      stage->next->tri( stage->next, &header );
+      stage->next->tri(stage->next, &header);
    }
 }
 
@@ -345,15 +358,28 @@ static inline float getclipdist(const struct clip_stage *clipper,
 {
    const float *plane;
    float dp;
-   if (vert->have_clipdist && plane_idx >= 6) {
+   if (plane_idx < 6) {
+      /* ordinary xyz view volume clipping uses pos output */
+      plane = clipper->plane[plane_idx];
+      dp = dot4(vert->clip_pos, plane);
+   }
+   else if (clipper->have_clipdist) {
       /* pick the correct clipdistance element from the output vectors */
       int _idx = plane_idx - 6;
       int cdi = _idx >= 4;
       int vidx = cdi ? _idx - 4 : _idx;
       dp = vert->data[draw_current_shader_clipdistance_output(clipper->stage.draw, cdi)][vidx];
    } else {
+      /*
+       * legacy user clip planes or gl_ClipVertex
+       */
       plane = clipper->plane[plane_idx];
-      dp = dot4(vert->clip, plane);
+      if (clipper->cv_attr >= 0) {
+         dp = dot4(vert->data[clipper->cv_attr], plane);
+      }
+      else {
+         dp = dot4(vert->clip_pos, plane);
+      }
    }
    return dp;
 }
@@ -400,13 +426,22 @@ do_clip_tri(struct draw_stage *stage,
    viewport_index = draw_viewport_index(clipper->stage.draw, prov_vertex);
 
    if (DEBUG_CLIP) {
-      const float *v0 = header->v[0]->clip;
-      const float *v1 = header->v[1]->clip;
-      const float *v2 = header->v[2]->clip;
-      debug_printf("Clip triangle:\n");
+      const float *v0 = header->v[0]->clip_pos;
+      const float *v1 = header->v[1]->clip_pos;
+      const float *v2 = header->v[2]->clip_pos;
+      debug_printf("Clip triangle pos:\n");
       debug_printf(" %f, %f, %f, %f\n", v0[0], v0[1], v0[2], v0[3]);
       debug_printf(" %f, %f, %f, %f\n", v1[0], v1[1], v1[2], v1[3]);
       debug_printf(" %f, %f, %f, %f\n", v2[0], v2[1], v2[2], v2[3]);
+      if (clipper->cv_attr >= 0) {
+         const float *v0 = header->v[0]->data[clipper->cv_attr];
+         const float *v1 = header->v[1]->data[clipper->cv_attr];
+         const float *v2 = header->v[2]->data[clipper->cv_attr];
+         debug_printf("Clip triangle cv:\n");
+         debug_printf(" %f, %f, %f, %f\n", v0[0], v0[1], v0[2], v0[3]);
+         debug_printf(" %f, %f, %f, %f\n", v1[0], v1[1], v1[2], v1[3]);
+         debug_printf(" %f, %f, %f, %f\n", v2[0], v2[1], v2[2], v2[3]);
+      }
    }
 
    /*
@@ -555,7 +590,7 @@ do_clip_tri(struct draw_stage *stage,
 
       /* Emit the polygon as triangles to the setup stage:
        */
-      emit_poly( stage, inlist, inEdges, n, header );
+      emit_poly(stage, inlist, inEdges, n, header);
    }
 }
 
@@ -567,7 +602,7 @@ do_clip_line(struct draw_stage *stage,
              struct prim_header *header,
              unsigned clipmask)
 {
-   const struct clip_stage *clipper = clip_stage( stage );
+   const struct clip_stage *clipper = clip_stage(stage);
    struct vertex_header *v0 = header->v[0];
    struct vertex_header *v1 = header->v[1];
    struct vertex_header *prov_vertex;
@@ -671,9 +706,9 @@ clip_point_guard_xy(struct draw_stage *stage, struct prim_header *header)
           * automatically). These would usually be captured by depth clip
           * too but this can be disabled.
           */
-         if (header->v[0]->clip[3] <= 0.0f ||
-             util_is_inf_or_nan(header->v[0]->clip[0]) ||
-             util_is_inf_or_nan(header->v[0]->clip[1]))
+         if (header->v[0]->clip_pos[3] <= 0.0f ||
+             util_is_inf_or_nan(header->v[0]->clip_pos[0]) ||
+             util_is_inf_or_nan(header->v[0]->clip_pos[1]))
             return;
       }
       stage->next->point(stage->next, header);
@@ -773,7 +808,7 @@ find_interp(const struct draw_fragment_shader *fs, int *indexed_interp,
 static void 
 clip_init_state(struct draw_stage *stage)
 {
-   struct clip_stage *clipper = clip_stage( stage );
+   struct clip_stage *clipper = clip_stage(stage);
    const struct draw_context *draw = stage->draw;
    const struct draw_fragment_shader *fs = draw->fs.fragment_shader;
    const struct tgsi_shader_info *info = draw_get_shader_info(draw);
@@ -781,6 +816,13 @@ clip_init_state(struct draw_stage *stage)
    int indexed_interp[2];
 
    clipper->pos_attr = draw_current_shader_position_output(draw);
+   clipper->have_clipdist = draw_current_shader_num_written_clipdistances(draw) > 0;
+   if (draw_current_shader_clipvertex_output(draw) != clipper->pos_attr) {
+      clipper->cv_attr = (int)draw_current_shader_clipvertex_output(draw);
+   }
+   else {
+      clipper->cv_attr = -1;
+   }
 
    /* We need to know for each attribute what kind of interpolation is
     * done on it (flat, smooth or noperspective).  But the information
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index b58d7530783..cf52ca48b26 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -477,6 +477,7 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
    struct pipe_context *pipe = pstip->pipe;
    struct draw_context *draw = stage->draw;
    uint num_samplers;
+   uint num_sampler_views;
 
    assert(stage->draw->rasterizer->poly_stipple_enable);
 
@@ -490,8 +491,8 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
 
    /* how many samplers? */
    /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
-   num_samplers = MAX2(pstip->num_sampler_views, pstip->num_samplers);
-   num_samplers = MAX2(num_samplers, pstip->fs->sampler_unit + 1);
+   num_samplers = MAX2(pstip->num_samplers, pstip->fs->sampler_unit + 1);
+   num_sampler_views = MAX2(pstip->num_sampler_views, num_samplers);
 
    /* plug in our sampler, texture */
    pstip->state.samplers[pstip->fs->sampler_unit] = pstip->sampler_cso;
@@ -506,7 +507,7 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
                                      num_samplers, pstip->state.samplers);
 
    pstip->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                                   num_samplers, pstip->state.sampler_views);
+                                   num_sampler_views, pstip->state.sampler_views);
 
    draw->suspend_flushing = FALSE;
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 5584c4a222c..8774bebd5f9 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -86,11 +86,10 @@ struct draw_vertex_buffer {
 struct vertex_header {
    unsigned clipmask:DRAW_TOTAL_CLIP_PLANES;
    unsigned edgeflag:1;
-   unsigned have_clipdist:1;
+   unsigned pad:1;
    unsigned vertex_id:16;
 
-   float clip[4];
-   float pre_clip_pos[4];
+   float clip_pos[4];
 
    /* This will probably become float (*data)[4] soon:
     */
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index d1eafd84032..0b9fab5721c 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -60,7 +60,7 @@ draw_pt_emit_prepare(struct pt_emit *emit,
 
    /* XXX: need to flush to get prim_vbuf.c to release its allocation??
     */
-   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+   draw_do_flush(draw, DRAW_FLUSH_BACKEND);
 
    /* XXX: may need to defensively reset this later on as clipping can
     * clobber this state in the render backend.
@@ -80,7 +80,7 @@ draw_pt_emit_prepare(struct pt_emit *emit,
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       unsigned output_format;
-      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
+      unsigned src_offset = vinfo->attrib[i].src_index * 4 * sizeof(float);
 
       output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit);
       emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit);
@@ -89,8 +89,8 @@ draw_pt_emit_prepare(struct pt_emit *emit,
       assert(emit_sz != 0);
 
       if (vinfo->attrib[i].emit == EMIT_1F_PSIZE) {
-	 src_buffer = 1;
-	 src_offset = 0;
+         src_buffer = 1;
+         src_offset = 0;
       }
 
       hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL;
@@ -138,7 +138,7 @@ draw_pt_emit(struct pt_emit *emit,
 
    /* XXX: need to flush to get prim_vbuf.c to release its allocation??
     */
-   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+   draw_do_flush(draw, DRAW_FLUSH_BACKEND);
 
    if (vertex_count == 0)
       return;
@@ -152,31 +152,31 @@ draw_pt_emit(struct pt_emit *emit,
                              (ushort)translate->key.output_stride,
                              (ushort)vertex_count);
 
-   hw_verts = render->map_vertices( render );
+   hw_verts = render->map_vertices(render);
    if (!hw_verts) {
       debug_warn_once("map of vertex buffer failed (out of memory?)");
       return;
    }
 
    translate->set_buffer(translate,
-			 0,
-			 vertex_data,
-			 stride,
-			 ~0);
+                         0,
+                         vertex_data,
+                         stride,
+                         ~0);
 
    translate->set_buffer(translate,
-			 1,
-			 &draw->rasterizer->point_size,
-			 0,
-			 ~0);
+                         1,
+                         &draw->rasterizer->point_size,
+                         0,
+                         ~0);
 
    /* fetch/translate vertex attribs to fill hw_verts[] */
    translate->run(translate,
-		  0,
-		  vertex_count,
-                  draw->start_instance,
-                  draw->instance_id,
-		  hw_verts );
+                  0,
+                  vertex_count,
+                  0,
+                  0,
+                  hw_verts);
 
    render->unmap_vertices(render, 0, vertex_count - 1);
 
@@ -212,7 +212,7 @@ draw_pt_emit_linear(struct pt_emit *emit,
 #endif
    /* XXX: need to flush to get prim_vbuf.c to release its allocation??
     */
-   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+   draw_do_flush(draw, DRAW_FLUSH_BACKEND);
 
    /* XXX: and work out some way to coordinate the render primitive
     * between vbuf.c and here...
@@ -224,35 +224,35 @@ draw_pt_emit_linear(struct pt_emit *emit,
                                   (ushort)count))
       goto fail;
 
-   hw_verts = render->map_vertices( render );
+   hw_verts = render->map_vertices(render);
    if (!hw_verts)
       goto fail;
 
    translate->set_buffer(translate, 0,
-			 vertex_data, stride, count - 1);
+                         vertex_data, stride, count - 1);
 
    translate->set_buffer(translate, 1,
-			 &draw->rasterizer->point_size,
-			 0, ~0);
+                         &draw->rasterizer->point_size,
+                         0, ~0);
 
    translate->run(translate,
                   0,
                   count,
-                  draw->start_instance,
-                  draw->instance_id,
+                  0,
+                  0,
                   hw_verts);
 
    if (0) {
       unsigned i;
       for (i = 0; i < count; i++) {
          debug_printf("\n\n%s vertex %d:\n", __FUNCTION__, i);
-         draw_dump_emitted_vertex( emit->vinfo,
-                                   (const uint8_t *)hw_verts +
-                                   translate->key.output_stride * i );
+         draw_dump_emitted_vertex(emit->vinfo,
+                                  (const uint8_t *)hw_verts +
+                                  translate->key.output_stride * i);
       }
    }
 
-   render->unmap_vertices( render, 0, count - 1 );
+   render->unmap_vertices(render, 0, count - 1);
 
    for (start = i = 0;
         i < prim_info->primitive_count;
@@ -262,7 +262,7 @@ draw_pt_emit_linear(struct pt_emit *emit,
                           start,
                           prim_info->primitive_lengths[i]);
    }
-   
+
    render->release_vertices(render);
 
    return;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 2d7569b0fdf..edd4541270a 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -453,6 +453,7 @@ llvm_pipeline_generic(struct draw_pt_middle_end *middle,
                                draw->vs.vertex_shader->info.writes_viewport_index)) {
          clipped = draw_pt_post_vs_run( fpme->post_vs, vert_info, prim_info );
       }
+      /* "clipped" also includes non-one edgeflag */
       if (clipped) {
          opt |= PT_PIPELINE;
       }
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
index f0d5e0f5656..3a9101a44d0 100644
--- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -58,7 +58,7 @@ initialize_vertex_header(struct vertex_header *header)
 {
    header->clipmask = 0;
    header->edgeflag = 1;
-   header->have_clipdist = 0;
+   header->pad = 0;
    header->vertex_id = UNDEFINED_VERTEX_ID;
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
index 20de26fd08a..261bd3467f9 100644
--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -275,7 +275,7 @@ void draw_pt_so_emit( struct pt_so_emit *emit,
    emit->generated_primitives = 0;
    emit->input_vertex_stride = input_verts->stride;
    if (emit->use_pre_clip_pos)
-      emit->pre_clip_pos = input_verts->verts->pre_clip_pos;
+      emit->pre_clip_pos = input_verts->verts->clip_pos;
 
    emit->inputs = (const float (*)[4])input_verts->verts->data;
 
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 5def6d3f32a..2cb723c5793 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -22,10 +22,6 @@
  * IN THE SOFTWARE.
  */
 
-#ifdef __GNUC__
-#pragma GCC diagnostic ignored "-Wdeclaration-after-statement"
-#endif
-
 #include "util/ralloc.h"
 #include "glsl/nir/nir.h"
 #include "glsl/nir/nir_control_flow.h"
@@ -1069,7 +1065,9 @@ ttn_kill(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
 static void
 ttn_kill_if(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
 {
-   nir_ssa_def *cmp = nir_bany4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)));
+   nir_ssa_def *cmp = nir_bany_inequal4(b, nir_flt(b, src[0],
+                                                   nir_imm_float(b, 0.0)),
+                                        nir_imm_int(b, 0));
    nir_intrinsic_instr *discard =
       nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
    discard->src[0] = nir_src_for_ssa(cmp);
@@ -1901,6 +1899,7 @@ ttn_emit_instruction(struct ttn_compile *c)
                                            &tgsi_dst->Indirect : NULL;
 
       store->num_components = 4;
+      store->const_index[0] = 0xf;
       store->variables[0] = ttn_array_deref(c, store, var, offset, indirect);
       store->src[0] = nir_src_for_reg(dest.dest.reg.reg);
 
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index eab48c5f00d..ed4a2324896 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -398,9 +398,8 @@ util_blitter_save_stencil_ref(struct blitter_context *blitter,
    blitter->saved_stencil_ref = *state;
 }
 
-static inline
-void util_blitter_save_rasterizer(struct blitter_context *blitter,
-                                  void *state)
+static inline void
+util_blitter_save_rasterizer(struct blitter_context *blitter, void *state)
 {
    blitter->saved_rs_state = state;
 }
@@ -459,8 +458,8 @@ util_blitter_save_scissor(struct blitter_context *blitter,
    blitter->saved_scissor = *state;
 }
 
-static inline
-void util_blitter_save_fragment_sampler_states(
+static inline void
+util_blitter_save_fragment_sampler_states(
                   struct blitter_context *blitter,
                   unsigned num_sampler_states,
                   void **sampler_states)
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h
index cda0f2e86ec..fe1917f06c9 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.h
+++ b/src/gallium/auxiliary/util/u_simple_shaders.h
@@ -31,6 +31,7 @@
 
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_shader_tokens.h"
 
 
 struct pipe_context;
diff --git a/src/gallium/drivers/ddebug/dd_draw.c b/src/gallium/drivers/ddebug/dd_draw.c
index b443c5b0b03..0d7ee9a1686 100644
--- a/src/gallium/drivers/ddebug/dd_draw.c
+++ b/src/gallium/drivers/ddebug/dd_draw.c
@@ -588,8 +588,11 @@ dd_context_flush(struct pipe_context *_pipe,
 static void
 dd_before_draw(struct dd_context *dctx)
 {
-   if (dd_screen(dctx->base.screen)->mode == DD_DETECT_HANGS &&
-       !dd_screen(dctx->base.screen)->no_flush)
+   struct dd_screen *dscreen = dd_screen(dctx->base.screen);
+
+   if (dscreen->mode == DD_DETECT_HANGS &&
+       !dscreen->no_flush &&
+       dctx->num_draw_calls >= dscreen->skip_count)
       dd_flush_and_handle_hang(dctx, NULL, 0,
                                "GPU hang most likely caused by internal "
                                "driver commands");
@@ -598,22 +601,31 @@ dd_before_draw(struct dd_context *dctx)
 static void
 dd_after_draw(struct dd_context *dctx, struct dd_call *call)
 {
-   switch (dd_screen(dctx->base.screen)->mode) {
-   case DD_DETECT_HANGS:
-      if (!dd_screen(dctx->base.screen)->no_flush &&
-          dd_flush_and_check_hang(dctx, NULL, 0)) {
-         dd_dump_call(dctx, call, PIPE_DEBUG_DEVICE_IS_HUNG);
+   struct dd_screen *dscreen = dd_screen(dctx->base.screen);
 
-         /* Terminate the process to prevent future hangs. */
-         dd_kill_process();
+   if (dctx->num_draw_calls >= dscreen->skip_count) {
+      switch (dscreen->mode) {
+      case DD_DETECT_HANGS:
+         if (!dscreen->no_flush &&
+            dd_flush_and_check_hang(dctx, NULL, 0)) {
+            dd_dump_call(dctx, call, PIPE_DEBUG_DEVICE_IS_HUNG);
+
+            /* Terminate the process to prevent future hangs. */
+            dd_kill_process();
+         }
+         break;
+      case DD_DUMP_ALL_CALLS:
+         dd_dump_call(dctx, call, 0);
+         break;
+      default:
+         assert(0);
       }
-      break;
-   case DD_DUMP_ALL_CALLS:
-      dd_dump_call(dctx, call, 0);
-      break;
-   default:
-      assert(0);
    }
+
+   ++dctx->num_draw_calls;
+   if (dscreen->skip_count && dctx->num_draw_calls % 10000 == 0)
+      fprintf(stderr, "Gallium debugger reached %u draw calls.\n",
+              dctx->num_draw_calls);
 }
 
 static void
diff --git a/src/gallium/drivers/ddebug/dd_pipe.h b/src/gallium/drivers/ddebug/dd_pipe.h
index 34f59203e4b..a045518dc16 100644
--- a/src/gallium/drivers/ddebug/dd_pipe.h
+++ b/src/gallium/drivers/ddebug/dd_pipe.h
@@ -45,6 +45,7 @@ struct dd_screen
    unsigned timeout_ms;
    enum dd_mode mode;
    bool no_flush;
+   unsigned skip_count;
 };
 
 struct dd_query
@@ -110,6 +111,8 @@ struct dd_context
    struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS];
    struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
    float tess_default_levels[6];
+
+   unsigned num_draw_calls;
 };
 
 
diff --git a/src/gallium/drivers/ddebug/dd_screen.c b/src/gallium/drivers/ddebug/dd_screen.c
index a776580c9bb..2716845f58f 100644
--- a/src/gallium/drivers/ddebug/dd_screen.c
+++ b/src/gallium/drivers/ddebug/dd_screen.c
@@ -290,6 +290,9 @@ ddebug_screen_create(struct pipe_screen *screen)
       puts("    $HOME/"DD_DIR"/ when a hang is detected.");
       puts("    If 'noflush' is specified, only detect hangs in pipe->flush.");
       puts("");
+      puts("  GALLIUM_DDEBUG_SKIP=[count]");
+      puts("    Skip flush and hang detection for the given initial number of draw calls.");
+      puts("");
       exit(0);
    }
 
@@ -349,5 +352,11 @@ ddebug_screen_create(struct pipe_screen *screen)
       assert(0);
    }
 
+   dscreen->skip_count = debug_get_num_option("GALLIUM_DDEBUG_SKIP", 0);
+   if (dscreen->skip_count > 0) {
+      fprintf(stderr, "Gallium debugger skipping the first %u draw calls.\n",
+              dscreen->skip_count);
+   }
+
    return &dscreen->base;
 }
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index 77f708f449c..d23111352b7 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index a6940dfefea..c4f253b836c 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
@@ -1421,15 +1421,23 @@ static inline uint32_t A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(enum adreno_pa_
 #define REG_A3XX_PC_RESTART_INDEX				0x000021ed
 
 #define REG_A3XX_HLSQ_CONTROL_0_REG				0x00002200
-#define A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK		0x00000010
+#define A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK		0x00000030
 #define A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT		4
 static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(enum a3xx_threadsize val)
 {
 	return ((val) << A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT) & A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK;
 }
 #define A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE		0x00000040
+#define A3XX_HLSQ_CONTROL_0_REG_COMPUTEMODE			0x00000100
 #define A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART			0x00000200
 #define A3XX_HLSQ_CONTROL_0_REG_RESERVED2			0x00000400
+#define A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__MASK	0x00fff000
+#define A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__SHIFT	12
+static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__SHIFT) & A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__MASK;
+}
+#define A3XX_HLSQ_CONTROL_0_REG_FSONLYTEX			0x02000000
 #define A3XX_HLSQ_CONTROL_0_REG_CHUNKDISABLE			0x04000000
 #define A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__MASK			0x08000000
 #define A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__SHIFT		27
@@ -1443,17 +1451,39 @@ static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(uint32_t val)
 #define A3XX_HLSQ_CONTROL_0_REG_SINGLECONTEXT			0x80000000
 
 #define REG_A3XX_HLSQ_CONTROL_1_REG				0x00002201
-#define A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__MASK		0x00000040
+#define A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__MASK		0x000000c0
 #define A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__SHIFT		6
 static inline uint32_t A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(enum a3xx_threadsize val)
 {
 	return ((val) << A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__SHIFT) & A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__MASK;
 }
 #define A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE		0x00000100
-#define A3XX_HLSQ_CONTROL_1_REG_RESERVED1			0x00000200
-#define A3XX_HLSQ_CONTROL_1_REG_ZWCOORD				0x02000000
+#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__MASK		0x00ff0000
+#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__SHIFT		16
+static inline uint32_t A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__SHIFT) & A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__MASK;
+}
+#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__MASK		0xff000000
+#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__SHIFT		24
+static inline uint32_t A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__SHIFT) & A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__MASK;
+}
 
 #define REG_A3XX_HLSQ_CONTROL_2_REG				0x00002202
+#define A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__MASK		0x000003fc
+#define A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__SHIFT		2
+static inline uint32_t A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__SHIFT) & A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__MASK;
+}
+#define A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__MASK		0x03fc0000
+#define A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__SHIFT		18
+static inline uint32_t A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID(uint32_t val)
+{
+	return ((val) << A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__SHIFT) & A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__MASK;
+}
 #define A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD__MASK	0xfc000000
 #define A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD__SHIFT	26
 static inline uint32_t A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(uint32_t val)
@@ -1470,13 +1500,13 @@ static inline uint32_t A3XX_HLSQ_CONTROL_3_REG_REGID(uint32_t val)
 }
 
 #define REG_A3XX_HLSQ_VS_CONTROL_REG				0x00002204
-#define A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK		0x00000fff
+#define A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK		0x000003ff
 #define A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT		0
 static inline uint32_t A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(uint32_t val)
 {
 	return ((val) << A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT) & A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK;
 }
-#define A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET__MASK		0x00fff000
+#define A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET__MASK		0x001ff000
 #define A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET__SHIFT	12
 static inline uint32_t A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(uint32_t val)
 {
@@ -1490,13 +1520,13 @@ static inline uint32_t A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(uint32_t val)
 }
 
 #define REG_A3XX_HLSQ_FS_CONTROL_REG				0x00002205
-#define A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__MASK		0x00000fff
+#define A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__MASK		0x000003ff
 #define A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__SHIFT		0
 static inline uint32_t A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(uint32_t val)
 {
 	return ((val) << A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__SHIFT) & A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__MASK;
 }
-#define A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET__MASK		0x00fff000
+#define A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET__MASK		0x001ff000
 #define A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET__SHIFT	12
 static inline uint32_t A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(uint32_t val)
 {
@@ -1510,13 +1540,13 @@ static inline uint32_t A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(uint32_t val)
 }
 
 #define REG_A3XX_HLSQ_CONST_VSPRESV_RANGE_REG			0x00002206
-#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__MASK	0x0000ffff
+#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__MASK	0x000001ff
 #define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__SHIFT	0
 static inline uint32_t A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY(uint32_t val)
 {
 	return ((val) << A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__SHIFT) & A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__MASK;
 }
-#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY__MASK	0xffff0000
+#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY__MASK	0x01ff0000
 #define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY__SHIFT	16
 static inline uint32_t A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY(uint32_t val)
 {
@@ -1524,13 +1554,13 @@ static inline uint32_t A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY(uint32_t val)
 }
 
 #define REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG			0x00002207
-#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__MASK	0x0000ffff
+#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__MASK	0x000001ff
 #define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__SHIFT	0
 static inline uint32_t A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(uint32_t val)
 {
 	return ((val) << A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__SHIFT) & A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__MASK;
 }
-#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY__MASK	0xffff0000
+#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY__MASK	0x01ff0000
 #define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY__SHIFT	16
 static inline uint32_t A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(uint32_t val)
 {
@@ -2012,24 +2042,19 @@ static inline uint32_t A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(enum a3xx_instrbuffe
 	return ((val) << A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE__SHIFT) & A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE__MASK;
 }
 #define A3XX_SP_VS_CTRL_REG0_CACHEINVALID			0x00000004
+#define A3XX_SP_VS_CTRL_REG0_ALUSCHMODE				0x00000008
 #define A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
 #define A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
 static inline uint32_t A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
 }
-#define A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0003fc00
+#define A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0000fc00
 #define A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT		10
 static inline uint32_t A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK;
 }
-#define A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__MASK		0x000c0000
-#define A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__SHIFT		18
-static inline uint32_t A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(uint32_t val)
-{
-	return ((val) << A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__SHIFT) & A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__MASK;
-}
 #define A3XX_SP_VS_CTRL_REG0_THREADSIZE__MASK			0x00100000
 #define A3XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT			20
 static inline uint32_t A3XX_SP_VS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
@@ -2037,8 +2062,6 @@ static inline uint32_t A3XX_SP_VS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
 	return ((val) << A3XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT) & A3XX_SP_VS_CTRL_REG0_THREADSIZE__MASK;
 }
 #define A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE			0x00200000
-#define A3XX_SP_VS_CTRL_REG0_PIXLODENABLE			0x00400000
-#define A3XX_SP_VS_CTRL_REG0_COMPUTEMODE			0x00800000
 #define A3XX_SP_VS_CTRL_REG0_LENGTH__MASK			0xff000000
 #define A3XX_SP_VS_CTRL_REG0_LENGTH__SHIFT			24
 static inline uint32_t A3XX_SP_VS_CTRL_REG0_LENGTH(uint32_t val)
@@ -2079,7 +2102,8 @@ static inline uint32_t A3XX_SP_VS_PARAM_REG_PSIZEREGID(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_PARAM_REG_PSIZEREGID__SHIFT) & A3XX_SP_VS_PARAM_REG_PSIZEREGID__MASK;
 }
-#define A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR__MASK		0xfff00000
+#define A3XX_SP_VS_PARAM_REG_POS2DMODE				0x00010000
+#define A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR__MASK		0x01f00000
 #define A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR__SHIFT		20
 static inline uint32_t A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(uint32_t val)
 {
@@ -2089,24 +2113,26 @@ static inline uint32_t A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(uint32_t val)
 static inline uint32_t REG_A3XX_SP_VS_OUT(uint32_t i0) { return 0x000022c7 + 0x1*i0; }
 
 static inline uint32_t REG_A3XX_SP_VS_OUT_REG(uint32_t i0) { return 0x000022c7 + 0x1*i0; }
-#define A3XX_SP_VS_OUT_REG_A_REGID__MASK			0x000001ff
+#define A3XX_SP_VS_OUT_REG_A_REGID__MASK			0x000000ff
 #define A3XX_SP_VS_OUT_REG_A_REGID__SHIFT			0
 static inline uint32_t A3XX_SP_VS_OUT_REG_A_REGID(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_OUT_REG_A_REGID__SHIFT) & A3XX_SP_VS_OUT_REG_A_REGID__MASK;
 }
+#define A3XX_SP_VS_OUT_REG_A_HALF				0x00000100
 #define A3XX_SP_VS_OUT_REG_A_COMPMASK__MASK			0x00001e00
 #define A3XX_SP_VS_OUT_REG_A_COMPMASK__SHIFT			9
 static inline uint32_t A3XX_SP_VS_OUT_REG_A_COMPMASK(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_OUT_REG_A_COMPMASK__SHIFT) & A3XX_SP_VS_OUT_REG_A_COMPMASK__MASK;
 }
-#define A3XX_SP_VS_OUT_REG_B_REGID__MASK			0x01ff0000
+#define A3XX_SP_VS_OUT_REG_B_REGID__MASK			0x00ff0000
 #define A3XX_SP_VS_OUT_REG_B_REGID__SHIFT			16
 static inline uint32_t A3XX_SP_VS_OUT_REG_B_REGID(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_OUT_REG_B_REGID__SHIFT) & A3XX_SP_VS_OUT_REG_B_REGID__MASK;
 }
+#define A3XX_SP_VS_OUT_REG_B_HALF				0x01000000
 #define A3XX_SP_VS_OUT_REG_B_COMPMASK__MASK			0x1e000000
 #define A3XX_SP_VS_OUT_REG_B_COMPMASK__SHIFT			25
 static inline uint32_t A3XX_SP_VS_OUT_REG_B_COMPMASK(uint32_t val)
@@ -2117,25 +2143,25 @@ static inline uint32_t A3XX_SP_VS_OUT_REG_B_COMPMASK(uint32_t val)
 static inline uint32_t REG_A3XX_SP_VS_VPC_DST(uint32_t i0) { return 0x000022d0 + 0x1*i0; }
 
 static inline uint32_t REG_A3XX_SP_VS_VPC_DST_REG(uint32_t i0) { return 0x000022d0 + 0x1*i0; }
-#define A3XX_SP_VS_VPC_DST_REG_OUTLOC0__MASK			0x000000ff
+#define A3XX_SP_VS_VPC_DST_REG_OUTLOC0__MASK			0x0000007f
 #define A3XX_SP_VS_VPC_DST_REG_OUTLOC0__SHIFT			0
 static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC0(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_VPC_DST_REG_OUTLOC0__SHIFT) & A3XX_SP_VS_VPC_DST_REG_OUTLOC0__MASK;
 }
-#define A3XX_SP_VS_VPC_DST_REG_OUTLOC1__MASK			0x0000ff00
+#define A3XX_SP_VS_VPC_DST_REG_OUTLOC1__MASK			0x00007f00
 #define A3XX_SP_VS_VPC_DST_REG_OUTLOC1__SHIFT			8
 static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC1(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_VPC_DST_REG_OUTLOC1__SHIFT) & A3XX_SP_VS_VPC_DST_REG_OUTLOC1__MASK;
 }
-#define A3XX_SP_VS_VPC_DST_REG_OUTLOC2__MASK			0x00ff0000
+#define A3XX_SP_VS_VPC_DST_REG_OUTLOC2__MASK			0x007f0000
 #define A3XX_SP_VS_VPC_DST_REG_OUTLOC2__SHIFT			16
 static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC2(uint32_t val)
 {
 	return ((val) << A3XX_SP_VS_VPC_DST_REG_OUTLOC2__SHIFT) & A3XX_SP_VS_VPC_DST_REG_OUTLOC2__MASK;
 }
-#define A3XX_SP_VS_VPC_DST_REG_OUTLOC3__MASK			0xff000000
+#define A3XX_SP_VS_VPC_DST_REG_OUTLOC3__MASK			0x7f000000
 #define A3XX_SP_VS_VPC_DST_REG_OUTLOC3__SHIFT			24
 static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC3(uint32_t val)
 {
@@ -2143,6 +2169,12 @@ static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC3(uint32_t val)
 }
 
 #define REG_A3XX_SP_VS_OBJ_OFFSET_REG				0x000022d4
+#define A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK	0x0000ffff
+#define A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT	0
+static inline uint32_t A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET(uint32_t val)
+{
+	return ((val) << A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT) & A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK;
+}
 #define A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
 static inline uint32_t A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(uint32_t val)
@@ -2159,8 +2191,38 @@ static inline uint32_t A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
 #define REG_A3XX_SP_VS_OBJ_START_REG				0x000022d5
 
 #define REG_A3XX_SP_VS_PVT_MEM_PARAM_REG			0x000022d6
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK	0x000000ff
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT	0
+static inline uint32_t A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM(uint32_t val)
+{
+	return ((val) << A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT) & A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK;
+}
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK	0x00ffff00
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT	8
+static inline uint32_t A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET(uint32_t val)
+{
+	return ((val) << A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT) & A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK;
+}
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK	0xff000000
+#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT	24
+static inline uint32_t A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD(uint32_t val)
+{
+	return ((val) << A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT) & A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK;
+}
 
 #define REG_A3XX_SP_VS_PVT_MEM_ADDR_REG				0x000022d7
+#define A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__MASK		0x0000001f
+#define A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT		0
+static inline uint32_t A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN(uint32_t val)
+{
+	return ((val) << A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT) & A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__MASK;
+}
+#define A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK	0xffffffe0
+#define A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT	5
+static inline uint32_t A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS(uint32_t val)
+{
+	return ((val >> 5) << A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT) & A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK;
+}
 
 #define REG_A3XX_SP_VS_PVT_MEM_SIZE_REG				0x000022d8
 
@@ -2186,24 +2248,22 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(enum a3xx_instrbuffe
 	return ((val) << A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE__SHIFT) & A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE__MASK;
 }
 #define A3XX_SP_FS_CTRL_REG0_CACHEINVALID			0x00000004
+#define A3XX_SP_FS_CTRL_REG0_ALUSCHMODE				0x00000008
 #define A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
 #define A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
 static inline uint32_t A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
 {
 	return ((val) << A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
 }
-#define A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0003fc00
+#define A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0000fc00
 #define A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT		10
 static inline uint32_t A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
 {
 	return ((val) << A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK;
 }
-#define A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__MASK		0x000c0000
-#define A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__SHIFT		18
-static inline uint32_t A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(uint32_t val)
-{
-	return ((val) << A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__SHIFT) & A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__MASK;
-}
+#define A3XX_SP_FS_CTRL_REG0_FSBYPASSENABLE			0x00020000
+#define A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP			0x00040000
+#define A3XX_SP_FS_CTRL_REG0_OUTORDERED				0x00080000
 #define A3XX_SP_FS_CTRL_REG0_THREADSIZE__MASK			0x00100000
 #define A3XX_SP_FS_CTRL_REG0_THREADSIZE__SHIFT			20
 static inline uint32_t A3XX_SP_FS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
@@ -2239,7 +2299,7 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(uint32_t val)
 {
 	return ((val) << A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING__SHIFT) & A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING__MASK;
 }
-#define A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET__MASK		0x3f000000
+#define A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET__MASK		0x7f000000
 #define A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET__SHIFT		24
 static inline uint32_t A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(uint32_t val)
 {
@@ -2247,6 +2307,12 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(uint32_t val)
 }
 
 #define REG_A3XX_SP_FS_OBJ_OFFSET_REG				0x000022e2
+#define A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK	0x0000ffff
+#define A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT	0
+static inline uint32_t A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET(uint32_t val)
+{
+	return ((val) << A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT) & A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK;
+}
 #define A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK	0x01ff0000
 #define A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT	16
 static inline uint32_t A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(uint32_t val)
@@ -2263,8 +2329,38 @@ static inline uint32_t A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val)
 #define REG_A3XX_SP_FS_OBJ_START_REG				0x000022e3
 
 #define REG_A3XX_SP_FS_PVT_MEM_PARAM_REG			0x000022e4
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK	0x000000ff
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT	0
+static inline uint32_t A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM(uint32_t val)
+{
+	return ((val) << A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT) & A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK;
+}
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK	0x00ffff00
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT	8
+static inline uint32_t A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET(uint32_t val)
+{
+	return ((val) << A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT) & A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK;
+}
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK	0xff000000
+#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT	24
+static inline uint32_t A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD(uint32_t val)
+{
+	return ((val) << A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT) & A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK;
+}
 
 #define REG_A3XX_SP_FS_PVT_MEM_ADDR_REG				0x000022e5
+#define A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__MASK		0x0000001f
+#define A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT		0
+static inline uint32_t A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN(uint32_t val)
+{
+	return ((val) << A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT) & A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__MASK;
+}
+#define A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK	0xffffffe0
+#define A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT	5
+static inline uint32_t A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS(uint32_t val)
+{
+	return ((val >> 5) << A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT) & A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK;
+}
 
 #define REG_A3XX_SP_FS_PVT_MEM_SIZE_REG				0x000022e6
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 736151651b2..a64ecf16eab 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -229,7 +229,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 			A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE);
 	OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) |
 			A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE |
-			COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_ZWCOORD));
+			COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(regid(0,0)) |
+					A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(regid(0,2))));
 	OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31));
 	OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid));
 	OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) |
@@ -254,10 +255,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 			COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) |
 			A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) |
 			A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) |
-			A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) |
 			A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) |
 			A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE |
-			COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) |
 			A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz));
 	OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) |
 			A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) |
@@ -336,7 +335,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 				COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) |
 				A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) |
 				A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) |
-				A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
+				A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP |
 				A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) |
 				A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
 				COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) |
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index a450379e98d..e8df429441e 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
@@ -157,58 +157,62 @@ enum a4xx_vtx_fmt {
 	VFMT4_10_10_10_2_UNORM = 57,
 	VFMT4_10_10_10_2_SINT = 58,
 	VFMT4_10_10_10_2_SNORM = 59,
+	VFMT4_2_10_10_10_UINT = 60,
+	VFMT4_2_10_10_10_UNORM = 61,
+	VFMT4_2_10_10_10_SINT = 62,
+	VFMT4_2_10_10_10_SNORM = 63,
 };
 
 enum a4xx_tex_fmt {
-	TFMT4_5_6_5_UNORM = 11,
-	TFMT4_5_5_5_1_UNORM = 9,
-	TFMT4_4_4_4_4_UNORM = 8,
-	TFMT4_X8Z24_UNORM = 71,
-	TFMT4_10_10_10_2_UNORM = 33,
-	TFMT4_10_10_10_2_UINT = 34,
 	TFMT4_A8_UNORM = 3,
-	TFMT4_L8_A8_UNORM = 13,
 	TFMT4_8_UNORM = 4,
-	TFMT4_8_8_UNORM = 14,
-	TFMT4_8_8_8_8_UNORM = 28,
 	TFMT4_8_SNORM = 5,
-	TFMT4_8_8_SNORM = 15,
-	TFMT4_8_8_8_8_SNORM = 29,
 	TFMT4_8_UINT = 6,
-	TFMT4_8_8_UINT = 16,
-	TFMT4_8_8_8_8_UINT = 30,
 	TFMT4_8_SINT = 7,
+	TFMT4_4_4_4_4_UNORM = 8,
+	TFMT4_5_5_5_1_UNORM = 9,
+	TFMT4_5_6_5_UNORM = 11,
+	TFMT4_L8_A8_UNORM = 13,
+	TFMT4_8_8_UNORM = 14,
+	TFMT4_8_8_SNORM = 15,
+	TFMT4_8_8_UINT = 16,
 	TFMT4_8_8_SINT = 17,
-	TFMT4_8_8_8_8_SINT = 31,
 	TFMT4_16_UNORM = 18,
-	TFMT4_16_16_UNORM = 38,
-	TFMT4_16_16_16_16_UNORM = 51,
 	TFMT4_16_SNORM = 19,
-	TFMT4_16_16_SNORM = 39,
-	TFMT4_16_16_16_16_SNORM = 52,
+	TFMT4_16_FLOAT = 20,
 	TFMT4_16_UINT = 21,
-	TFMT4_16_16_UINT = 41,
-	TFMT4_16_16_16_16_UINT = 54,
 	TFMT4_16_SINT = 22,
+	TFMT4_8_8_8_8_UNORM = 28,
+	TFMT4_8_8_8_8_SNORM = 29,
+	TFMT4_8_8_8_8_UINT = 30,
+	TFMT4_8_8_8_8_SINT = 31,
+	TFMT4_9_9_9_E5_FLOAT = 32,
+	TFMT4_10_10_10_2_UNORM = 33,
+	TFMT4_10_10_10_2_UINT = 34,
+	TFMT4_11_11_10_FLOAT = 37,
+	TFMT4_16_16_UNORM = 38,
+	TFMT4_16_16_SNORM = 39,
+	TFMT4_16_16_FLOAT = 40,
+	TFMT4_16_16_UINT = 41,
 	TFMT4_16_16_SINT = 42,
-	TFMT4_16_16_16_16_SINT = 55,
+	TFMT4_32_FLOAT = 43,
 	TFMT4_32_UINT = 44,
-	TFMT4_32_32_UINT = 57,
-	TFMT4_32_32_32_32_UINT = 64,
 	TFMT4_32_SINT = 45,
-	TFMT4_32_32_SINT = 58,
-	TFMT4_32_32_32_32_SINT = 65,
-	TFMT4_16_FLOAT = 20,
-	TFMT4_16_16_FLOAT = 40,
+	TFMT4_16_16_16_16_UNORM = 51,
+	TFMT4_16_16_16_16_SNORM = 52,
 	TFMT4_16_16_16_16_FLOAT = 53,
-	TFMT4_32_FLOAT = 43,
+	TFMT4_16_16_16_16_UINT = 54,
+	TFMT4_16_16_16_16_SINT = 55,
 	TFMT4_32_32_FLOAT = 56,
-	TFMT4_32_32_32_32_FLOAT = 63,
+	TFMT4_32_32_UINT = 57,
+	TFMT4_32_32_SINT = 58,
 	TFMT4_32_32_32_FLOAT = 59,
 	TFMT4_32_32_32_UINT = 60,
 	TFMT4_32_32_32_SINT = 61,
-	TFMT4_9_9_9_E5_FLOAT = 32,
-	TFMT4_11_11_10_FLOAT = 37,
+	TFMT4_32_32_32_32_FLOAT = 63,
+	TFMT4_32_32_32_32_UINT = 64,
+	TFMT4_32_32_32_32_SINT = 65,
+	TFMT4_X8Z24_UNORM = 71,
 	TFMT4_DXT1 = 86,
 	TFMT4_DXT3 = 87,
 	TFMT4_DXT5 = 88,
@@ -800,6 +804,7 @@ static inline uint32_t A4XX_RB_DEPTH_CONTROL_ZFUNC(enum adreno_compare_func val)
 }
 #define A4XX_RB_DEPTH_CONTROL_BF_ENABLE				0x00000080
 #define A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE			0x00010000
+#define A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS			0x00020000
 #define A4XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE			0x80000000
 
 #define REG_A4XX_RB_DEPTH_CLEAR					0x00002102
@@ -1060,6 +1065,9 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_TP_REG(uint32_t i0) { return 0x
 
 #define REG_A4XX_RBBM_CFG_DEBBUS_SEL_D				0x0000004d
 
+#define REG_A4XX_RBBM_POWER_CNTL_IP				0x00000098
+#define A4XX_RBBM_POWER_CNTL_IP_SW_COLLAPSE			0x00000001
+
 #define REG_A4XX_RBBM_PERFCTR_CP_0_LO				0x0000009c
 
 static inline uint32_t REG_A4XX_RBBM_CLOCK_CTL_SP(uint32_t i0) { return 0x00000068 + 0x1*i0; }
@@ -1110,6 +1118,10 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1(uint32_t i0) { r
 
 static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1_REG(uint32_t i0) { return 0x0000008e + 0x1*i0; }
 
+#define REG_A4XX_RBBM_SP_REGFILE_SLEEP_CNTL_0			0x00000099
+
+#define REG_A4XX_RBBM_SP_REGFILE_SLEEP_CNTL_1			0x0000009a
+
 #define REG_A4XX_RBBM_PERFCTR_PWR_1_LO				0x00000168
 
 #define REG_A4XX_RBBM_PERFCTR_CTL				0x00000170
@@ -1163,6 +1175,11 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1_REG(uint32_t i0)
 
 #define REG_A4XX_RBBM_INTERFACE_RRDY_STATUS5			0x0000019f
 
+#define REG_A4XX_RBBM_POWER_STATUS				0x000001b0
+#define A4XX_RBBM_POWER_STATUS_SP_TP_PWR_ON			0x00100000
+
+#define REG_A4XX_RBBM_WAIT_IDLE_CLOCKS_CTL2			0x000001b8
+
 #define REG_A4XX_CP_SCRATCH_UMASK				0x00000228
 
 #define REG_A4XX_CP_SCRATCH_ADDR				0x00000229
@@ -1265,6 +1282,28 @@ static inline uint32_t REG_A4XX_CP_SCRATCH_REG(uint32_t i0) { return 0x00000578
 
 #define REG_A4XX_SP_MODE_CONTROL				0x00000ec3
 
+#define REG_A4XX_SP_PERFCTR_SP_SEL_0				0x00000ec4
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_1				0x00000ec5
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_2				0x00000ec6
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_3				0x00000ec7
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_4				0x00000ec8
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_5				0x00000ec9
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_6				0x00000eca
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_7				0x00000ecb
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_8				0x00000ecc
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_9				0x00000ecd
+
+#define REG_A4XX_SP_PERFCTR_SP_SEL_10				0x00000ece
+
 #define REG_A4XX_SP_PERFCTR_SP_SEL_11				0x00000ecf
 
 #define REG_A4XX_SP_SP_CTRL_REG					0x000022c0
@@ -2180,6 +2219,7 @@ static inline uint32_t A4XX_GRAS_SU_POINT_SIZE(float val)
 
 #define REG_A4XX_GRAS_ALPHA_CONTROL				0x00002073
 #define A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE		0x00000004
+#define A4XX_GRAS_ALPHA_CONTROL_FORCE_FRAGZ_TO_FS		0x00000008
 
 #define REG_A4XX_GRAS_SU_POLY_OFFSET_SCALE			0x00002074
 #define A4XX_GRAS_SU_POLY_OFFSET_SCALE__MASK			0xffffffff
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index f220fc7ac1f..b9a28149722 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -529,14 +529,16 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 		OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
 		OUT_RING(ring, zsa->rb_depth_control |
-				COND(fragz, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE));
+				COND(fragz, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE) |
+				COND(fragz && fp->frag_coord, A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS));
 
 		/* maybe this register/bitfield needs a better name.. this
 		 * appears to be just disabling early-z
 		 */
 		OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
 		OUT_RING(ring, zsa->gras_alpha_control |
-				COND(fragz, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE));
+				COND(fragz, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE) |
+				COND(fragz && fp->frag_coord, A4XX_GRAS_ALPHA_CONTROL_FORCE_FRAGZ_TO_FS));
 	}
 
 	if (dirty & FD_DIRTY_RASTERIZER) {
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index 0e861b90b12..32b8fce1613 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -420,9 +420,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 			COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) |
 			COND(s[FS].v->frag_coord, A4XX_RB_RENDER_CONTROL2_XCOORD |
 					A4XX_RB_RENDER_CONTROL2_YCOORD |
-// TODO enabling gl_FragCoord.z is causing lockups on 0ad (but seems
-// to work everywhere else).
-//					A4XX_RB_RENDER_CONTROL2_ZCOORD |
+					A4XX_RB_RENDER_CONTROL2_ZCOORD |
 					A4XX_RB_RENDER_CONTROL2_WCOORD));
 
 	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index 0e0f0e65e9b..f9c0e6aaa83 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 4aabc086607..c6741890c69 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  68291 bytes, from 2015-11-17 16:39:59)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  64038 bytes, from 2015-11-17 16:37:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  15149 bytes, from 2015-11-20 16:22:25)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  69600 bytes, from 2015-11-24 14:39:00)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  67220 bytes, from 2015-12-13 17:58:09)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
@@ -199,7 +199,11 @@ enum adreno_state_type {
 
 enum adreno_state_src {
 	SS_DIRECT = 0,
+	SS_INVALID_ALL_IC = 2,
+	SS_INVALID_PART_IC = 3,
 	SS_INDIRECT = 4,
+	SS_INDIRECT_TCM = 5,
+	SS_INDIRECT_STM = 6,
 };
 
 enum a4xx_index_size {
@@ -227,7 +231,7 @@ static inline uint32_t CP_LOAD_STATE_0_STATE_BLOCK(enum adreno_state_block val)
 {
 	return ((val) << CP_LOAD_STATE_0_STATE_BLOCK__SHIFT) & CP_LOAD_STATE_0_STATE_BLOCK__MASK;
 }
-#define CP_LOAD_STATE_0_NUM_UNIT__MASK				0x7fc00000
+#define CP_LOAD_STATE_0_NUM_UNIT__MASK				0xffc00000
 #define CP_LOAD_STATE_0_NUM_UNIT__SHIFT				22
 static inline uint32_t CP_LOAD_STATE_0_NUM_UNIT(uint32_t val)
 {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index e768e6133a8..d55daeefe06 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -114,8 +114,6 @@ int main(int argc, char **argv)
 	void *ptr;
 	size_t size;
 
-	fd_mesa_debug |= FD_DBG_DISASM;
-
 	memset(&s, 0, sizeof(s));
 	memset(&v, 0, sizeof(v));
 
@@ -128,7 +126,7 @@ int main(int argc, char **argv)
 
 	while (n < argc) {
 		if (!strcmp(argv[n], "--verbose")) {
-			fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS;
+			fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS | FD_DBG_DISASM;
 			n++;
 			continue;
 		}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index eea5c5e28db..44c74b84e34 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1264,7 +1264,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 
 /* handles array reads: */
 static void
-emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
+emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 		struct ir3_instruction **dst)
 {
 	nir_deref_var *dvar = intr->variables[0];
@@ -1305,7 +1305,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 
 /* handles array writes: */
 static void
-emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
@@ -1321,6 +1321,10 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	case nir_deref_array_type_direct:
 		/* direct access does not require anything special: */
 		for (int i = 0; i < intr->num_components; i++) {
+			/* ttn doesn't generate partial writemasks */
+			assert(intr->const_index[0] ==
+			       (1 << intr->num_components) - 1);
+
 			unsigned n = darr->base_offset * 4 + i;
 			compile_assert(ctx, n < arr->length);
 			arr->arr[n] = src[i];
@@ -1333,6 +1337,10 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		struct ir3_instruction *addr =
 				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
 		for (int i = 0; i < intr->num_components; i++) {
+			/* ttn doesn't generate partial writemasks */
+			assert(intr->const_index[0] ==
+			       (1 << intr->num_components) - 1);
+
 			struct ir3_instruction *store;
 			unsigned n = darr->base_offset * 4 + i;
 			compile_assert(ctx, n < arr->length);
@@ -1392,7 +1400,7 @@ static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot,
 }
 
 static void
-emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
+emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
 	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
 	struct ir3_instruction **dst, **src;
@@ -1454,10 +1462,10 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		}
 		break;
 	case nir_intrinsic_load_var:
-		emit_intrinisic_load_var(ctx, intr, dst);
+		emit_intrinsic_load_var(ctx, intr, dst);
 		break;
 	case nir_intrinsic_store_var:
-		emit_intrinisic_store_var(ctx, intr);
+		emit_intrinsic_store_var(ctx, intr);
 		break;
 	case nir_intrinsic_store_output:
 		const_offset = nir_src_as_const_value(intr->src[1]);
@@ -1927,7 +1935,7 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 		emit_alu(ctx, nir_instr_as_alu(instr));
 		break;
 	case nir_instr_type_intrinsic:
-		emit_intrinisic(ctx, nir_instr_as_intrinsic(instr));
+		emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
 		break;
 	case nir_instr_type_load_const:
 		emit_load_const(ctx, nir_instr_as_load_const(instr));
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
index 07e03d26908..a84e7989cf8 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_print.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -143,7 +143,7 @@ block_id(struct ir3_block *block)
 #ifdef DEBUG
 	return block->serialno;
 #else
-	return (uint32_t)(uint64_t)block;
+	return (uint32_t)(unsigned long)block;
 #endif
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 9029d2a4180..0a52642e395 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -63,7 +63,8 @@ enum lp_interp {
    LP_INTERP_LINEAR,
    LP_INTERP_PERSPECTIVE,
    LP_INTERP_POSITION,
-   LP_INTERP_FACING
+   LP_INTERP_FACING,
+   LP_INTERP_ZERO
 };
 
 struct lp_shader_input {
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index c9a5d678244..9dcc102e758 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -108,22 +108,28 @@ struct llvmpipe_context {
    struct vertex_info vertex_info;
    
    /** Which vertex shader output slot contains color */
-   int color_slot[2];
+   uint8_t color_slot[2];
 
    /** Which vertex shader output slot contains bcolor */
-   int bcolor_slot[2];
+   uint8_t bcolor_slot[2];
 
    /** Which vertex shader output slot contains point size */
-   int psize_slot;
+   uint8_t psize_slot;
 
    /** Which vertex shader output slot contains viewport index */
-   int viewport_index_slot;
+   uint8_t viewport_index_slot;
 
    /** Which geometry shader output slot contains layer */
-   int layer_slot;
+   uint8_t layer_slot;
 
    /** A fake frontface output for unfilled primitives */
-   int face_slot;
+   uint8_t face_slot;
+
+   /** Which output slot is used for the fake vp index info */
+   uint8_t fake_vpindex_slot;
+
+   /** Which output slot is used for the fake layer info */
+   uint8_t fake_layer_slot;
 
    /** Depth format and bias settings. */
    boolean floating_point_depth;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 1778b13f9dd..ddbb88eb107 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -1207,7 +1207,7 @@ lp_setup_update_state( struct lp_setup_context *setup,
       /* Will probably need to move this somewhere else, just need  
        * to know about vertex shader point size attribute.
        */
-      setup->psize = lp->psize_slot;
+      setup->psize_slot = lp->psize_slot;
       setup->viewport_index_slot = lp->viewport_index_slot;
       setup->layer_slot = lp->layer_slot;
       setup->face_slot = lp->face_slot;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 2410e23840b..4451284c303 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -105,10 +105,10 @@ struct lp_setup_context
    float pixel_offset;
    float line_width;
    float point_size;
-   float psize;
-   unsigned viewport_index_slot;
-   unsigned layer_slot;
-   int face_slot;
+   uint8_t psize_slot;
+   uint8_t viewport_index_slot;
+   uint8_t layer_slot;
+   uint8_t face_slot;
 
    struct pipe_framebuffer_state fb;
    struct u_rect framebuffer;
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 75544b52493..14c389f7ce0 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -328,7 +328,7 @@ try_setup_point( struct lp_setup_context *setup,
    struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
    /* x/y positions in fixed point */
    const struct lp_setup_variant_key *key = &setup->setup.variant->key;
-   const int sizeAttr = setup->psize;
+   const int sizeAttr = setup->psize_slot;
    const float size
       = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0]
       : setup->point_size;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index a25e8326188..f5bcfb2b511 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -55,10 +55,14 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 
    draw_prepare_shader_outputs(llvmpipe->draw);
 
-   llvmpipe->color_slot[0] = -1;
-   llvmpipe->color_slot[1] = -1;
-   llvmpipe->bcolor_slot[0] = -1;
-   llvmpipe->bcolor_slot[1] = -1;
+   llvmpipe->color_slot[0] = 0;
+   llvmpipe->color_slot[1] = 0;
+   llvmpipe->bcolor_slot[0] = 0;
+   llvmpipe->bcolor_slot[1] = 0;
+   llvmpipe->viewport_index_slot = 0;
+   llvmpipe->layer_slot = 0;
+   llvmpipe->face_slot = 0;
+   llvmpipe->psize_slot = 0;
 
    /*
     * Match FS inputs against VS outputs, emitting the necessary
@@ -86,7 +90,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
           lpfs->info.base.input_semantic_index[i] < 2) {
          int idx = lpfs->info.base.input_semantic_index[i];
-         llvmpipe->color_slot[idx] = (int)vinfo->num_attribs;
+         llvmpipe->color_slot[idx] = vinfo->num_attribs;
       }
 
       if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
@@ -94,6 +98,30 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
       } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_PRIMID) {
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      /*
+       * For vp index and layer, if the fs requires them but the vs doesn't
+       * provide them, store the slot - we'll later replace the data directly
+       * with zero (as required by ARB_fragment_layer_viewport). This is
+       * because draw itself just redirects them to whatever was at output 0.
+       * We'll also store the real vpindex/layer slot for setup use.
+       */
+      } else if (lpfs->info.base.input_semantic_name[i] ==
+                 TGSI_SEMANTIC_VIEWPORT_INDEX) {
+         if (vs_index >= 0) {
+            llvmpipe->viewport_index_slot = vinfo->num_attribs;
+         }
+         else {
+            llvmpipe->fake_vpindex_slot = vinfo->num_attribs;
+         }
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_LAYER) {
+         if (vs_index >= 0) {
+            llvmpipe->layer_slot = vinfo->num_attribs;
+         }
+         else {
+            llvmpipe->fake_layer_slot = vinfo->num_attribs;
+         }
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
       } else {
          /*
           * Emit the requested fs attribute for all but position.
@@ -101,6 +129,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
       }
    }
+
    /* Figure out if we need bcolor as well.
     */
    for (i = 0; i < 2; i++) {
@@ -108,12 +137,11 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
                                          TGSI_SEMANTIC_BCOLOR, i);
 
       if (vs_index >= 0) {
-         llvmpipe->bcolor_slot[i] = (int)vinfo->num_attribs;
+         llvmpipe->bcolor_slot[i] = vinfo->num_attribs;
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
       }
    }
 
-
    /* Figure out if we need pointsize as well.
     */
    vs_index = draw_find_shader_output(llvmpipe->draw,
@@ -124,26 +152,26 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
    }
 
-   /* Figure out if we need viewport index */
-   vs_index = draw_find_shader_output(llvmpipe->draw,
-                                      TGSI_SEMANTIC_VIEWPORT_INDEX,
-                                      0);
-   if (vs_index >= 0) {
-      llvmpipe->viewport_index_slot = vinfo->num_attribs;
-      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-   } else {
-      llvmpipe->viewport_index_slot = 0;
+   /* Figure out if we need viewport index (if it wasn't already in fs input) */
+   if (llvmpipe->viewport_index_slot == 0) {
+      vs_index = draw_find_shader_output(llvmpipe->draw,
+                                         TGSI_SEMANTIC_VIEWPORT_INDEX,
+                                         0);
+      if (vs_index >= 0) {
+         llvmpipe->viewport_index_slot = vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      }
    }
 
-   /* Figure out if we need layer */
-   vs_index = draw_find_shader_output(llvmpipe->draw,
-                                      TGSI_SEMANTIC_LAYER,
-                                      0);
-   if (vs_index >= 0) {
-      llvmpipe->layer_slot = vinfo->num_attribs;
-      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
-   } else {
-      llvmpipe->layer_slot = 0;
+   /* Figure out if we need layer (if it wasn't already in fs input) */
+   if (llvmpipe->layer_slot == 0) {
+      vs_index = draw_find_shader_output(llvmpipe->draw,
+                                         TGSI_SEMANTIC_LAYER,
+                                         0);
+      if (vs_index >= 0) {
+         llvmpipe->layer_slot = vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      }
    }
 
    draw_compute_vertex_size(vinfo);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
index 64215be91af..d7ba5c8ad8e 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -372,9 +372,9 @@ load_attribute(struct gallivm_state *gallivm,
    /* Potentially modify it according to twoside, etc:
     */
    if (key->twoside) {
-      if (vert_attr == key->color_slot && key->bcolor_slot >= 0)
+      if (vert_attr == key->color_slot && key->bcolor_slot > 0)
          lp_twoside(gallivm, args, key, key->bcolor_slot, attribv);
-      else if (vert_attr == key->spec_slot && key->bspec_slot >= 0)
+      else if (vert_attr == key->spec_slot && key->bspec_slot > 0)
          lp_twoside(gallivm, args, key, key->bspec_slot, attribv);
    }
 }
@@ -602,6 +602,13 @@ emit_tri_coef( struct gallivm_state *gallivm,
           */
          break;
 
+      case LP_INTERP_ZERO:
+         /*
+          * The information we get from the output is bogus, replace it
+          * with zero.
+          */
+         emit_constant_coef4(gallivm, args, slot+1, args->bld.zero);
+         break;
       case LP_INTERP_FACING:
          emit_facing_coef(gallivm, args, slot+1);
          break;
@@ -848,14 +855,10 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp,
    key->size = Offset(struct lp_setup_variant_key,
                       inputs[key->num_inputs]);
 
-   key->color_slot  = lp->color_slot [0];
+   key->color_slot = lp->color_slot[0];
    key->bcolor_slot = lp->bcolor_slot[0];
-   key->spec_slot   = lp->color_slot [1];
-   key->bspec_slot  = lp->bcolor_slot[1];
-   assert(key->color_slot  == lp->color_slot [0]);
-   assert(key->bcolor_slot == lp->bcolor_slot[0]);
-   assert(key->spec_slot   == lp->color_slot [1]);
-   assert(key->bspec_slot  == lp->bcolor_slot[1]);
+   key->spec_slot = lp->color_slot[1];
+   key->bspec_slot = lp->bcolor_slot[1];
 
    /*
     * If depth is floating point, depth bias is calculated with respect
@@ -876,7 +879,13 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp,
    key->pad = 0;
    memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]);
    for (i = 0; i < key->num_inputs; i++) {
-      if (key->inputs[i].interp == LP_INTERP_COLOR) {
+      if (key->inputs[i].interp == LP_INTERP_CONSTANT) {
+         if (key->inputs[i].src_index == lp->fake_vpindex_slot ||
+             key->inputs[i].src_index == lp->fake_layer_slot) {
+            key->inputs[i].interp = LP_INTERP_ZERO;
+         }
+      }
+      else if (key->inputs[i].interp == LP_INTERP_COLOR) {
          if (lp->rasterizer->flatshade)
             key->inputs[i].interp = LP_INTERP_CONSTANT;
          else
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h
index 82af8350fb7..6cee6fe5eb5 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -17,11 +17,10 @@ struct lp_setup_variant_list_item
 struct lp_setup_variant_key {
    unsigned size:16;
    unsigned num_inputs:8;
-   int color_slot:8;
-
-   int bcolor_slot:8;
-   int spec_slot:8;
-   int bspec_slot:8;
+   unsigned color_slot:8;
+   unsigned bcolor_slot:8;
+   unsigned spec_slot:8;
+   unsigned bspec_slot:8;
    unsigned flatshade_first:1;
    unsigned pixel_center_half:1;
    unsigned twoside:1;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 216e119af63..c126c085daf 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -1124,12 +1124,15 @@ CodeEmitterNV50::emitIMUL(const Instruction *i)
 {
    code[0] = 0x40000000;
 
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      if (i->sType == TYPE_S16)
+         code[0] |= 0x8100;
+      code[1] = 0;
+      emitForm_IMM(i);
+   } else
    if (i->encSize == 8) {
       code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
-      if (i->src(1).getFile() == FILE_IMMEDIATE)
-         emitForm_IMM(i);
-      else
-         emitForm_MAD(i);
+      emitForm_MAD(i);
    } else {
       if (i->sType == TYPE_S16)
          code[0] |= 0x8100;
@@ -1190,29 +1193,45 @@ CodeEmitterNV50::emitDMUL(const Instruction *i)
 void
 CodeEmitterNV50::emitIMAD(const Instruction *i)
 {
+   int mode;
    code[0] = 0x60000000;
-   if (isSignedType(i->sType))
-      code[1] = i->saturate ? 0x40000000 : 0x20000000;
-   else
-      code[1] = 0x00000000;
 
-   int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg();
-   int neg2 = i->src(2).mod.neg();
-
-   assert(!(neg1 & neg2));
-   code[1] |= neg1 << 27;
-   code[1] |= neg2 << 26;
+   assert(!i->src(0).mod && !i->src(1).mod && !i->src(2).mod);
+   if (!isSignedType(i->sType))
+      mode = 0;
+   else if (i->saturate)
+      mode = 2;
+   else
+      mode = 1;
 
-   if (i->src(1).getFile() == FILE_IMMEDIATE)
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
       emitForm_IMM(i);
-   else
+      code[0] |= (mode & 1) << 8 | (mode & 2) << 14;
+      if (i->flagsSrc >= 0) {
+         assert(!(code[0] & 0x10400000));
+         assert(SDATA(i->src(i->flagsSrc)).id == 0);
+         code[0] |= 0x10400000;
+      }
+   } else
+   if (i->encSize == 4) {
+      emitForm_MUL(i);
+      code[0] |= (mode & 1) << 8 | (mode & 2) << 14;
+      if (i->flagsSrc >= 0) {
+         assert(!(code[0] & 0x10400000));
+         assert(SDATA(i->src(i->flagsSrc)).id == 0);
+         code[0] |= 0x10400000;
+      }
+   } else {
+      code[1] = mode << 29;
       emitForm_MAD(i);
 
-   if (i->flagsSrc >= 0) {
-      // add with carry from $cX
-      assert(!(code[1] & 0x0c000000) && !i->getPredicate());
-      code[1] |= 0xc << 24;
-      srcId(i->src(i->flagsSrc), 32 + 12);
+      if (i->flagsSrc >= 0) {
+         // add with carry from $cX
+         assert(!(code[1] & 0x0c000000) && !i->getPredicate());
+         code[1] |= 0xc << 24;
+         srcId(i->src(i->flagsSrc), 32 + 12);
+      }
    }
 }
 
@@ -2054,8 +2073,9 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
 
    // check constraints on short MAD
    if (info.srcNr >= 2 && i->srcExists(2)) {
-      if (!i->defExists(0) || !isFloatType(i->dType) ||
-          i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id)
+      if (!i->defExists(0) ||
+          (i->flagsSrc >= 0 && SDATA(i->src(i->flagsSrc)).id > 0) ||
+          DDATA(i->def(0)).id != SDATA(i->src(2)).id)
          return 8;
    }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 1d2caaba72f..b23386040a7 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1897,7 +1897,7 @@ Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
       shd = fetchSrc(C >> 4, C & 3);
 
    if (texi->op == OP_TXD) {
-      for (c = 0; c < tgt.getDim(); ++c) {
+      for (c = 0; c < tgt.getDim() + tgt.isCube(); ++c) {
          texi->dPdx[c].set(fetchSrc(Dx >> 4, (Dx & 3) + c));
          texi->dPdy[c].set(fetchSrc(Dy >> 4, (Dy & 3) + c));
       }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 420cc4ef7c0..0b903780614 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -57,7 +57,7 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
    Instruction *tex, *add;
    Value *zero = bld.loadImm(bld.getSSA(), 0);
    int l, c;
-   const int dim = i->tex.target.getDim();
+   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    const int array = i->tex.target.isArray();
 
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 64f5fc0031e..8752b0c8c54 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -44,6 +44,8 @@ static bool
 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
 {
    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+   ImmediateValue src1;
+   bool src1imm = mul->src(1).getImmediate(src1);
 
    DataType fTy; // full type
    switch (mul->sType) {
@@ -72,24 +74,41 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
    for (int j = 0; j < 4; ++j)
       t[j] = bld->getSSA(fullSize);
 
-   s[0] = mul->getSrc(0);
-   s[1] = mul->getSrc(1);
-
    if (isSignedType(mul->sType) && highResult) {
       s[0] = bld->getSSA(fullSize);
       s[1] = bld->getSSA(fullSize);
       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
       bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
+      src1.reg.data.s32 = abs(src1.reg.data.s32);
+   } else {
+      s[0] = mul->getSrc(0);
+      s[1] = mul->getSrc(1);
    }
 
    // split sources into halves
    i[0] = bld->mkSplit(a, halfSize, s[0]);
    i[1] = bld->mkSplit(b, halfSize, s[1]);
 
-   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
-   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+   if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
+      i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
+                               bld->mkImm(src1.reg.data.u32 & 0xffff));
+   } else {
+      i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
+                        src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
+      if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
+         i[3] = i[2];
+         t[1] = t[0];
+      } else {
+         i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+      }
+   }
    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
-   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+   if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
+      i[4] = i[3];
+      t[3] = t[2];
+   } else {
+      i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+   }
 
    if (highResult) {
       Value *c[2];
@@ -911,7 +930,7 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
    Instruction *tex;
    Value *zero = bld.loadImm(bld.getSSA(), 0);
    int l, c;
-   const int dim = i->tex.target.getDim();
+   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
 
    handleTEX(i);
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
@@ -1225,7 +1244,7 @@ NV50LoweringPreSSA::handleEXPORT(Instruction *i)
          i->setDef(0, new_LValue(func, FILE_GPR));
          i->getDef(0)->reg.data.id = id;
 
-         prog->maxGPR = MAX2(prog->maxGPR, id);
+         prog->maxGPR = MAX2(prog->maxGPR, id * 2);
       }
    }
    return true;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 0f575f2eedd..e67bf3eca84 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -186,92 +186,68 @@ NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
       uses.push_back(TexUse(usei, texi));
 }
 
+// While it might be tempting to use the an algorithm that just looks at tex
+// uses, not all texture results are guaranteed to be used on all paths. In
+// the case where along some control flow path a texture result is never used,
+// we might reuse that register for something else, creating a
+// write-after-write hazard. So we have to manually look through all
+// instructions looking for ones that reference the registers in question.
 void
-NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
-                                        Instruction *insn,
-                                        const BasicBlock *term,
-                                        std::list<TexUse> &uses)
+NVC0LegalizePostRA::findFirstUses(
+   Instruction *texi, std::list<TexUse> &uses)
 {
-   while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
-      insn = insn->getSrc(0)->getUniqueInsn();
-
-   // NOTE: the tex itself is, of course, not an overwriting definition
-   if (insn == texi || !insn->bb->reachableBy(texi->bb, term))
-      return;
+   int minGPR = texi->def(0).rep()->reg.data.id;
+   int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
 
-   switch (insn->op) {
-   /* Values not connected to the tex's definition through any of these should
-    * not be conflicting.
-    */
-   case OP_SPLIT:
-   case OP_MERGE:
-   case OP_PHI:
-   case OP_UNION:
-      /* recurse again */
-      for (int s = 0; insn->srcExists(s); ++s)
-         findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
-                             uses);
-      break;
-   default:
-      // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
-      addTexUse(uses, insn, texi);
-      break;
-   }
+   unordered_set<const BasicBlock *> visited;
+   findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
 }
 
 void
-NVC0LegalizePostRA::findFirstUses(
-      const Instruction *texi,
-      const Instruction *insn,
-      std::list<TexUse> &uses,
-      unordered_set<const Instruction *>& visited)
+NVC0LegalizePostRA::findFirstUsesBB(
+   int minGPR, int maxGPR, Instruction *start,
+   const Instruction *texi, std::list<TexUse> &uses,
+   unordered_set<const BasicBlock *> &visited)
 {
-   for (int d = 0; insn->defExists(d); ++d) {
-      Value *v = insn->getDef(d);
-      for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
-         Instruction *usei = (*u)->getInsn();
-
-         // NOTE: In case of a loop that overwrites a value but never uses
-         // it, it can happen that we have a cycle of uses that consists only
-         // of phis and no-op moves and will thus cause an infinite loop here
-         // since these are not considered actual uses.
-         // The most obvious (and perhaps the only) way to prevent this is to
-         // remember which instructions we've already visited.
-
-         if (visited.find(usei) != visited.end())
-            continue;
+   const BasicBlock *bb = start->bb;
+
+   // We don't process the whole bb the first time around. This is correct,
+   // however we might be in a loop and hit this BB again, and need to process
+   // the full thing. So only mark a bb as visited if we processed it from the
+   // beginning.
+   if (start == bb->getEntry()) {
+      if (visited.find(bb) != visited.end())
+         return;
+      visited.insert(bb);
+   }
 
-         visited.insert(usei);
-
-         if (usei->op == OP_PHI || usei->op == OP_UNION) {
-            // need a barrier before WAW cases, like:
-            //   %r0 = tex
-            //   if ...
-            //     texbar <- is required or tex might replace x again
-            //     %r1 = x <- overwriting def
-            //   %r2 = phi %r0, %r1
-            for (int s = 0; usei->srcExists(s); ++s) {
-               Instruction *defi = usei->getSrc(s)->getUniqueInsn();
-               if (defi && &usei->src(s) != *u)
-                  findOverwritingDefs(texi, defi, usei->bb, uses);
-            }
-         }
+   for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
+      if (insn->isNop())
+         continue;
 
-         if (usei->op == OP_SPLIT ||
-             usei->op == OP_MERGE ||
-             usei->op == OP_PHI ||
-             usei->op == OP_UNION) {
-            // these uses don't manifest in the machine code
-            findFirstUses(texi, usei, uses, visited);
-         } else
-         if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
-             usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
-            findFirstUses(texi, usei, uses, visited);
-         } else {
-            addTexUse(uses, usei, texi);
-         }
+      for (int d = 0; insn->defExists(d); ++d) {
+         if (insn->def(d).getFile() != FILE_GPR ||
+             insn->def(d).rep()->reg.data.id < minGPR ||
+             insn->def(d).rep()->reg.data.id > maxGPR)
+            continue;
+         addTexUse(uses, insn, texi);
+         return;
+      }
+
+      for (int s = 0; insn->srcExists(s); ++s) {
+         if (insn->src(s).getFile() != FILE_GPR ||
+             insn->src(s).rep()->reg.data.id < minGPR ||
+             insn->src(s).rep()->reg.data.id > maxGPR)
+            continue;
+         addTexUse(uses, insn, texi);
+         return;
       }
    }
+
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
+                      texi, uses, visited);
+   }
 }
 
 // Texture barriers:
@@ -323,8 +299,7 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
    if (!uses)
       return false;
    for (size_t i = 0; i < texes.size(); ++i) {
-      unordered_set<const Instruction *> visited;
-      findFirstUses(texes[i], texes[i], uses[i], visited);
+      findFirstUses(texes[i], uses[i]);
    }
 
    // determine the barrier level at each use
@@ -870,7 +845,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
    Instruction *tex;
    Value *zero = bld.loadImm(bld.getSSA(), 0);
    int l, c;
-   const int dim = i->tex.target.getDim();
+   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    const int array = i->tex.target.isArray();
 
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
@@ -917,7 +892,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
 bool
 NVC0LoweringPass::handleTXD(TexInstruction *txd)
 {
-   int dim = txd->tex.target.getDim();
+   int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
    unsigned arg = txd->tex.target.getArgCount();
    unsigned expected_args = arg;
    const int chipset = prog->getTarget()->getChipset();
@@ -937,8 +912,7 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
 
    if (expected_args > 4 ||
        dim > 2 ||
-       txd->tex.target.isShadow() ||
-       txd->tex.target.isCube())
+       txd->tex.target.isShadow())
       txd->op = OP_TEX;
 
    handleTEX(txd);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 2ce52e5c4f7..adb400a559a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -69,12 +69,10 @@ private:
    };
    bool insertTextureBarriers(Function *);
    inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
-   void findFirstUses(const Instruction *tex, const Instruction *def,
-                      std::list<TexUse>&,
-                      unordered_set<const Instruction *>&);
-   void findOverwritingDefs(const Instruction *tex, Instruction *insn,
-                            const BasicBlock *term,
-                            std::list<TexUse>&);
+   void findFirstUses(Instruction *texi, std::list<TexUse> &uses);
+   void findFirstUsesBB(int minGPR, int maxGPR, Instruction *start,
+                        const Instruction *texi, std::list<TexUse> &uses,
+                        unordered_set<const BasicBlock *> &visited);
    void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
    const Instruction *recurseDef(const Instruction *);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 805be5fcb40..022626ccb8f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1501,6 +1501,7 @@ private:
    void handleSLCT(Instruction *);
    void handleLOGOP(Instruction *);
    void handleCVT_NEG(Instruction *);
+   void handleCVT_CVT(Instruction *);
    void handleCVT_EXTBF(Instruction *);
    void handleSUCLAMP(Instruction *);
 
@@ -1792,6 +1793,47 @@ AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
    delete_Instruction(prog, cvt);
 }
 
+// F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
+// does a type conversion, this becomes trickier as there might be range
+// changes/etc. We could handle those in theory as long as the range was being
+// reduced or kept the same.
+void
+AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
+{
+   Instruction *insn = cvt->getSrc(0)->getInsn();
+   RoundMode rnd = insn->rnd;
+
+   if (insn->saturate ||
+       insn->subOp ||
+       insn->dType != insn->sType ||
+       insn->dType != cvt->sType)
+      return;
+
+   switch (insn->op) {
+   case OP_CEIL:
+      rnd = ROUND_PI;
+      break;
+   case OP_FLOOR:
+      rnd = ROUND_MI;
+      break;
+   case OP_TRUNC:
+      rnd = ROUND_ZI;
+      break;
+   case OP_CVT:
+      break;
+   default:
+      return;
+   }
+
+   if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
+      rnd = (RoundMode)(rnd & 3);
+
+   cvt->rnd = rnd;
+   cvt->setSrc(0, insn->getSrc(0));
+   cvt->src(0).mod *= insn->src(0).mod;
+   cvt->sType = insn->sType;
+}
+
 // Some shaders extract packed bytes out of words and convert them to
 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
 // nv50 for word sizes.
@@ -1961,6 +2003,7 @@ AlgebraicOpt::visit(BasicBlock *bb)
          break;
       case OP_CVT:
          handleCVT_NEG(i);
+         handleCVT_CVT(i);
          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
              handleCVT_EXTBF(i);
          break;
@@ -2532,6 +2575,7 @@ MemoryOpt::runOpt(BasicBlock *bb)
 class FlatteningPass : public Pass
 {
 private:
+   virtual bool visit(Function *);
    virtual bool visit(BasicBlock *);
 
    bool tryPredicateConditional(BasicBlock *);
@@ -2540,6 +2584,8 @@ private:
    inline bool isConstantCondition(Value *pred);
    inline bool mayPredicate(const Instruction *, const Value *pred) const;
    inline void removeFlow(Instruction *);
+
+   uint8_t gpr_unit;
 };
 
 bool
@@ -2561,9 +2607,15 @@ FlatteningPass::isConstantCondition(Value *pred)
          file = ld->src(0).getFile();
       } else {
          file = insn->src(s).getFile();
-         // catch $r63 on NVC0
-         if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
-            file = FILE_IMMEDIATE;
+         // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
+         // in register "units", which can vary between targets.
+         if (file == FILE_GPR) {
+            Value *v = insn->getSrc(s);
+            int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
+            int units = bytes >> gpr_unit;
+            if (units > prog->maxGPR)
+               file = FILE_IMMEDIATE;
+         }
       }
       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
          return false;
@@ -2669,6 +2721,14 @@ FlatteningPass::tryPropagateBranch(BasicBlock *bb)
 }
 
 bool
+FlatteningPass::visit(Function *fn)
+{
+   gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
+
+   return true;
+}
+
+bool
 FlatteningPass::visit(BasicBlock *bb)
 {
    if (tryPredicateConditional(bb))
@@ -2774,6 +2834,15 @@ private:
    virtual bool visit(BasicBlock *);
 };
 
+static bool
+post_ra_dead(Instruction *i)
+{
+   for (int d = 0; i->defExists(d); ++d)
+      if (i->getDef(d)->refCount())
+         return false;
+   return true;
+}
+
 bool
 NV50PostRaConstantFolding::visit(BasicBlock *bb)
 {
@@ -2787,24 +2856,48 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb)
              i->src(0).getFile() != FILE_GPR ||
              i->src(1).getFile() != FILE_GPR ||
              i->src(2).getFile() != FILE_GPR ||
-             i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id ||
-             !isFloatType(i->dType))
+             i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
             break;
 
          if (i->getDef(0)->reg.data.id >= 64 ||
              i->getSrc(0)->reg.data.id >= 64)
             break;
 
+         if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
+            break;
+
+         if (i->getPredicate())
+            break;
+
          def = i->getSrc(1)->getInsn();
+         if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
+            def = def->getSrc(0)->getInsn();
          if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
             vtmp = i->getSrc(1);
-            i->setSrc(1, def->getSrc(0));
+            if (isFloatType(i->sType)) {
+               i->setSrc(1, def->getSrc(0));
+            } else {
+               ImmediateValue val;
+               bool ret = def->src(0).getImmediate(val);
+               assert(ret);
+               if (i->getSrc(1)->reg.data.id & 1)
+                  val.reg.data.u32 >>= 16;
+               val.reg.data.u32 &= 0xffff;
+               i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32));
+            }
 
             /* There's no post-RA dead code elimination, so do it here
              * XXX: if we add more code-removing post-RA passes, we might
              *      want to create a post-RA dead-code elim pass */
-            if (vtmp->refCount() == 0)
-               delete_Instruction(bb->getProgram(), def);
+            if (post_ra_dead(vtmp->getInsn())) {
+               Value *src = vtmp->getInsn()->getSrc(0);
+               // Careful -- splits will have already been removed from the
+               // functions. Don't double-delete.
+               if (vtmp->getInsn()->bb)
+                  delete_Instruction(prog, vtmp->getInsn());
+               if (src->getInsn() && post_ra_dead(src->getInsn()))
+                  delete_Instruction(prog, src->getInsn());
+            }
 
             break;
          }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index b32bc13f755..cd8c42ced5e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1473,7 +1473,6 @@ GCRA::allocateRegisters(ArrayList& insns)
                // Short encoding only possible if they're all GPRs, no need to
                // affect them otherwise.
                if (insn->flagsDef < 0 &&
-                   isFloatType(insn->dType) &&
                    insn->src(0).getFile() == FILE_GPR &&
                    insn->src(1).getFile() == FILE_GPR &&
                    insn->src(2).getFile() == FILE_GPR)
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index a6065e45aaa..4ca9e5c06cd 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -147,6 +147,12 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
    if (nv_dbg)
       nouveau_mesa_debug = atoi(nv_dbg);
 
+   /* These must be set before any failure is possible, as the cleanup
+    * paths assume they're responsible for deleting them.
+    */
+   screen->drm = nouveau_drm(&dev->object);
+   screen->device = dev;
+
    /*
     * this is initialized to 1 in nouveau_drm_screen_create after screen
     * is fully constructed and added to the global screen list.
@@ -175,7 +181,6 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
                             data, size, &screen->channel);
    if (ret)
       return ret;
-   screen->device = dev;
 
    ret = nouveau_client_new(screen->device, &screen->client);
    if (ret)
@@ -229,6 +234,8 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 void
 nouveau_screen_fini(struct nouveau_screen *screen)
 {
+   int fd = screen->drm->fd;
+
    nouveau_mm_destroy(screen->mm_GART);
    nouveau_mm_destroy(screen->mm_VRAM);
 
@@ -238,6 +245,8 @@ nouveau_screen_fini(struct nouveau_screen *screen)
    nouveau_object_del(&screen->channel);
 
    nouveau_device_del(&screen->device);
+   nouveau_drm_del(&screen->drm);
+   close(fd);
 }
 
 static void
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index 328646fe3ce..28c4760af20 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -17,6 +17,7 @@ struct nouveau_bo;
 
 struct nouveau_screen {
    struct pipe_screen base;
+   struct nouveau_drm *drm;
    struct nouveau_device *device;
    struct nouveau_object *channel;
    struct nouveau_client *client;
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index 1319c3290cf..f13988ea526 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -6,6 +6,7 @@
 
 #include "pipe/p_defines.h"
 
+#include <drm.h>
 #include <nouveau.h>
 
 #ifndef NV04_PFIFO_MAX_PACKET_LEN
@@ -79,13 +80,13 @@ nouveau_screen_transfer_flags(unsigned pipe)
    return flags;
 }
 
-extern struct pipe_screen *
+extern struct nouveau_screen *
 nv30_screen_create(struct nouveau_device *);
 
-extern struct pipe_screen *
+extern struct nouveau_screen *
 nv50_screen_create(struct nouveau_device *);
 
-extern struct pipe_screen *
+extern struct nouveau_screen *
 nvc0_screen_create(struct nouveau_device *);
 
 #endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 154c3d370f7..854f70cf34c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -413,23 +413,20 @@ nv30_screen_destroy(struct pipe_screen *pscreen)
 #define FAIL_SCREEN_INIT(str, err)                    \
    do {                                               \
       NOUVEAU_ERR(str, err);                          \
-      nv30_screen_destroy(pscreen);                   \
-      return NULL;                                    \
+      screen->base.base.context_create = NULL;        \
+      return &screen->base;                           \
    } while(0)
 
-struct pipe_screen *
+struct nouveau_screen *
 nv30_screen_create(struct nouveau_device *dev)
 {
-   struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen);
+   struct nv30_screen *screen;
    struct pipe_screen *pscreen;
    struct nouveau_pushbuf *push;
    struct nv04_fifo *fifo;
    unsigned oclass = 0;
    int ret, i;
 
-   if (!screen)
-      return NULL;
-
    switch (dev->chipset & 0xf0) {
    case 0x30:
       if (RANKINE_0397_CHIPSET & (1 << (dev->chipset & 0x0f)))
@@ -458,10 +455,16 @@ nv30_screen_create(struct nouveau_device *dev)
 
    if (!oclass) {
       NOUVEAU_ERR("unknown 3d class for 0x%02x\n", dev->chipset);
-      FREE(screen);
       return NULL;
    }
 
+   screen = CALLOC_STRUCT(nv30_screen);
+   if (!screen)
+      return NULL;
+
+   pscreen = &screen->base.base;
+   pscreen->destroy = nv30_screen_destroy;
+
    /*
     * Some modern apps try to use msaa without keeping in mind the
     * restrictions on videomem of older cards. Resulting in dmesg saying:
@@ -479,8 +482,6 @@ nv30_screen_create(struct nouveau_device *dev)
    if (screen->max_sample_count > 4)
       screen->max_sample_count = 4;
 
-   pscreen = &screen->base.base;
-   pscreen->destroy = nv30_screen_destroy;
    pscreen->get_param = nv30_screen_get_param;
    pscreen->get_paramf = nv30_screen_get_paramf;
    pscreen->get_shader_param = nv30_screen_get_shader_param;
@@ -693,5 +694,5 @@ nv30_screen_create(struct nouveau_device *dev)
    nouveau_pushbuf_kick(push, push->channel);
 
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
-   return pscreen;
+   return &screen->base;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
index 812d10ce667..745011977b3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -336,9 +336,10 @@ nv50_miptree_create(struct pipe_screen *pscreen,
                     const struct pipe_resource *templ)
 {
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nouveau_drm *drm = nouveau_screen(pscreen)->drm;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   bool compressed = dev->drm_version >= 0x01000101;
+   bool compressed = drm->version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
index b6ebbbf1010..cccd3b71672 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -113,6 +113,12 @@ static void
 nv50_hw_destroy_query(struct nv50_context *nv50, struct nv50_query *q)
 {
    struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   if (hq->funcs && hq->funcs->destroy_query) {
+      hq->funcs->destroy_query(nv50, hq);
+      return;
+   }
+
    nv50_hw_query_allocate(nv50, q, 0);
    nouveau_fence_ref(NULL, &hq->fence);
    FREE(hq);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
index d1bccb94193..4a605f21c45 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
@@ -71,7 +71,8 @@ nv50_hw_metric_destroy_query(struct nv50_context *nv50,
    unsigned i;
 
    for (i = 0; i < hmq->num_queries; i++)
-      hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
+      if (hmq->queries[i]->funcs->destroy_query)
+         hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
    FREE(hmq);
 }
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
index 8453ce76095..79c7023b2d4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
@@ -153,7 +153,9 @@ static void
 nv50_hw_sm_destroy_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
 {
    struct nv50_query *q = &hq->base;
-   q->funcs->destroy_query(nv50, q);
+   nv50_hw_query_allocate(nv50, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
 }
 
 static boolean
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 1e4b75f18e0..272e1d45bff 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -405,6 +405,11 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
 
    if (screen->blitter)
       nv50_blitter_destroy(screen);
+   if (screen->pm.prog) {
+      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
+      nv50_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
+   }
 
    nouveau_bo_ref(NULL, &screen->code);
    nouveau_bo_ref(NULL, &screen->tls_bo);
@@ -518,11 +523,11 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
    }
 
    BEGIN_NV04(push, NV50_3D(ZETA_COMP_ENABLE), 1);
-   PUSH_DATA(push, screen->base.device->drm_version >= 0x01000101);
+   PUSH_DATA(push, screen->base.drm->version >= 0x01000101);
 
    BEGIN_NV04(push, NV50_3D(RT_COMP_ENABLE(0)), 8);
    for (i = 0; i < 8; ++i)
-      PUSH_DATA(push, screen->base.device->drm_version >= 0x01000101);
+      PUSH_DATA(push, screen->base.drm->version >= 0x01000101);
 
    BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
    PUSH_DATA (push, 1);
@@ -747,7 +752,7 @@ int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space)
    return 1;
 }
 
-struct pipe_screen *
+struct nouveau_screen *
 nv50_screen_create(struct nouveau_device *dev)
 {
    struct nv50_screen *screen;
@@ -762,6 +767,7 @@ nv50_screen_create(struct nouveau_device *dev)
    if (!screen)
       return NULL;
    pscreen = &screen->base.base;
+   pscreen->destroy = nv50_screen_destroy;
 
    ret = nouveau_screen_init(&screen->base, dev);
    if (ret) {
@@ -782,7 +788,6 @@ nv50_screen_create(struct nouveau_device *dev)
 
    chan = screen->base.channel;
 
-   pscreen->destroy = nv50_screen_destroy;
    pscreen->context_create = nv50_create;
    pscreen->is_format_supported = nv50_screen_is_format_supported;
    pscreen->get_param = nv50_screen_get_param;
@@ -961,11 +966,11 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
-   return pscreen;
+   return &screen->base;
 
 fail:
-   nv50_screen_destroy(pscreen);
-   return NULL;
+   screen->base.base.context_create = NULL;
+   return &screen->base;
 }
 
 int
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
index 6083ea995c8..c3f433608df 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -192,8 +192,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
       tic[2] |= NV50_TIC_2_TARGET_BUFFER | NV50_TIC_2_LINEAR;
       break;
    default:
-      NOUVEAU_ERR("invalid texture target: %d\n", mt->base.base.target);
-      return false;
+      unreachable("unexpected/invalid texture target");
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 85878d5fcc7..7de2f1f1d0f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -91,6 +91,9 @@ nv50_vertex_state_create(struct pipe_context *pipe,
             }
             so->element[i].state = nv50_format_table[fmt].vtx;
             so->need_conversion = true;
+            pipe_debug_message(&nouveau_context(pipe)->debug, FALLBACK,
+                               "Converting vertex element %d, no hw format %s",
+                               i, util_format_name(ve->src_format));
         }
         so->element[i].state |= i;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video.c b/src/gallium/drivers/nouveau/nv50/nv84_video.c
index 7a4670f663d..88655dbfae5 100644
--- a/src/gallium/drivers/nouveau/nv50/nv84_video.c
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video.c
@@ -756,8 +756,8 @@ firmware_present(struct pipe_screen *pscreen, enum pipe_video_format codec)
    int present, ret;
 
    if (!FIRMWARE_PRESENT(checked, VP_KERN)) {
-      nouveau_object_new(screen->channel, 0, 0x7476, NULL, 0, &obj);
-      if (obj)
+      ret = nouveau_object_new(screen->channel, 0, 0x7476, NULL, 0, &obj);
+      if (!ret)
          screen->firmware_info.profiles_present |= FIRMWARE_VP_KERN;
       nouveau_object_del(&obj);
       screen->firmware_info.profiles_checked |= FIRMWARE_VP_KERN;
@@ -765,8 +765,8 @@ firmware_present(struct pipe_screen *pscreen, enum pipe_video_format codec)
 
    if (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
       if (!FIRMWARE_PRESENT(checked, BSP_KERN)) {
-         nouveau_object_new(screen->channel, 0, 0x74b0, NULL, 0, &obj);
-         if (obj)
+         ret = nouveau_object_new(screen->channel, 0, 0x74b0, NULL, 0, &obj);
+         if (!ret)
             screen->firmware_info.profiles_present |= FIRMWARE_BSP_KERN;
          nouveau_object_del(&obj);
          screen->firmware_info.profiles_checked |= FIRMWARE_BSP_KERN;
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video.c b/src/gallium/drivers/nouveau/nv50/nv98_video.c
index 20ea547ff34..177a7e0274b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv98_video.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video.c
@@ -25,6 +25,8 @@
 #include "util/u_sampler.h"
 #include "util/u_format.h"
 
+#include <nvif/class.h>
+
 static void
 nv98_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                               struct pipe_video_buffer *video_target,
@@ -56,6 +58,28 @@ nv98_decoder_decode_bitstream(struct pipe_video_codec *decoder,
    nv98_decoder_ppp(dec, desc, target, comm_seq);
 }
 
+static const struct nouveau_mclass
+nv98_decoder_msvld[] = {
+   { G98_MSVLD, -1 },
+   { IGT21A_MSVLD, -1 },
+   { GT212_MSVLD, -1 },
+   {}
+};
+
+static const struct nouveau_mclass
+nv98_decoder_mspdec[] = {
+   { G98_MSPDEC, -1 },
+   { GT212_MSPDEC, -1 },
+   {}
+};
+
+static const struct nouveau_mclass
+nv98_decoder_msppp[] = {
+   { G98_MSPPP, -1 },
+   { GT212_MSPPP, -1 },
+   {}
+};
+
 struct pipe_video_codec *
 nv98_create_decoder(struct pipe_context *context,
                     const struct pipe_video_codec *templ)
@@ -103,12 +127,33 @@ nv98_create_decoder(struct pipe_context *context,
    }
    push = dec->pushbuf;
 
-   if (!ret)
-      ret = nouveau_object_new(dec->channel[0], 0x390b1, 0x85b1, NULL, 0, &dec->bsp);
-   if (!ret)
-      ret = nouveau_object_new(dec->channel[1], 0x190b2, 0x85b2, NULL, 0, &dec->vp);
-   if (!ret)
-      ret = nouveau_object_new(dec->channel[2], 0x290b3, 0x85b3, NULL, 0, &dec->ppp);
+   if (!ret) {
+      ret = nouveau_object_mclass(dec->channel[0], nv98_decoder_msvld);
+      if (ret >= 0) {
+         ret = nouveau_object_new(dec->channel[0], 0xbeef85b1,
+                                  nv98_decoder_msvld[ret].oclass, NULL, 0,
+                                  &dec->bsp);
+      }
+   }
+
+   if (!ret) {
+      ret = nouveau_object_mclass(dec->channel[1], nv98_decoder_mspdec);
+      if (ret >= 0) {
+         ret = nouveau_object_new(dec->channel[1], 0xbeef85b2,
+                                  nv98_decoder_mspdec[ret].oclass, NULL, 0,
+                                  &dec->vp);
+      }
+   }
+
+   if (!ret) {
+      ret = nouveau_object_mclass(dec->channel[2], nv98_decoder_msppp);
+      if (ret >= 0) {
+         ret = nouveau_object_new(dec->channel[2], 0xbeef85b3,
+                                  nv98_decoder_msppp[ret].oclass, NULL, 0,
+                                  &dec->ppp);
+      }
+   }
+
    if (ret)
       goto fail;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
index 15991c3d2bd..ed1ac48315b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -248,9 +248,10 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
                     const struct pipe_resource *templ)
 {
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nouveau_drm *drm = nouveau_screen(pscreen)->drm;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   bool compressed = dev->drm_version >= 0x01000101;
+   bool compressed = drm->version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 3845d616c3d..7497317c419 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -184,7 +184,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
    count++;
 #endif
 
-   if (screen->base.device->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
       if (screen->compute) {
          if (screen->base.class_3d == NVE4_3D_CLASS) {
             count += 2;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index 90ee82f21e5..a70d524ea85 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -116,6 +116,12 @@ static void
 nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
    struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+   if (hq->funcs && hq->funcs->destroy_query) {
+      hq->funcs->destroy_query(nvc0, hq);
+      return;
+   }
+
    nvc0_hw_query_allocate(nvc0, q, 0);
    nouveau_fence_ref(NULL, &hq->fence);
    FREE(hq);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
index 12fb609b62a..7a64b69b1c1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -293,7 +293,8 @@ nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
    unsigned i;
 
    for (i = 0; i < hmq->num_queries; i++)
-      hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
+      if (hmq->queries[i]->funcs->destroy_query)
+         hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
    FREE(hmq);
 }
 
@@ -420,7 +421,10 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
 {
    switch (hq->base.type - NVE4_HW_METRIC_QUERY(0)) {
    case NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
-      return sm20_hw_metric_calc_result(hq, res64);
+      /* (active_warps / active_cycles) / max. number of warps on a MP */
+      if (res64[1])
+         return (res64[0] / (double)res64[1]) / 64;
+      break;
    case NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
       return sm20_hw_metric_calc_result(hq, res64);
    case NVE4_HW_METRIC_QUERY_INST_ISSUED:
@@ -561,7 +565,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
    uint16_t class_3d = screen->base.class_3d;
    int count = 0;
 
-   if (screen->base.device->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
       if (screen->compute) {
          if (screen->base.class_3d == NVE4_3D_CLASS) {
             count += NVE4_HW_METRIC_QUERY_COUNT;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
index 7d1e75fd9d1..721857edecc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -782,7 +782,9 @@ static void
 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
 {
    struct nvc0_query *q = &hq->base;
-   q->funcs->destroy_query(nvc0, q);
+   nvc0_hw_query_allocate(nvc0, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
 }
 
 static boolean
@@ -1075,17 +1077,6 @@ nve4_hw_sm_query_read_data(uint32_t count[32][8],
    return true;
 }
 
-/* Metric calculations:
- * sum(x) ... sum of x over all MPs
- * avg(x) ... average of x over all MPs
- *
- * IPC              : sum(inst_executed) / clock
- * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
- * MP_OCCUPANCY     : avg((active_warps / 64) / active_cycles)
- * MP_EFFICIENCY    : avg(active_cycles / clock)
- *
- * NOTE: Interpretation of IPC requires knowledge of MP count.
- */
 static boolean
 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
                             boolean wait, union pipe_query_result *result)
@@ -1130,7 +1121,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
    struct nvc0_hw_query *hq;
    unsigned space;
 
-   if (nvc0->screen->base.device->drm_version < 0x01000101)
+   if (nvc0->screen->base.drm->version < 0x01000101)
       return NULL;
 
    if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
@@ -1225,7 +1216,7 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
 {
    int count = 0;
 
-   if (screen->base.device->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
       if (screen->compute) {
          if (screen->base.class_3d == NVE4_3D_CLASS) {
             count += NVE4_HW_SM_QUERY_COUNT;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 461fcaaf677..39954464b9c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -22,6 +22,7 @@
 
 #include <xf86drm.h>
 #include <nouveau_drm.h>
+#include <nvif/class.h>
 #include "util/u_format.h"
 #include "util/u_format_s3tc.h"
 #include "pipe/p_screen.h"
@@ -428,6 +429,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
    if (screen->pm.prog) {
       screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
       nvc0_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
    }
 
    nouveau_bo_ref(NULL, &screen->text);
@@ -617,11 +619,10 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
 #define FAIL_SCREEN_INIT(str, err)                    \
    do {                                               \
       NOUVEAU_ERR(str, err);                          \
-      nvc0_screen_destroy(pscreen);                   \
-      return NULL;                                    \
+      goto fail;                                      \
    } while(0)
 
-struct pipe_screen *
+struct nouveau_screen *
 nvc0_screen_create(struct nouveau_device *dev)
 {
    struct nvc0_screen *screen;
@@ -650,6 +651,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    if (!screen)
       return NULL;
    pscreen = &screen->base.base;
+   pscreen->destroy = nvc0_screen_destroy;
 
    ret = nouveau_screen_init(&screen->base, dev);
    if (ret) {
@@ -672,7 +674,6 @@ nvc0_screen_create(struct nouveau_device *dev)
       screen->base.vidmem_bindings = 0;
    }
 
-   pscreen->destroy = nvc0_screen_destroy;
    pscreen->context_create = nvc0_create;
    pscreen->is_format_supported = nvc0_screen_is_format_supported;
    pscreen->get_param = nvc0_screen_get_param;
@@ -687,7 +688,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported;
 
    flags = NOUVEAU_BO_GART | NOUVEAU_BO_MAP;
-   if (dev->drm_version >= 0x01000202)
+   if (screen->base.drm->version >= 0x01000202)
       flags |= NOUVEAU_BO_COHERENT;
 
    ret = nouveau_bo_new(dev, flags, 0, 4096, NULL, &screen->fence.bo);
@@ -699,12 +700,13 @@ nvc0_screen_create(struct nouveau_device *dev)
    screen->base.fence.update = nvc0_screen_fence_update;
 
 
-   ret = nouveau_object_new(chan,
-                            (dev->chipset < 0xe0) ? 0x1f906e : 0x906e, 0x906e,
-                            NULL, 0, &screen->nvsw);
+   ret = nouveau_object_new(chan, (dev->chipset < 0xe0) ? 0x1f906e : 0x906e,
+                            NVIF_CLASS_SW_GF100, NULL, 0, &screen->nvsw);
    if (ret)
       FAIL_SCREEN_INIT("Error creating SW object: %d\n", ret);
 
+   BEGIN_NVC0(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->nvsw->handle);
 
    switch (dev->chipset & ~0xf) {
    case 0x110:
@@ -811,10 +813,11 @@ nvc0_screen_create(struct nouveau_device *dev)
       PUSH_DATA (push, 0x17);
    }
 
-   IMMED_NVC0(push, NVC0_3D(ZETA_COMP_ENABLE), dev->drm_version >= 0x01000101);
+   IMMED_NVC0(push, NVC0_3D(ZETA_COMP_ENABLE),
+                    screen->base.drm->version >= 0x01000101);
    BEGIN_NVC0(push, NVC0_3D(RT_COMP_ENABLE(0)), 8);
    for (i = 0; i < 8; ++i)
-           PUSH_DATA(push, dev->drm_version >= 0x01000101);
+           PUSH_DATA(push, screen->base.drm->version >= 0x01000101);
 
    BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
    PUSH_DATA (push, 1);
@@ -910,7 +913,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
    PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
 
-   if (dev->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
       ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
       if (ret) {
          NOUVEAU_ERR("NOUVEAU_GETPARAM_GRAPH_UNITS failed.\n");
@@ -1061,11 +1064,11 @@ nvc0_screen_create(struct nouveau_device *dev)
 
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
-   return pscreen;
+   return &screen->base;
 
 fail:
-   nvc0_screen_destroy(pscreen);
-   return NULL;
+   screen->base.base.context_create = NULL;
+   return &screen->base;
 }
 
 int
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 7e2e9992fe8..5e84ca9e0ea 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -236,11 +236,8 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_program *gp = nvc0->gmtyprog;
 
-   if (gp)
-      nvc0_program_validate(nvc0, gp);
-
    /* we allow GPs with no code for specifying stream output state only */
-   if (gp && gp->code_size) {
+   if (gp && nvc0_program_validate(nvc0, gp) && gp->code_size) {
       const bool gp_selects_layer = !!(gp->hdr[13] & (1 << 9));
 
       BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index f8e1efb9834..4e43c4e99fd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1030,9 +1030,11 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit)
       nvc0->base.pipe.render_condition(&nvc0->base.pipe, nvc0->cond_query,
                                        nvc0->cond_cond, nvc0->cond_mode);
 
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP);
    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0));
    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1));
+   nouveau_scratch_done(&nvc0->base);
 
    nvc0->dirty = blit->saved.dirty |
       (NVC0_NEW_FRAMEBUFFER | NVC0_NEW_SCISSOR | NVC0_NEW_SAMPLE_MASK |
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 2dd100ffdc7..74090ce40a5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -193,9 +193,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
       tic[2] |= NV50_TIC_2_TARGET_CUBE_ARRAY;
       break;
    default:
-      NOUVEAU_ERR("unexpected/invalid texture target: %d\n",
-                  mt->base.base.target);
-      return false;
+      unreachable("unexpected/invalid texture target");
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index c464904d6d4..54443bdccc0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -95,6 +95,9 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
             }
             so->element[i].state = nvc0_format_table[fmt].vtx;
             so->need_conversion = true;
+            pipe_debug_message(&nouveau_context(pipe)->debug, FALLBACK,
+                               "Converting vertex element %d, no hw format %s",
+                               i, util_format_name(ve->src_format));
         }
         size = util_format_get_blocksize(fmt);
 
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 02d0c7f6c11..1443bc0c4af 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -2414,7 +2414,7 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
 	struct r600_command_buffer *cb = &rctx->start_cs_cmd;
 	int tmp, i;
 
-	r600_init_command_buffer(cb, 342);
+	r600_init_command_buffer(cb, 338);
 
 	/* This must be first. */
 	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
@@ -2468,10 +2468,6 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
 
 	r600_store_context_reg(cb, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0);
 
-	r600_store_context_reg_seq(cb, R_028AB4_VGT_REUSE_OFF, 2);
-	r600_store_value(cb, 0); /* R_028AB4_VGT_REUSE_OFF */
-	r600_store_value(cb, 0); /* R_028AB8_VGT_VTX_CNT_EN */
-
 	r600_store_config_reg(cb, R_008A14_PA_CL_ENHANCE, (3 << 1) | 1);
 
 	r600_store_context_reg_seq(cb, CM_R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
@@ -2671,7 +2667,7 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx)
 		return;
 	}
 
-	r600_init_command_buffer(cb, 342);
+	r600_init_command_buffer(cb, 338);
 
 	/* This must be first. */
 	r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0));
@@ -2896,10 +2892,6 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx)
 	r600_store_value(cb, 0); /* R_028A3C_VGT_GROUP_VECT_1_FMT_CNTL */
 	r600_store_value(cb, 0); /* R_028A40_VGT_GS_MODE */
 
-	r600_store_context_reg_seq(cb, R_028AB4_VGT_REUSE_OFF, 2);
-	r600_store_value(cb, 0); /* R_028AB4_VGT_REUSE_OFF */
-	r600_store_value(cb, 0); /* R_028AB8_VGT_VTX_CNT_EN */
-
 	r600_store_config_reg(cb, R_008A14_PA_CL_ENHANCE, (3 << 1) | 1);
 
         r600_store_context_reg(cb, R_0288F0_SQ_VTX_SEMANTIC_CLEAR, ~0);
@@ -3731,7 +3723,7 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->blend_color.atom, id++, r600_emit_blend_color, 6);
 	r600_init_atom(rctx, &rctx->blend_state.atom, id++, r600_emit_cso_state, 0);
 	r600_init_atom(rctx, &rctx->cb_misc_state.atom, id++, evergreen_emit_cb_misc_state, 4);
-	r600_init_atom(rctx, &rctx->clip_misc_state.atom, id++, r600_emit_clip_misc_state, 6);
+	r600_init_atom(rctx, &rctx->clip_misc_state.atom, id++, r600_emit_clip_misc_state, 9);
 	r600_init_atom(rctx, &rctx->clip_state.atom, id++, evergreen_emit_clip_state, 26);
 	r600_init_atom(rctx, &rctx->db_misc_state.atom, id++, evergreen_emit_db_misc_state, 10);
 	r600_init_atom(rctx, &rctx->db_state.atom, id++, evergreen_emit_db_state, 14);
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 795fb9a9513..31f2a729494 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -152,6 +152,7 @@ struct r600_clip_misc_state {
 	unsigned clip_plane_enable; /* from rasterizer    */
 	unsigned clip_dist_write;   /* from vertex shader */
 	boolean clip_disable;       /* from vertex shader */
+	boolean vs_out_viewport;    /* from vertex shader */
 };
 
 struct r600_alphatest_state {
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 6a666343b06..ca589fa7759 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1377,11 +1377,13 @@ static void r600_update_clip_state(struct r600_context *rctx,
 {
 	if (current->pa_cl_vs_out_cntl != rctx->clip_misc_state.pa_cl_vs_out_cntl ||
 	    current->shader.clip_dist_write != rctx->clip_misc_state.clip_dist_write ||
-	    current->shader.vs_position_window_space != rctx->clip_misc_state.clip_disable) {
-				rctx->clip_misc_state.pa_cl_vs_out_cntl = current->pa_cl_vs_out_cntl;
-				rctx->clip_misc_state.clip_dist_write = current->shader.clip_dist_write;
-				rctx->clip_misc_state.clip_disable = current->shader.vs_position_window_space;
-				r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
+	    current->shader.vs_position_window_space != rctx->clip_misc_state.clip_disable ||
+	    current->shader.vs_out_viewport != rctx->clip_misc_state.vs_out_viewport) {
+		rctx->clip_misc_state.pa_cl_vs_out_cntl = current->pa_cl_vs_out_cntl;
+		rctx->clip_misc_state.clip_dist_write = current->shader.clip_dist_write;
+		rctx->clip_misc_state.clip_disable = current->shader.vs_position_window_space;
+		rctx->clip_misc_state.vs_out_viewport = current->shader.vs_out_viewport;
+		r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 	}
 }
 
@@ -1656,6 +1658,10 @@ void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom
 	radeon_set_context_reg(cs, R_02881C_PA_CL_VS_OUT_CNTL,
 			       state->pa_cl_vs_out_cntl |
 			       (state->clip_plane_enable & state->clip_dist_write));
+	/* reuse needs to be set off if we write oViewport */
+	if (rctx->b.chip_class >= EVERGREEN)
+		radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF,
+				       S_028AB4_REUSE_OFF(state->vs_out_viewport));
 }
 
 static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
diff --git a/src/gallium/drivers/radeon/Makefile.am b/src/gallium/drivers/radeon/Makefile.am
index 13d8976de95..a6fc145cb5c 100644
--- a/src/gallium/drivers/radeon/Makefile.am
+++ b/src/gallium/drivers/radeon/Makefile.am
@@ -16,7 +16,8 @@ libradeon_la_SOURCES = \
 if NEED_RADEON_LLVM
 
 AM_CFLAGS += \
-	$(LLVM_CFLAGS)
+	$(LLVM_CFLAGS) \
+	$(LIBELF_CFLAGS)
 
 libradeon_la_SOURCES += \
 	$(LLVM_C_FILES)
@@ -24,7 +25,7 @@ libradeon_la_SOURCES += \
 libradeon_la_LIBADD = \
 	$(CLOCK_LIB) \
 	$(LLVM_LIBS) \
-	$(ELF_LIB)
+	$(LIBELF_LIBS)
 
 libradeon_la_LDFLAGS = \
 	$(LLVM_LDFLAGS)
diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c
index a835aee993b..fad7bdec40a 100644
--- a/src/gallium/drivers/radeon/r600_perfcounter.c
+++ b/src/gallium/drivers/radeon/r600_perfcounter.c
@@ -202,9 +202,6 @@ static void r600_pc_query_add_result(struct r600_common_context *ctx,
 	for (i = 0; i < query->num_counters; ++i) {
 		struct r600_pc_counter *counter = &query->counters[i];
 
-		if (counter->base == ~0)
-			continue;
-
 		for (j = 0; j < counter->dwords; ++j) {
 			uint32_t value = results[counter->base + j * counter->stride];
 			result->batch[i].u32 += value;
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index ed0aefc9d32..0aa19cd54fe 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -119,7 +119,7 @@ static void r600_query_sw_end(struct r600_common_context *rctx,
 		rctx->b.flush(&rctx->b, &query->fence, 0);
 		break;
 	case R600_QUERY_DRAW_CALLS:
-		query->begin_result = rctx->num_draw_calls;
+		query->end_result = rctx->num_draw_calls;
 		break;
 	case R600_QUERY_REQUESTED_VRAM:
 	case R600_QUERY_REQUESTED_GTT:
@@ -141,10 +141,10 @@ static void r600_query_sw_end(struct r600_common_context *rctx,
 		query->begin_result = 0;
 		break;
 	case R600_QUERY_NUM_COMPILATIONS:
-		query->begin_result = p_atomic_read(&rctx->screen->num_compilations);
+		query->end_result = p_atomic_read(&rctx->screen->num_compilations);
 		break;
 	case R600_QUERY_NUM_SHADERS_CREATED:
-		query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created);
+		query->end_result = p_atomic_read(&rctx->screen->num_shaders_created);
 		break;
 	default:
 		unreachable("r600_query_sw_end: bad query type");
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 6b2ebdead38..61ed9402122 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -188,8 +188,8 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	if (mem_err) {
 		fprintf(stderr, "%s: %s", __FUNCTION__, err);
 		FREE(err);
-		LLVMDisposeTargetMachine(tm);
-		return 1;
+		rval = 1;
+		goto out;
 	}
 
 	if (0 != rval) {
@@ -205,6 +205,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar
 	/* Clean up */
 	LLVMDisposeMemoryBuffer(out_buffer);
 
+out:
 	if (dispose_tm) {
 		LLVMDisposeTargetMachine(tm);
 	}
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index a0ddff6c4e3..7ee1daee7bf 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -436,7 +436,7 @@ static void si_pc_emit_select(struct r600_common_context *ctx,
 
 		dw = count + regs->num_prelude;
 		if (count >= regs->num_multi)
-			count += regs->num_multi;
+			dw += regs->num_multi;
 		radeon_set_uconfig_reg_seq(cs, regs->select0, dw);
 		for (idx = 0; idx < regs->num_prelude; ++idx)
 			radeon_emit(cs, 0);
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 4086819c1ac..2a6d2c6ff36 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -605,6 +605,10 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 		(clipdist_mask ? 0 :
 		 sctx->queued.named.rasterizer->clip_plane_enable & SIX_BITS) |
 		S_028810_CLIP_DISABLE(window_space));
+
+	/* reuse needs to be set off if we write oViewport */
+	radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF,
+			       S_028AB4_REUSE_OFF(info->writes_viewport_index));
 }
 
 static void si_set_scissor_states(struct pipe_context *ctx,
@@ -3468,7 +3472,6 @@ static void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
 
 	si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
-	si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0);
 	si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
 	if (sctx->b.chip_class < CIK)
 		si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index c4284cc3e5b..78e346a92b9 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -343,6 +343,13 @@ struct svga_hw_draw_state
    SVGA3dElementLayoutId layout_id;
    SVGA3dPrimitiveType topology;
 
+   struct svga_winsys_surface *ib;  /**< index buffer for drawing */
+   SVGA3dSurfaceFormat ib_format;
+   unsigned ib_offset;
+
+   unsigned num_samplers[PIPE_SHADER_TYPES];
+   SVGA3dSamplerId samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
+
    /* used for rebinding */
    unsigned num_sampler_views[PIPE_SHADER_TYPES];
    unsigned default_constbuf_size[PIPE_SHADER_TYPES];
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index cca499a65c7..2d3631d6f9c 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -539,11 +539,18 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
       SVGA3dSurfaceFormat indexFormat = xlate_index_format(range->indexWidth);
 
       /* setup index buffer */
-      ret = SVGA3D_vgpu10_SetIndexBuffer(svga->swc, ib_handle,
-                                         indexFormat,
-                                         range->indexArray.offset);
-      if (ret != PIPE_OK)
-         return ret;
+      if (ib_handle != svga->state.hw_draw.ib ||
+          indexFormat != svga->state.hw_draw.ib_format ||
+          range->indexArray.offset != svga->state.hw_draw.ib_offset) {
+         ret = SVGA3D_vgpu10_SetIndexBuffer(svga->swc, ib_handle,
+                                            indexFormat,
+                                            range->indexArray.offset);
+         if (ret != PIPE_OK)
+            return ret;
+         svga->state.hw_draw.ib = ib_handle;
+         svga->state.hw_draw.ib_format = indexFormat;
+         svga->state.hw_draw.ib_offset = range->indexArray.offset;
+      }
 
       if (instance_count > 1) {
          ret = SVGA3D_vgpu10_DrawIndexedInstanced(svga->swc,
diff --git a/src/gallium/drivers/svga/svga_state.c b/src/gallium/drivers/svga/svga_state.c
index 722b369fd4b..4479a271262 100644
--- a/src/gallium/drivers/svga/svga_state.c
+++ b/src/gallium/drivers/svga/svga_state.c
@@ -129,7 +129,11 @@ update_state(struct svga_context *svga,
              const struct svga_tracked_state *atoms[],
              unsigned *state)
 {
+#ifdef DEBUG
    boolean debug = TRUE;
+#else
+   boolean debug = FALSE;
+#endif
    enum pipe_error ret = PIPE_OK;
    unsigned i;
 
diff --git a/src/gallium/drivers/svga/svga_state_sampler.c b/src/gallium/drivers/svga/svga_state_sampler.c
index c5d52bbfd14..b070f65feaf 100644
--- a/src/gallium/drivers/svga/svga_state_sampler.c
+++ b/src/gallium/drivers/svga/svga_state_sampler.c
@@ -301,13 +301,21 @@ update_samplers(struct svga_context *svga, unsigned dirty )
       }
 
       if (count > 0) {
-         ret = SVGA3D_vgpu10_SetSamplers(svga->swc,
-                                         count,
-                                         0,                        /* start */
-                                         svga_shader_type(shader), /* type */
-                                         ids);
-         if (ret != PIPE_OK)
-            return ret;
+         if (count != svga->state.hw_draw.num_samplers[shader] ||
+             memcmp(ids, svga->state.hw_draw.samplers[shader],
+                    count * sizeof(ids[0])) != 0) {
+            /* HW state is really changing */
+            ret = SVGA3D_vgpu10_SetSamplers(svga->swc,
+                                            count,
+                                            0,                       /* start */
+                                            svga_shader_type(shader), /* type */
+                                            ids);
+            if (ret != PIPE_OK)
+               return ret;
+            memcpy(svga->state.hw_draw.samplers[shader], ids,
+                   count * sizeof(ids[0]));
+            svga->state.hw_draw.num_samplers[shader] = count;
+         }
       }
    }
 
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 24b577ae9f3..a9a2742ec66 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -32,6 +32,7 @@ C_SOURCES := \
 	vc4_program.c \
 	vc4_qir.c \
 	vc4_qir_lower_uniforms.c \
+	vc4_qir_schedule.c \
 	vc4_qir.h \
 	vc4_qpu.c \
 	vc4_qpu_defines.h \
diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
index 16dcece1ea1..876c296dc85 100644
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -54,8 +54,8 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
         bool old_msaa = vc4->msaa;
         int old_tile_width = vc4->tile_width;
         int old_tile_height = vc4->tile_height;
-        bool msaa = (info->src.resource->nr_samples ||
-                     info->dst.resource->nr_samples);
+        bool msaa = (info->src.resource->nr_samples > 1 ||
+                     info->dst.resource->nr_samples > 1);
         int tile_width = msaa ? 32 : 64;
         int tile_height = msaa ? 32 : 64;
 
@@ -110,9 +110,11 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
 
         pipe_surface_reference(&vc4->color_read, src_surf);
         pipe_surface_reference(&vc4->color_write,
-                               dst_surf->texture->nr_samples ? NULL : dst_surf);
+                               dst_surf->texture->nr_samples > 1 ?
+                               NULL : dst_surf);
         pipe_surface_reference(&vc4->msaa_color_write,
-                               dst_surf->texture->nr_samples ? dst_surf : NULL);
+                               dst_surf->texture->nr_samples > 1 ?
+                               dst_surf : NULL);
         pipe_surface_reference(&vc4->zs_read, NULL);
         pipe_surface_reference(&vc4->zs_write, NULL);
         pipe_surface_reference(&vc4->msaa_zs_write, NULL);
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index 1cd167600b9..312b006f96e 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -67,15 +67,13 @@ vc4_flush(struct pipe_context *pctx)
         cl_u8(&bcl, VC4_PACKET_FLUSH);
         cl_end(&vc4->bcl, bcl);
 
-        vc4->msaa = false;
         if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
                 pipe_surface_reference(&vc4->color_write,
-                                       cbuf->texture->nr_samples ? NULL : cbuf);
+                                       cbuf->texture->nr_samples > 1 ?
+                                       NULL : cbuf);
                 pipe_surface_reference(&vc4->msaa_color_write,
-                                       cbuf->texture->nr_samples ? cbuf : NULL);
-
-                if (cbuf->texture->nr_samples)
-                        vc4->msaa = true;
+                                       cbuf->texture->nr_samples > 1 ?
+                                       cbuf : NULL);
 
                 if (!(vc4->cleared & PIPE_CLEAR_COLOR0)) {
                         pipe_surface_reference(&vc4->color_read, cbuf);
@@ -92,15 +90,12 @@ vc4_flush(struct pipe_context *pctx)
         if (vc4->framebuffer.zsbuf &&
             (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
                 pipe_surface_reference(&vc4->zs_write,
-                                       zsbuf->texture->nr_samples ?
+                                       zsbuf->texture->nr_samples > 1 ?
                                        NULL : zsbuf);
                 pipe_surface_reference(&vc4->msaa_zs_write,
-                                       zsbuf->texture->nr_samples ?
+                                       zsbuf->texture->nr_samples > 1 ?
                                        zsbuf : NULL);
 
-                if (zsbuf->texture->nr_samples)
-                        vc4->msaa = true;
-
                 if (!(vc4->cleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
                         pipe_surface_reference(&vc4->zs_read, zsbuf);
                 } else {
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h
index bf58c6cd078..110c783e5fd 100644
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -32,6 +32,7 @@
 #define DRM_VC4_CREATE_BO                         0x03
 #define DRM_VC4_MMAP_BO                           0x04
 #define DRM_VC4_CREATE_SHADER_BO                  0x05
+#define DRM_VC4_GET_HANG_STATE                    0x06
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -39,6 +40,7 @@
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
 #define DRM_IOCTL_VC4_CREATE_SHADER_BO    DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo)
+#define DRM_IOCTL_VC4_GET_HANG_STATE      DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state)
 
 struct drm_vc4_submit_rcl_surface {
 	uint32_t hindex; /* Handle index, or ~0 if not present. */
@@ -231,4 +233,47 @@ struct drm_vc4_mmap_bo {
 	uint64_t offset;
 };
 
+struct drm_vc4_get_hang_state_bo {
+	uint32_t handle;
+	uint32_t paddr;
+	uint32_t size;
+	uint32_t pad;
+};
+
+/**
+ * struct drm_vc4_hang_state - ioctl argument for collecting state
+ * from a GPU hang for analysis.
+*/
+struct drm_vc4_get_hang_state {
+	/** Pointer to array of struct drm_vc4_get_hang_state_bo. */
+	uint64_t bo;
+	/**
+	 * On input, the size of the bo array.  Output is the number
+	 * of bos to be returned.
+	 */
+	uint32_t bo_count;
+
+	uint32_t start_bin, start_render;
+
+	uint32_t ct0ca, ct0ea;
+	uint32_t ct1ca, ct1ea;
+	uint32_t ct0cs, ct1cs;
+	uint32_t ct0ra0, ct1ra0;
+
+	uint32_t bpca, bpcs;
+	uint32_t bpoa, bpos;
+
+	uint32_t vpmbase;
+
+	uint32_t dbge;
+	uint32_t fdbgo;
+	uint32_t fdbgb;
+	uint32_t fdbgr;
+	uint32_t fdbgs;
+	uint32_t errstat;
+
+	/* Pad that we may save more registers into in the future. */
+	uint32_t pad[16];
+};
+
 #endif /* _UAPI_VC4_DRM_H_ */
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index 79bf2c49eec..5d071ec862f 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -89,7 +89,7 @@ vc4_submit_setup_rcl_surface(struct vc4_context *vc4,
         submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
         submit_surf->offset = surf->offset;
 
-        if (psurf->texture->nr_samples == 0) {
+        if (psurf->texture->nr_samples <= 1) {
                 if (is_depth) {
                         submit_surf->bits =
                                 VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS,
@@ -132,7 +132,7 @@ vc4_submit_setup_rcl_render_config_surface(struct vc4_context *vc4,
         submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
         submit_surf->offset = surf->offset;
 
-        if (psurf->texture->nr_samples == 0) {
+        if (psurf->texture->nr_samples <= 1) {
                 submit_surf->bits =
                         VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ?
                                       VC4_RENDER_CONFIG_FORMAT_BGR565 :
@@ -240,9 +240,11 @@ vc4_job_submit(struct vc4_context *vc4)
 #else
                 ret = vc4_simulator_flush(vc4, &submit);
 #endif
-                if (ret) {
-                        fprintf(stderr, "VC4 submit failed\n");
-                        abort();
+                static bool warned = false;
+                if (ret && !warned) {
+                        fprintf(stderr, "Draw call returned %s.  "
+                                        "Expect corruption.\n", strerror(errno));
+                        warned = true;
                 }
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index caad05cb9f7..d11c4e3b7ca 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1848,12 +1848,15 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         qir_optimize(c);
         qir_lower_uniforms(c);
 
+        qir_schedule_instructions(c);
+
         if (vc4_debug & VC4_DEBUG_QIR) {
                 fprintf(stderr, "%s prog %d/%d QIR:\n",
                         qir_get_stage_name(c->stage),
                         c->program_id, c->variant_id);
                 qir_dump(c);
         }
+
         qir_reorder_uniforms(c);
         vc4_generate_code(vc4, c);
 
@@ -2043,7 +2046,7 @@ vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key,
                 key->tex[i].swizzle[2] = sampler->swizzle_b;
                 key->tex[i].swizzle[3] = sampler->swizzle_a;
 
-                if (sampler->texture->nr_samples) {
+                if (sampler->texture->nr_samples > 1) {
                         key->tex[i].msaa_width = sampler->texture->width0;
                         key->tex[i].msaa_height = sampler->texture->height0;
                 } else if (sampler){
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index c34dce3aec8..b0fbb4c1db2 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -459,6 +459,7 @@ void qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst);
 struct qreg qir_uniform(struct vc4_compile *c,
                         enum quniform_contents contents,
                         uint32_t data);
+void qir_schedule_instructions(struct vc4_compile *c);
 void qir_reorder_uniforms(struct vc4_compile *c);
 
 void qir_emit(struct vc4_compile *c, struct qinst *inst);
diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c
new file mode 100644
index 00000000000..d20815f055e
--- /dev/null
+++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -0,0 +1,619 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file vc4_qir_schedule.c
+ *
+ * The basic model of the list scheduler is to take a basic block, compute a
+ * DAG of the dependencies from the bottom up, and make a list of the DAG
+ * heads.  Heuristically pick a DAG head and schedule (remove) it, then put
+ * all the parents that are now DAG heads into the list of things to
+ * schedule.
+ *
+ * The goal of scheduling here, before register allocation and conversion to
+ * QPU instructions, is to reduce register pressure by reordering instructions
+ * to consume values when possible.
+ */
+
+#include "vc4_qir.h"
+
+static bool debug;
+
+struct schedule_node {
+        struct list_head link;
+        struct qinst *inst;
+
+        struct schedule_node **children;
+        uint32_t child_count;
+        uint32_t child_array_size;
+        uint32_t parent_count;
+
+        /* Length of the longest (latency) chain from a DAG head to the this
+         * instruction.
+         */
+        uint32_t delay;
+
+        /* Longest time + latency_between(parent, this) of any parent of this
+         * node.
+         */
+        uint32_t unblocked_time;
+};
+
+struct schedule_state {
+        /* List of struct schedule_node *.  This starts out with all
+         * instructions, and after dependency updates it's trimmed to be just
+         * the DAG heads.
+         */
+        struct list_head worklist;
+
+        uint32_t time;
+
+        uint32_t *temp_writes;
+
+        BITSET_WORD *temp_live;
+};
+
+/* When walking the instructions in reverse, we need to swap before/after in
+ * add_dep().
+ */
+enum direction { F, R };
+
+/**
+ * Marks a dependency between two intructions, that @after must appear after
+ * @before.
+ *
+ * Our dependencies are tracked as a DAG.  Since we're scheduling bottom-up,
+ * the latest instructions with nothing left to schedule are the DAG heads,
+ * and their inputs are their children.
+ */
+static void
+add_dep(enum direction dir,
+        struct schedule_node *before,
+        struct schedule_node *after)
+{
+        if (!before || !after)
+                return;
+
+        assert(before != after);
+
+        if (dir == R) {
+                struct schedule_node *t = before;
+                before = after;
+                after = t;
+        }
+
+        for (int i = 0; i < after->child_count; i++) {
+                if (after->children[i] == after)
+                        return;
+        }
+
+        if (after->child_array_size <= after->child_count) {
+                after->child_array_size = MAX2(after->child_array_size * 2, 16);
+                after->children = reralloc(after, after->children,
+                                           struct schedule_node *,
+                                           after->child_array_size);
+        }
+
+        after->children[after->child_count] = before;
+        after->child_count++;
+        before->parent_count++;
+}
+
+static void
+add_write_dep(enum direction dir,
+              struct schedule_node **before,
+              struct schedule_node *after)
+{
+        add_dep(dir, *before, after);
+        *before = after;
+}
+
+struct schedule_setup_state {
+        struct schedule_node **last_temp_write;
+        struct schedule_node *last_sf;
+        struct schedule_node *last_vary_read;
+        struct schedule_node *last_vpm_read;
+        struct schedule_node *last_vpm_write;
+        struct schedule_node *last_tex_coord;
+        struct schedule_node *last_tex_result;
+        struct schedule_node *last_tlb;
+        enum direction dir;
+
+	/**
+         * Texture FIFO tracking.  This is done top-to-bottom, and is used to
+         * track the QOP_TEX_RESULTs and add dependencies on previous ones
+         * when trying to submit texture coords with TFREQ full or new texture
+         * fetches with TXRCV full.
+         */
+        struct {
+                struct schedule_node *node;
+                int coords;
+        } tex_fifo[8];
+        int tfreq_count; /**< Number of texture coords outstanding. */
+        int tfrcv_count; /**< Number of texture results outstanding. */
+        int tex_fifo_pos;
+};
+
+static void
+block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n)
+{
+        add_dep(state->dir, state->tex_fifo[0].node, n);
+
+        state->tfreq_count -= state->tex_fifo[0].coords;
+        state->tfrcv_count--;
+
+        memmove(&state->tex_fifo[0],
+                &state->tex_fifo[1],
+                state->tex_fifo_pos * sizeof(state->tex_fifo[0]));
+        state->tex_fifo_pos--;
+}
+
+/**
+ * Common code for dependencies that need to be tracked both forward and
+ * backward.
+ *
+ * This is for things like "all VPM reads have to happen in order."
+ */
+static void
+calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
+{
+        struct qinst *inst = n->inst;
+        enum direction dir = state->dir;
+
+
+        /* Add deps for temp registers and varyings accesses.  Note that we
+         * ignore uniforms accesses, because qir_reorder_uniforms() happens
+         * after this.
+         */
+        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                switch (inst->src[i].file) {
+                case QFILE_TEMP:
+                        add_dep(dir,
+                                state->last_temp_write[inst->src[i].index], n);
+                        break;
+
+                case QFILE_VARY:
+                        add_write_dep(dir, &state->last_vary_read, n);
+                        break;
+
+                case QFILE_VPM:
+                        add_write_dep(dir, &state->last_vpm_read, n);
+                        break;
+
+                default:
+                        break;
+                }
+        }
+
+        switch (inst->op) {
+        case QOP_VARY_ADD_C:
+                add_dep(dir, state->last_vary_read, n);
+                break;
+
+        case QOP_TEX_S:
+        case QOP_TEX_T:
+        case QOP_TEX_R:
+        case QOP_TEX_B:
+        case QOP_TEX_DIRECT:
+                /* Texturing setup gets scheduled in order, because
+                 * the uniforms referenced by them have to land in a
+                 * specific order.
+                 */
+                add_write_dep(dir, &state->last_tex_coord, n);
+                break;
+
+        case QOP_TEX_RESULT:
+                /* Results have to be fetched in order. */
+                add_write_dep(dir, &state->last_tex_result, n);
+                break;
+
+        case QOP_TLB_COLOR_WRITE:
+        case QOP_TLB_COLOR_READ:
+        case QOP_TLB_Z_WRITE:
+        case QOP_TLB_STENCIL_SETUP:
+        case QOP_MS_MASK:
+                add_write_dep(dir, &state->last_tlb, n);
+                break;
+
+        case QOP_TLB_DISCARD_SETUP:
+                add_write_dep(dir, &state->last_sf, n);
+                add_write_dep(dir, &state->last_tlb, n);
+                break;
+
+        default:
+                break;
+        }
+
+        if (inst->dst.file == QFILE_VPM)
+                add_write_dep(dir, &state->last_vpm_write, n);
+        else if (inst->dst.file == QFILE_TEMP)
+                add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
+
+        if (inst->sf)
+                add_write_dep(dir, &state->last_sf, n);
+
+        if (qir_depends_on_flags(inst)) {
+                add_dep(dir, state->last_sf, n);
+        }
+}
+
+static void
+calculate_forward_deps(struct vc4_compile *c, void *mem_ctx,
+                       struct list_head *schedule_list)
+{
+        struct schedule_setup_state state;
+
+        memset(&state, 0, sizeof(state));
+        state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
+                                              c->num_temps);
+        state.dir = F;
+
+        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
+                struct qinst *inst = n->inst;
+
+                calculate_deps(&state, n);
+
+                switch (inst->op) {
+                case QOP_TEX_S:
+                case QOP_TEX_T:
+                case QOP_TEX_R:
+                case QOP_TEX_B:
+                case QOP_TEX_DIRECT:
+                        /* If the texture coordinate fifo is full,
+                         * block this on the last QOP_TEX_RESULT.
+                         */
+                        if (state.tfreq_count == 8) {
+                                block_until_tex_result(&state, n);
+                        }
+
+                        /* If the texture result fifo is full, block
+                         * adding any more to it until the last
+                         * QOP_TEX_RESULT.
+                         */
+                        if (inst->op == QOP_TEX_S ||
+                            inst->op == QOP_TEX_DIRECT) {
+                                if (state.tfrcv_count == 4)
+                                        block_until_tex_result(&state, n);
+                                state.tfrcv_count++;
+                        }
+
+                        state.tex_fifo[state.tex_fifo_pos].coords++;
+                        state.tfreq_count++;
+                        break;
+
+                case QOP_TEX_RESULT:
+                        /* Results have to be fetched after the
+                         * coordinate setup.  Note that we're assuming
+                         * here that our input shader has the texture
+                         * coord setup and result fetch in order,
+                         * which is true initially but not of our
+                         * instruction stream after this pass.
+                         */
+                        add_dep(state.dir, state.last_tex_coord, n);
+
+                        state.tex_fifo[state.tex_fifo_pos].node = n;
+
+                        state.tex_fifo_pos++;
+                        memset(&state.tex_fifo[state.tex_fifo_pos], 0,
+                               sizeof(state.tex_fifo[0]));
+                        break;
+                default:
+                        assert(!qir_is_tex(inst));
+                        break;
+                }
+        }
+}
+
+static void
+calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx,
+                       struct list_head *schedule_list)
+{
+        struct schedule_setup_state state;
+
+        memset(&state, 0, sizeof(state));
+        state.dir = R;
+        state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *,
+                                              c->num_temps);
+
+        list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) {
+                calculate_deps(&state, n);
+        }
+}
+
+static int
+get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
+{
+        int cost = 0;
+
+        if (inst->dst.file == QFILE_TEMP &&
+            state->temp_writes[inst->dst.index] == 1)
+                cost--;
+
+        for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                if (inst->src[i].file == QFILE_TEMP &&
+                    !BITSET_TEST(state->temp_live, inst->src[i].index)) {
+                        cost++;
+                }
+        }
+
+        return cost;
+}
+
+static bool
+locks_scoreboard(struct qinst *inst)
+{
+        switch (inst->op) {
+        case QOP_TLB_Z_WRITE:
+        case QOP_TLB_COLOR_WRITE:
+        case QOP_TLB_COLOR_WRITE_MS:
+        case QOP_TLB_COLOR_READ:
+                return true;
+        default:
+                return false;
+        }
+}
+
+static struct schedule_node *
+choose_instruction(struct schedule_state *state)
+{
+        struct schedule_node *chosen = NULL;
+
+        list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+                if (!chosen) {
+                        chosen = n;
+                        continue;
+                }
+
+                /* Prefer scheduling things that lock the scoreboard, so that
+                 * they appear late in the program and we get more parallelism
+                 * between shaders on multiple QPUs hitting the same fragment.
+                 */
+                if (locks_scoreboard(n->inst) &&
+                    !locks_scoreboard(chosen->inst)) {
+                        chosen = n;
+                        continue;
+                } else if (!locks_scoreboard(n->inst) &&
+                           locks_scoreboard(chosen->inst)) {
+                        continue;
+                }
+
+                /* If we would block on the previously chosen node, but would
+                 * block less on this one, then then prefer it.
+                 */
+                if (chosen->unblocked_time > state->time &&
+                    n->unblocked_time < chosen->unblocked_time) {
+                        chosen = n;
+                        continue;
+                } else if (n->unblocked_time > state->time &&
+                           n->unblocked_time > chosen->unblocked_time) {
+                        continue;
+                }
+
+                /* If we can definitely reduce register pressure, do so
+                 * immediately.
+                 */
+                int register_pressure_cost =
+                        get_register_pressure_cost(state, n->inst);
+                int chosen_register_pressure_cost =
+                        get_register_pressure_cost(state, chosen->inst);
+
+                if (register_pressure_cost < chosen_register_pressure_cost) {
+                        chosen = n;
+                        continue;
+                } else if (register_pressure_cost >
+                           chosen_register_pressure_cost) {
+                        continue;
+                }
+
+                /* Otherwise, prefer instructions with the deepest chain to
+                 * the end of the program.  This avoids the problem of
+                 * "everything generates a temp, nothing finishes freeing one,
+                 * guess I'll just keep emitting varying mul/adds".
+                 */
+                if (n->delay > chosen->delay) {
+                        chosen = n;
+                        continue;
+                } else if (n->delay < chosen->delay) {
+                        continue;
+                }
+        }
+
+        return chosen;
+}
+
+static void
+dump_state(struct vc4_compile *c, struct schedule_state *state)
+{
+        uint32_t i = 0;
+        list_for_each_entry(struct schedule_node, n, &state->worklist, link) {
+                fprintf(stderr, "%3d: ", i++);
+                qir_dump_inst(c, n->inst);
+                fprintf(stderr, " (%d cost)\n",
+                        get_register_pressure_cost(state, n->inst));
+
+                for (int i = 0; i < n->child_count; i++) {
+                        struct schedule_node *child = n->children[i];
+                        fprintf(stderr, "   - ");
+                        qir_dump_inst(c, child->inst);
+                        fprintf(stderr, " (%d parents)\n", child->parent_count);
+                }
+        }
+}
+
+/* Estimate of how many instructions we should schedule between operations.
+ *
+ * These aren't in real cycle counts, because we're just estimating cycle
+ * times anyway.  QIR instructions will get paired up when turned into QPU
+ * instructions, or extra NOP delays will have to be added due to register
+ * allocation choices.
+ */
+static uint32_t
+latency_between(struct schedule_node *before, struct schedule_node *after)
+{
+        if ((before->inst->op == QOP_TEX_S ||
+             before->inst->op == QOP_TEX_DIRECT) &&
+            after->inst->op == QOP_TEX_RESULT)
+                return 100;
+
+        return 1;
+}
+
+/** Recursive computation of the delay member of a node. */
+static void
+compute_delay(struct schedule_node *n)
+{
+        if (!n->child_count) {
+                /* The color read needs to be scheduled late, to avoid locking
+                 * the scoreboard early.  This is our best tool for
+                 * encouraging that.  The other scoreboard locking ops will
+                 * have this happen by default, since they are generally the
+                 * DAG heads or close to them.
+                 */
+                if (n->inst->op == QOP_TLB_COLOR_READ)
+                        n->delay = 1000;
+                else
+                        n->delay = 1;
+        } else {
+                for (int i = 0; i < n->child_count; i++) {
+                        if (!n->children[i]->delay)
+                                compute_delay(n->children[i]);
+                        n->delay = MAX2(n->delay,
+                                        n->children[i]->delay +
+                                        latency_between(n, n->children[i]));
+                }
+        }
+}
+
+static void
+schedule_instructions(struct vc4_compile *c, struct schedule_state *state)
+{
+        if (debug) {
+                fprintf(stderr, "initial deps:\n");
+                dump_state(c, state);
+        }
+
+        /* Remove non-DAG heads from the list. */
+        list_for_each_entry_safe(struct schedule_node, n,
+                                 &state->worklist, link) {
+                if (n->parent_count != 0)
+                        list_del(&n->link);
+        }
+
+        state->time = 0;
+        while (!list_empty(&state->worklist)) {
+                struct schedule_node *chosen = choose_instruction(state);
+                struct qinst *inst = chosen->inst;
+
+                if (debug) {
+                        fprintf(stderr, "current list:\n");
+                        dump_state(c, state);
+                        fprintf(stderr, "chose: ");
+                        qir_dump_inst(c, inst);
+                        fprintf(stderr, " (%d cost)\n",
+                                get_register_pressure_cost(state, inst));
+                }
+
+                state->time = MAX2(state->time, chosen->unblocked_time);
+
+                /* Schedule this instruction back onto the QIR list. */
+                list_del(&chosen->link);
+                list_add(&inst->link, &c->instructions);
+
+                /* Now that we've scheduled a new instruction, some of its
+                 * children can be promoted to the list of instructions ready to
+                 * be scheduled.  Update the children's unblocked time for this
+                 * DAG edge as we do so.
+                 */
+                for (int i = chosen->child_count - 1; i >= 0; i--) {
+                        struct schedule_node *child = chosen->children[i];
+
+                        child->unblocked_time = MAX2(child->unblocked_time,
+                                                     state->time +
+                                                     latency_between(chosen,
+                                                                     child));
+                        child->parent_count--;
+                        if (child->parent_count == 0)
+                                list_add(&child->link, &state->worklist);
+                }
+
+                /* Update our tracking of register pressure. */
+                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                        if (inst->src[i].file == QFILE_TEMP)
+                                BITSET_SET(state->temp_live, inst->src[i].index);
+                }
+                if (inst->dst.file == QFILE_TEMP) {
+                        state->temp_writes[inst->dst.index]--;
+                        if (state->temp_writes[inst->dst.index] == 0)
+                                BITSET_CLEAR(state->temp_live, inst->dst.index);
+                }
+
+                state->time++;
+        }
+}
+
+void
+qir_schedule_instructions(struct vc4_compile *c)
+{
+        void *mem_ctx = ralloc_context(NULL);
+        struct schedule_state state = { 0 };
+
+        if (debug) {
+                fprintf(stderr, "Pre-schedule instructions\n");
+                qir_dump(c);
+        }
+
+        state.temp_writes = rzalloc_array(mem_ctx, uint32_t, c->num_temps);
+        state.temp_live = rzalloc_array(mem_ctx, BITSET_WORD,
+                                        BITSET_WORDS(c->num_temps));
+        list_inithead(&state.worklist);
+
+        /* Wrap each instruction in a scheduler structure. */
+        list_for_each_entry_safe(struct qinst, inst, &c->instructions, link) {
+                struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
+
+                n->inst = inst;
+                list_del(&inst->link);
+                list_addtail(&n->link, &state.worklist);
+
+                if (inst->dst.file == QFILE_TEMP)
+                        state.temp_writes[inst->dst.index]++;
+        }
+
+        /* Dependencies tracked top-to-bottom. */
+        calculate_forward_deps(c, mem_ctx, &state.worklist);
+        /* Dependencies tracked bottom-to-top. */
+        calculate_reverse_deps(c, mem_ctx, &state.worklist);
+
+        list_for_each_entry(struct schedule_node, n, &state.worklist, link)
+                compute_delay(n);
+
+        schedule_instructions(c, &state);
+
+        if (debug) {
+                fprintf(stderr, "Post-schedule instructions\n");
+                qir_dump(c);
+        }
+
+        ralloc_free(mem_ctx);
+}
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index 98b7b6070d7..09164b74932 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -50,7 +50,7 @@ struct schedule_node {
         uint32_t child_array_size;
         uint32_t parent_count;
 
-        /* Longest cycles + n->latency of any parent of this node. */
+        /* Longest cycles + instruction_latency() of any parent of this node. */
         uint32_t unblocked_time;
 
         /**
@@ -259,7 +259,8 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n,
                 }
         } else if (is_tmu_write(waddr)) {
                 add_write_dep(state, &state->last_tmu_write, n);
-        } else if (qpu_waddr_is_tlb(waddr)) {
+        } else if (qpu_waddr_is_tlb(waddr) ||
+                   waddr == QPU_W_MS_FLAGS) {
                 add_write_dep(state, &state->last_tlb, n);
         } else {
                 switch (waddr) {
@@ -623,6 +624,46 @@ dump_state(struct list_head *schedule_list)
         }
 }
 
+static uint32_t waddr_latency(uint32_t waddr, uint64_t after)
+{
+        if (waddr < 32)
+                return 2;
+
+        /* Apply some huge latency between texture fetch requests and getting
+         * their results back.
+         */
+        if (waddr == QPU_W_TMU0_S) {
+                if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0)
+                        return 100;
+        }
+        if (waddr == QPU_W_TMU1_S) {
+                if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1)
+                        return 100;
+        }
+
+        switch(waddr) {
+        case QPU_W_SFU_RECIP:
+        case QPU_W_SFU_RECIPSQRT:
+        case QPU_W_SFU_EXP:
+        case QPU_W_SFU_LOG:
+                return 3;
+        default:
+                return 1;
+        }
+}
+
+static uint32_t
+instruction_latency(struct schedule_node *before, struct schedule_node *after)
+{
+        uint64_t before_inst = before->inst->inst;
+        uint64_t after_inst = after->inst->inst;
+
+        return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD),
+                                  after_inst),
+                    waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL),
+                                  after_inst));
+}
+
 /** Recursive computation of the delay member of a node. */
 static void
 compute_delay(struct schedule_node *n)
@@ -634,7 +675,8 @@ compute_delay(struct schedule_node *n)
                         if (!n->children[i].node->delay)
                                 compute_delay(n->children[i].node);
                         n->delay = MAX2(n->delay,
-                                        n->children[i].node->delay + n->latency);
+                                        n->children[i].node->delay +
+                                        instruction_latency(n, n->children[i].node));
                 }
         }
 }
@@ -663,9 +705,14 @@ mark_instruction_scheduled(struct list_head *schedule_list,
                  * immediately after (or paired with!) the thing reading the
                  * destination.
                  */
-                int latency_from_previous = war_only ? 0 : node->latency;
+                uint32_t latency = 0;
+                if (!war_only) {
+                        latency = instruction_latency(node,
+                                                      node->children[i].node);
+                }
+
                 child->unblocked_time = MAX2(child->unblocked_time,
-                                             time + latency_from_previous);
+                                             time + latency);
                 child->parent_count--;
                 if (child->parent_count == 0)
                         list_add(&child->link, schedule_list);
@@ -798,33 +845,6 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
         return time;
 }
 
-static uint32_t waddr_latency(uint32_t waddr)
-{
-        if (waddr < 32)
-                return 2;
-
-        /* Some huge number, really. */
-        if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B)
-                return 100;
-
-        switch(waddr) {
-        case QPU_W_SFU_RECIP:
-        case QPU_W_SFU_RECIPSQRT:
-        case QPU_W_SFU_EXP:
-        case QPU_W_SFU_LOG:
-                return 3;
-        default:
-                return 1;
-        }
-}
-
-static uint32_t
-instruction_latency(uint64_t inst)
-{
-        return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)),
-                    waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
-}
-
 uint32_t
 qpu_schedule_instructions(struct vc4_compile *c)
 {
@@ -851,7 +871,6 @@ qpu_schedule_instructions(struct vc4_compile *c)
                 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
 
                 n->inst = inst;
-                n->latency = instruction_latency(inst->inst);
 
                 if (reads_uniform(inst->inst)) {
                         n->uniform = next_uniform++;
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index e7069a4e9ff..9e6678a0625 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -207,7 +207,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
         /* If the resource is multisampled, we need to resolve to single
          * sample.  This seems like it should be handled at a higher layer.
          */
-        if (prsc->nr_samples) {
+        if (prsc->nr_samples > 1) {
                 trans->ss_resource = vc4_get_temp_resource(pctx, prsc, box);
                 if (!trans->ss_resource)
                         goto fail;
@@ -377,7 +377,7 @@ vc4_setup_slices(struct vc4_resource *rsc)
 
                 if (!rsc->tiled) {
                         slice->tiling = VC4_TILING_FORMAT_LINEAR;
-                        if (prsc->nr_samples) {
+                        if (prsc->nr_samples > 1) {
                                 /* MSAA (4x) surfaces are stored as raw tile buffer contents. */
                                 level_width = align(level_width, 32);
                                 level_height = align(level_height, 32);
@@ -458,7 +458,7 @@ vc4_resource_setup(struct pipe_screen *pscreen,
         prsc->screen = pscreen;
 
         rsc->base.vtbl = &vc4_resource_vtbl;
-        if (prsc->nr_samples == 0)
+        if (prsc->nr_samples <= 1)
                 rsc->cpp = util_format_get_blocksize(tmpl->format);
         else
                 rsc->cpp = sizeof(uint32_t);
@@ -475,7 +475,7 @@ get_resource_texture_format(struct pipe_resource *prsc)
         uint8_t format = vc4_get_tex_format(prsc->format);
 
         if (!rsc->tiled) {
-                if (prsc->nr_samples) {
+                if (prsc->nr_samples > 1) {
                         return ~0;
                 } else {
                         assert(format == VC4_TEXTURE_TYPE_RGBA8888);
@@ -497,7 +497,7 @@ vc4_resource_create(struct pipe_screen *pscreen,
          * communicate metadata about tiling currently.
          */
         if (tmpl->target == PIPE_BUFFER ||
-            tmpl->nr_samples ||
+            tmpl->nr_samples > 1 ||
             (tmpl->bind & (PIPE_BIND_SCANOUT |
                            PIPE_BIND_LINEAR |
                            PIPE_BIND_SHARED |
@@ -832,7 +832,7 @@ vc4_dump_surface(struct pipe_surface *psurf)
         if (!psurf)
                 return;
 
-        if (psurf->texture->nr_samples)
+        if (psurf->texture->nr_samples > 1)
                 vc4_dump_surface_msaa(psurf);
         else
                 vc4_dump_surface_non_msaa(psurf);
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 090579c2c76..8ddf0865d21 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -57,6 +57,10 @@ static const struct debug_named_value debug_options[] = {
           "Flush after each draw call" },
         { "always_sync", VC4_DEBUG_ALWAYS_SYNC,
           "Wait for finish after each flush" },
+#if USE_VC4_SIMULATOR
+        { "dump", VC4_DEBUG_DUMP,
+          "Write a GPU command stream trace file" },
+#endif
         { NULL }
 };
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h
index 5992e371093..03f76b257e3 100644
--- a/src/gallium/drivers/vc4/vc4_screen.h
+++ b/src/gallium/drivers/vc4/vc4_screen.h
@@ -41,6 +41,7 @@ struct vc4_bo;
 #define VC4_DEBUG_ALWAYS_FLUSH 0x0080
 #define VC4_DEBUG_ALWAYS_SYNC  0x0100
 #define VC4_DEBUG_NIR       0x0200
+#define VC4_DEBUG_DUMP      0x0400
 
 #define VC4_MAX_MIP_LEVELS 12
 #define VC4_MAX_TEXTURE_SAMPLERS 16
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 4b1df9234b6..521ef50f814 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -131,6 +131,93 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
         return 0;
 }
 
+static void
+vc4_dump_to_file(struct vc4_exec_info *exec)
+{
+        static int dumpno = 0;
+        struct drm_vc4_get_hang_state *state;
+        struct drm_vc4_get_hang_state_bo *bo_state;
+        unsigned int dump_version = 0;
+
+        if (!(vc4_debug & VC4_DEBUG_DUMP))
+                return;
+
+        state = calloc(1, sizeof(*state));
+
+        int unref_count = 0;
+        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
+                                 unref_head) {
+                unref_count++;
+        }
+
+        /* Add one more for the overflow area that isn't wrapped in a BO. */
+        state->bo_count = exec->bo_count + unref_count + 1;
+        bo_state = calloc(state->bo_count, sizeof(*bo_state));
+
+        char *filename = NULL;
+        asprintf(&filename, "vc4-dri-%d.dump", dumpno++);
+        FILE *f = fopen(filename, "w+");
+        if (!f) {
+                fprintf(stderr, "Couldn't open %s: %s", filename,
+                        strerror(errno));
+                return;
+        }
+
+        fwrite(&dump_version, sizeof(dump_version), 1, f);
+
+        state->ct0ca = exec->ct0ca;
+        state->ct0ea = exec->ct0ea;
+        state->ct1ca = exec->ct1ca;
+        state->ct1ea = exec->ct1ea;
+        state->start_bin = exec->ct0ca;
+        state->start_render = exec->ct1ca;
+        fwrite(state, sizeof(*state), 1, f);
+
+        int i;
+        for (i = 0; i < exec->bo_count; i++) {
+                struct drm_gem_cma_object *cma_bo = exec->bo[i];
+                bo_state[i].handle = i; /* Not used by the parser. */
+                bo_state[i].paddr = cma_bo->paddr;
+                bo_state[i].size = cma_bo->base.size;
+        }
+
+        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
+                                 unref_head) {
+                struct drm_gem_cma_object *cma_bo = &bo->base;
+                bo_state[i].handle = 0;
+                bo_state[i].paddr = cma_bo->paddr;
+                bo_state[i].size = cma_bo->base.size;
+                i++;
+        }
+
+        /* Add the static overflow memory area. */
+        bo_state[i].handle = exec->bo_count;
+        bo_state[i].paddr = 0;
+        bo_state[i].size = OVERFLOW_SIZE;
+        i++;
+
+        fwrite(bo_state, sizeof(*bo_state), state->bo_count, f);
+
+        for (int i = 0; i < exec->bo_count; i++) {
+                struct drm_gem_cma_object *cma_bo = exec->bo[i];
+                fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
+        }
+
+        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
+                                 unref_head) {
+                struct drm_gem_cma_object *cma_bo = &bo->base;
+                fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
+        }
+
+        void *overflow = calloc(1, OVERFLOW_SIZE);
+        fwrite(overflow, 1, OVERFLOW_SIZE, f);
+        free(overflow);
+
+        free(state);
+        free(bo_state);
+        fclose(f);
+}
+
 int
 vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
 {
@@ -183,6 +270,8 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
                             exec.ct1ea - exec.ct1ca, true);
         }
 
+        vc4_dump_to_file(&exec);
+
         if (exec.ct0ca != exec.ct0ea) {
                 int bfc = simpenrose_do_binning(exec.ct0ca, exec.ct0ea);
                 if (bfc != 1) {
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 40d3ada6ca2..e1f8b5a305e 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -45,7 +45,7 @@ struct vc4_exec_info;
 #define roundup(x, y) align(x, y)
 #define round_up(x, y) align(x, y)
 #define max(x, y) MAX2(x, y)
-#define min(x, y) MiN2(x, y)
+#define min(x, y) MIN2(x, y)
 #define BUG_ON(condition) assert(!(condition))
 
 static inline int
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index d9c0f55b23d..7e0ada7d0fa 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -462,9 +462,9 @@ vc4_set_framebuffer_state(struct pipe_context *pctx,
 
         vc4->msaa = false;
         if (cso->cbufs[0])
-                vc4->msaa = cso->cbufs[0]->texture->nr_samples != 0;
+                vc4->msaa = cso->cbufs[0]->texture->nr_samples > 1;
         else if (cso->zsbuf)
-                vc4->msaa = cso->zsbuf->texture->nr_samples != 0;
+                vc4->msaa = cso->zsbuf->texture->nr_samples > 1;
 
         if (vc4->msaa) {
                 vc4->tile_width = 32;
diff --git a/src/gallium/state_trackers/osmesa/osmesa.c b/src/gallium/state_trackers/osmesa/osmesa.c
index 0f27ba87f8a..38d684b8e9c 100644
--- a/src/gallium/state_trackers/osmesa/osmesa.c
+++ b/src/gallium/state_trackers/osmesa/osmesa.c
@@ -544,11 +544,39 @@ GLAPI OSMesaContext GLAPIENTRY
 OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits,
                        GLint accumBits, OSMesaContext sharelist)
 {
+   int attribs[100], n = 0;
+
+   attribs[n++] = OSMESA_FORMAT;
+   attribs[n++] = format;
+   attribs[n++] = OSMESA_DEPTH_BITS;
+   attribs[n++] = depthBits;
+   attribs[n++] = OSMESA_STENCIL_BITS;
+   attribs[n++] = stencilBits;
+   attribs[n++] = OSMESA_ACCUM_BITS;
+   attribs[n++] = accumBits;
+   attribs[n++] = 0;
+
+   return OSMesaCreateContextAttribs(attribs, sharelist);
+}
+
+
+/**
+ * New in Mesa 11.2
+ *
+ * Create context with attribute list.
+ */
+GLAPI OSMesaContext GLAPIENTRY
+OSMesaCreateContextAttribs(const int *attribList, OSMesaContext sharelist)
+{
    OSMesaContext osmesa;
    struct st_context_iface *st_shared;
    enum st_context_error st_error = 0;
    struct st_context_attribs attribs;
    struct st_api *stapi = get_st_api();
+   GLenum format = GL_RGBA;
+   int depthBits = 0, stencilBits = 0, accumBits = 0;
+   int profile = OSMESA_COMPAT_PROFILE, version_major = 1, version_minor = 0;
+   int i;
 
    if (sharelist) {
       st_shared = sharelist->stctx;
@@ -557,6 +585,64 @@ OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits,
       st_shared = NULL;
    }
 
+   for (i = 0; attribList[i]; i += 2) {
+      switch (attribList[i]) {
+      case OSMESA_FORMAT:
+         format = attribList[i+1];
+         switch (format) {
+         case OSMESA_COLOR_INDEX:
+         case OSMESA_RGBA:
+         case OSMESA_BGRA:
+         case OSMESA_ARGB:
+         case OSMESA_RGB:
+         case OSMESA_BGR:
+         case OSMESA_RGB_565:
+            /* legal */
+            break;
+         default:
+            return NULL;
+         }
+         break;
+      case OSMESA_DEPTH_BITS:
+         depthBits = attribList[i+1];
+         if (depthBits < 0)
+            return NULL;
+         break;
+      case OSMESA_STENCIL_BITS:
+         stencilBits = attribList[i+1];
+         if (stencilBits < 0)
+            return NULL;
+         break;
+      case OSMESA_ACCUM_BITS:
+         accumBits = attribList[i+1];
+         if (accumBits < 0)
+            return NULL;
+         break;
+      case OSMESA_PROFILE:
+         profile = attribList[i+1];
+         if (profile != OSMESA_CORE_PROFILE &&
+             profile != OSMESA_COMPAT_PROFILE)
+            return NULL;
+         break;
+      case OSMESA_CONTEXT_MAJOR_VERSION:
+         version_major = attribList[i+1];
+         if (version_major < 1)
+            return NULL;
+         break;
+      case OSMESA_CONTEXT_MINOR_VERSION:
+         version_minor = attribList[i+1];
+         if (version_minor < 0)
+            return NULL;
+         break;
+      case 0:
+         /* end of list */
+         break;
+      default:
+         fprintf(stderr, "Bad attribute in OSMesaCreateContextAttribs()\n");
+         return NULL;
+      }
+   }
+
    osmesa = (OSMesaContext) CALLOC_STRUCT(osmesa_context);
    if (!osmesa)
       return NULL;
@@ -581,9 +667,11 @@ OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits,
    /*
     * Create the rendering context
     */
-   attribs.profile = ST_PROFILE_DEFAULT;
-   attribs.major = 2;
-   attribs.minor = 1;
+   memset(&attribs, 0, sizeof(attribs));
+   attribs.profile = (profile == OSMESA_CORE_PROFILE)
+      ? ST_PROFILE_OPENGL_CORE : ST_PROFILE_DEFAULT;
+   attribs.major = version_major;
+   attribs.minor = version_minor;
    attribs.flags = 0;  /* ST_CONTEXT_FLAG_x */
    attribs.options.force_glsl_extensions_warn = FALSE;
    attribs.options.disable_blend_func_extended = FALSE;
@@ -614,6 +702,7 @@ OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits,
 }
 
 
+
 /**
  * Destroy an Off-Screen Mesa rendering context.
  *
@@ -883,6 +972,7 @@ struct name_function
 static struct name_function functions[] = {
    { "OSMesaCreateContext", (OSMESAproc) OSMesaCreateContext },
    { "OSMesaCreateContextExt", (OSMESAproc) OSMesaCreateContextExt },
+   { "OSMesaCreateContextAttribs", (OSMESAproc) OSMesaCreateContextAttribs },
    { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext },
    { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent },
    { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext },
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c
index 769305e2999..8de79352b7c 100644
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -212,6 +212,7 @@ VAStatus
 vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
                         VABufferInfo *out_buf_info)
 {
+   vlVaDriver *drv;
    uint32_t i;
    uint32_t mem_type;
    vlVaBuffer *buf ;
@@ -255,13 +256,9 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
    if (!buf->derived_surface.resource)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
+   drv = VL_VA_DRIVER(ctx);
    screen = VL_VA_PSCREEN(ctx);
 
-   if (buf->derived_surface.fence) {
-      screen->fence_finish(screen, buf->derived_surface.fence, PIPE_TIMEOUT_INFINITE);
-      screen->fence_reference(screen, &buf->derived_surface.fence, NULL);
-   }
-
    if (buf->export_refcount > 0) {
       if (buf->export_state.mem_type != mem_type)
          return VA_STATUS_ERROR_INVALID_PARAMETER;
@@ -272,6 +269,8 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
       case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: {
          struct winsys_handle whandle;
 
+         drv->pipe->flush(drv->pipe, NULL, 0);
+
          memset(&whandle, 0, sizeof(whandle));
          whandle.type = DRM_API_HANDLE_TYPE_FD;
 
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c
index ae07da857e1..ccc263f77a7 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -264,9 +264,8 @@ vlVaDeriveImage(VADriverContextP ctx, VASurfaceID surface, VAImage *image)
    img->image_id = handle_table_add(drv->htab, img);
 
    img_buf->type = VAImageBufferType;
-   img_buf->size = image->data_size;
+   img_buf->size = img->data_size;
    img_buf->num_elements = 1;
-   img_buf->derived_surface.fence = surf->fence;
 
    pipe_resource_reference(&img_buf->derived_surface.resource, surfaces[0]->texture);
 
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index 8623139b5be..7b30bf87d75 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -92,7 +92,6 @@ vlVaGetReferenceFrame(vlVaDriver *drv, VASurfaceID surface_id,
 static VAStatus
 handlePictureParameterBuffer(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
-   unsigned int i;
    VAStatus vaStatus = VA_STATUS_SUCCESS;
 
    switch (u_reduce_video_profile(context->templat.profile)) {
diff --git a/src/gallium/state_trackers/va/picture_hevc.c b/src/gallium/state_trackers/va/picture_hevc.c
index dc66b0f3fd5..28743ee7aa6 100644
--- a/src/gallium/state_trackers/va/picture_hevc.c
+++ b/src/gallium/state_trackers/va/picture_hevc.c
@@ -159,11 +159,6 @@ void vlVaHandlePictureParameterBufferHEVC(vlVaDriver *drv, vlVaContext *context,
    for (i = 0 ; i < 15 ; i++) {
       context->desc.h265.PicOrderCntVal[i] = hevc->ReferenceFrames[i].pic_order_cnt;
 
-      unsigned int index = hevc->ReferenceFrames[i].picture_id & 0x7F;
-
-      if (index == 0x7F)
-         continue;
-
       vlVaGetReferenceFrame(drv, hevc->ReferenceFrames[i].picture_id, &context->desc.h265.ref[i]);
 
       if ((hevc->ReferenceFrames[i].flags & VA_PICTURE_HEVC_RPS_ST_CURR_BEFORE) && (iBefore < 8)) {
diff --git a/src/gallium/state_trackers/va/postproc.c b/src/gallium/state_trackers/va/postproc.c
index 1fdb4e79f4c..15053a9ac3e 100644
--- a/src/gallium/state_trackers/va/postproc.c
+++ b/src/gallium/state_trackers/va/postproc.c
@@ -25,25 +25,35 @@
  *
  **************************************************************************/
 
-//#include "pipe/p_video_codec.h"
-
 #include "util/u_handle_table.h"
-//#include "util/u_video.h"
-
-//#include "vl/vl_vlc.h"
-//#include "vl/vl_winsys.h"
 
 #include "va_private.h"
 
+static const VARectangle *
+vlVaRegionDefault(const VARectangle *region, struct pipe_video_buffer *buf,
+		  VARectangle *def)
+{
+   if (region)
+      return region;
+
+   def->x = 0;
+   def->y = 0;
+   def->width = buf->width;
+   def->height = buf->height;
+
+   return def;
+}
+
 VAStatus
 vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
+   VARectangle def_src_region, def_dst_region;
+   const VARectangle *src_region, *dst_region;
    struct u_rect src_rect;
    struct u_rect dst_rect;
    vlVaSurface *src_surface;
    VAProcPipelineParameterBuffer *pipeline_param;
    struct pipe_surface **surfaces;
-   struct pipe_screen *screen;
    struct pipe_surface *psurf;
 
    if (!drv || !context)
@@ -66,27 +76,28 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex
    if (!surfaces || !surfaces[0])
       return VA_STATUS_ERROR_INVALID_SURFACE;
 
-   screen = drv->pipe->screen;
-
    psurf = surfaces[0];
 
-   src_rect.x0 = pipeline_param->surface_region->x;
-   src_rect.y0 = pipeline_param->surface_region->y;
-   src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width;
-   src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height;
+   src_region = vlVaRegionDefault(pipeline_param->surface_region, src_surface->buffer, &def_src_region);
+   dst_region = vlVaRegionDefault(pipeline_param->output_region, context->target, &def_dst_region);
+
+   src_rect.x0 = src_region->x;
+   src_rect.y0 = src_region->y;
+   src_rect.x1 = src_region->x + src_region->width;
+   src_rect.y1 = src_region->y + src_region->height;
 
-   dst_rect.x0 = pipeline_param->output_region->x;
-   dst_rect.y0 = pipeline_param->output_region->y;
-   dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width;
-   dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height;
+   dst_rect.x0 = dst_region->x;
+   dst_rect.y0 = dst_region->y;
+   dst_rect.x1 = dst_region->x + dst_region->width;
+   dst_rect.y1 = dst_region->y + dst_region->height;
 
    vl_compositor_clear_layers(&drv->cstate);
    vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
    vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect);
    vl_compositor_render(&drv->cstate, &drv->compositor, psurf, NULL, false);
 
-   screen->fence_reference(screen, &src_surface->fence, NULL);
-   drv->pipe->flush(drv->pipe, &src_surface->fence, 0);
+   // TODO: figure out why this is necessary for DMA-buf sharing
+   drv->pipe->flush(drv->pipe, NULL, 0);
 
    return VA_STATUS_SUCCESS;
 }
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index c052c8f2284..5ddaf04a8c7 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -72,8 +72,6 @@ vlVaDestroySurfaces(VADriverContextP ctx, VASurfaceID *surface_list, int num_sur
       vlVaSurface *surf = handle_table_get(drv->htab, surface_list[i]);
       if (surf->buffer)
          surf->buffer->destroy(surf->buffer);
-      if(surf->fence)
-         drv->pipe->screen->fence_reference(drv->pipe->screen, &surf->fence, NULL);
       util_dynarray_fini(&surf->subpics);
       FREE(surf);
       handle_table_remove(drv->htab, surface_list[i]);
@@ -245,11 +243,6 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s
    screen = drv->pipe->screen;
    vscreen = drv->vscreen;
 
-   if(surf->fence) {
-      screen->fence_finish(screen, surf->fence, PIPE_TIMEOUT_INFINITE);
-      screen->fence_reference(screen, &surf->fence, NULL);
-   }
-
    tex = vscreen->texture_from_drawable(vscreen, draw);
    if (!tex)
       return VA_STATUS_ERROR_INVALID_DISPLAY;
@@ -281,8 +274,7 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s
    screen->flush_frontbuffer(screen, tex, 0, 0,
                              vscreen->get_private(vscreen), NULL);
 
-   screen->fence_reference(screen, &surf->fence, NULL);
-   drv->pipe->flush(drv->pipe, &surf->fence, 0);
+   drv->pipe->flush(drv->pipe, NULL, 0);
 
    pipe_resource_reference(&tex, NULL);
    pipe_surface_reference(&surf_draw, NULL);
@@ -697,11 +689,11 @@ vlVaQueryVideoProcFilterCaps(VADriverContextP ctx, VAContextID context,
    return VA_STATUS_SUCCESS;
 }
 
-static VAProcColorStandardType vpp_input_color_standards[VAProcColorStandardCount] = {
+static VAProcColorStandardType vpp_input_color_standards[] = {
    VAProcColorStandardBT601
 };
 
-static VAProcColorStandardType vpp_output_color_standards[VAProcColorStandardCount] = {
+static VAProcColorStandardType vpp_output_color_standards[] = {
    VAProcColorStandardBT601
 };
 
@@ -725,9 +717,9 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context,
    pipeline_cap->filter_flags = 0;
    pipeline_cap->num_forward_references = 0;
    pipeline_cap->num_backward_references = 0;
-   pipeline_cap->num_input_color_standards = 1;
+   pipeline_cap->num_input_color_standards = Elements(vpp_input_color_standards);
    pipeline_cap->input_color_standards = vpp_input_color_standards;
-   pipeline_cap->num_output_color_standards = 1;
+   pipeline_cap->num_output_color_standards = Elements(vpp_output_color_standards);
    pipeline_cap->output_color_standards = vpp_output_color_standards;
 
    for (i = 0; i < num_filters; i++) {
diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h
index 6739efc8c3b..fa6e0fb301e 100644
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -244,7 +244,6 @@ typedef struct {
    struct {
       struct pipe_resource *resource;
       struct pipe_transfer *transfer;
-      struct pipe_fence_handle *fence;
    } derived_surface;
    unsigned int export_refcount;
    VABufferInfo export_state;
@@ -252,7 +251,6 @@ typedef struct {
 
 typedef struct {
    struct pipe_video_buffer templat, *buffer;
-   struct pipe_fence_handle *fence;
    struct util_dynarray subpics; /* vlVaSubpicture */
 } vlVaSurface;
 
diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am
index 08f95e8074c..f3ba1e32d1b 100644
--- a/src/gallium/targets/opencl/Makefile.am
+++ b/src/gallium/targets/opencl/Makefile.am
@@ -2,6 +2,9 @@ include $(top_srcdir)/src/gallium/Automake.inc
 
 lib_LTLIBRARIES = lib@[email protected]
 
+AM_CPPFLAGS = \
+        $(LIBELF_CFLAGS)
+
 lib@OPENCL_LIBNAME@_la_LDFLAGS = \
 	$(LLVM_LDFLAGS) \
 	-no-undefined \
@@ -19,7 +22,7 @@ lib@OPENCL_LIBNAME@_la_LIBADD = \
 	$(top_builddir)/src/gallium/state_trackers/clover/libclover.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
-	$(ELF_LIB) \
+	$(LIBELF_LIBS) \
 	$(DLOPEN_LIBS) \
 	-lclangCodeGen \
 	-lclangFrontendTool \
diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
index c6603e38a00..c44424f4f4a 100644
--- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
+++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
@@ -13,6 +13,9 @@
 #include "nouveau/nouveau_winsys.h"
 #include "nouveau/nouveau_screen.h"
 
+#include <nvif/class.h>
+#include <nvif/cl0080.h>
+
 static struct util_hash_table *fd_tab = NULL;
 
 pipe_static_mutex(nouveau_screen_mutex);
@@ -27,7 +30,7 @@ bool nouveau_drm_screen_unref(struct nouveau_screen *screen)
 	ret = --screen->refcount;
 	assert(ret >= 0);
 	if (ret == 0)
-		util_hash_table_remove(fd_tab, intptr_to_pointer(screen->device->fd));
+		util_hash_table_remove(fd_tab, intptr_to_pointer(screen->drm->fd));
 	pipe_mutex_unlock(nouveau_screen_mutex);
 	return ret == 0;
 }
@@ -57,16 +60,19 @@ static int compare_fd(void *key1, void *key2)
 PUBLIC struct pipe_screen *
 nouveau_drm_screen_create(int fd)
 {
+	struct nouveau_drm *drm = NULL;
 	struct nouveau_device *dev = NULL;
-	struct pipe_screen *(*init)(struct nouveau_device *);
-	struct nouveau_screen *screen;
-	int ret, dupfd = -1;
+	struct nouveau_screen *(*init)(struct nouveau_device *);
+	struct nouveau_screen *screen = NULL;
+	int ret, dupfd;
 
 	pipe_mutex_lock(nouveau_screen_mutex);
 	if (!fd_tab) {
 		fd_tab = util_hash_table_create(hash_fd, compare_fd);
-		if (!fd_tab)
-			goto err;
+		if (!fd_tab) {
+			pipe_mutex_unlock(nouveau_screen_mutex);
+			return NULL;
+		}
 	}
 
 	screen = util_hash_table_get(fd_tab, intptr_to_pointer(fd));
@@ -86,7 +92,15 @@ nouveau_drm_screen_create(int fd)
 	 * creation error.
 	 */
 	dupfd = dup(fd);
-	ret = nouveau_device_wrap(dupfd, 1, &dev);
+
+	ret = nouveau_drm_new(dupfd, &drm);
+	if (ret)
+		goto err;
+
+	ret = nouveau_device_new(&drm->client, NV_DEVICE,
+				 &(struct nv_device_v0) {
+					.device = ~0ULL,
+				 }, sizeof(struct nv_device_v0), &dev);
 	if (ret)
 		goto err;
 
@@ -116,8 +130,8 @@ nouveau_drm_screen_create(int fd)
 		goto err;
 	}
 
-	screen = (struct nouveau_screen*)init(dev);
-	if (!screen)
+	screen = init(dev);
+	if (!screen || !screen->base.context_create)
 		goto err;
 
 	/* Use dupfd in hash table, to avoid errors if the original fd gets
@@ -130,10 +144,13 @@ nouveau_drm_screen_create(int fd)
 	return &screen->base;
 
 err:
-	if (dev)
+	if (screen) {
+		screen->base.destroy(&screen->base);
+	} else {
 		nouveau_device_del(&dev);
-	else if (dupfd >= 0)
+		nouveau_drm_del(&drm);
 		close(dupfd);
+	}
 	pipe_mutex_unlock(nouveau_screen_mutex);
 	return NULL;
 }
diff --git a/src/glsl/ast.h b/src/glsl/ast.h
index adfc7938bff..f8ab0b71b7b 100644
--- a/src/glsl/ast.h
+++ b/src/glsl/ast.h
@@ -726,6 +726,7 @@ public:
 			  struct _mesa_glsl_parse_state *state);
 
    const char *name;
+   ast_type_qualifier *layout;
    /* List of ast_declarator_list * */
    exec_list declarations;
    bool is_declaration;
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index fc6bb3e31f1..376dfda2c84 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -6180,7 +6180,8 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                           bool allow_reserved_names,
                                           ir_variable_mode var_mode,
                                           ast_type_qualifier *layout,
-                                          unsigned block_stream)
+                                          unsigned block_stream,
+                                          unsigned expl_location)
 {
    unsigned decl_count = 0;
 
@@ -6201,6 +6202,9 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
    glsl_struct_field *const fields = ralloc_array(state, glsl_struct_field,
                                                   decl_count);
 
+   bool first_member = true;
+   bool first_member_has_explicit_location;
+
    unsigned i = 0;
    foreach_list_typed (ast_declarator_list, decl_list, link, declarations) {
       const char *type_name;
@@ -6265,6 +6269,27 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                           "to struct or interface block members");
       }
 
+      if (is_interface) {
+         if (!first_member) {
+            if (!layout->flags.q.explicit_location &&
+                ((first_member_has_explicit_location &&
+                  !qual->flags.q.explicit_location) ||
+                 (!first_member_has_explicit_location &&
+                  qual->flags.q.explicit_location))) {
+               _mesa_glsl_error(&loc, state,
+                                "when block-level location layout qualifier "
+                                "is not supplied either all members must "
+                                "have a location layout qualifier or all "
+                                "members must not have a location layout "
+                                "qualifier");
+            }
+         } else {
+            first_member = false;
+            first_member_has_explicit_location =
+               qual->flags.q.explicit_location;
+         }
+      }
+
       if (qual->flags.q.std140 ||
           qual->flags.q.std430 ||
           qual->flags.q.packed ||
@@ -6339,7 +6364,6 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
          validate_array_dimensions(field_type, state, &loc);
          fields[i].type = field_type;
          fields[i].name = decl->identifier;
-         fields[i].location = -1;
          fields[i].interpolation =
             interpret_interpolation_qualifier(qual, var_mode, state, &loc);
          fields[i].centroid = qual->flags.q.centroid ? 1 : 0;
@@ -6347,6 +6371,22 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
          fields[i].patch = qual->flags.q.patch ? 1 : 0;
          fields[i].precision = qual->precision;
 
+         if (qual->flags.q.explicit_location) {
+            unsigned qual_location;
+            if (process_qualifier_constant(state, &loc, "location",
+                                           qual->location, &qual_location)) {
+               fields[i].location = VARYING_SLOT_VAR0 + qual_location;
+               expl_location = fields[i].location + 1;
+            }
+         } else {
+            if (layout && layout->flags.q.explicit_location) {
+               fields[i].location = expl_location;
+               expl_location = expl_location + 1;
+            } else {
+               fields[i].location = -1;
+            }
+         }
+
          /* Propogate row- / column-major information down the fields of the
           * structure or interface block.  Structures need this data because
           * the structure may contain a structure that contains ... a matrix
@@ -6445,6 +6485,16 @@ ast_struct_specifier::hir(exec_list *instructions,
 
    state->struct_specifier_depth++;
 
+   unsigned expl_location = -1;
+   if (layout && layout->flags.q.explicit_location) {
+      if (!process_qualifier_constant(state, &loc, "location",
+                                      layout->location, &expl_location)) {
+         return NULL;
+      } else {
+         expl_location = VARYING_SLOT_VAR0 + expl_location;
+      }
+   }
+
    glsl_struct_field *fields;
    unsigned decl_count =
       ast_process_struct_or_iface_block_members(instructions,
@@ -6455,8 +6505,9 @@ ast_struct_specifier::hir(exec_list *instructions,
                                                 GLSL_MATRIX_LAYOUT_INHERITED,
                                                 false /* allow_reserved_names */,
                                                 ir_var_auto,
-                                                NULL,
-                                                0 /* for interface only */);
+                                                layout,
+                                                0, /* for interface only */
+                                                expl_location);
 
    validate_identifier(this->name, loc, state);
 
@@ -6621,6 +6672,16 @@ ast_interface_block::hir(exec_list *instructions,
       return NULL;
    }
 
+   unsigned expl_location = -1;
+   if (layout.flags.q.explicit_location) {
+      if (!process_qualifier_constant(state, &loc, "location",
+                                      layout.location, &expl_location)) {
+         return NULL;
+      } else {
+         expl_location = VARYING_SLOT_VAR0 + expl_location;
+      }
+   }
+
    unsigned int num_variables =
       ast_process_struct_or_iface_block_members(&declared_variables,
                                                 state,
@@ -6631,7 +6692,8 @@ ast_interface_block::hir(exec_list *instructions,
                                                 redeclaring_per_vertex,
                                                 var_mode,
                                                 &this->layout,
-                                                qual_stream);
+                                                qual_stream,
+                                                expl_location);
 
    state->struct_specifier_depth--;
 
@@ -6970,6 +7032,10 @@ ast_interface_block::hir(exec_list *instructions,
          }
 
          var->data.stream = qual_stream;
+         if (layout.flags.q.explicit_location) {
+            var->data.location = expl_location;
+            var->data.explicit_location = true;
+         }
 
          state->symbols->add_variable(var);
          instructions->push_tail(var);
@@ -6990,6 +7056,9 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.sample = fields[i].sample;
          var->data.patch = fields[i].patch;
          var->data.stream = qual_stream;
+         var->data.location = fields[i].location;
+         if (fields[i].location != -1)
+            var->data.explicit_location = true;
          var->init_interface_type(block_type);
 
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index 9973a763087..602852a78d1 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -136,6 +136,12 @@ v140(const _mesa_glsl_parse_state *state)
 }
 
 static bool
+v140_or_es3(const _mesa_glsl_parse_state *state)
+{
+   return state->is_version(140, 300);
+}
+
+static bool
 v400_fs_only(const _mesa_glsl_parse_state *state)
 {
    return state->is_version(400, 0) &&
@@ -544,6 +550,7 @@ private:
    ir_variable *in_var(const glsl_type *type, const char *name);
    ir_variable *out_var(const glsl_type *type, const char *name);
    ir_constant *imm(float f, unsigned vector_elements=1);
+   ir_constant *imm(bool b, unsigned vector_elements=1);
    ir_constant *imm(int i, unsigned vector_elements=1);
    ir_constant *imm(unsigned u, unsigned vector_elements=1);
    ir_constant *imm(double d, unsigned vector_elements=1);
@@ -1438,9 +1445,9 @@ builtin_builder::create_builtins()
 
                 NULL);
    add_function("inverse",
-                _inverse_mat2(v120, glsl_type::mat2_type),
-                _inverse_mat3(v120, glsl_type::mat3_type),
-                _inverse_mat4(v120, glsl_type::mat4_type),
+                _inverse_mat2(v140_or_es3, glsl_type::mat2_type),
+                _inverse_mat3(v140_or_es3, glsl_type::mat3_type),
+                _inverse_mat4(v140_or_es3, glsl_type::mat4_type),
                 _inverse_mat2(fp64, glsl_type::dmat2_type),
                 _inverse_mat3(fp64, glsl_type::dmat3_type),
                 _inverse_mat4(fp64, glsl_type::dmat4_type),
@@ -3006,6 +3013,12 @@ builtin_builder::out_var(const glsl_type *type, const char *name)
 }
 
 ir_constant *
+builtin_builder::imm(bool b, unsigned vector_elements)
+{
+   return new(mem_ctx) ir_constant(b, vector_elements);
+}
+
+ir_constant *
 builtin_builder::imm(float f, unsigned vector_elements)
 {
    return new(mem_ctx) ir_constant(f, vector_elements);
@@ -4364,7 +4377,13 @@ builtin_builder::_notEqual(builtin_available_predicate avail,
 ir_function_signature *
 builtin_builder::_any(const glsl_type *type)
 {
-   return unop(always_available, ir_unop_any, glsl_type::bool_type, type);
+   ir_variable *v = in_var(type, "v");
+   MAKE_SIG(glsl_type::bool_type, always_available, 1, v);
+
+   const unsigned vec_elem = v->type->vector_elements;
+   body.emit(ret(expr(ir_binop_any_nequal, v, imm(false, vec_elem))));
+
+   return sig;
 }
 
 ir_function_signature *
@@ -4373,20 +4392,8 @@ builtin_builder::_all(const glsl_type *type)
    ir_variable *v = in_var(type, "v");
    MAKE_SIG(glsl_type::bool_type, always_available, 1, v);
 
-   switch (type->vector_elements) {
-   case 2:
-      body.emit(ret(logic_and(swizzle_x(v), swizzle_y(v))));
-      break;
-   case 3:
-      body.emit(ret(logic_and(logic_and(swizzle_x(v), swizzle_y(v)),
-                              swizzle_z(v))));
-      break;
-   case 4:
-      body.emit(ret(logic_and(logic_and(logic_and(swizzle_x(v), swizzle_y(v)),
-                                        swizzle_z(v)),
-                              swizzle_w(v))));
-      break;
-   }
+   const unsigned vec_elem = v->type->vector_elements;
+   body.emit(ret(expr(ir_binop_all_equal, v, imm(true, vec_elem))));
 
    return sig;
 }
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 7eb383ac60c..51796a65df9 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -1130,6 +1130,10 @@ fully_specified_type:
       $$->set_location_range(@1, @2);
       $$->qualifier = $1;
       $$->specifier = $2;
+      if ($$->specifier->structure != NULL &&
+          $$->specifier->structure->is_declaration) {
+            $$->specifier->structure->layout = &$$->qualifier;
+      }
    }
    ;
 
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index f989e9b6dff..70227070ca7 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -307,10 +307,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0)
       this->type = glsl_type::uvec2_type;
       break;
 
-   case ir_unop_any:
-      this->type = glsl_type::bool_type;
-      break;
-
    case ir_unop_pack_snorm_2x16:
    case ir_unop_pack_snorm_4x8:
    case ir_unop_pack_unorm_2x16:
@@ -538,7 +534,6 @@ static const char *const operator_strs[] = {
    "bitcast_f2i",
    "bitcast_u2f",
    "bitcast_f2u",
-   "any",
    "trunc",
    "ceil",
    "floor",
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index bdc932ef538..159f94d9edd 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -1355,7 +1355,6 @@ enum ir_expression_operation {
    ir_unop_bitcast_f2i, /**< Bit-identical float-to-int "conversion" */
    ir_unop_bitcast_u2f, /**< Bit-identical uint-to-float "conversion" */
    ir_unop_bitcast_f2u, /**< Bit-identical float-to-uint "conversion" */
-   ir_unop_any,
 
    /**
     * \name Unary floating-point rounding operations.
@@ -1726,7 +1725,6 @@ public:
    {
       return operation == ir_binop_all_equal ||
              operation == ir_binop_any_nequal ||
-             operation == ir_unop_any ||
              operation == ir_binop_dot ||
              operation == ir_quadop_vector;
    }
diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp
index ef705851613..5bf5ce54f78 100644
--- a/src/glsl/ir_constant_expression.cpp
+++ b/src/glsl/ir_constant_expression.cpp
@@ -648,14 +648,6 @@ ir_expression::constant_expression_value(struct hash_table *variable_context)
 	 data.u[c] = bitcast_f2u(op[0]->value.f[c]);
       }
       break;
-   case ir_unop_any:
-      assert(op[0]->type->is_boolean());
-      data.b[0] = false;
-      for (unsigned c = 0; c < op[0]->type->components(); c++) {
-	 if (op[0]->value.b[c])
-	    data.b[0] = true;
-      }
-      break;
    case ir_unop_d2f:
       assert(op[0]->type->base_type == GLSL_TYPE_DOUBLE);
       for (unsigned c = 0; c < op[0]->type->components(); c++) {
diff --git a/src/glsl/ir_set_program_inouts.cpp b/src/glsl/ir_set_program_inouts.cpp
index d7c29b00f88..a2dea67c6a9 100644
--- a/src/glsl/ir_set_program_inouts.cpp
+++ b/src/glsl/ir_set_program_inouts.cpp
@@ -81,16 +81,9 @@ is_shader_inout(ir_variable *var)
           var->data.mode == ir_var_system_value;
 }
 
-static inline bool
-is_dual_slot(ir_variable *var)
-{
-   const glsl_type *type = var->type->without_array();
-   return type == glsl_type::dvec4_type || type == glsl_type::dvec3_type;
-}
-
 static void
 mark(struct gl_program *prog, ir_variable *var, int offset, int len,
-     bool is_fragment_shader)
+     gl_shader_stage stage)
 {
    /* As of GLSL 1.20, varyings can only be floats, floating-point
     * vectors or matrices, or arrays of them.  For Mesa programs using
@@ -101,7 +94,6 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,
     */
 
    for (int i = 0; i < len; i++) {
-      bool dual_slot = is_dual_slot(var);
       int idx = var->data.location + var->data.index + offset + i;
       bool is_patch_generic = var->data.patch &&
                               idx != VARYING_SLOT_TESS_LEVEL_INNER &&
@@ -123,9 +115,12 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len,
          else
             prog->InputsRead |= bitfield;
 
-         if (dual_slot)
+         /* double inputs read is only for vertex inputs */
+         if (stage == MESA_SHADER_VERTEX &&
+             var->type->without_array()->is_dual_slot_double())
             prog->DoubleInputsRead |= bitfield;
-         if (is_fragment_shader) {
+
+         if (stage == MESA_SHADER_FRAGMENT) {
             gl_fragment_program *fprog = (gl_fragment_program *) prog;
             fprog->InterpQualifier[idx] =
                (glsl_interp_qualifier) var->data.interpolation;
@@ -154,6 +149,7 @@ void
 ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var)
 {
    const glsl_type *type = var->type;
+   bool vertex_input = false;
    if (this->shader_stage == MESA_SHADER_GEOMETRY &&
        var->data.mode == ir_var_shader_in && type->is_array()) {
       type = type->fields.array;
@@ -177,8 +173,12 @@ ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var)
       type = type->fields.array;
    }
 
-   mark(this->prog, var, 0, type->count_attribute_slots(),
-        this->shader_stage == MESA_SHADER_FRAGMENT);
+   if (this->shader_stage == MESA_SHADER_VERTEX &&
+       var->data.mode == ir_var_shader_in)
+      vertex_input = true;
+
+   mark(this->prog, var, 0, type->count_attribute_slots(vertex_input),
+        this->shader_stage);
 }
 
 /* Default handler: Mark all the locations in the variable as used. */
@@ -301,8 +301,15 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var,
       return false;
    }
 
+   /* double element width for double types that takes two slots */
+   if (this->shader_stage != MESA_SHADER_VERTEX ||
+       var->data.mode != ir_var_shader_in) {
+      if (type->without_array()->is_dual_slot_double())
+	 elem_width *= 2;
+   }
+
    mark(this->prog, var, index_as_constant->value.u[0] * elem_width,
-        elem_width, this->shader_stage == MESA_SHADER_FRAGMENT);
+        elem_width, this->shader_stage);
    return true;
 }
 
diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp
index e63b5c318e3..dcc079cfe37 100644
--- a/src/glsl/ir_validate.cpp
+++ b/src/glsl/ir_validate.cpp
@@ -320,11 +320,6 @@ ir_validate::visit_leave(ir_expression *ir)
       assert(ir->type->base_type == GLSL_TYPE_UINT);
       break;
 
-   case ir_unop_any:
-      assert(ir->operands[0]->type->base_type == GLSL_TYPE_BOOL);
-      assert(ir->type == glsl_type::bool_type);
-      break;
-
    case ir_unop_trunc:
    case ir_unop_round_even:
    case ir_unop_ceil:
diff --git a/src/glsl/link_interface_blocks.cpp b/src/glsl/link_interface_blocks.cpp
index 936e2e0ba21..64c30fea9a3 100644
--- a/src/glsl/link_interface_blocks.cpp
+++ b/src/glsl/link_interface_blocks.cpp
@@ -30,100 +30,52 @@
 #include "glsl_symbol_table.h"
 #include "linker.h"
 #include "main/macros.h"
-#include "program/hash_table.h"
+#include "util/hash_table.h"
 
 
 namespace {
 
 /**
- * Information about a single interface block definition that we need to keep
- * track of in order to check linkage rules.
- *
- * Note: this class is expected to be short lived, so it doesn't make copies
- * of the strings it references; it simply borrows the pointers from the
- * ir_variable class.
- */
-struct interface_block_definition
-{
-   /**
-    * Extract an interface block definition from an ir_variable that
-    * represents either the interface instance (for named interfaces), or a
-    * member of the interface (for unnamed interfaces).
-    */
-   explicit interface_block_definition(ir_variable *var)
-      : var(var),
-        type(var->get_interface_type()),
-        instance_name(NULL)
-   {
-      if (var->is_interface_instance()) {
-         instance_name = var->name;
-      }
-      explicitly_declared = (var->data.how_declared != ir_var_declared_implicitly);
-   }
-   /**
-    * Interface block ir_variable
-    */
-   ir_variable *var;
-
-   /**
-    * Interface block type
-    */
-   const glsl_type *type;
-
-   /**
-    * For a named interface block, the instance name.  Otherwise NULL.
-    */
-   const char *instance_name;
-
-   /**
-    * True if this interface block was explicitly declared in the shader;
-    * false if it was an implicitly declared built-in interface block.
-    */
-   bool explicitly_declared;
-};
-
-
-/**
  * Check if two interfaces match, according to intrastage interface matching
  * rules.  If they do, and the first interface uses an unsized array, it will
  * be updated to reflect the array size declared in the second interface.
  */
 bool
-intrastage_match(interface_block_definition *a,
-                 const interface_block_definition *b,
-                 ir_variable_mode mode,
+intrastage_match(ir_variable *a,
+                 ir_variable *b,
                  struct gl_shader_program *prog)
 {
    /* Types must match. */
-   if (a->type != b->type) {
+   if (a->get_interface_type() != b->get_interface_type()) {
       /* Exception: if both the interface blocks are implicitly declared,
        * don't force their types to match.  They might mismatch due to the two
        * shaders using different GLSL versions, and that's ok.
        */
-      if (a->explicitly_declared || b->explicitly_declared)
+      if (a->data.how_declared != ir_var_declared_implicitly ||
+          b->data.how_declared != ir_var_declared_implicitly)
          return false;
    }
 
    /* Presence/absence of interface names must match. */
-   if ((a->instance_name == NULL) != (b->instance_name == NULL))
+   if (a->is_interface_instance() != b->is_interface_instance())
       return false;
 
    /* For uniforms, instance names need not match.  For shader ins/outs,
     * it's not clear from the spec whether they need to match, but
     * Mesa's implementation relies on them matching.
     */
-   if (a->instance_name != NULL &&
-       mode != ir_var_uniform && mode != ir_var_shader_storage &&
-       strcmp(a->instance_name, b->instance_name) != 0) {
+   if (a->is_interface_instance() && b->data.mode != ir_var_uniform &&
+       b->data.mode != ir_var_shader_storage &&
+       strcmp(a->name, b->name) != 0) {
       return false;
    }
 
    /* If a block is an array then it must match across the shader.
     * Unsized arrays are also processed and matched agaist sized arrays.
     */
-   if (b->var->type != a->var->type &&
-       (b->instance_name != NULL || a->instance_name != NULL) &&
-       !validate_intrastage_arrays(prog, b->var, a->var))
+   if (b->type != a->type &&
+       (b->is_interface_instance() || a->is_interface_instance()) &&
+       !validate_intrastage_arrays(prog, b, a))
       return false;
 
    return true;
@@ -139,43 +91,44 @@ intrastage_match(interface_block_definition *a,
  * This is used for tessellation control and geometry shader consumers.
  */
 bool
-interstage_match(const interface_block_definition *producer,
-                 const interface_block_definition *consumer,
+interstage_match(ir_variable *producer,
+                 ir_variable *consumer,
                  bool extra_array_level)
 {
    /* Unsized arrays should not occur during interstage linking.  They
     * should have all been assigned a size by link_intrastage_shaders.
     */
-   assert(!consumer->var->type->is_unsized_array());
-   assert(!producer->var->type->is_unsized_array());
+   assert(!consumer->type->is_unsized_array());
+   assert(!producer->type->is_unsized_array());
 
    /* Types must match. */
-   if (consumer->type != producer->type) {
+   if (consumer->get_interface_type() != producer->get_interface_type()) {
       /* Exception: if both the interface blocks are implicitly declared,
        * don't force their types to match.  They might mismatch due to the two
        * shaders using different GLSL versions, and that's ok.
        */
-      if (consumer->explicitly_declared || producer->explicitly_declared)
+      if (consumer->data.how_declared != ir_var_declared_implicitly ||
+          producer->data.how_declared != ir_var_declared_implicitly)
          return false;
    }
 
    /* Ignore outermost array if geom shader */
    const glsl_type *consumer_instance_type;
    if (extra_array_level) {
-      consumer_instance_type = consumer->var->type->fields.array;
+      consumer_instance_type = consumer->type->fields.array;
    } else {
-      consumer_instance_type = consumer->var->type;
+      consumer_instance_type = consumer->type;
    }
 
    /* If a block is an array then it must match across shaders.
     * Since unsized arrays have been ruled out, we can check this by just
     * making sure the types are equal.
     */
-   if ((consumer->instance_name != NULL &&
+   if ((consumer->is_interface_instance() &&
         consumer_instance_type->is_array()) ||
-       (producer->instance_name != NULL &&
-        producer->var->type->is_array())) {
-      if (consumer_instance_type != producer->var->type)
+       (producer->is_interface_instance() &&
+        producer->type->is_array())) {
+      if (consumer_instance_type != producer->type)
          return false;
    }
 
@@ -197,35 +150,55 @@ class interface_block_definitions
 public:
    interface_block_definitions()
       : mem_ctx(ralloc_context(NULL)),
-        ht(hash_table_ctor(0, hash_table_string_hash,
-                           hash_table_string_compare))
+        ht(_mesa_hash_table_create(NULL, _mesa_key_hash_string,
+                                   _mesa_key_string_equal))
    {
    }
 
    ~interface_block_definitions()
    {
-      hash_table_dtor(ht);
       ralloc_free(mem_ctx);
+      _mesa_hash_table_destroy(ht, NULL);
    }
 
    /**
-    * Lookup the interface definition having the given block name.  Return
-    * NULL if none is found.
+    * Lookup the interface definition. Return NULL if none is found.
     */
-   interface_block_definition *lookup(const char *block_name)
+   ir_variable *lookup(ir_variable *var)
    {
-      return (interface_block_definition *) hash_table_find(ht, block_name);
+      if (var->data.explicit_location &&
+          var->data.location >= VARYING_SLOT_VAR0) {
+         char location_str[11];
+         snprintf(location_str, 11, "%d", var->data.location);
+
+         const struct hash_entry *entry =
+            _mesa_hash_table_search(ht, location_str);
+         return entry ? (ir_variable *) entry->data : NULL;
+      } else {
+         const struct hash_entry *entry =
+            _mesa_hash_table_search(ht, var->get_interface_type()->name);
+         return entry ? (ir_variable *) entry->data : NULL;
+      }
    }
 
    /**
     * Add a new interface definition.
     */
-   void store(const interface_block_definition &def)
+   void store(ir_variable *var)
    {
-      interface_block_definition *hash_entry =
-         rzalloc(mem_ctx, interface_block_definition);
-      *hash_entry = def;
-      hash_table_insert(ht, hash_entry, def.type->name);
+      if (var->data.explicit_location &&
+          var->data.location >= VARYING_SLOT_VAR0) {
+         /* If explicit location is given then lookup the variable by location.
+          * We turn the location into a string and use this as the hash key
+          * rather than the name. Note: We allocate enough space for a 32-bit
+          * unsigned location value which is overkill but future proof.
+          */
+         char location_str[11];
+         snprintf(location_str, 11, "%d", var->data.location);
+         _mesa_hash_table_insert(ht, ralloc_strdup(mem_ctx, location_str), var);
+      } else {
+         _mesa_hash_table_insert(ht, var->get_interface_type()->name, var);
+      }
    }
 
 private:
@@ -236,8 +209,7 @@ private:
 
    /**
     * Hash table mapping interface block name to an \c
-    * interface_block_definition struct.  interface_block_definition structs
-    * are allocated using \c mem_ctx.
+    * ir_variable.
     */
    hash_table *ht;
 };
@@ -292,18 +264,13 @@ validate_intrastage_interface_blocks(struct gl_shader_program *prog,
             continue;
          }
 
-         const interface_block_definition def(var);
-         interface_block_definition *prev_def =
-            definitions->lookup(iface_type->name);
-
+         ir_variable *prev_def = definitions->lookup(var);
          if (prev_def == NULL) {
             /* This is the first time we've seen the interface, so save
              * it into the appropriate data structure.
              */
-            definitions->store(def);
-         } else if (!intrastage_match(prev_def, &def,
-                                      (ir_variable_mode) var->data.mode,
-                                      prog)) {
+            definitions->store(var);
+         } else if (!intrastage_match(prev_def, var, prog)) {
             linker_error(prog, "definitions of interface block `%s' do not"
                          " match\n", iface_type->name);
             return;
@@ -329,7 +296,7 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog,
       if (!var || !var->get_interface_type() || var->data.mode != ir_var_shader_in)
          continue;
 
-      definitions.store(interface_block_definition(var));
+      definitions.store(var);
    }
 
    /* Verify that the producer's output interfaces match. */
@@ -338,16 +305,13 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog,
       if (!var || !var->get_interface_type() || var->data.mode != ir_var_shader_out)
          continue;
 
-      interface_block_definition *consumer_def =
-         definitions.lookup(var->get_interface_type()->name);
+      ir_variable *consumer_def = definitions.lookup(var);
 
       /* The consumer doesn't use this output block.  Ignore it. */
       if (consumer_def == NULL)
          continue;
 
-      const interface_block_definition producer_def(var);
-
-      if (!interstage_match(&producer_def, consumer_def, extra_array_level)) {
+      if (!interstage_match(var, consumer_def, extra_array_level)) {
          linker_error(prog, "definitions of interface block `%s' do not "
                       "match\n", var->get_interface_type()->name);
          return;
@@ -374,19 +338,15 @@ validate_interstage_uniform_blocks(struct gl_shader_program *prog,
               var->data.mode != ir_var_shader_storage))
             continue;
 
-         interface_block_definition *old_def =
-            definitions.lookup(var->get_interface_type()->name);
-         const interface_block_definition new_def(var);
+         ir_variable *old_def = definitions.lookup(var);
          if (old_def == NULL) {
-            definitions.store(new_def);
+            definitions.store(var);
          } else {
             /* Interstage uniform matching rules are the same as intrastage
              * uniform matchin rules (for uniforms, it is as though all
              * shaders are in the same shader stage).
              */
-            if (!intrastage_match(old_def, &new_def,
-                                  (ir_variable_mode) var->data.mode,
-                                  prog)) {
+            if (!intrastage_match(old_def, var, prog)) {
                linker_error(prog, "definitions of interface block `%s' do not "
                             "match\n", var->get_interface_type()->name);
                return;
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 71750d1b42b..9cc77feb78a 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -432,6 +432,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
          this->matched_candidate->type->fields.array->matrix_columns;
       const unsigned vector_elements =
          this->matched_candidate->type->fields.array->vector_elements;
+      const unsigned dmul =
+         this->matched_candidate->type->fields.array->is_double() ? 2 : 1;
       unsigned actual_array_size;
       switch (this->lowered_builtin_array_variable) {
       case clip_distance:
@@ -459,7 +461,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
             return false;
          }
          unsigned array_elem_size = this->lowered_builtin_array_variable ?
-            1 : vector_elements * matrix_cols;
+            1 : vector_elements * matrix_cols * dmul;
          fine_location += array_elem_size * this->array_subscript;
          this->size = 1;
       } else {
@@ -519,7 +521,6 @@ tfeedback_decl::get_num_outputs() const
    if (!this->is_varying()) {
       return 0;
    }
-
    return (this->num_components() + this->location_frac + 3)/4;
 }
 
@@ -968,6 +969,8 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
       } else {
          slots = type->matrix_columns;
       }
+      if (type->without_array()->is_dual_slot_double())
+         slots *= 2;
       this->matches[this->num_matches].num_components = 4 * slots;
    } else {
       this->matches[this->num_matches].num_components
@@ -1712,7 +1715,8 @@ check_against_output_limit(struct gl_context *ctx,
 
       if (var && var->data.mode == ir_var_shader_out &&
           var_counts_against_varying_limit(producer->Stage, var)) {
-         output_vectors += var->type->count_attribute_slots();
+         /* outputs for fragment shader can't be doubles */
+         output_vectors += var->type->count_attribute_slots(false);
       }
    }
 
@@ -1753,7 +1757,8 @@ check_against_input_limit(struct gl_context *ctx,
 
       if (var && var->data.mode == ir_var_shader_in &&
           var_counts_against_varying_limit(consumer->Stage, var)) {
-         input_vectors += var->type->count_attribute_slots();
+         /* vertex inputs aren't varying counted */
+         input_vectors += var->type->count_attribute_slots(false);
       }
    }
 
diff --git a/src/glsl/link_varyings.h b/src/glsl/link_varyings.h
index 2ce72d43d84..1d12978fa30 100644
--- a/src/glsl/link_varyings.h
+++ b/src/glsl/link_varyings.h
@@ -131,7 +131,8 @@ public:
       if (this->lowered_builtin_array_variable)
          return this->size;
       else
-         return this->vector_elements * this->matrix_columns * this->size;
+         return this->vector_elements * this->matrix_columns * this->size *
+            (this->is_double() ? 2 : 1);
    }
 
    unsigned get_location() const {
@@ -139,6 +140,29 @@ public:
    }
 
 private:
+
+   bool is_double() const
+   {
+      switch (this->type) {
+      case GL_DOUBLE:
+      case GL_DOUBLE_VEC2:
+      case GL_DOUBLE_VEC3:
+      case GL_DOUBLE_VEC4:
+      case GL_DOUBLE_MAT2:
+      case GL_DOUBLE_MAT2x3:
+      case GL_DOUBLE_MAT2x4:
+      case GL_DOUBLE_MAT3:
+      case GL_DOUBLE_MAT3x2:
+      case GL_DOUBLE_MAT3x4:
+      case GL_DOUBLE_MAT4:
+      case GL_DOUBLE_MAT4x2:
+      case GL_DOUBLE_MAT4x3:
+         return true;
+      default:
+         return false;
+      }
+   }
+
    /**
     * The name that was supplied to glTransformFeedbackVaryings.  Used for
     * error reporting and glGetTransformFeedbackVarying().
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index a87bbb2b994..c7e69765335 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2466,7 +2466,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
          return false;
       }
 
-      const unsigned slots = var->type->count_attribute_slots();
+      const unsigned slots = var->type->count_attribute_slots(target_index == MESA_SHADER_VERTEX ? true : false);
 
       /* If the variable is not a built-in and has a location statically
        * assigned in the shader (presumably via a layout qualifier), make sure
@@ -2603,17 +2603,8 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
              * issue (3) of the GL_ARB_vertex_attrib_64bit behavior, this
              * is optional behavior, but it seems preferable.
              */
-            const glsl_type *type = var->type->without_array();
-            if (type == glsl_type::dvec3_type ||
-                type == glsl_type::dvec4_type ||
-                type == glsl_type::dmat2x3_type ||
-                type == glsl_type::dmat2x4_type ||
-                type == glsl_type::dmat3_type ||
-                type == glsl_type::dmat3x4_type ||
-                type == glsl_type::dmat4x3_type ||
-                type == glsl_type::dmat4_type) {
+            if (var->type->without_array()->is_dual_slot_double())
                double_storage_locations |= (use_mask << attr);
-            }
 	 }
 
 	 continue;
@@ -3004,7 +2995,8 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
             foreach_in_list(ir_instruction, node, sh->ir) {
                ir_variable *var = node->as_variable();
                if (var && var->data.mode == ir_var_shader_out)
-                  fragment_outputs += var->type->count_attribute_slots();
+                  /* since there are no double fs outputs - pass false */
+                  fragment_outputs += var->type->count_attribute_slots(false);
             }
          }
       }
@@ -4423,13 +4415,13 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
    if (first < MESA_SHADER_FRAGMENT) {
       gl_shader *const sh = prog->_LinkedShaders[last];
 
-      if (first == MESA_SHADER_GEOMETRY) {
+      if (first != MESA_SHADER_VERTEX) {
          /* There was no vertex shader, but we still have to assign varying
-          * locations for use by geometry shader inputs in SSO.
+          * locations for use by tessellation/geometry shader inputs in SSO.
           *
           * If the shader is not separable (i.e., prog->SeparateShader is
-          * false), linking will have already failed when first is
-          * MESA_SHADER_GEOMETRY.
+          * false), linking will have already failed when first is not
+          * MESA_SHADER_VERTEX.
           */
          if (!assign_varying_locations(ctx, mem_ctx, prog,
                                        NULL, prog->_LinkedShaders[first],
diff --git a/src/glsl/lower_mat_op_to_vec.cpp b/src/glsl/lower_mat_op_to_vec.cpp
index dda754f9149..e96cda216dd 100644
--- a/src/glsl/lower_mat_op_to_vec.cpp
+++ b/src/glsl/lower_mat_op_to_vec.cpp
@@ -277,7 +277,11 @@ ir_mat_op_to_vec_visitor::do_equal_mat_mat(ir_dereference *result,
    }
 
    ir_rvalue *const val = new(this->mem_ctx) ir_dereference_variable(tmp_bvec);
-   ir_expression *any = new(this->mem_ctx) ir_expression(ir_unop_any, val);
+   uint8_t vec_elems = val->type->vector_elements;
+   ir_expression *any =
+      new(this->mem_ctx) ir_expression(ir_binop_any_nequal, val,
+                                       new(this->mem_ctx) ir_constant(false,
+                                                                      vec_elems));
 
    if (test_equal)
       any = new(this->mem_ctx) ir_expression(ir_unop_logic_not, any);
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 9a25f2fc905..fe8aa278380 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -367,7 +367,6 @@ nir_visitor::visit(ir_variable *ir)
    var->data.explicit_index = ir->data.explicit_index;
    var->data.explicit_binding = ir->data.explicit_binding;
    var->data.has_initializer = ir->data.has_initializer;
-   var->data.is_unmatched_generic_inout = ir->data.is_unmatched_generic_inout;
    var->data.location_frac = ir->data.location_frac;
    var->data.from_named_ifc_block_array = ir->data.from_named_ifc_block_array;
    var->data.from_named_ifc_block_nonarray = ir->data.from_named_ifc_block_nonarray;
@@ -1072,6 +1071,7 @@ nir_visitor::visit(ir_call *ir)
          nir_intrinsic_instr *store_instr =
             nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
          store_instr->num_components = ir->return_deref->type->vector_elements;
+         store_instr->const_index[0] = (1 << store_instr->num_components) - 1;
 
          store_instr->variables[0] =
             evaluate_deref(&store_instr->instr, ir->return_deref);
@@ -1133,43 +1133,23 @@ nir_visitor::visit(ir_assignment *ir)
    nir_ssa_def *src = evaluate_rvalue(ir->rhs);
 
    if (ir->write_mask != (1 << num_components) - 1 && ir->write_mask != 0) {
-      /*
-       * We have no good way to update only part of a variable, so just load
-       * the LHS and do a vec operation to combine the old with the new, and
-       * then store it
-       * back into the LHS. Copy propagation should get rid of the mess.
+      /* GLSL IR will give us the input to the write-masked assignment in a
+       * single packed vector.  So, for example, if the writemask is xzw, then
+       * we have to swizzle x -> x, y -> z, and z -> w and get the y component
+       * from the load.
        */
-
-      nir_intrinsic_instr *load =
-         nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_var);
-      load->num_components = ir->lhs->type->vector_elements;
-      nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
-      load->variables[0] = lhs_deref;
-      ralloc_steal(load, load->variables[0]);
-      nir_builder_instr_insert(&b, &load->instr);
-
-      nir_ssa_def *srcs[4];
-
+      unsigned swiz[4];
       unsigned component = 0;
-      for (unsigned i = 0; i < ir->lhs->type->vector_elements; i++) {
-         if (ir->write_mask & (1 << i)) {
-            /* GLSL IR will give us the input to the write-masked assignment
-             * in a single packed vector.  So, for example, if the
-             * writemask is xzw, then we have to swizzle x -> x, y -> z,
-             * and z -> w and get the y component from the load.
-             */
-            srcs[i] = nir_channel(&b, src, component++);
-         } else {
-            srcs[i] = nir_channel(&b, &load->dest.ssa, i);
-         }
+      for (unsigned i = 0; i < 4; i++) {
+         swiz[i] = ir->write_mask & (1 << i) ? component++ : 0;
       }
-
-      src = nir_vec(&b, srcs, ir->lhs->type->vector_elements);
+      src = nir_swizzle(&b, src, swiz, num_components, !supports_ints);
    }
 
    nir_intrinsic_instr *store =
       nir_intrinsic_instr_create(this->shader, nir_intrinsic_store_var);
    store->num_components = ir->lhs->type->vector_elements;
+   store->const_index[0] = ir->write_mask;
    nir_deref *store_deref = nir_copy_deref(store, &lhs_deref->deref);
    store->variables[0] = nir_deref_as_var(store_deref);
    store->src[0] = nir_src_for_ssa(src);
@@ -1421,24 +1401,6 @@ nir_visitor::visit(ir_expression *ir)
       /* no-op */
       result = nir_imov(&b, srcs[0]);
       break;
-   case ir_unop_any:
-      switch (ir->operands[0]->type->vector_elements) {
-      case 2:
-         result = supports_ints ? nir_bany2(&b, srcs[0])
-                                : nir_fany2(&b, srcs[0]);
-         break;
-      case 3:
-         result = supports_ints ? nir_bany3(&b, srcs[0])
-                                : nir_fany3(&b, srcs[0]);
-         break;
-      case 4:
-         result = supports_ints ? nir_bany4(&b, srcs[0])
-                                : nir_fany4(&b, srcs[0]);
-         break;
-      default:
-         unreachable("not reached");
-      }
-      break;
    case ir_unop_trunc: result = nir_ftrunc(&b, srcs[0]); break;
    case ir_unop_ceil:  result = nir_fceil(&b, srcs[0]); break;
    case ir_unop_floor: result = nir_ffloor(&b, srcs[0]); break;
diff --git a/src/glsl/nir/glsl_types.cpp b/src/glsl/nir/glsl_types.cpp
index bc8677ba6fc..d86609718ea 100644
--- a/src/glsl/nir/glsl_types.cpp
+++ b/src/glsl/nir/glsl_types.cpp
@@ -1831,7 +1831,7 @@ glsl_type::std430_size(bool row_major) const
 }
 
 unsigned
-glsl_type::count_attribute_slots() const
+glsl_type::count_attribute_slots(bool vertex_input_slots) const
 {
    /* From page 31 (page 37 of the PDF) of the GLSL 1.50 spec:
     *
@@ -1852,27 +1852,35 @@ glsl_type::count_attribute_slots() const
     * allows varying structs, the number of varying slots taken up by a
     * varying struct is simply equal to the sum of the number of slots taken
     * up by each element.
+    *
+    * Doubles are counted different depending on whether they are vertex
+    * inputs or everything else. Vertex inputs from ARB_vertex_attrib_64bit
+    * take one location no matter what size they are, otherwise dvec3/4
+    * take two locations.
     */
    switch (this->base_type) {
    case GLSL_TYPE_UINT:
    case GLSL_TYPE_INT:
    case GLSL_TYPE_FLOAT:
    case GLSL_TYPE_BOOL:
-   case GLSL_TYPE_DOUBLE:
       return this->matrix_columns;
-
+   case GLSL_TYPE_DOUBLE:
+      if (this->vector_elements > 2 && !vertex_input_slots)
+         return this->matrix_columns * 2;
+      else
+         return this->matrix_columns;
    case GLSL_TYPE_STRUCT:
    case GLSL_TYPE_INTERFACE: {
       unsigned size = 0;
 
       for (unsigned i = 0; i < this->length; i++)
-         size += this->fields.structure[i].type->count_attribute_slots();
+         size += this->fields.structure[i].type->count_attribute_slots(vertex_input_slots);
 
       return size;
    }
 
    case GLSL_TYPE_ARRAY:
-      return this->length * this->fields.array->count_attribute_slots();
+      return this->length * this->fields.array->count_attribute_slots(vertex_input_slots);
 
    case GLSL_TYPE_FUNCTION:
    case GLSL_TYPE_SAMPLER:
diff --git a/src/glsl/nir/glsl_types.h b/src/glsl/nir/glsl_types.h
index 1aafa5cd547..ff8dcc7a5f6 100644
--- a/src/glsl/nir/glsl_types.h
+++ b/src/glsl/nir/glsl_types.h
@@ -334,8 +334,11 @@ struct glsl_type {
     * varying slots the type will use up in the absence of varying packing
     * (and thus, it can be used to measure the number of varying slots used by
     * the varyings that are generated by lower_packed_varyings).
+    *
+    * For vertex shader attributes - doubles only take one slot.
+    * For inter-shader varyings - dvec3/dvec4 take two slots.
     */
-   unsigned count_attribute_slots() const;
+   unsigned count_attribute_slots(bool vertex_input_slots) const;
 
    /**
     * Alignment in bytes of the start of this type in a std140 uniform
@@ -481,6 +484,14 @@ struct glsl_type {
    }
 
    /**
+    * Query whether a double takes two slots.
+    */
+   bool is_dual_slot_double() const
+   {
+      return base_type == GLSL_TYPE_DOUBLE && vector_elements > 2;
+   }
+
+   /**
     * Query whether or not a type is a non-array boolean type
     */
    bool is_boolean() const
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 904e444487a..c05df101f29 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -216,15 +216,6 @@ typedef struct {
       unsigned has_initializer:1;
 
       /**
-       * Is this variable a generic output or input that has not yet been matched
-       * up to a variable in another stage of the pipeline?
-       *
-       * This is used by the linker as scratch storage while assigning locations
-       * to generic inputs and outputs.
-       */
-      unsigned is_unmatched_generic_inout:1;
-
-      /**
        * If non-zero, then this variable may be packed along with other variables
        * into a single varying slot, so this offset should be applied when
        * accessing components.  For example, an offset of 1 means that the x
@@ -388,7 +379,9 @@ nir_variable_get_io_mask(nir_variable *var, gl_shader_stage stage)
          var_type = glsl_get_array_element(var_type);
    }
 
-   unsigned slots = glsl_count_attribute_slots(var_type);
+   bool is_vertex_input = (var->data.mode == nir_var_shader_in &&
+                           stage == MESA_SHADER_VERTEX);
+   unsigned slots = glsl_count_attribute_slots(var_type, is_vertex_input);
    return ((1ull << slots) - 1) << var->data.location;
 }
 
diff --git a/src/glsl/nir/nir_builder.h b/src/glsl/nir/nir_builder.h
index 423bddd7a9a..038d5d0e240 100644
--- a/src/glsl/nir/nir_builder.h
+++ b/src/glsl/nir/nir_builder.h
@@ -341,13 +341,15 @@ nir_load_var(nir_builder *build, nir_variable *var)
 }
 
 static inline void
-nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value)
+nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value,
+              unsigned writemask)
 {
    const unsigned num_components = glsl_get_vector_elements(var->type);
 
    nir_intrinsic_instr *store =
       nir_intrinsic_instr_create(build->shader, nir_intrinsic_store_var);
    store->num_components = num_components;
+   store->const_index[0] = writemask;
    store->variables[0] = nir_deref_var_create(store, var);
    store->src[0] = nir_src_for_ssa(value);
    nir_builder_instr_insert(build, &store->instr);
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h
index 5086e297e8e..e30750804dc 100644
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -43,7 +43,7 @@
 
 
 INTRINSIC(load_var, 0, ARR(), true, 0, 1, 0, NIR_INTRINSIC_CAN_ELIMINATE)
-INTRINSIC(store_var, 1, ARR(0), false, 0, 1, 0, 0)
+INTRINSIC(store_var, 1, ARR(0), false, 0, 1, 1, 0)
 INTRINSIC(copy_var, 0, ARR(), false, 0, 2, 0, 0)
 
 /*
@@ -323,13 +323,13 @@ LOAD(push_constant, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDE
 #define STORE(name, srcs, indices, flags) \
    INTRINSIC(store_##name, srcs, ARR(0, 1, 1, 1), false, 0, 0, indices, flags)
 
-/* src[] = { value, offset }. const_index[] = { base } */
-STORE(output, 2, 1, 0)
-/* src[] = { value, vertex, offset }. const_index[] = { base } */
-STORE(per_vertex_output, 3, 1, 0)
+/* src[] = { value, offset }. const_index[] = { base, write_mask } */
+STORE(output, 2, 2, 0)
+/* src[] = { value, vertex, offset }. const_index[] = { base, write_mask } */
+STORE(per_vertex_output, 3, 2, 0)
 /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */
 STORE(ssbo, 3, 1, 0)
 /* src[] = { value, offset }. const_index[] = { base, write_mask } */
-STORE(shared, 2, 1, 0)
+STORE(shared, 2, 2, 0)
 
 LAST_INTRINSIC(store_shared)
diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c
index 9313fc0f97e..d267ca383ab 100644
--- a/src/glsl/nir/nir_lower_alu_to_scalar.c
+++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
@@ -137,10 +137,6 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
       LOWER_REDUCTION(nir_op_bany_inequal, nir_op_ine, nir_op_ior);
       LOWER_REDUCTION(nir_op_fall_equal, nir_op_seq, nir_op_fand);
       LOWER_REDUCTION(nir_op_fany_nequal, nir_op_sne, nir_op_for);
-      LOWER_REDUCTION(nir_op_ball, nir_op_imov, nir_op_iand);
-      LOWER_REDUCTION(nir_op_bany, nir_op_imov, nir_op_ior);
-      LOWER_REDUCTION(nir_op_fall, nir_op_fmov, nir_op_fand);
-      LOWER_REDUCTION(nir_op_fany, nir_op_fmov, nir_op_for);
 
    default:
       break;
diff --git a/src/glsl/nir/nir_lower_gs_intrinsics.c b/src/glsl/nir/nir_lower_gs_intrinsics.c
index e0d067885d8..13254599088 100644
--- a/src/glsl/nir/nir_lower_gs_intrinsics.c
+++ b/src/glsl/nir/nir_lower_gs_intrinsics.c
@@ -99,7 +99,8 @@ rewrite_emit_vertex(nir_intrinsic_instr *intrin, struct state *state)
 
    /* Increment the vertex count by 1 */
    nir_store_var(b, state->vertex_count_var,
-                 nir_iadd(b, count, nir_imm_int(b, 1)));
+                 nir_iadd(b, count, nir_imm_int(b, 1)),
+                 0x1); /* .x */
 
    nir_instr_remove(&intrin->instr);
 
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index ec6d09d5b6d..36165a8b765 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -261,6 +261,9 @@ nir_lower_io_block(nir_block *block, void *void_state)
          store->const_index[0] =
             intrin->variables[0]->var->data.driver_location;
 
+         /* Copy the writemask */
+         store->const_index[1] = intrin->const_index[0];
+
          if (per_vertex)
             store->src[1] = nir_src_for_ssa(vertex_index);
 
diff --git a/src/glsl/nir/nir_lower_locals_to_regs.c b/src/glsl/nir/nir_lower_locals_to_regs.c
index 17b53ca36f3..3e21ac0cdd5 100644
--- a/src/glsl/nir/nir_lower_locals_to_regs.c
+++ b/src/glsl/nir/nir_lower_locals_to_regs.c
@@ -243,7 +243,7 @@ lower_locals_to_regs_block(nir_block *block, void *void_state)
 
          nir_alu_instr *mov = nir_alu_instr_create(state->shader, nir_op_imov);
          nir_src_copy(&mov->src[0].src, &intrin->src[0], mov);
-         mov->dest.write_mask = (1 << intrin->num_components) - 1;
+         mov->dest.write_mask = intrin->const_index[0];
          mov->dest.dest.is_ssa = false;
          mov->dest.dest.reg.reg = reg_src.reg.reg;
          mov->dest.dest.reg.base_offset = reg_src.reg.base_offset;
diff --git a/src/glsl/nir/nir_lower_returns.c b/src/glsl/nir/nir_lower_returns.c
index ce0512c770a..f1e8b143840 100644
--- a/src/glsl/nir/nir_lower_returns.c
+++ b/src/glsl/nir/nir_lower_returns.c
@@ -156,7 +156,7 @@ lower_returns_in_block(nir_block *block, struct lower_returns_state *state)
       state->return_flag->constant_initializer =
          rzalloc(state->return_flag, nir_constant);
    }
-   nir_store_var(b, state->return_flag, nir_imm_int(b, NIR_TRUE));
+   nir_store_var(b, state->return_flag, nir_imm_int(b, NIR_TRUE), 1);
 
    if (state->loop) {
       /* We're in a loop.  Make the return a break. */
diff --git a/src/glsl/nir/nir_lower_var_copies.c b/src/glsl/nir/nir_lower_var_copies.c
index 98c107aa50e..a9017de5449 100644
--- a/src/glsl/nir/nir_lower_var_copies.c
+++ b/src/glsl/nir/nir_lower_var_copies.c
@@ -128,6 +128,7 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr,
       nir_intrinsic_instr *store =
          nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_store_var);
       store->num_components = num_components;
+      store->const_index[0] = (1 << num_components) - 1;
       store->variables[0] = nir_deref_as_var(nir_copy_deref(store, &dest_head->deref));
 
       store->src[0].is_ssa = true;
diff --git a/src/glsl/nir/nir_lower_vars_to_ssa.c b/src/glsl/nir/nir_lower_vars_to_ssa.c
index e670dbdc7e7..3ec0e1d9960 100644
--- a/src/glsl/nir/nir_lower_vars_to_ssa.c
+++ b/src/glsl/nir/nir_lower_vars_to_ssa.c
@@ -26,6 +26,7 @@
  */
 
 #include "nir.h"
+#include "nir_builder.h"
 #include "nir_vla.h"
 
 
@@ -590,6 +591,9 @@ add_phi_sources(nir_block *block, nir_block *pred,
 static bool
 rename_variables_block(nir_block *block, struct lower_variables_state *state)
 {
+   nir_builder b;
+   nir_builder_init(&b, state->impl);
+
    nir_foreach_instr_safe(block, instr) {
       if (instr->type == nir_instr_type_phi) {
          nir_phi_instr *phi = nir_instr_as_phi(instr);
@@ -675,20 +679,40 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
 
             assert(intrin->src[0].is_ssa);
 
-            nir_alu_instr *mov = nir_alu_instr_create(state->shader,
-                                                      nir_op_imov);
-            mov->src[0].src.is_ssa = true;
-            mov->src[0].src.ssa = intrin->src[0].ssa;
-            for (unsigned i = intrin->num_components; i < 4; i++)
-               mov->src[0].swizzle[i] = 0;
+            nir_ssa_def *new_def;
+            b.cursor = nir_before_instr(&intrin->instr);
 
-            mov->dest.write_mask = (1 << intrin->num_components) - 1;
-            nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                              intrin->num_components, NULL);
+            if (intrin->const_index[0] == (1 << intrin->num_components) - 1) {
+               /* Whole variable store - just copy the source.  Note that
+                * intrin->num_components and intrin->src[0].ssa->num_components
+                * may differ.
+                */
+               unsigned swiz[4];
+               for (unsigned i = 0; i < 4; i++)
+                  swiz[i] = i < intrin->num_components ? i : 0;
+
+               new_def = nir_swizzle(&b, intrin->src[0].ssa, swiz,
+                                     intrin->num_components, false);
+            } else {
+               nir_ssa_def *old_def = get_ssa_def_for_block(node, block, state);
+               /* For writemasked store_var intrinsics, we combine the newly
+                * written values with the existing contents of unwritten
+                * channels, creating a new SSA value for the whole vector.
+                */
+               nir_ssa_def *srcs[4];
+               for (unsigned i = 0; i < intrin->num_components; i++) {
+                  if (intrin->const_index[0] & (1 << i)) {
+                     srcs[i] = nir_channel(&b, intrin->src[0].ssa, i);
+                  } else {
+                     srcs[i] = nir_channel(&b, old_def, i);
+                  }
+               }
+               new_def = nir_vec(&b, srcs, intrin->num_components);
+            }
 
-            nir_instr_insert_before(&intrin->instr, &mov->instr);
+            assert(new_def->num_components == intrin->num_components);
 
-            def_stack_push(node, &mov->dest.dest.ssa, state);
+            def_stack_push(node, new_def, state);
 
             /* We'll wait to remove the instruction until the next pass
              * where we pop the node we just pushed back off the stack.
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index 37d3dfc4588..1cd01a4fe92 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -167,13 +167,6 @@ unop_convert("i2b", tint, tbool, "src0 != 0")
 unop_convert("b2i", tbool, tint, "src0 ? 1 : 0") # Boolean-to-int conversion
 unop_convert("u2f", tuint, tfloat, "src0") # Unsigned-to-float conversion.
 
-unop_reduce("bany", 1, tbool, tbool, "{src}", "{src0} || {src1}", "{src}")
-unop_reduce("ball", 1, tbool, tbool, "{src}", "{src0} && {src1}", "{src}")
-unop_reduce("fany", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} || {src1}",
-            "{src} ? 1.0f : 0.0f")
-unop_reduce("fall", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} && {src1}",
-            "{src} ? 1.0f : 0.0f")
-
 # Unary floating-point rounding operations.
 
 
diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c
index 26b1cbb467d..2691cbdf213 100644
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -249,6 +249,53 @@ get_var_name(nir_variable *var, print_state *state)
 }
 
 static void
+print_constant(nir_constant *c, const struct glsl_type *type, print_state *state)
+{
+   FILE *fp = state->fp;
+   unsigned total_elems = glsl_get_components(type);
+   unsigned i;
+
+   switch (glsl_get_base_type(type)) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_BOOL:
+      for (i = 0; i < total_elems; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "0x%08x", c->value.u[i]);
+      }
+      break;
+
+   case GLSL_TYPE_FLOAT:
+      for (i = 0; i < total_elems; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "%f", c->value.f[i]);
+      }
+      break;
+
+   case GLSL_TYPE_STRUCT:
+      for (i = 0; i < c->num_elements; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "{ ");
+         print_constant(c->elements[i], glsl_get_struct_field(type, i), state);
+         fprintf(fp, " }");
+      }
+      break;
+
+   case GLSL_TYPE_ARRAY:
+      for (i = 0; i < c->num_elements; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "{ ");
+         print_constant(c->elements[i], glsl_get_array_element(type), state);
+         fprintf(fp, " }");
+      }
+      break;
+
+   default:
+      unreachable("not reached");
+   }
+}
+
+static void
 print_var_decl(nir_variable *var, print_state *state)
 {
    FILE *fp = state->fp;
@@ -311,6 +358,12 @@ print_var_decl(nir_variable *var, print_state *state)
       fprintf(fp, " (%s, %u)", loc, var->data.driver_location);
    }
 
+   if (var->constant_initializer) {
+      fprintf(fp, " = { ");
+      print_constant(var->constant_initializer, var->type, state);
+      fprintf(fp, " }");
+   }
+
    fprintf(fp, "\n");
 }
 
diff --git a/src/glsl/nir/nir_types.cpp b/src/glsl/nir/nir_types.cpp
index 54751cbcb5f..86f8508b859 100644
--- a/src/glsl/nir/nir_types.cpp
+++ b/src/glsl/nir/nir_types.cpp
@@ -125,9 +125,10 @@ glsl_get_aoa_size(const struct glsl_type *type)
 }
 
 unsigned
-glsl_count_attribute_slots(const struct glsl_type *type)
+glsl_count_attribute_slots(const struct glsl_type *type,
+                           bool vertex_input_slots)
 {
-   return type->count_attribute_slots();
+   return type->count_attribute_slots(vertex_input_slots);
 }
 
 const char *
@@ -238,6 +239,18 @@ glsl_float_type(void)
 }
 
 const glsl_type *
+glsl_vec_type(unsigned n)
+{
+   return glsl_type::vec(n);
+}
+
+const glsl_type *
+glsl_vec4_type(void)
+{
+   return glsl_type::vec4_type;
+}
+
+const glsl_type *
 glsl_int_type(void)
 {
    return glsl_type::int_type;
@@ -256,12 +269,6 @@ glsl_bool_type(void)
 }
 
 const glsl_type *
-glsl_vec4_type(void)
-{
-   return glsl_type::vec4_type;
-}
-
-const glsl_type *
 glsl_scalar_type(enum glsl_base_type base_type)
 {
    return glsl_type::get_instance(base_type, 1, 1);
diff --git a/src/glsl/nir/nir_types.h b/src/glsl/nir/nir_types.h
index 1bae84a356e..535d36373de 100644
--- a/src/glsl/nir/nir_types.h
+++ b/src/glsl/nir/nir_types.h
@@ -67,7 +67,8 @@ unsigned glsl_get_length(const struct glsl_type *type);
 
 unsigned glsl_get_aoa_size(const struct glsl_type *type);
 
-unsigned glsl_count_attribute_slots(const struct glsl_type *type);
+unsigned glsl_count_attribute_slots(const struct glsl_type *type,
+                                    bool vertex_input_slots);
 
 const char *glsl_get_struct_elem_name(const struct glsl_type *type,
                                       unsigned index);
@@ -92,11 +93,12 @@ bool glsl_sampler_type_is_array(const struct glsl_type *type);
 
 const struct glsl_type *glsl_void_type(void);
 const struct glsl_type *glsl_float_type(void);
+const struct glsl_type *glsl_vec_type(unsigned n);
+const struct glsl_type *glsl_vec4_type(void);
 const struct glsl_type *glsl_int_type(void);
 const struct glsl_type *glsl_uint_type(void);
 const struct glsl_type *glsl_bool_type(void);
 
-const struct glsl_type *glsl_vec4_type(void);
 const struct glsl_type *glsl_scalar_type(enum glsl_base_type base_type);
 const struct glsl_type *glsl_vector_type(enum glsl_base_type base_type,
                                          unsigned components);
diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c
index 06879d64ee2..da920557d20 100644
--- a/src/glsl/nir/nir_validate.c
+++ b/src/glsl/nir/nir_validate.c
@@ -417,6 +417,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
       assert(instr->variables[0]->var->data.mode != nir_var_shader_in &&
              instr->variables[0]->var->data.mode != nir_var_uniform &&
              instr->variables[0]->var->data.mode != nir_var_shader_storage);
+      assert((instr->const_index[0] & ~((1 << instr->num_components) - 1)) == 0);
       break;
    }
    case nir_intrinsic_copy_var:
diff --git a/src/glsl/nir/spirv_to_nir.c b/src/glsl/nir/spirv_to_nir.c
index 48960b70373..91710ba45f9 100644
--- a/src/glsl/nir/spirv_to_nir.c
+++ b/src/glsl/nir/spirv_to_nir.c
@@ -1133,6 +1133,7 @@ _vtn_variable_store(struct vtn_builder *b,
       store->variables[0] =
          nir_deref_as_var(nir_copy_deref(store, &dest_deref->deref));
       store->num_components = glsl_get_vector_elements(src->type);
+      store->const_index[0] = (1 << store->num_components) - 1;
       store->src[0] = nir_src_for_ssa(src->def);
 
       nir_builder_instr_insert(&b->nb, &store->instr);
@@ -2574,20 +2575,30 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode,
    case SpvOpNot:                   op = nir_op_inot;    break;
 
    case SpvOpAny:
-      switch (src[0]->num_components) {
-      case 1:  op = nir_op_imov;    break;
-      case 2:  op = nir_op_bany2;   break;
-      case 3:  op = nir_op_bany3;   break;
-      case 4:  op = nir_op_bany4;   break;
+      if (src[0]->num_components == 1) {
+         op = nir_op_imov;
+      } else {
+         switch (src[0]->num_components) {
+         case 2:  op = nir_op_bany_inequal2; break;
+         case 3:  op = nir_op_bany_inequal3; break;
+         case 4:  op = nir_op_bany_inequal4; break;
+         }
+         num_inputs = 2;
+         src[1] = nir_imm_int(&b->nb, NIR_FALSE);
       }
       break;
 
    case SpvOpAll:
-      switch (src[0]->num_components) {
-      case 1:  op = nir_op_imov;    break;
-      case 2:  op = nir_op_ball2;   break;
-      case 3:  op = nir_op_ball3;   break;
-      case 4:  op = nir_op_ball4;   break;
+      if (src[0]->num_components == 1) {
+         op = nir_op_imov;
+      } else {
+         switch (src[0]->num_components) {
+         case 2:  op = nir_op_ball_iequal2;  break;
+         case 3:  op = nir_op_ball_iequal3;  break;
+         case 4:  op = nir_op_ball_iequal4;  break;
+         }
+         num_inputs = 2;
+         src[1] = nir_imm_int(&b->nb, NIR_TRUE);
       }
       break;
 
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index ee243126731..8bdbb9caf56 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -437,7 +437,8 @@ dri3_wait_x(struct glx_context *gc)
    struct dri3_drawable *priv = (struct dri3_drawable *)
       GetGLXDRIDrawable(gc->currentDpy, gc->currentDrawable);
 
-   loader_dri3_wait_x(&priv->loader_drawable);
+   if (priv)
+      loader_dri3_wait_x(&priv->loader_drawable);
 }
 
 static void
@@ -446,7 +447,8 @@ dri3_wait_gl(struct glx_context *gc)
    struct dri3_drawable *priv = (struct dri3_drawable *)
       GetGLXDRIDrawable(gc->currentDpy, gc->currentDrawable);
 
-   loader_dri3_wait_gl(&priv->loader_drawable);
+   if (priv)
+      loader_dri3_wait_gl(&priv->loader_drawable);
 }
 
 /**
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 5610e9ff80f..36bed77b481 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -1544,7 +1544,8 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear)
    const char *vs_source =
       "#extension GL_AMD_vertex_shader_layer : enable\n"
       "#extension GL_ARB_draw_instanced : enable\n"
-      "attribute vec4 position;\n"
+      "#extension GL_ARB_explicit_attrib_location :enable\n"
+      "layout(location = 0) in vec4 position;\n"
       "void main()\n"
       "{\n"
       "#ifdef GL_AMD_vertex_shader_layer\n"
@@ -1553,7 +1554,9 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear)
       "   gl_Position = position;\n"
       "}\n";
    const char *fs_source =
-      "uniform vec4 color;\n"
+      "#extension GL_ARB_explicit_attrib_location :enable\n"
+      "#extension GL_ARB_explicit_uniform_location :enable\n"
+      "layout(location = 0) uniform vec4 color;\n"
       "void main()\n"
       "{\n"
       "   gl_FragColor = color;\n"
@@ -1580,12 +1583,9 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear)
    _mesa_DeleteShader(fs);
    _mesa_AttachShader(clear->ShaderProg, vs);
    _mesa_DeleteShader(vs);
-   _mesa_BindAttribLocation(clear->ShaderProg, 0, "position");
    _mesa_ObjectLabel(GL_PROGRAM, clear->ShaderProg, -1, "meta clear");
    _mesa_LinkProgram(clear->ShaderProg);
 
-   clear->ColorLocation = _mesa_GetUniformLocation(clear->ShaderProg, "color");
-
    has_integer_textures = _mesa_is_gles3(ctx) ||
       (_mesa_is_desktop_gl(ctx) && ctx->Const.GLSLVersion >= 130);
 
@@ -1596,7 +1596,8 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear)
                          "#version 130\n"
                          "#extension GL_AMD_vertex_shader_layer : enable\n"
                          "#extension GL_ARB_draw_instanced : enable\n"
-                         "in vec4 position;\n"
+                         "#extension GL_ARB_explicit_attrib_location :enable\n"
+                         "layout(location = 0) in vec4 position;\n"
                          "void main()\n"
                          "{\n"
                          "#ifdef GL_AMD_vertex_shader_layer\n"
@@ -1607,7 +1608,9 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear)
       const char *fs_int_source =
          ralloc_asprintf(shader_source_mem_ctx,
                          "#version 130\n"
-                         "uniform ivec4 color;\n"
+                         "#extension GL_ARB_explicit_attrib_location :enable\n"
+                         "#extension GL_ARB_explicit_uniform_location :enable\n"
+                         "layout(location = 0) uniform ivec4 color;\n"
                          "out ivec4 out_color;\n"
                          "\n"
                          "void main()\n"
@@ -1626,7 +1629,6 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear)
       _mesa_DeleteShader(fs);
       _mesa_AttachShader(clear->IntegerShaderProg, vs);
       _mesa_DeleteShader(vs);
-      _mesa_BindAttribLocation(clear->IntegerShaderProg, 0, "position");
 
       /* Note that user-defined out attributes get automatically assigned
        * locations starting from 0, so we don't need to explicitly
@@ -1636,9 +1638,6 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear)
       _mesa_ObjectLabel(GL_PROGRAM, clear->IntegerShaderProg, -1,
                         "integer clear");
       _mesa_meta_link_program_with_debug(ctx, clear->IntegerShaderProg);
-
-      clear->IntegerColorLocation =
-	 _mesa_GetUniformLocation(clear->IntegerShaderProg, "color");
    }
 }
 
@@ -1770,12 +1769,10 @@ meta_clear(struct gl_context *ctx, GLbitfield buffers, bool glsl)
    if (fb->_IntegerColor) {
       assert(glsl);
       _mesa_UseProgram(clear->IntegerShaderProg);
-      _mesa_Uniform4iv(clear->IntegerColorLocation, 1,
-			  ctx->Color.ClearColor.i);
+      _mesa_Uniform4iv(0, 1, ctx->Color.ClearColor.i);
    } else if (glsl) {
       _mesa_UseProgram(clear->ShaderProg);
-      _mesa_Uniform4fv(clear->ColorLocation, 1,
-			  ctx->Color.ClearColor.f);
+      _mesa_Uniform4fv(0, 1, ctx->Color.ClearColor.f);
    }
 
    /* GL_COLOR_BUFFER_BIT */
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
index 21495eecd27..5b04755a63e 100644
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -322,12 +322,7 @@ struct clear_state
    GLuint VAO;
    struct gl_buffer_object *buf_obj;
    GLuint ShaderProg;
-   GLint ColorLocation;
-   GLint LayerLocation;
-
    GLuint IntegerShaderProg;
-   GLint IntegerColorLocation;
-   GLint IntegerLayerLocation;
 };
 
 
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index d38e6b88953..2b942d6fd71 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -62,6 +62,15 @@ fallback_required(struct gl_context *ctx, GLenum target,
    GLuint srcLevel;
    GLenum status;
 
+   /* GL_DRAW_FRAMEBUFFER does not exist in OpenGL ES 1.x, and since
+    * _mesa_meta_begin hasn't been called yet, we have to work-around API
+    * difficulties.  The whole reason that GL_DRAW_FRAMEBUFFER is used instead
+    * of GL_FRAMEBUFFER is that the read framebuffer may be different.  This
+    * is moot in OpenGL ES 1.x.
+    */
+   const GLenum fbo_target = ctx->API == API_OPENGLES
+      ? GL_FRAMEBUFFER : GL_DRAW_FRAMEBUFFER;
+
    /* check for fallbacks */
    if (target == GL_TEXTURE_3D) {
       _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH,
@@ -102,13 +111,13 @@ fallback_required(struct gl_context *ctx, GLenum target,
     */
    if (!mipmap->FBO)
       _mesa_GenFramebuffers(1, &mipmap->FBO);
-   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, mipmap->FBO);
+   _mesa_BindFramebuffer(fbo_target, mipmap->FBO);
 
-   _mesa_meta_bind_fbo_image(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, baseImage, 0);
+   _mesa_meta_bind_fbo_image(fbo_target, GL_COLOR_ATTACHMENT0, baseImage, 0);
 
-   status = _mesa_CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER);
+   status = _mesa_CheckFramebufferStatus(fbo_target);
 
-   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, fboSave);
+   _mesa_BindFramebuffer(fbo_target, fboSave);
 
    if (status != GL_FRAMEBUFFER_COMPLETE_EXT) {
       _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH,
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index ac910525082..9ad04ee286b 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -76,6 +76,7 @@ i965_compiler_FILES = \
 	brw_vec4_reg_allocate.cpp \
 	brw_vec4_surface_builder.cpp \
 	brw_vec4_surface_builder.h \
+	brw_vec4_tcs.cpp \
 	brw_vec4_visitor.cpp \
 	brw_vec4_vs_visitor.cpp \
 	brw_vue_map.c \
@@ -151,7 +152,9 @@ i965_FILES = \
 	brw_state.h \
 	brw_state_upload.c \
 	brw_structs.h \
+	brw_tcs.c \
 	brw_tcs_surface_state.c \
+	brw_tes.c \
 	brw_tes_surface_state.c \
 	brw_tex.c \
 	brw_tex_layout.c \
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 17a95c6a5bb..7e33fa05632 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -194,6 +194,38 @@ struct brw_vs_prog_key {
    struct brw_sampler_prog_key_data tex;
 };
 
+/** The program key for Tessellation Control Shaders. */
+struct brw_tcs_prog_key
+{
+   unsigned program_string_id;
+
+   GLenum tes_primitive_mode;
+
+   unsigned input_vertices;
+
+   /** A bitfield of per-patch outputs written. */
+   uint32_t patch_outputs_written;
+
+   /** A bitfield of per-vertex outputs written. */
+   uint64_t outputs_written;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Tessellation Evaluation Shaders. */
+struct brw_tes_prog_key
+{
+   unsigned program_string_id;
+
+   /** A bitfield of per-patch inputs read. */
+   uint32_t patch_inputs_read;
+
+   /** A bitfield of per-vertex inputs read. */
+   uint64_t inputs_read;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
 /** The program key for Geometry Shaders. */
 struct brw_gs_prog_key
 {
@@ -445,7 +477,7 @@ struct brw_vue_map {
     * additional processing is applied before storing them in the VUE), the
     * value is -1.
     */
-   signed char varying_to_slot[BRW_VARYING_SLOT_COUNT];
+   signed char varying_to_slot[VARYING_SLOT_TESS_MAX];
 
    /**
     * Map from VUE slot to gl_varying_slot value.  For slots that do not
@@ -454,12 +486,24 @@ struct brw_vue_map {
     *
     * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD.
     */
-   signed char slot_to_varying[BRW_VARYING_SLOT_COUNT];
+   signed char slot_to_varying[VARYING_SLOT_TESS_MAX];
 
    /**
     * Total number of VUE slots in use
     */
    int num_slots;
+
+   /**
+    * Number of per-patch VUE slots. Only valid for tessellation control
+    * shader outputs and tessellation evaluation shader inputs.
+    */
+   int num_per_patch_slots;
+
+   /**
+    * Number of per-vertex VUE slots. Only valid for tessellation control
+    * shader outputs and tessellation evaluation shader inputs.
+    */
+   int num_per_vertex_slots;
 };
 
 void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map);
@@ -487,6 +531,10 @@ void brw_compute_vue_map(const struct brw_device_info *devinfo,
                          GLbitfield64 slots_valid,
                          bool separate_shader);
 
+void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map,
+                              const GLbitfield64 slots_valid,
+                              const GLbitfield is_patch);
+
 enum shader_dispatch_mode {
    DISPATCH_MODE_4X1_SINGLE = 0,
    DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
@@ -656,6 +704,38 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
                char **error_str);
 
 /**
+ * Compile a tessellation control shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tcs_prog_key *key,
+                struct brw_tcs_prog_data *prog_data,
+                const struct nir_shader *nir,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str);
+
+/**
+ * Compile a tessellation evaluation shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler, void *log_data,
+                void *mem_ctx,
+                const struct brw_tes_prog_key *key,
+                struct brw_tes_prog_data *prog_data,
+                const struct nir_shader *shader,
+                struct gl_shader_program *shader_prog,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str);
+
+/**
  * Compile a vertex shader.
  *
  * Returns the final assembly and the program's size.
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 0abe60124f4..005c3236c88 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -159,8 +159,10 @@ intel_viewport(struct gl_context *ctx)
    __DRIcontext *driContext = brw->driContext;
 
    if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
-      dri2InvalidateDrawable(driContext->driDrawablePriv);
-      dri2InvalidateDrawable(driContext->driReadablePriv);
+      if (driContext->driDrawablePriv)
+         dri2InvalidateDrawable(driContext->driDrawablePriv);
+      if (driContext->driReadablePriv)
+         dri2InvalidateDrawable(driContext->driReadablePriv);
    }
 }
 
@@ -377,7 +379,10 @@ brw_initialize_context_constants(struct brw_context *brw)
       [MESA_SHADER_GEOMETRY] = brw->gen >= 6,
       [MESA_SHADER_FRAGMENT] = true,
       [MESA_SHADER_COMPUTE] =
-         (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) ||
+         (ctx->API == API_OPENGL_CORE &&
+          ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) ||
+         (ctx->API == API_OPENGLES2 &&
+          ctx->Const.MaxComputeWorkGroupSize[0] >= 128) ||
          _mesa_extension_override_enables.ARB_compute_shader,
    };
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 1cc4c7b1282..0239b6214ae 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -179,8 +179,7 @@ enum brw_state_id {
    BRW_STATE_URB_FENCE = BRW_MAX_CACHE,
    BRW_STATE_FRAGMENT_PROGRAM,
    BRW_STATE_GEOMETRY_PROGRAM,
-   BRW_STATE_TESS_CTRL_PROGRAM,
-   BRW_STATE_TESS_EVAL_PROGRAM,
+   BRW_STATE_TESS_PROGRAMS,
    BRW_STATE_VERTEX_PROGRAM,
    BRW_STATE_CURBE_OFFSETS,
    BRW_STATE_REDUCED_PRIMITIVE,
@@ -192,6 +191,7 @@ enum brw_state_id {
    BRW_STATE_BINDING_TABLE_POINTERS,
    BRW_STATE_INDICES,
    BRW_STATE_VERTICES,
+   BRW_STATE_DEFAULT_TESS_LEVELS,
    BRW_STATE_BATCH,
    BRW_STATE_INDEX_BUFFER,
    BRW_STATE_VS_CONSTBUF,
@@ -262,8 +262,7 @@ enum brw_state_id {
 #define BRW_NEW_URB_FENCE               (1ull << BRW_STATE_URB_FENCE)
 #define BRW_NEW_FRAGMENT_PROGRAM        (1ull << BRW_STATE_FRAGMENT_PROGRAM)
 #define BRW_NEW_GEOMETRY_PROGRAM        (1ull << BRW_STATE_GEOMETRY_PROGRAM)
-#define BRW_NEW_TESS_EVAL_PROGRAM       (1ull << BRW_STATE_TESS_EVAL_PROGRAM)
-#define BRW_NEW_TESS_CTRL_PROGRAM       (1ull << BRW_STATE_TESS_CTRL_PROGRAM)
+#define BRW_NEW_TESS_PROGRAMS           (1ull << BRW_STATE_TESS_PROGRAMS)
 #define BRW_NEW_VERTEX_PROGRAM          (1ull << BRW_STATE_VERTEX_PROGRAM)
 #define BRW_NEW_CURBE_OFFSETS           (1ull << BRW_STATE_CURBE_OFFSETS)
 #define BRW_NEW_REDUCED_PRIMITIVE       (1ull << BRW_STATE_REDUCED_PRIMITIVE)
@@ -275,6 +274,7 @@ enum brw_state_id {
 #define BRW_NEW_BINDING_TABLE_POINTERS  (1ull << BRW_STATE_BINDING_TABLE_POINTERS)
 #define BRW_NEW_INDICES                 (1ull << BRW_STATE_INDICES)
 #define BRW_NEW_VERTICES                (1ull << BRW_STATE_VERTICES)
+#define BRW_NEW_DEFAULT_TESS_LEVELS     (1ull << BRW_STATE_DEFAULT_TESS_LEVELS)
 /**
  * Used for any batch entry with a relocated pointer that will be used
  * by any 3D rendering.
@@ -1008,6 +1008,8 @@ struct brw_context
    struct {
       GLuint vsize;		/* vertex size plus header in urb registers */
       GLuint gsize;	        /* GS output size in urb registers */
+      GLuint hsize;             /* Tessellation control output size in urb registers */
+      GLuint dsize;             /* Tessellation evaluation output size in urb registers */
       GLuint csize;		/* constant buffer size in urb registers */
       GLuint sfsize;		/* setup data size in urb registers */
 
@@ -1020,12 +1022,16 @@ struct brw_context
       GLuint max_gs_entries;	/* Maximum number of GS entries */
 
       GLuint nr_vs_entries;
+      GLuint nr_hs_entries;
+      GLuint nr_ds_entries;
       GLuint nr_gs_entries;
       GLuint nr_clip_entries;
       GLuint nr_sf_entries;
       GLuint nr_cs_entries;
 
       GLuint vs_start;
+      GLuint hs_start;
+      GLuint ds_start;
       GLuint gs_start;
       GLuint clip_start;
       GLuint sf_start;
@@ -1042,6 +1048,11 @@ struct brw_context
        * URB space for the GS.
        */
       bool gs_present;
+
+      /* True if the most recently sent _3DSTATE_URB message allocated
+       * URB space for the HS and DS.
+       */
+      bool tess_present;
    } urb;
 
 
@@ -1648,12 +1659,18 @@ void gen8_emit_3dstate_sample_pattern(struct brw_context *brw);
 /* gen7_urb.c */
 void
 gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
+                              unsigned hs_size, unsigned ds_size,
                               unsigned gs_size, unsigned fs_size);
 
 void
 gen7_emit_urb_state(struct brw_context *brw,
-                    unsigned nr_vs_entries, unsigned vs_size,
-                    unsigned vs_start, unsigned nr_gs_entries,
+                    unsigned nr_vs_entries,
+                    unsigned vs_size, unsigned vs_start,
+                    unsigned nr_hs_entries,
+                    unsigned hs_size, unsigned hs_start,
+                    unsigned nr_ds_entries,
+                    unsigned ds_size, unsigned ds_start,
+                    unsigned nr_gs_entries,
                     unsigned gs_size, unsigned gs_start);
 
 
@@ -1687,6 +1704,18 @@ brw_vertex_program_const(const struct gl_vertex_program *p)
    return (const struct brw_vertex_program *) p;
 }
 
+static inline struct brw_tess_ctrl_program *
+brw_tess_ctrl_program(struct gl_tess_ctrl_program *p)
+{
+   return (struct brw_tess_ctrl_program *) p;
+}
+
+static inline struct brw_tess_eval_program *
+brw_tess_eval_program(struct gl_tess_eval_program *p)
+{
+   return (struct brw_tess_eval_program *) p;
+}
+
 static inline struct brw_geometry_program *
 brw_geometry_program(struct gl_geometry_program *p)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 7d8723efd5e..5c18d671a0c 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1307,6 +1307,14 @@ enum opcode {
     *           UD immediate).
     */
    SHADER_OPCODE_MOV_INDIRECT,
+
+   VEC4_OPCODE_URB_READ,
+   TCS_OPCODE_GET_INSTANCE_ID,
+   TCS_OPCODE_URB_WRITE,
+   TCS_OPCODE_SET_INPUT_URB_OFFSETS,
+   TCS_OPCODE_SET_OUTPUT_URB_OFFSETS,
+   TCS_OPCODE_GET_PRIMITIVE_ID,
+   TCS_OPCODE_CREATE_BARRIER_HEADER,
 };
 
 enum brw_urb_write_flags {
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index cbc2f2fbf4b..f9a72903ef1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1687,6 +1687,21 @@ fs_visitor::assign_vs_urb_setup()
 }
 
 void
+fs_visitor::assign_tes_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
+
+   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
 fs_visitor::assign_gs_urb_setup()
 {
    assert(stage == MESA_SHADER_GEOMETRY);
@@ -5257,6 +5272,40 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
 }
 
 bool
+fs_visitor::run_tes()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
+   payload.num_regs = 5;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   emit_nir_code();
+
+   if (failed)
+      return false;
+
+   emit_urb_writes();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tes_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers();
+
+   return !failed;
+}
+
+bool
 fs_visitor::run_gs()
 {
    assert(stage == MESA_SHADER_GEOMETRY);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index dff86a97a14..bdbfd0c4546 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -81,7 +81,8 @@ public:
               struct gl_program *prog,
               const nir_shader *shader,
               unsigned dispatch_width,
-              int shader_time_index);
+              int shader_time_index,
+              const struct brw_vue_map *input_vue_map = NULL);
    fs_visitor(const struct brw_compiler *compiler, void *log_data,
               void *mem_ctx,
               struct brw_gs_compile *gs_compile,
@@ -109,6 +110,7 @@ public:
 
    bool run_fs(bool do_rep_send);
    bool run_vs(gl_clip_plane *clip_planes);
+   bool run_tes();
    bool run_gs();
    bool run_cs();
    void optimize();
@@ -124,6 +126,7 @@ public:
    void assign_urb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);
    void assign_vs_urb_setup();
+   void assign_tes_urb_setup();
    void assign_gs_urb_setup();
    bool assign_regs(bool allow_spilling);
    void assign_regs_trivial();
@@ -251,6 +254,8 @@ public:
                               nir_intrinsic_instr *instr);
    void nir_emit_intrinsic(const brw::fs_builder &bld,
                            nir_intrinsic_instr *instr);
+   void nir_emit_tes_intrinsic(const brw::fs_builder &bld,
+                               nir_intrinsic_instr *instr);
    void nir_emit_ssbo_atomic(const brw::fs_builder &bld,
                              int op, nir_intrinsic_instr *instr);
    void nir_emit_shared_atomic(const brw::fs_builder &bld,
@@ -262,6 +267,7 @@ public:
    fs_reg get_nir_src(nir_src src);
    fs_reg get_nir_dest(nir_dest dest);
    fs_reg get_nir_image_deref(const nir_deref_var *deref);
+   fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
 
@@ -315,6 +321,8 @@ public:
    struct brw_stage_prog_data *prog_data;
    struct gl_program *prog;
 
+   const struct brw_vue_map *input_vue_map;
+
    int *param_size;
 
    int *virtual_grf_start;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index b3fb0c6fd6e..78a82406ad7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -288,23 +288,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
       }
       break;
 
-   case ir_unop_any: {
-      ir_expression *temp;
-      temp = new(mem_ctx) ir_expression(ir_binop_logic_or,
-					element_type,
-					get_element(op_var[0], 0),
-					get_element(op_var[0], 1));
-
-      for (i = 2; i < vector_elements; i++) {
-	 temp = new(mem_ctx) ir_expression(ir_binop_logic_or,
-					   element_type,
-					   get_element(op_var[0], i),
-					   temp);
-      }
-      assign(ir, 0, temp);
-      break;
-   }
-
    case ir_binop_dot: {
       ir_expression *last = NULL;
       for (i = 0; i < vector_elements; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 4e0ff50ddcb..ba14ba54303 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -123,6 +123,7 @@ fs_visitor::nir_setup_outputs()
 
       switch (stage) {
       case MESA_SHADER_VERTEX:
+      case MESA_SHADER_TESS_EVAL:
       case MESA_SHADER_GEOMETRY: {
          unsigned location = var->data.location;
          nir_setup_single_output_varying(&reg, var->type, &location);
@@ -443,6 +444,9 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       case MESA_SHADER_VERTEX:
          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
          break;
+      case MESA_SHADER_TESS_EVAL:
+         nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
       case MESA_SHADER_GEOMETRY:
          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
          break;
@@ -841,12 +845,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    case nir_op_fdot2:
    case nir_op_fdot3:
    case nir_op_fdot4:
-   case nir_op_bany2:
-   case nir_op_bany3:
-   case nir_op_bany4:
-   case nir_op_ball2:
-   case nir_op_ball3:
-   case nir_op_ball4:
    case nir_op_ball_fequal2:
    case nir_op_ball_iequal2:
    case nir_op_ball_fequal3:
@@ -1715,6 +1713,24 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
    }
 }
 
+fs_reg
+fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+   nir_src *offset_src = nir_get_io_offset_src(instr);
+   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
+
+   if (const_value) {
+      /* The only constant offset we should find is 0.  brw_nir.c's
+       * add_const_offset_to_base() will fold other constant offsets
+       * into instr->const_index[0].
+       */
+      assert(const_value->u[0] == 0);
+      return fs_reg();
+   }
+
+   return get_nir_src(*offset_src);
+}
+
 void
 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
                                   nir_intrinsic_instr *instr)
@@ -1747,6 +1763,106 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
 }
 
 void
+fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
+                                   nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+   struct brw_tes_prog_data *tes_prog_data = (struct brw_tes_prog_data *) prog_data;
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
+      break;
+   case nir_intrinsic_load_tess_coord:
+      /* gl_TessCoord is part of the payload in g1-3 */
+      for (unsigned i = 0; i < 3; i++) {
+         bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
+      }
+      break;
+
+   case nir_intrinsic_load_tess_level_outer:
+      /* When the TES reads gl_TessLevelOuter, we ensure that the patch header
+       * appears as a push-model input.  So, we can simply use the ATTR file
+       * rather than issuing URB read messages.  The data is stored in the
+       * high DWords in reverse order - DWord 7 contains .x, DWord 6 contains
+       * .y, and so on.
+       */
+      switch (tes_prog_data->domain) {
+      case BRW_TESS_DOMAIN_QUAD:
+         for (unsigned i = 0; i < 4; i++)
+            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
+         break;
+      case BRW_TESS_DOMAIN_TRI:
+         for (unsigned i = 0; i < 3; i++)
+            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
+         break;
+      case BRW_TESS_DOMAIN_ISOLINE:
+         for (unsigned i = 0; i < 2; i++)
+            bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i));
+         break;
+      }
+      break;
+
+   case nir_intrinsic_load_tess_level_inner:
+      /* When the TES reads gl_TessLevelInner, we ensure that the patch header
+       * appears as a push-model input.  So, we can simply use the ATTR file
+       * rather than issuing URB read messages.
+       */
+      switch (tes_prog_data->domain) {
+      case BRW_TESS_DOMAIN_QUAD:
+         bld.MOV(dest, component(fs_reg(ATTR, 0), 3));
+         bld.MOV(offset(dest, bld, 1), component(fs_reg(ATTR, 0), 2));
+         break;
+      case BRW_TESS_DOMAIN_TRI:
+         bld.MOV(dest, component(fs_reg(ATTR, 0), 4));
+         break;
+      case BRW_TESS_DOMAIN_ISOLINE:
+         /* ignore - value is undefined */
+         break;
+      }
+      break;
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input: {
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Replicate the patch handle to all enabled channels */
+         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.MOV(patch_handle, retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle);
+         inst->mlen = 1;
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = {
+            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+            indirect_offset
+         };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, payload);
+         inst->mlen = 2;
+      }
+      inst->offset = imm_offset;
+      inst->base_mrf = -1;
+      inst->regs_written = instr->num_components;
+      break;
+   }
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
                                   nir_intrinsic_instr *instr)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 790f1009ca7..8f1fcbb233c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -702,7 +702,10 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
    fs_reg sources[8];
    fs_reg urb_handle;
 
-   urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+   if (stage == MESA_SHADER_TESS_EVAL)
+      urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD));
+   else
+      urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 
    /* If we don't have any valid slots to write, just do a minimal urb write
     * send to terminate the shader.  This includes 1 slot of undefined data,
@@ -936,9 +939,11 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
                        struct gl_program *prog,
                        const nir_shader *shader,
                        unsigned dispatch_width,
-                       int shader_time_index)
+                       int shader_time_index,
+                       const struct brw_vue_map *input_vue_map)
    : backend_shader(compiler, log_data, mem_ctx, shader, prog_data),
      key(key), gs_compile(NULL), prog_data(prog_data), prog(prog),
+     input_vue_map(input_vue_map),
      dispatch_width(dispatch_width),
      shader_time_index(shader_time_index),
      bld(fs_builder(this, dispatch_width).at_end())
@@ -974,6 +979,9 @@ fs_visitor::init()
    case MESA_SHADER_VERTEX:
       key_tex = &((const brw_vs_prog_key *) key)->tex;
       break;
+   case MESA_SHADER_TESS_EVAL:
+      key_tex = &((const brw_tes_prog_key *) key)->tex;
+      break;
    case MESA_SHADER_GEOMETRY:
       key_tex = &((const brw_gs_prog_key *) key)->tex;
       break;
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 31d29ec9045..7cdc830f6b8 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -42,6 +42,8 @@ brw_shader_precompile(struct gl_context *ctx,
                       struct gl_shader_program *sh_prog)
 {
    struct gl_shader *vs = sh_prog->_LinkedShaders[MESA_SHADER_VERTEX];
+   struct gl_shader *tcs = sh_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
+   struct gl_shader *tes = sh_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
    struct gl_shader *gs = sh_prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
    struct gl_shader *fs = sh_prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
    struct gl_shader *cs = sh_prog->_LinkedShaders[MESA_SHADER_COMPUTE];
@@ -52,6 +54,12 @@ brw_shader_precompile(struct gl_context *ctx,
    if (gs && !brw_gs_precompile(ctx, sh_prog, gs->Program))
       return false;
 
+   if (tes && !brw_tes_precompile(ctx, sh_prog, tes->Program))
+      return false;
+
+   if (tcs && !brw_tcs_precompile(ctx, sh_prog, tcs->Program))
+      return false;
+
    if (vs && !brw_vs_precompile(ctx, sh_prog, vs->Program))
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index fdfc4f661d1..eebd2a386b6 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -27,15 +27,43 @@
 #include "glsl/nir/nir_builder.h"
 #include "program/prog_to_nir.h"
 
-struct remap_vs_attrs_state {
+static bool
+is_input(nir_intrinsic_instr *intrin)
+{
+   return intrin->intrinsic == nir_intrinsic_load_input ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_input;
+}
+
+static bool
+is_output(nir_intrinsic_instr *intrin)
+{
+   return intrin->intrinsic == nir_intrinsic_load_output ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
+          intrin->intrinsic == nir_intrinsic_store_output ||
+          intrin->intrinsic == nir_intrinsic_store_per_vertex_output;
+}
+
+/**
+ * In many cases, we just add the base and offset together, so there's no
+ * reason to keep them separate.  Sometimes, combining them is essential:
+ * if a shader only accesses part of a compound variable (such as a matrix
+ * or array), the variable's base may not actually exist in the VUE map.
+ *
+ * This pass adds constant offsets to instr->const_index[0], and resets
+ * the offset source to 0.  Non-constant offsets remain unchanged - since
+ * we don't know what part of a compound variable is accessed, we allocate
+ * storage for the entire thing.
+ */
+struct add_const_offset_to_base_params {
    nir_builder b;
-   uint64_t inputs_read;
+   nir_variable_mode mode;
 };
 
 static bool
-remap_vs_attrs(nir_block *block, void *void_state)
+add_const_offset_to_base(nir_block *block, void *closure)
 {
-   struct remap_vs_attrs_state *state = void_state;
+   struct add_const_offset_to_base_params *params = closure;
+   nir_builder *b = &params->b;
 
    nir_foreach_instr_safe(block, instr) {
       if (instr->type != nir_instr_type_intrinsic)
@@ -43,30 +71,120 @@ remap_vs_attrs(nir_block *block, void *void_state)
 
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
+      if ((params->mode == nir_var_shader_in && is_input(intrin)) ||
+          (params->mode == nir_var_shader_out && is_output(intrin))) {
+         nir_src *offset = nir_get_io_offset_src(intrin);
+         nir_const_value *const_offset = nir_src_as_const_value(*offset);
+
+         if (const_offset) {
+            intrin->const_index[0] += const_offset->u[0];
+            b->cursor = nir_before_instr(&intrin->instr);
+            nir_instr_rewrite_src(&intrin->instr, offset,
+                                  nir_src_for_ssa(nir_imm_int(b, 0)));
+         }
+      }
+   }
+   return true;
+
+}
+
+static bool
+remap_vs_attrs(nir_block *block, void *closure)
+{
+   GLbitfield64 inputs_read = *((GLbitfield64 *) closure);
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
       if (intrin->intrinsic == nir_intrinsic_load_input) {
          /* Attributes come in a contiguous block, ordered by their
           * gl_vert_attrib value.  That means we can compute the slot
           * number for an attribute by masking out the enabled attributes
           * before it and counting the bits.
           */
-         nir_const_value *const_offset = nir_src_as_const_value(intrin->src[0]);
+         int attr = intrin->const_index[0];
+         int slot = _mesa_bitcount_64(inputs_read & BITFIELD64_MASK(attr));
+
+         intrin->const_index[0] = 4 * slot;
+      }
+   }
+   return true;
+}
 
-         /* We set EmitNoIndirect for VS inputs, so there are no indirects. */
-         assert(const_offset);
+static bool
+remap_inputs_with_vue_map(nir_block *block, void *closure)
+{
+   const struct brw_vue_map *vue_map = closure;
 
-         int attr = intrin->const_index[0] + const_offset->u[0];
-         int slot = _mesa_bitcount_64(state->inputs_read &
-                                      BITFIELD64_MASK(attr));
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
 
-         /* The NIR -> FS pass will just add the base and offset together, so
-          * there's no reason to keep them separate.  Just put it all in
-          * const_index[0] and set the offset src[0] to load_const(0).
-          */
-         intrin->const_index[0] = 4 * slot;
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      if (intrin->intrinsic == nir_intrinsic_load_input ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
+         int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
+         assert(vue_slot != -1);
+         intrin->const_index[0] = vue_slot;
+      }
+   }
+   return true;
+}
 
-         state->b.cursor = nir_before_instr(&intrin->instr);
-         nir_instr_rewrite_src(&intrin->instr, &intrin->src[0],
-                               nir_src_for_ssa(nir_imm_int(&state->b, 0)));
+struct remap_patch_urb_offsets_state {
+   nir_builder b;
+   struct brw_vue_map vue_map;
+};
+
+static bool
+remap_patch_urb_offsets(nir_block *block, void *closure)
+{
+   struct remap_patch_urb_offsets_state *state = closure;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      gl_shader_stage stage = state->b.shader->stage;
+
+      if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
+          (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
+         int vue_slot = state->vue_map.varying_to_slot[intrin->const_index[0]];
+         assert(vue_slot != -1);
+         intrin->const_index[0] = vue_slot;
+
+         nir_src *vertex = nir_get_io_vertex_index_src(intrin);
+         if (vertex) {
+            nir_const_value *const_vertex = nir_src_as_const_value(*vertex);
+            if (const_vertex) {
+               intrin->const_index[0] += const_vertex->u[0] *
+                                         state->vue_map.num_per_vertex_slots;
+            } else {
+               state->b.cursor = nir_before_instr(&intrin->instr);
+
+               /* Multiply by the number of per-vertex slots. */
+               nir_ssa_def *vertex_offset =
+                  nir_imul(&state->b,
+                           nir_ssa_for_src(&state->b, *vertex, 1),
+                           nir_imm_int(&state->b,
+                                       state->vue_map.num_per_vertex_slots));
+
+               /* Add it to the existing offset */
+               nir_src *offset = nir_get_io_offset_src(intrin);
+               nir_ssa_def *total_offset =
+                  nir_iadd(&state->b, vertex_offset,
+                           nir_ssa_for_src(&state->b, *offset, 1));
+
+               nir_instr_rewrite_src(&intrin->instr, offset,
+                                     nir_src_for_ssa(total_offset));
+            }
+         }
       }
    }
    return true;
@@ -77,6 +195,10 @@ brw_nir_lower_inputs(nir_shader *nir,
                      const struct brw_device_info *devinfo,
                      bool is_scalar)
 {
+   struct add_const_offset_to_base_params params = {
+      .mode = nir_var_shader_in
+   };
+
    switch (nir->stage) {
    case MESA_SHADER_VERTEX:
       /* Start with the location of the variable's base. */
@@ -97,23 +219,23 @@ brw_nir_lower_inputs(nir_shader *nir,
           * key->inputs_read since the two are identical aside from Gen4-5
           * edge flag differences.
           */
-         struct remap_vs_attrs_state remap_state = {
-            .inputs_read = nir->info.inputs_read,
-         };
+         GLbitfield64 inputs_read = nir->info.inputs_read;
 
          /* This pass needs actual constants */
          nir_opt_constant_folding(nir);
 
          nir_foreach_overload(nir, overload) {
             if (overload->impl) {
-               nir_builder_init(&remap_state.b, overload->impl);
-               nir_foreach_block(overload->impl, remap_vs_attrs, &remap_state);
+               nir_builder_init(&params.b, overload->impl);
+               nir_foreach_block(overload->impl, add_const_offset_to_base, &params);
+               nir_foreach_block(overload->impl, remap_vs_attrs, &inputs_read);
             }
          }
       }
       break;
+   case MESA_SHADER_TESS_CTRL:
    case MESA_SHADER_GEOMETRY: {
-      if (!is_scalar) {
+      if (!is_scalar && nir->stage == MESA_SHADER_GEOMETRY) {
          foreach_list_typed(nir_variable, var, node, &nir->inputs) {
             var->data.driver_location = var->data.location;
          }
@@ -135,17 +257,52 @@ brw_nir_lower_inputs(nir_shader *nir,
          GLbitfield64 inputs_read =
             nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID;
          brw_compute_vue_map(devinfo, &input_vue_map, inputs_read,
-                             nir->info.separate_shader);
+                             nir->info.separate_shader ||
+                             nir->stage == MESA_SHADER_TESS_CTRL);
 
-         /* Start with the slot for the variable's base. */
          foreach_list_typed(nir_variable, var, node, &nir->inputs) {
-            assert(input_vue_map.varying_to_slot[var->data.location] != -1);
-            var->data.driver_location =
-               input_vue_map.varying_to_slot[var->data.location];
+            var->data.driver_location = var->data.location;
          }
 
          /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
          nir_lower_io(nir, nir_var_shader_in, type_size_vec4);
+
+         /* This pass needs actual constants */
+         nir_opt_constant_folding(nir);
+
+         nir_foreach_overload(nir, overload) {
+            if (overload->impl) {
+               nir_builder_init(&params.b, overload->impl);
+               nir_foreach_block(overload->impl, add_const_offset_to_base, &params);
+               nir_foreach_block(overload->impl, remap_inputs_with_vue_map,
+                                 &input_vue_map);
+            }
+         }
+      }
+      break;
+   }
+   case MESA_SHADER_TESS_EVAL: {
+      struct remap_patch_urb_offsets_state state;
+      brw_compute_tess_vue_map(&state.vue_map,
+                               nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
+                               nir->info.patch_inputs_read);
+
+      foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+         var->data.driver_location = var->data.location;
+      }
+
+      nir_lower_io(nir, nir_var_shader_in, type_size_vec4);
+
+      /* This pass needs actual constants */
+      nir_opt_constant_folding(nir);
+
+      nir_foreach_overload(nir, overload) {
+         if (overload->impl) {
+            nir_builder_init(&params.b, overload->impl);
+            nir_foreach_block(overload->impl, add_const_offset_to_base, &params);
+            nir_builder_init(&state.b, overload->impl);
+            nir_foreach_block(overload->impl, remap_patch_urb_offsets, &state);
+         }
       }
       break;
    }
@@ -164,10 +321,13 @@ brw_nir_lower_inputs(nir_shader *nir,
 }
 
 static void
-brw_nir_lower_outputs(nir_shader *nir, bool is_scalar)
+brw_nir_lower_outputs(nir_shader *nir,
+                      const struct brw_device_info *devinfo,
+                      bool is_scalar)
 {
    switch (nir->stage) {
    case MESA_SHADER_VERTEX:
+   case MESA_SHADER_TESS_EVAL:
    case MESA_SHADER_GEOMETRY:
       if (is_scalar) {
          nir_assign_var_locations(&nir->outputs, &nir->num_outputs,
@@ -178,6 +338,34 @@ brw_nir_lower_outputs(nir_shader *nir, bool is_scalar)
             var->data.driver_location = var->data.location;
       }
       break;
+   case MESA_SHADER_TESS_CTRL: {
+      struct add_const_offset_to_base_params params = {
+         .mode = nir_var_shader_out
+      };
+
+      struct remap_patch_urb_offsets_state state;
+      brw_compute_tess_vue_map(&state.vue_map, nir->info.outputs_written,
+                               nir->info.patch_outputs_written);
+
+      nir_foreach_variable(var, &nir->outputs) {
+         var->data.driver_location = var->data.location;
+      }
+
+      nir_lower_io(nir, nir_var_shader_out, type_size_vec4);
+
+      /* This pass needs actual constants */
+      nir_opt_constant_folding(nir);
+
+      nir_foreach_overload(nir, overload) {
+         if (overload->impl) {
+            nir_builder_init(&params.b, overload->impl);
+            nir_foreach_block(overload->impl, add_const_offset_to_base, &params);
+            nir_builder_init(&state.b, overload->impl);
+            nir_foreach_block(overload->impl, remap_patch_urb_offsets, &state);
+         }
+      }
+      break;
+   }
    case MESA_SHADER_FRAGMENT:
       nir_assign_var_locations(&nir->outputs, &nir->num_outputs,
                                type_size_scalar);
@@ -328,37 +516,19 @@ brw_preprocess_nir(nir_shader *nir, bool is_scalar)
    return nir;
 }
 
-/* Lowers inputs, outputs, uniforms, and samplers for i965
- *
- * This function does all of the standard lowering prior to post-processing.
- * The lowering done is highly gen, stage, and backend-specific.  The
- * shader_prog parameter is optional and is used only for lowering sampler
- * derefs and atomics for GLSL shaders.
- */
+/** Lower input and output loads and stores for i965. */
 nir_shader *
-brw_lower_nir(nir_shader *nir,
-              const struct brw_device_info *devinfo,
-              const struct gl_shader_program *shader_prog,
-              bool is_scalar)
+brw_nir_lower_io(nir_shader *nir,
+                 const struct brw_device_info *devinfo,
+                 bool is_scalar)
 {
    bool progress; /* Written by OPT and OPT_V */
    (void)progress;
 
    OPT_V(brw_nir_lower_inputs, devinfo, is_scalar);
-   OPT_V(brw_nir_lower_outputs, is_scalar);
-   //OPT_V(brw_nir_lower_uniforms, is_scalar);
+   OPT_V(brw_nir_lower_outputs, devinfo, is_scalar);
    OPT_V(nir_lower_io, nir_var_all, is_scalar ? type_size_scalar : type_size_vec4);
 
-   if (shader_prog) {
-      OPT_V(nir_lower_samplers, shader_prog);
-   }
-
-   OPT(nir_lower_system_values);
-
-   if (shader_prog) {
-      OPT_V(nir_lower_atomics, shader_prog);
-   }
-
    return nir_optimize(nir, is_scalar);
 }
 
@@ -457,7 +627,19 @@ brw_create_nir(struct brw_context *brw,
    (void)progress;
 
    nir = brw_preprocess_nir(nir, is_scalar);
-   nir = brw_lower_nir(nir, devinfo, shader_prog, is_scalar);
+
+   OPT(nir_lower_system_values);
+   OPT_V(brw_nir_lower_uniforms, is_scalar);
+
+   if (shader_prog) {
+      OPT_V(nir_lower_samplers, shader_prog);
+      OPT_V(nir_lower_atomics, shader_prog);
+   }
+
+   if (nir->stage != MESA_SHADER_TESS_CTRL &&
+       nir->stage != MESA_SHADER_TESS_EVAL) {
+      nir = brw_nir_lower_io(nir, devinfo, is_scalar);
+   }
 
    return nir;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 0a8a5a280b1..78b139b991d 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -82,10 +82,9 @@ nir_shader *brw_create_nir(struct brw_context *brw,
                            bool is_scalar);
 
 nir_shader *brw_preprocess_nir(nir_shader *nir, bool is_scalar);
-nir_shader *brw_lower_nir(nir_shader *nir,
-                          const struct brw_device_info *devinfo,
-                          const struct gl_shader_program *shader_prog,
-                          bool is_scalar);
+nir_shader *brw_nir_lower_io(nir_shader *nir,
+                            const struct brw_device_info *devinfo,
+                            bool is_scalar);
 nir_shader *brw_postprocess_nir(nir_shader *nir,
                                 const struct brw_device_info *devinfo,
                                 bool is_scalar);
diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
index c995d2b7e2d..f4d23d81260 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -109,9 +109,6 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          uint8_t resolve_status;
          nir_alu_instr *alu = nir_instr_as_alu(instr);
          switch (alu->op) {
-         case nir_op_bany2:
-         case nir_op_bany3:
-         case nir_op_bany4:
          case nir_op_ball_fequal2:
          case nir_op_ball_iequal2:
          case nir_op_ball_fequal3:
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index ae3d8188325..6c636d26139 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -97,7 +97,8 @@ void
 brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
 {
    if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
+      if (brw->gen == 8)
+         gen8_add_cs_stall_workaround_bits(&flags);
 
       BEGIN_BATCH(6);
       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
@@ -141,7 +142,8 @@ brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
                             uint32_t imm_lower, uint32_t imm_upper)
 {
    if (brw->gen >= 8) {
-      gen8_add_cs_stall_workaround_bits(&flags);
+      if (brw->gen == 8)
+         gen8_add_cs_stall_workaround_bits(&flags);
 
       BEGIN_BATCH(6);
       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h
index 339b8e19ec5..059ccf8bd39 100644
--- a/src/mesa/drivers/dri/i965/brw_program.h
+++ b/src/mesa/drivers/dri/i965/brw_program.h
@@ -56,6 +56,11 @@ void
 brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog,
             struct gl_shader *shader, struct gl_program *prog);
 
+void brw_upload_tcs_prog(struct brw_context *brw,
+                         uint64_t per_vertex_slots, uint32_t per_patch_slots);
+void brw_upload_tes_prog(struct brw_context *brw,
+                         uint64_t per_vertex_slots, uint32_t per_patch_slots);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index fa912c96c36..9f2ff9ae5ad 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -84,6 +84,7 @@ struct brw_device_info;
 #define BRW_SWIZZLE_YZXW      BRW_SWIZZLE4(1,2,0,3)
 #define BRW_SWIZZLE_ZXYW      BRW_SWIZZLE4(2,0,1,3)
 #define BRW_SWIZZLE_ZWZW      BRW_SWIZZLE4(2,3,2,3)
+#define BRW_SWIZZLE_WZYX      BRW_SWIZZLE4(3,2,1,0)
 
 static inline bool
 brw_is_single_value_swizzle(unsigned swiz)
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 3f29e2fc105..d181468f5cb 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -654,7 +654,7 @@ const struct brw_tracked_state brw_gs_samplers = {
 static void
 brw_upload_tcs_samplers(struct brw_context *brw)
 {
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program;
    if (!tcs)
       return;
@@ -667,7 +667,7 @@ const struct brw_tracked_state brw_tcs_samplers = {
    .dirty = {
       .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_TESS_CTRL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
    },
    .emit = brw_upload_tcs_samplers,
 };
@@ -676,7 +676,7 @@ const struct brw_tracked_state brw_tcs_samplers = {
 static void
 brw_upload_tes_samplers(struct brw_context *brw)
 {
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    struct gl_program *tes = (struct gl_program *) brw->tess_eval_program;
    if (!tes)
       return;
@@ -689,7 +689,7 @@ const struct brw_tracked_state brw_tes_samplers = {
    .dirty = {
       .mesa = _NEW_TEXTURE,
       .brw = BRW_NEW_BATCH |
-             BRW_NEW_TESS_EVAL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
    },
    .emit = brw_upload_tes_samplers,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index d051e124584..836cf0a69e9 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -24,6 +24,7 @@
 #include "brw_context.h"
 #include "brw_cfg.h"
 #include "brw_eu.h"
+#include "brw_fs.h"
 #include "brw_nir.h"
 #include "glsl/glsl_parser_extras.h"
 #include "main/shaderobj.h"
@@ -84,6 +85,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 
    compiler->scalar_stage[MESA_SHADER_VERTEX] =
       devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
+   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = true;
    compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
       devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
    compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
@@ -137,6 +140,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
    }
 
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
+
    if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
       compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
 
@@ -548,6 +554,21 @@ brw_instruction_name(enum opcode op)
       return "mulh";
    case SHADER_OPCODE_MOV_INDIRECT:
       return "mov_indirect";
+
+   case VEC4_OPCODE_URB_READ:
+      return "urb_read";
+   case TCS_OPCODE_GET_INSTANCE_ID:
+      return "tcs_get_instance_id";
+   case TCS_OPCODE_URB_WRITE:
+      return "tcs_urb_write";
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+      return "tcs_set_input_urb_offsets";
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      return "tcs_set_output_urb_offsets";
+   case TCS_OPCODE_GET_PRIMITIVE_ID:
+      return "tcs_get_primitive_id";
+   case TCS_OPCODE_CREATE_BARRIER_HEADER:
+      return "tcs_create_barrier_header";
    }
 
    unreachable("not reached");
@@ -1292,3 +1313,96 @@ gl_clip_plane *brw_select_clip_planes(struct gl_context *ctx)
    }
 }
 
+extern "C" const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tes_prog_key *key,
+                struct brw_tes_prog_data *prog_data,
+                const nir_shader *src_shader,
+                struct gl_shader_program *shader_prog,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str)
+{
+   const struct brw_device_info *devinfo = compiler->devinfo;
+   struct gl_shader *shader =
+      shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
+   nir->info.inputs_read = key->inputs_read;
+   nir->info.patch_inputs_read = key->patch_inputs_read;
+   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
+   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
+
+   brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
+                       nir->info.outputs_written,
+                       nir->info.separate_shader);
+
+   unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES) {
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, "DS outputs exceed maximum size");
+      return NULL;
+   }
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   struct brw_vue_map input_vue_map;
+   brw_compute_tess_vue_map(&input_vue_map,
+                            nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
+                            nir->info.patch_inputs_read);
+
+   bool need_patch_header = nir->info.system_values_read &
+      (BITFIELD64_BIT(SYSTEM_VALUE_TESS_LEVEL_OUTER) |
+       BITFIELD64_BIT(SYSTEM_VALUE_TESS_LEVEL_INNER));
+
+   /* The TES will pull most inputs using URB read messages.
+    *
+    * However, we push the patch header for TessLevel factors when required,
+    * as it's a tiny amount of extra data.
+    */
+   prog_data->base.urb_read_length = need_patch_header ? 1 : 0;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
+      fprintf(stderr, "TES Input ");
+      brw_print_vue_map(stderr, &input_vue_map);
+      fprintf(stderr, "TES Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map);
+   }
+
+   if (is_scalar) {
+      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
+                   &prog_data->base.base, shader->Program, nir, 8,
+                   shader_time_index, &input_vue_map);
+      if (!v.run_tes()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+                     &prog_data->base.base, v.promoted_constants, false,
+                     "TES");
+      if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
+         g.enable_debug(ralloc_asprintf(mem_ctx,
+                                        "%s tessellation evaluation shader %s",
+                                        nir->info.label ? nir->info.label
+                                                        : "unnamed",
+                                        nir->info.name));
+      }
+
+      g.generate_code(v.cfg, 8);
+
+      return g.get_assembly(final_assembly_size);
+   } else {
+      unreachable("XXX: vec4 tessellation evalation shaders not merged yet.");
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 0b77dc24539..82374a46c18 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -270,6 +270,12 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage,
 bool brw_vs_precompile(struct gl_context *ctx,
                        struct gl_shader_program *shader_prog,
                        struct gl_program *prog);
+bool brw_tcs_precompile(struct gl_context *ctx,
+                        struct gl_shader_program *shader_prog,
+                        struct gl_program *prog);
+bool brw_tes_precompile(struct gl_context *ctx,
+                        struct gl_shader_program *shader_prog,
+                        struct gl_program *prog);
 bool brw_gs_precompile(struct gl_context *ctx,
                        struct gl_shader_program *shader_prog,
                        struct gl_program *prog);
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index 3d3a6cf943a..46667884125 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -319,10 +319,13 @@ dump_gen8_surface_state(struct brw_context *brw, uint32_t offset, int index)
              GET_FIELD(surf[4], GEN7_SURFACE_MIN_ARRAY_ELEMENT),
              GET_FIELD(surf[4], GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT) + 1,
              1 << GET_BITS(surf[4], 5, 3));
-   batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d\n",
+   batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d,"
+             " tr_mode (gen9+): %d, mip tail (gen9+): %d\n",
              GET_FIELD(surf[5], BRW_SURFACE_X_OFFSET),
              GET_FIELD(surf[5], BRW_SURFACE_Y_OFFSET),
-             GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD));
+             GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD),
+             GET_FIELD(surf[5], GEN9_SURFACE_TRMODE),
+             GET_FIELD(surf[5], GEN9_SURFACE_MIP_TAIL_START_LOD));
    batch_out(brw, name, offset, 6, "AUX pitch: %d qpitch: %d\n",
              GET_FIELD(surf[6], GEN8_SURFACE_AUX_QPITCH) << 2,
              GET_FIELD(surf[6], GEN8_SURFACE_AUX_PITCH) << 2);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index cf3cf97daea..81a67d284e0 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -517,6 +517,7 @@ void brw_init_state( struct brw_context *brw )
    ctx->DriverFlags.NewTextureBuffer = BRW_NEW_TEXTURE_BUFFER;
    ctx->DriverFlags.NewAtomicBuffer = BRW_NEW_ATOMIC_BUFFER;
    ctx->DriverFlags.NewImageUnits = BRW_NEW_IMAGE_UNITS;
+   ctx->DriverFlags.NewDefaultTessLevels = BRW_NEW_DEFAULT_TESS_LEVELS;
 }
 
 
@@ -607,8 +608,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_URB_FENCE),
    DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM),
    DEFINE_BIT(BRW_NEW_GEOMETRY_PROGRAM),
-   DEFINE_BIT(BRW_NEW_TESS_EVAL_PROGRAM),
-   DEFINE_BIT(BRW_NEW_TESS_CTRL_PROGRAM),
+   DEFINE_BIT(BRW_NEW_TESS_PROGRAMS),
    DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM),
    DEFINE_BIT(BRW_NEW_CURBE_OFFSETS),
    DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE),
@@ -620,6 +620,7 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_BINDING_TABLE_POINTERS),
    DEFINE_BIT(BRW_NEW_INDICES),
    DEFINE_BIT(BRW_NEW_VERTICES),
+   DEFINE_BIT(BRW_NEW_DEFAULT_TESS_LEVELS),
    DEFINE_BIT(BRW_NEW_BATCH),
    DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
    DEFINE_BIT(BRW_NEW_VS_CONSTBUF),
@@ -673,11 +674,40 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map)
 }
 
 static inline void
+brw_upload_tess_programs(struct brw_context *brw)
+{
+   if (brw->tess_eval_program) {
+      uint64_t per_vertex_slots = brw->tess_eval_program->Base.InputsRead;
+      uint32_t per_patch_slots =
+         brw->tess_eval_program->Base.PatchInputsRead;
+
+      /* The TCS may have additional outputs which aren't read by the
+       * TES (possibly for cross-thread communication).  These need to
+       * be stored in the Patch URB Entry as well.
+       */
+      if (brw->tess_ctrl_program) {
+         per_vertex_slots |= brw->tess_ctrl_program->Base.OutputsWritten;
+         per_patch_slots |=
+            brw->tess_ctrl_program->Base.PatchOutputsWritten;
+      }
+
+      brw_upload_tcs_prog(brw, per_vertex_slots, per_patch_slots);
+      brw_upload_tes_prog(brw, per_vertex_slots, per_patch_slots);
+   } else {
+      brw->tcs.prog_data = NULL;
+      brw->tcs.base.prog_data = NULL;
+      brw->tes.prog_data = NULL;
+      brw->tes.base.prog_data = NULL;
+   }
+}
+
+static inline void
 brw_upload_programs(struct brw_context *brw,
                     enum brw_pipeline pipeline)
 {
    if (pipeline == BRW_RENDER_PIPELINE) {
       brw_upload_vs_prog(brw);
+      brw_upload_tess_programs(brw);
 
       if (brw->gen < 6)
          brw_upload_ff_gs_prog(brw);
@@ -691,6 +721,8 @@ brw_upload_programs(struct brw_context *brw,
       bool old_separate = brw->vue_map_geom_out.separate;
       if (brw->geometry_program)
          brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map;
+      else if (brw->tess_eval_program)
+         brw->vue_map_geom_out = brw->tes.prog_data->base.vue_map;
       else
          brw->vue_map_geom_out = brw->vs.prog_data->base.vue_map;
 
@@ -750,12 +782,12 @@ brw_upload_pipeline_state(struct brw_context *brw,
 
       if (brw->tess_eval_program != ctx->TessEvalProgram._Current) {
          brw->tess_eval_program = ctx->TessEvalProgram._Current;
-         brw->ctx.NewDriverState |= BRW_NEW_TESS_EVAL_PROGRAM;
+         brw->ctx.NewDriverState |= BRW_NEW_TESS_PROGRAMS;
       }
 
       if (brw->tess_ctrl_program != ctx->TessCtrlProgram._Current) {
          brw->tess_ctrl_program = ctx->TessCtrlProgram._Current;
-         brw->ctx.NewDriverState |= BRW_NEW_TESS_CTRL_PROGRAM;
+         brw->ctx.NewDriverState |= BRW_NEW_TESS_PROGRAMS;
       }
 
       if (brw->geometry_program != ctx->GeometryProgram._Current) {
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index b7d9078d87d..9730078aeab 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -395,6 +395,7 @@ brw_format_for_mesa_format(mesa_format mesa_format)
       [MESA_FORMAT_A8R8G8B8_SRGB] = 0,
       [MESA_FORMAT_R8G8B8A8_SRGB] = BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB,
       [MESA_FORMAT_X8R8G8B8_SRGB] = 0,
+      [MESA_FORMAT_B8G8R8X8_SRGB] = BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB,
       [MESA_FORMAT_L_SRGB8] = BRW_SURFACEFORMAT_L8_UNORM_SRGB,
       [MESA_FORMAT_L8A8_SRGB] = BRW_SURFACEFORMAT_L8A8_UNORM_SRGB,
       [MESA_FORMAT_A8L8_SRGB] = 0,
@@ -660,6 +661,10 @@ brw_init_surface_formats(struct brw_context *brw)
          if (gen < tinfo->render_target)
             render = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 	 break;
+      case BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB:
+         if (gen < tinfo->render_target)
+            render = BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
+         break;
       case BRW_SURFACEFORMAT_R8G8B8X8_UNORM:
          render = BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
new file mode 100644
index 00000000000..2c925e7f572
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -0,0 +1,321 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_tcs.c
+ *
+ * Tessellation control shader state upload code.
+ */
+
+#include "brw_context.h"
+#include "brw_nir.h"
+#include "brw_program.h"
+#include "brw_shader.h"
+#include "brw_state.h"
+#include "program/prog_parameter.h"
+
+static void
+brw_tcs_debug_recompile(struct brw_context *brw,
+                       struct gl_shader_program *shader_prog,
+                       const struct brw_tcs_prog_key *key)
+{
+   struct brw_cache_item *c = NULL;
+   const struct brw_tcs_prog_key *old_key = NULL;
+   bool found = false;
+
+   perf_debug("Recompiling tessellation control shader for program %d\n",
+              shader_prog->Name);
+
+   for (unsigned int i = 0; i < brw->cache.size; i++) {
+      for (c = brw->cache.items[i]; c; c = c->next) {
+         if (c->cache_id == BRW_CACHE_TCS_PROG) {
+            old_key = c->key;
+
+            if (old_key->program_string_id == key->program_string_id)
+               break;
+         }
+      }
+      if (c)
+         break;
+   }
+
+   if (!c) {
+      perf_debug("  Didn't find previous compile in the shader cache for "
+                 "debug\n");
+      return;
+   }
+
+   found |= key_debug(brw, "input vertices", old_key->input_vertices,
+                      key->input_vertices);
+   found |= key_debug(brw, "outputs written", old_key->outputs_written,
+                      key->outputs_written);
+   found |= key_debug(brw, "patch outputs written", old_key->patch_outputs_written,
+                      key->patch_outputs_written);
+   found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode,
+                      key->tes_primitive_mode);
+   found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
+
+   if (!found) {
+      perf_debug("  Something else\n");
+   }
+}
+
+static bool
+brw_codegen_tcs_prog(struct brw_context *brw,
+                     struct gl_shader_program *shader_prog,
+                     struct brw_tess_ctrl_program *tcp,
+                     struct brw_tcs_prog_key *key)
+{
+   struct gl_context *ctx = &brw->ctx;
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
+   struct brw_stage_state *stage_state = &brw->tcs.base;
+   nir_shader *nir;
+   struct brw_tcs_prog_data prog_data;
+   bool start_busy = false;
+   double start_time = 0;
+
+   if (tcp) {
+      nir = tcp->program.Base.nir;
+   } else {
+      /* Create a dummy nir_shader.  We won't actually use NIR code to
+       * generate assembly (it's easier to generate assembly directly),
+       * but the whole compiler assumes one of these exists.
+       */
+      const nir_shader_compiler_options *options =
+         ctx->Const.ShaderCompilerOptions[MESA_SHADER_TESS_CTRL].NirOptions;
+      nir = nir_shader_create(NULL, MESA_SHADER_TESS_CTRL, options);
+      nir->num_uniforms = 2; /* both halves of the patch header */
+      nir->info.outputs_written = key->outputs_written;
+      nir->info.inputs_read = key->outputs_written;
+      nir->info.tcs.vertices_out = key->input_vertices;
+      nir->info.name = ralloc_strdup(nir, "passthrough");
+   }
+
+   memset(&prog_data, 0, sizeof(prog_data));
+
+   /* Allocate the references to the uniforms that will end up in the
+    * prog_data associated with the compiled program, and which will be freed
+    * by the state cache.
+    *
+    * Note: param_count needs to be num_uniform_components * 4, since we add
+    * padding around uniform values below vec4 size, so the worst case is that
+    * every uniform is a float which gets padded to the size of a vec4.
+    */
+   struct gl_shader *tcs = shader_prog ?
+      shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL] : NULL;
+   int param_count = nir->num_uniforms;
+   if (!compiler->scalar_stage[MESA_SHADER_TESS_CTRL])
+      param_count *= 4;
+
+   prog_data.base.base.param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.pull_param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.nr_params = param_count;
+
+   if (tcs) {
+      prog_data.base.base.image_param =
+         rzalloc_array(NULL, struct brw_image_param, tcs->NumImages);
+      prog_data.base.base.nr_image_params = tcs->NumImages;
+
+      brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base,
+                                  &prog_data.base.base, false);
+   } else {
+      /* Upload the Patch URB Header as the first two uniforms.
+       * Do the annoying scrambling so the shader doesn't have to.
+       */
+      const float **param = (const float **) prog_data.base.base.param;
+      static float zero = 0.0f;
+      for (int i = 0; i < 4; i++) {
+         param[7 - i] = &ctx->TessCtrlProgram.patch_default_outer_level[i];
+      }
+
+      if (key->tes_primitive_mode == GL_QUADS) {
+         param[3] = &ctx->TessCtrlProgram.patch_default_inner_level[0];
+         param[2] = &ctx->TessCtrlProgram.patch_default_inner_level[1];
+         param[1] = &zero;
+         param[0] = &zero;
+      } else if (key->tes_primitive_mode == GL_TRIANGLES) {
+         param[4] = &ctx->TessCtrlProgram.patch_default_inner_level[0];
+         for (int i = 0; i < 4; i++)
+            param[i] = &zero;
+      }
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS) && tcs)
+      brw_dump_ir("tessellation control", shader_prog, tcs, NULL);
+
+   int st_index = -1;
+   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+      st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TCS);
+
+   if (unlikely(brw->perf_debug)) {
+      start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo);
+      start_time = get_time();
+   }
+
+   void *mem_ctx = ralloc_context(NULL);
+   unsigned program_size;
+   char *error_str;
+   const unsigned *program =
+      brw_compile_tcs(compiler, brw, mem_ctx, key, &prog_data, nir, st_index,
+                      &program_size, &error_str);
+   if (program == NULL) {
+      if (shader_prog) {
+         shader_prog->LinkStatus = false;
+         ralloc_strcat(&shader_prog->InfoLog, error_str);
+      } else {
+         ralloc_free(nir);
+      }
+
+      _mesa_problem(NULL, "Failed to compile tessellation control shader: "
+                    "%s\n", error_str);
+
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (unlikely(brw->perf_debug)) {
+      struct brw_shader *btcs = (struct brw_shader *) tcs;
+      if (btcs->compiled_once) {
+         brw_tcs_debug_recompile(brw, shader_prog, key);
+      }
+      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
+         perf_debug("TCS compile took %.03f ms and stalled the GPU\n",
+                    (get_time() - start_time) * 1000);
+      }
+      btcs->compiled_once = true;
+   }
+
+   /* Scratch space is used for register spilling */
+   if (prog_data.base.base.total_scratch) {
+      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
+			 prog_data.base.base.total_scratch *
+                         brw->max_hs_threads);
+   }
+
+   brw_upload_cache(&brw->cache, BRW_CACHE_TCS_PROG,
+                    key, sizeof(*key),
+                    program, program_size,
+                    &prog_data, sizeof(prog_data),
+                    &stage_state->prog_offset, &brw->tcs.prog_data);
+   ralloc_free(mem_ctx);
+   if (!tcs)
+      ralloc_free(nir);
+
+   return true;
+}
+
+
+void
+brw_upload_tcs_prog(struct brw_context *brw,
+                    uint64_t per_vertex_slots,
+                    uint32_t per_patch_slots)
+{
+   struct gl_context *ctx = &brw->ctx;
+   struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
+   struct brw_stage_state *stage_state = &brw->tcs.base;
+   struct brw_tcs_prog_key key;
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct brw_tess_ctrl_program *tcp =
+      (struct brw_tess_ctrl_program *) brw->tess_ctrl_program;
+   struct brw_tess_eval_program *tep =
+      (struct brw_tess_eval_program *) brw->tess_eval_program;
+   assert(tep);
+
+   if (!brw_state_dirty(brw,
+                        _NEW_TEXTURE,
+                        BRW_NEW_PATCH_PRIMITIVE |
+                        BRW_NEW_TESS_PROGRAMS))
+      return;
+
+   struct gl_program *prog = &tcp->program.Base;
+
+   memset(&key, 0, sizeof(key));
+
+   key.input_vertices = ctx->TessCtrlProgram.patch_vertices;
+   key.outputs_written = per_vertex_slots;
+   key.patch_outputs_written = per_patch_slots;
+
+   /* We need to specialize our code generation for tessellation levels
+    * based on the domain the DS is expecting to tessellate.
+    */
+   key.tes_primitive_mode = tep->program.PrimitiveMode;
+
+   if (tcp) {
+      key.program_string_id = tcp->id;
+
+      /* _NEW_TEXTURE */
+      brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count,
+                                         &key.tex);
+   } else {
+      key.outputs_written = tep->program.Base.InputsRead;
+   }
+
+
+   if (!brw_search_cache(&brw->cache, BRW_CACHE_TCS_PROG,
+                         &key, sizeof(key),
+                         &stage_state->prog_offset, &brw->tcs.prog_data)) {
+      bool success = brw_codegen_tcs_prog(brw, current[MESA_SHADER_TESS_CTRL],
+                                          tcp, &key);
+      assert(success);
+      (void)success;
+   }
+   brw->tcs.base.prog_data = &brw->tcs.prog_data->base.base;
+}
+
+
+bool
+brw_tcs_precompile(struct gl_context *ctx,
+                   struct gl_shader_program *shader_prog,
+                   struct gl_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_tcs_prog_key key;
+   uint32_t old_prog_offset = brw->tcs.base.prog_offset;
+   struct brw_tcs_prog_data *old_prog_data = brw->tcs.prog_data;
+   bool success;
+
+   struct gl_tess_ctrl_program *tcp = (struct gl_tess_ctrl_program *)prog;
+   struct brw_tess_ctrl_program *btcp = brw_tess_ctrl_program(tcp);
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = btcp->id;
+   brw_setup_tex_for_precompile(brw, &key.tex, prog);
+
+   /* Guess that the input and output patches have the same dimensionality. */
+   key.input_vertices = shader_prog->TessCtrl.VerticesOut;
+
+   key.tes_primitive_mode = GL_TRIANGLES;
+
+   key.outputs_written = prog->OutputsWritten;
+   key.patch_outputs_written = prog->PatchOutputsWritten;
+
+   success = brw_codegen_tcs_prog(brw, shader_prog, btcp, &key);
+
+   brw->tcs.base.prog_offset = old_prog_offset;
+   brw->tcs.prog_data = old_prog_data;
+
+   return success;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c
index 115c5abd391..28cef3ca589 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c
@@ -39,7 +39,7 @@ brw_upload_tcs_pull_constants(struct brw_context *brw)
 {
    struct brw_stage_state *stage_state = &brw->tcs.base;
 
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    struct brw_tess_ctrl_program *tcp =
       (struct brw_tess_ctrl_program *) brw->tess_ctrl_program;
 
@@ -59,7 +59,7 @@ const struct brw_tracked_state brw_tcs_pull_constants = {
       .mesa = _NEW_PROGRAM_CONSTANTS,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_TCS_PROG_DATA |
-             BRW_NEW_TESS_CTRL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
    },
    .emit = brw_upload_tcs_pull_constants,
 };
@@ -122,7 +122,7 @@ static void
 brw_upload_tcs_image_surfaces(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    struct gl_shader_program *prog =
       ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL];
 
@@ -138,7 +138,7 @@ const struct brw_tracked_state brw_tcs_image_surfaces = {
       .brw = BRW_NEW_BATCH |
              BRW_NEW_TCS_PROG_DATA |
              BRW_NEW_IMAGE_UNITS |
-             BRW_NEW_TESS_CTRL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
    },
    .emit = brw_upload_tcs_image_surfaces,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c
new file mode 100644
index 00000000000..27dc7e59f5d
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_tes.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_tes.c
+ *
+ * Tessellation evaluation shader state upload code.
+ */
+
+#include "brw_context.h"
+#include "brw_nir.h"
+#include "brw_program.h"
+#include "brw_shader.h"
+#include "brw_state.h"
+#include "program/prog_parameter.h"
+
+static void
+brw_tes_debug_recompile(struct brw_context *brw,
+                       struct gl_shader_program *shader_prog,
+                       const struct brw_tes_prog_key *key)
+{
+   struct brw_cache_item *c = NULL;
+   const struct brw_tes_prog_key *old_key = NULL;
+   bool found = false;
+
+   perf_debug("Recompiling tessellation evaluation shader for program %d\n",
+              shader_prog->Name);
+
+   for (unsigned int i = 0; i < brw->cache.size; i++) {
+      for (c = brw->cache.items[i]; c; c = c->next) {
+         if (c->cache_id == BRW_CACHE_TES_PROG) {
+            old_key = c->key;
+
+            if (old_key->program_string_id == key->program_string_id)
+               break;
+         }
+      }
+      if (c)
+         break;
+   }
+
+   if (!c) {
+      perf_debug("  Didn't find previous compile in the shader cache for "
+                 "debug\n");
+      return;
+   }
+
+   found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
+   found |= key_debug(brw, "inputs read", old_key->inputs_read,
+                      key->inputs_read);
+   found |= key_debug(brw, "patch inputs read", old_key->patch_inputs_read,
+                      key->patch_inputs_read);
+
+   if (!found) {
+      perf_debug("  Something else\n");
+   }
+}
+
+static bool
+brw_codegen_tes_prog(struct brw_context *brw,
+                     struct gl_shader_program *shader_prog,
+                     struct brw_tess_eval_program *tep,
+                     struct brw_tes_prog_key *key)
+{
+   const struct brw_compiler *compiler = brw->intelScreen->compiler;
+   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
+   struct brw_stage_state *stage_state = &brw->tes.base;
+   nir_shader *nir = tep->program.Base.nir;
+   struct brw_tes_prog_data prog_data;
+   bool start_busy = false;
+   double start_time = 0;
+
+   memset(&prog_data, 0, sizeof(prog_data));
+
+   brw_assign_common_binding_table_offsets(MESA_SHADER_TESS_EVAL, devinfo,
+                                           shader_prog, &tep->program.Base,
+                                           &prog_data.base.base, 0);
+
+   switch (tep->program.Spacing) {
+   case GL_EQUAL:
+      prog_data.partitioning = BRW_TESS_PARTITIONING_INTEGER;
+      break;
+   case GL_FRACTIONAL_ODD:
+      prog_data.partitioning = BRW_TESS_PARTITIONING_ODD_FRACTIONAL;
+      break;
+   case GL_FRACTIONAL_EVEN:
+      prog_data.partitioning = BRW_TESS_PARTITIONING_EVEN_FRACTIONAL;
+      break;
+   default:
+      unreachable("invalid domain shader spacing");
+   }
+
+   switch (tep->program.PrimitiveMode) {
+   case GL_QUADS:
+      prog_data.domain = BRW_TESS_DOMAIN_QUAD;
+      break;
+   case GL_TRIANGLES:
+      prog_data.domain = BRW_TESS_DOMAIN_TRI;
+      break;
+   case GL_ISOLINES:
+      prog_data.domain = BRW_TESS_DOMAIN_ISOLINE;
+      break;
+   default:
+      unreachable("invalid domain shader primitive mode");
+   }
+
+   if (tep->program.PointMode) {
+      prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+   } else if (tep->program.PrimitiveMode == GL_ISOLINES) {
+      prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_LINE;
+   } else {
+      /* Hardware winding order is backwards from OpenGL */
+      switch (tep->program.VertexOrder) {
+      case GL_CCW:
+         prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW;
+         break;
+      case GL_CW:
+         prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW;
+         break;
+      default:
+         unreachable("invalid domain shader vertex order");
+      }
+   }
+
+   /* Allocate the references to the uniforms that will end up in the
+    * prog_data associated with the compiled program, and which will be freed
+    * by the state cache.
+    *
+    * Note: param_count needs to be num_uniform_components * 4, since we add
+    * padding around uniform values below vec4 size, so the worst case is that
+    * every uniform is a float which gets padded to the size of a vec4.
+    */
+   struct gl_shader *tes = shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
+   int param_count = nir->num_uniforms;
+   if (!compiler->scalar_stage[MESA_SHADER_TESS_EVAL])
+      param_count *= 4;
+
+   prog_data.base.base.param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.pull_param =
+      rzalloc_array(NULL, const gl_constant_value *, param_count);
+   prog_data.base.base.image_param =
+      rzalloc_array(NULL, struct brw_image_param, tes->NumImages);
+   prog_data.base.base.nr_params = param_count;
+   prog_data.base.base.nr_image_params = tes->NumImages;
+
+   brw_nir_setup_glsl_uniforms(nir, shader_prog, &tep->program.Base,
+                               &prog_data.base.base,
+                               compiler->scalar_stage[MESA_SHADER_TESS_EVAL]);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TES))
+      brw_dump_ir("tessellation evaluation", shader_prog, tes, NULL);
+
+   int st_index = -1;
+   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+      st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TES);
+
+   if (unlikely(brw->perf_debug)) {
+      start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo);
+      start_time = get_time();
+   }
+
+   void *mem_ctx = ralloc_context(NULL);
+   unsigned program_size;
+   char *error_str;
+   const unsigned *program =
+      brw_compile_tes(compiler, brw, mem_ctx, key, &prog_data, nir,
+                      shader_prog, st_index, &program_size, &error_str);
+   if (program == NULL) {
+      if (shader_prog) {
+         shader_prog->LinkStatus = false;
+         ralloc_strcat(&shader_prog->InfoLog, error_str);
+      }
+
+      _mesa_problem(NULL, "Failed to compile tessellation evaluation shader: "
+                    "%s\n", error_str);
+
+      ralloc_free(mem_ctx);
+      return false;
+   }
+
+   if (unlikely(brw->perf_debug)) {
+      struct brw_shader *btes = (struct brw_shader *) tes;
+      if (btes->compiled_once) {
+         brw_tes_debug_recompile(brw, shader_prog, key);
+      }
+      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
+         perf_debug("TES compile took %.03f ms and stalled the GPU\n",
+                    (get_time() - start_time) * 1000);
+      }
+      btes->compiled_once = true;
+   }
+
+   /* Scratch space is used for register spilling */
+   if (prog_data.base.base.total_scratch) {
+      brw_get_scratch_bo(brw, &stage_state->scratch_bo,
+			 prog_data.base.base.total_scratch *
+                         brw->max_ds_threads);
+   }
+
+   brw_upload_cache(&brw->cache, BRW_CACHE_TES_PROG,
+                    key, sizeof(*key),
+                    program, program_size,
+                    &prog_data, sizeof(prog_data),
+                    &stage_state->prog_offset, &brw->tes.prog_data);
+   ralloc_free(mem_ctx);
+
+   return true;
+}
+
+
+void
+brw_upload_tes_prog(struct brw_context *brw,
+                    uint64_t per_vertex_slots,
+                    uint32_t per_patch_slots)
+{
+   struct gl_context *ctx = &brw->ctx;
+   struct gl_shader_program **current = ctx->_Shader->CurrentProgram;
+   struct brw_stage_state *stage_state = &brw->tes.base;
+   struct brw_tes_prog_key key;
+   /* BRW_NEW_TESS_PROGRAMS */
+   struct brw_tess_eval_program *tep =
+      (struct brw_tess_eval_program *) brw->tess_eval_program;
+
+   if (!brw_state_dirty(brw,
+                        _NEW_TEXTURE,
+                        BRW_NEW_TESS_PROGRAMS))
+      return;
+
+   struct gl_program *prog = &tep->program.Base;
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = tep->id;
+
+   /* Ignore gl_TessLevelInner/Outer - we treat them as system values,
+    * not inputs, and they're always present in the URB entry regardless
+    * of whether or not we read them.
+    */
+   key.inputs_read = per_vertex_slots &
+      ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
+   key.patch_inputs_read = per_patch_slots;
+
+   /* _NEW_TEXTURE */
+   brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count,
+                                      &key.tex);
+
+   if (!brw_search_cache(&brw->cache, BRW_CACHE_TES_PROG,
+                         &key, sizeof(key),
+                         &stage_state->prog_offset, &brw->tes.prog_data)) {
+      bool success = brw_codegen_tes_prog(brw, current[MESA_SHADER_TESS_EVAL],
+                                          tep, &key);
+      assert(success);
+      (void)success;
+   }
+   brw->tes.base.prog_data = &brw->tes.prog_data->base.base;
+}
+
+
+bool
+brw_tes_precompile(struct gl_context *ctx,
+                   struct gl_shader_program *shader_prog,
+                   struct gl_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_tes_prog_key key;
+   uint32_t old_prog_offset = brw->tes.base.prog_offset;
+   struct brw_tes_prog_data *old_prog_data = brw->tes.prog_data;
+   bool success;
+
+   struct gl_tess_eval_program *tep = (struct gl_tess_eval_program *)prog;
+   struct brw_tess_eval_program *btep = brw_tess_eval_program(tep);
+
+   memset(&key, 0, sizeof(key));
+
+   key.program_string_id = btep->id;
+   key.inputs_read = prog->InputsRead;
+   key.patch_inputs_read = prog->PatchInputsRead;
+
+   if (shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
+      struct gl_program *tcp =
+         shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program;
+      key.inputs_read |= tcp->OutputsWritten;
+      key.patch_inputs_read |= tcp->PatchOutputsWritten;
+   }
+
+   /* Ignore gl_TessLevelInner/Outer - they're system values. */
+   key.inputs_read &= ~(VARYING_BIT_TESS_LEVEL_INNER |
+                        VARYING_BIT_TESS_LEVEL_OUTER);
+
+   brw_setup_tex_for_precompile(brw, &key.tex, prog);
+
+   success = brw_codegen_tes_prog(brw, shader_prog, btep, &key);
+
+   brw->tes.base.prog_offset = old_prog_offset;
+   brw->tes.prog_data = old_prog_data;
+
+   return success;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c
index 142bd5a3109..eff1740c12f 100644
--- a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c
@@ -39,7 +39,7 @@ brw_upload_tes_pull_constants(struct brw_context *brw)
 {
    struct brw_stage_state *stage_state = &brw->tes.base;
 
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    struct brw_tess_eval_program *dp =
       (struct brw_tess_eval_program *) brw->tess_eval_program;
 
@@ -59,7 +59,7 @@ const struct brw_tracked_state brw_tes_pull_constants = {
       .mesa = _NEW_PROGRAM_CONSTANTS,
       .brw = BRW_NEW_BATCH |
              BRW_NEW_TES_PROG_DATA |
-             BRW_NEW_TESS_EVAL_PROGRAM,
+             BRW_NEW_TESS_PROGRAMS,
    },
    .emit = brw_upload_tes_pull_constants,
 };
@@ -122,7 +122,7 @@ static void
 brw_upload_tes_image_surfaces(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    struct gl_shader_program *prog =
       ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
 
@@ -137,7 +137,7 @@ const struct brw_tracked_state brw_tes_image_surfaces = {
    .dirty = {
       .brw = BRW_NEW_BATCH |
              BRW_NEW_IMAGE_UNITS |
-             BRW_NEW_TESS_EVAL_PROGRAM |
+             BRW_NEW_TESS_PROGRAMS |
              BRW_NEW_TES_PROG_DATA,
    },
    .emit = brw_upload_tes_image_surfaces,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index a697bdf84a0..0cded0c87c6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -155,6 +155,9 @@ vec4_instruction::is_send_from_grf()
    case SHADER_OPCODE_TYPED_ATOMIC:
    case SHADER_OPCODE_TYPED_SURFACE_READ:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case VEC4_OPCODE_URB_READ:
+   case TCS_OPCODE_URB_WRITE:
+   case SHADER_OPCODE_BARRIER:
       return true;
    default:
       return false;
@@ -184,7 +187,9 @@ bool
 vec4_instruction::has_source_and_destination_hazard() const
 {
    switch (opcode) {
-   /* Most opcodes in the vec4 world use MRFs. */
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      return true;
    default:
       return false;
    }
@@ -204,6 +209,7 @@ vec4_instruction::regs_read(unsigned arg) const
    case SHADER_OPCODE_TYPED_ATOMIC:
    case SHADER_OPCODE_TYPED_SURFACE_READ:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case TCS_OPCODE_URB_WRITE:
       return arg == 0 ? mlen : 1;
 
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
@@ -281,6 +287,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
       return 0;
    case GS_OPCODE_FF_SYNC:
       return 1;
+   case TCS_OPCODE_URB_WRITE:
+      return 0;
    case SHADER_OPCODE_SHADER_TIME_ADD:
       return 0;
    case SHADER_OPCODE_TEX:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 27c72766f2d..531eb170419 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -314,6 +314,8 @@ public:
 
    bool is_high_sampler(src_reg sampler);
 
+   bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate);
+
    virtual void emit_nir_code();
    virtual void nir_setup_uniforms();
    virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
@@ -341,6 +343,7 @@ public:
                        unsigned num_components = 4);
    src_reg get_nir_src(nir_src src,
                        unsigned num_components = 4);
+   src_reg get_indirect_offset(nir_intrinsic_instr *instr);
 
    virtual dst_reg *make_reg_for_system_value(int location,
                                               const glsl_type *type) = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 85cbf24092e..0c1f0c31b0d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -75,6 +75,8 @@ is_expression(const vec4_instruction *const inst)
    case VEC4_OPCODE_UNPACK_UNIFORM:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
       return true;
    case SHADER_OPCODE_RCP:
    case SHADER_OPCODE_RSQ:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
index 2d0722aa1eb..c31e72def67 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
@@ -45,6 +45,9 @@ can_do_writemask(const struct brw_device_info *devinfo,
    case VS_OPCODE_PULL_CONSTANT_LOAD:
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
    case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+   case VEC4_OPCODE_URB_READ:
       return false;
    default:
       /* The MATH instruction on Gen6 only executes in align1 mode, which does
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 3299843bfa9..86ae9289df1 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -722,6 +722,220 @@ generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
 }
 
 static void
+generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
+    *
+    * Since we operate in SIMD4x2 mode, we need run half as many threads
+    * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
+    * shift right by one less to accomplish the multiplication by two.
+    */
+   dst = retype(dst, BRW_REGISTER_TYPE_UD);
+   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   const int mask = INTEL_MASK(23, 17);
+   const int shift = 17;
+
+   brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
+   brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
+           brw_imm_ud(shift - 1));
+   brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_urb_write(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg urb_header)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, brw_null_reg());
+   brw_set_src0(p, send, urb_header);
+
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              inst->mlen /* mlen */, 0 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+   brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+
+   /* what happens to swizzles? */
+}
+
+
+static void
+generate_tcs_input_urb_offsets(struct brw_codegen *p,
+                               struct brw_reg dst,
+                               struct brw_reg vertex,
+                               struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation.
+    * Inputs are a vertex index, and a byte offset from the beginning of
+    * the vertex. */
+
+   /* If `vertex` is not an immediate, we clobber a0.0 */
+
+   assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
+   assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
+
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   /* m0.5 bits 8-15 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+   /* m0.0-0.1: URB handles */
+   if (vertex.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t vertex_index = vertex.ud;
+      struct brw_reg index_reg = brw_vec1_grf(
+            1 + (vertex_index >> 3), vertex_index & 7);
+
+      brw_MOV(p, vec2(get_element_ud(dst, 0)),
+              retype(index_reg, BRW_REGISTER_TYPE_UD));
+   } else {
+      /* Use indirect addressing.  ICP Handles are DWords (single channels
+       * of a register) and start at g1.0.
+       *
+       * In order to start our region at g1.0, we add 8 to the vertex index,
+       * effectively skipping over the 8 channels in g0.0.  This gives us a
+       * DWord offset to the ICP Handle.
+       *
+       * Indirect addressing works in terms of bytes, so we then multiply
+       * the DWord offset by 4 (by shifting left by 2).
+       */
+      struct brw_reg addr = brw_address_reg(0);
+
+      /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
+      brw_ADD(p, addr, get_element_ud(vertex, 0), brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_ud(2));
+      brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
+
+      /* top half: m0.1 = g[1.0 + vertex.4]UD */
+      brw_ADD(p, addr, get_element_ud(vertex, 4), brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_ud(2));
+      brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
+   }
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+
+static void
+generate_tcs_output_urb_offsets(struct brw_codegen *p,
+                                struct brw_reg dst,
+                                struct brw_reg write_mask,
+                                struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
+
+   assert(write_mask.file == BRW_IMMEDIATE_VALUE);
+   assert(write_mask.type == BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   unsigned mask = write_mask.ud;
+
+   /* m0.5 bits 15:12 and 11:8 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
+
+   /* HS patch URB handle is delivered in r0.0 */
+   struct brw_reg urb_handle = brw_vec1_grf(0, 0);
+
+   /* m0.0-0.1: URB handles */
+   brw_MOV(p, vec2(get_element_ud(dst, 0)),
+           retype(urb_handle, BRW_REGISTER_TYPE_UD));
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_vec4_urb_read(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg dst,
+                       struct brw_reg header)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+
+   assert(header.file == BRW_GENERAL_REGISTER_FILE);
+   assert(header.type == BRW_REGISTER_TYPE_UD);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              1 /* mlen */, 1 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+   brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+}
+
+static void
+generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_create_barrier_header(struct brw_codegen *p,
+                                   struct brw_vue_prog_data *prog_data,
+                                   struct brw_reg dst)
+{
+   struct brw_reg m0_2 = get_element_ud(dst, 2);
+   unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Zero the message header */
+   brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
+
+   /* Copy "Barrier ID" from DW0 bits 16:13 */
+   brw_AND(p, m0_2,
+           retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(0x1e000));
+
+   /* Shift it into place */
+   brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(11));
+
+   /* Set the Barrier Count and the enable bit */
+   brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
+
+   brw_pop_insn_state(p);
+}
+
+static void
 generate_oword_dual_block_offsets(struct brw_codegen *p,
                                   struct brw_reg m1,
                                   struct brw_reg index)
@@ -1546,6 +1760,39 @@ generate_code(struct brw_codegen *p,
          break;
       }
 
+      case TCS_OPCODE_URB_WRITE:
+         generate_tcs_urb_write(p, inst, src[0]);
+         break;
+
+      case VEC4_OPCODE_URB_READ:
+         generate_vec4_urb_read(p, inst, dst, src[0]);
+         break;
+
+      case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+         generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+         generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_GET_INSTANCE_ID:
+         generate_tcs_get_instance_id(p, dst);
+         break;
+
+      case TCS_OPCODE_GET_PRIMITIVE_ID:
+         generate_tcs_get_primitive_id(p, dst);
+         break;
+
+      case TCS_OPCODE_CREATE_BARRIER_HEADER:
+         generate_tcs_create_barrier_header(p, prog_data, dst);
+         break;
+
+      case SHADER_OPCODE_BARRIER:
+         brw_barrier(p, src[0]);
+         brw_WAIT(p);
+         break;
+
       default:
          unreachable("Unsupported opcode");
       }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index dcecd772ff6..7781d3c01ef 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -327,6 +327,24 @@ vec4_visitor::get_nir_src(nir_src src, unsigned num_components)
    return get_nir_src(src, nir_type_int, num_components);
 }
 
+src_reg
+vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+   nir_src *offset_src = nir_get_io_offset_src(instr);
+   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
+
+   if (const_value) {
+      /* The only constant offset we should find is 0.  brw_nir.c's
+       * add_const_offset_to_base() will fold other constant offsets
+       * into instr->const_index[0].
+       */
+      assert(const_value->u[0] == 0);
+      return src_reg();
+   }
+
+   return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1);
+}
+
 void
 vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
 {
@@ -650,7 +668,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
    case nir_intrinsic_load_vertex_id_zero_base:
    case nir_intrinsic_load_base_vertex:
-   case nir_intrinsic_load_instance_id: {
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_invocation_id:
+   case nir_intrinsic_load_tess_level_inner:
+   case nir_intrinsic_load_tess_level_outer: {
       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
       src_reg val = src_reg(nir_system_values[sv]);
       assert(val.file != BAD_FILE);
@@ -888,6 +909,59 @@ brw_conditional_for_nir_comparison(nir_op op)
    }
 }
 
+bool
+vec4_visitor::optimize_predicate(nir_alu_instr *instr,
+                                 enum brw_predicate *predicate)
+{
+   if (!instr->src[0].src.is_ssa ||
+       instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *cmp_instr =
+      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+   switch (cmp_instr->op) {
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4:
+      *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4:
+      *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      break;
+   default:
+      return false;
+   }
+
+   unsigned size_swizzle =
+      brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]);
+
+   src_reg op[2];
+   assert(nir_op_infos[cmp_instr->op].num_inputs == 2);
+   for (unsigned i = 0; i < 2; i++) {
+      op[i] = get_nir_src(cmp_instr->src[i].src,
+                          nir_op_infos[cmp_instr->op].input_types[i], 4);
+      unsigned base_swizzle =
+         brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle);
+      op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle);
+      op[i].abs = cmp_instr->src[i].abs;
+      op[i].negate = cmp_instr->src[i].negate;
+   }
+
+   emit(CMP(dst_null_d(), op[0], op[1],
+            brw_conditional_for_nir_comparison(cmp_instr->op)));
+
+   return true;
+}
+
 void
 vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
 {
@@ -1378,25 +1452,29 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_bcsel:
-      emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
-      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
-      switch (dst.writemask) {
-      case WRITEMASK_X:
-         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
-         break;
-      case WRITEMASK_Y:
-         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
-         break;
-      case WRITEMASK_Z:
-         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
-         break;
-      case WRITEMASK_W:
-         inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
-         break;
-      default:
-         inst->predicate = BRW_PREDICATE_NORMAL;
-         break;
+      enum brw_predicate predicate;
+      if (!optimize_predicate(instr, &predicate)) {
+         emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
+         switch (dst.writemask) {
+         case WRITEMASK_X:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
+            break;
+         case WRITEMASK_Y:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+            break;
+         case WRITEMASK_Z:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+            break;
+         case WRITEMASK_W:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
+            break;
+         default:
+            predicate = BRW_PREDICATE_NORMAL;
+            break;
+         }
       }
+      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
+      inst->predicate = predicate;
       break;
 
    case nir_op_fdot_replicated2:
@@ -1419,20 +1497,6 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
-   case nir_op_bany2:
-   case nir_op_bany3:
-   case nir_op_bany4: {
-      unsigned swiz =
-         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
-
-      emit(CMP(dst_null_d(), swizzle(op[0], swiz), brw_imm_d(0),
-               BRW_CONDITIONAL_NZ));
-      emit(MOV(dst, brw_imm_d(0)));
-      inst = emit(MOV(dst, brw_imm_d(~0)));
-      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
-      break;
-   }
-
    case nir_op_fabs:
    case nir_op_iabs:
    case nir_op_fneg:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
new file mode 100644
index 00000000000..507db749e63
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.cpp
+ *
+ * Tessellaton control shader specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4_tcs.h"
+
+namespace brw {
+
+vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
+                                   void *log_data,
+                                   const struct brw_tcs_prog_key *key,
+                                   struct brw_tcs_prog_data *prog_data,
+                                   const nir_shader *nir,
+                                   void *mem_ctx,
+                                   int shader_time_index,
+                                   const struct brw_vue_map *input_vue_map)
+   : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+                  nir, mem_ctx, false, shader_time_index),
+     input_vue_map(input_vue_map), key(key)
+{
+}
+
+
+void
+vec4_tcs_visitor::emit_nir_code()
+{
+   if (key->program_string_id != 0) {
+      /* We have a real application-supplied TCS, emit real code. */
+      vec4_visitor::emit_nir_code();
+   } else {
+      /* There is no TCS; automatically generate a passthrough shader
+       * that writes the API-specified default tessellation levels and
+       * copies VS outputs to TES inputs.
+       */
+      uniforms = 2;
+      uniform_size[0] = 1;
+      uniform_size[1] = 1;
+
+      uint64_t varyings = key->outputs_written;
+
+      src_reg vertex_offset(this, glsl_type::uint_type);
+      emit(MUL(dst_reg(vertex_offset), invocation_id,
+               brw_imm_ud(prog_data->vue_map.num_per_vertex_slots)));
+
+      while (varyings != 0) {
+         const int varying = ffsll(varyings) - 1;
+
+         unsigned in_offset = input_vue_map->varying_to_slot[varying];
+         unsigned out_offset = prog_data->vue_map.varying_to_slot[varying];
+         assert(out_offset >= 2);
+
+         dst_reg val(this, glsl_type::vec4_type);
+         emit_input_urb_read(val, invocation_id, in_offset, src_reg());
+         emit_urb_write(src_reg(val), WRITEMASK_XYZW, out_offset,
+                        vertex_offset);
+
+         varyings &= ~BITFIELD64_BIT(varying);
+      }
+
+      /* Only write the tessellation factors from invocation 0.
+       * There's no point in making other threads do redundant work.
+       */
+      emit(CMP(dst_null_d(), invocation_id, brw_imm_ud(0),
+               BRW_CONDITIONAL_EQ));
+      emit(IF(BRW_PREDICATE_NORMAL));
+      emit_urb_write(src_reg(UNIFORM, 0, glsl_type::vec4_type),
+                     WRITEMASK_XYZW, 0, src_reg());
+      emit_urb_write(src_reg(UNIFORM, 1, glsl_type::vec4_type),
+                     WRITEMASK_XYZW, 1, src_reg());
+      emit(BRW_OPCODE_ENDIF);
+   }
+}
+
+void
+vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+}
+
+dst_reg *
+vec4_tcs_visitor::make_reg_for_system_value(int location, const glsl_type *type)
+{
+   return NULL;
+}
+
+
+void
+vec4_tcs_visitor::setup_payload()
+{
+   int reg = 0;
+
+   /* The payload always contains important data in r0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg++;
+
+   /* r1.0 - r4.7 may contain the input control point URB handles,
+    * which we use to pull vertex data.
+    */
+   reg += 4;
+
+   /* Push constants may start at r5.0 */
+   reg = setup_uniforms(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tcs_visitor::emit_prolog()
+{
+   invocation_id = src_reg(this, glsl_type::uint_type);
+   emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
+
+   /* HS threads are dispatched with the dispatch mask set to 0xFF.
+    * If there are an odd number of output vertices, then the final
+    * HS instance dispatched will only have its bottom half doing real
+    * work, and so we need to disable the upper half:
+    */
+   if (nir->info.tcs.vertices_out % 2) {
+      emit(CMP(dst_null_d(), invocation_id,
+               brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L));
+
+      /* Matching ENDIF is in emit_thread_end() */
+      emit(IF(BRW_PREDICATE_NORMAL));
+   }
+}
+
+
+void
+vec4_tcs_visitor::emit_thread_end()
+{
+   current_annotation = "thread end";
+
+   if (nir->info.tcs.vertices_out % 2) {
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+      emit_shader_time_end();
+
+   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+   inst->mlen = 1;   /* just the header, no data. */
+   inst->urb_write_flags = BRW_URB_WRITE_EOT_COMPLETE;
+}
+
+
+void
+vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
+                                      const src_reg &vertex_index,
+                                      unsigned base_offset,
+                                      const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+   dst_reg temp(this, glsl_type::ivec4_type);
+   temp.type = dst.type;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+   inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
+               indirect_offset);
+   inst->force_writemask_all = true;
+
+   /* Read into a temporary, ignoring writemasking. */
+   inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+   inst->offset = base_offset;
+   inst->mlen = 1;
+   inst->base_mrf = -1;
+
+   /* Copy the temporary to the destination to deal with writemasking.
+    *
+    * Also attempt to deal with gl_PointSize being in the .w component.
+    */
+   if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+      emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
+   } else {
+      emit(MOV(dst, src_reg(temp)));
+   }
+}
+
+void
+vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
+                                       unsigned base_offset,
+                                       const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
+               brw_imm_ud(dst.writemask), indirect_offset);
+   inst->force_writemask_all = true;
+
+   /* Read into a temporary, ignoring writemasking. */
+   vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header));
+   read->offset = base_offset;
+   read->mlen = 1;
+   read->base_mrf = -1;
+}
+
+void
+vec4_tcs_visitor::emit_urb_write(const src_reg &value,
+                                 unsigned writemask,
+                                 unsigned base_offset,
+                                 const src_reg &indirect_offset)
+{
+   if (writemask == 0)
+      return;
+
+   src_reg message(this, glsl_type::uvec4_type, 2);
+   vec4_instruction *inst;
+
+   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
+               brw_imm_ud(writemask), indirect_offset);
+   inst->force_writemask_all = true;
+   inst = emit(MOV(offset(dst_reg(retype(message, value.type)), 1), value));
+   inst->force_writemask_all = true;
+
+   inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message);
+   inst->offset = base_offset;
+   inst->mlen = 2;
+   inst->base_mrf = -1;
+}
+
+static unsigned
+tesslevel_outer_components(GLenum tes_primitive_mode)
+{
+   switch (tes_primitive_mode) {
+   case GL_QUADS:
+      return 4;
+   case GL_TRIANGLES:
+      return 3;
+   case GL_ISOLINES:
+      return 2;
+   default:
+      unreachable("Bogus tessellation domain");
+   }
+   return 0;
+}
+
+static unsigned
+tesslevel_inner_components(GLenum tes_primitive_mode)
+{
+   switch (tes_primitive_mode) {
+   case GL_QUADS:
+      return 2;
+   case GL_TRIANGLES:
+      return 1;
+   case GL_ISOLINES:
+      return 0;
+   default:
+      unreachable("Bogus tessellation domain");
+   }
+   return 0;
+}
+
+/**
+ * Given a normal .xyzw writemask, convert it to a writemask for a vector
+ * that's stored backwards, i.e. .wzyx.
+ */
+static unsigned
+writemask_for_backwards_vector(unsigned mask)
+{
+   unsigned new_mask = 0;
+
+   for (int i = 0; i < 4; i++)
+      new_mask |= ((mask >> i) & 1) << (3 - i);
+
+   return new_mask;
+}
+
+void
+vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_invocation_id:
+      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD),
+               invocation_id));
+      break;
+   case nir_intrinsic_load_primitive_id:
+      emit(TCS_OPCODE_GET_PRIMITIVE_ID,
+           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D),
+               brw_imm_d(key->input_vertices)));
+      break;
+   case nir_intrinsic_load_per_vertex_input: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
+      src_reg vertex_index =
+         vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0]))
+                      : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
+
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+
+      emit_input_urb_read(dst, vertex_index, imm_offset, indirect_offset);
+      break;
+   }
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
+      break;
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];;
+
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+
+      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
+         dst.type = BRW_REGISTER_TYPE_F;
+
+         /* This is a read of gl_TessLevelInner[], which lives in the
+          * Patch URB header.  The layout depends on the domain.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS: {
+            /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */
+            dst_reg tmp(this, glsl_type::vec4_type);
+            emit_output_urb_read(tmp, 0, src_reg());
+            emit(MOV(writemask(dst, WRITEMASK_XY),
+                     swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
+            break;
+         }
+         case GL_TRIANGLES:
+            /* DWord 4; use offset 1 but normal swizzle/writemask. */
+            emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, src_reg());
+            break;
+         case GL_ISOLINES:
+            /* All channels are undefined. */
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
+         }
+      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
+         dst.type = BRW_REGISTER_TYPE_F;
+
+         /* This is a read of gl_TessLevelOuter[], which lives in the
+          * high 4 DWords of the Patch URB header, in reverse order.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS:
+            dst.writemask = WRITEMASK_XYZW;
+            break;
+         case GL_TRIANGLES:
+            dst.writemask = WRITEMASK_XYZ;
+            break;
+         case GL_ISOLINES:
+            dst.writemask = WRITEMASK_XY;
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
+         }
+
+         dst_reg tmp(this, glsl_type::vec4_type);
+         emit_output_urb_read(tmp, 1, src_reg());
+         emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
+      } else {
+         emit_output_urb_read(dst, imm_offset, indirect_offset);
+      }
+      break;
+   }
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output: {
+      src_reg value = get_nir_src(instr->src[0]);
+      unsigned mask = instr->const_index[1];
+      unsigned swiz = BRW_SWIZZLE_XYZW;
+
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      if (imm_offset == 0 && indirect_offset.file == BAD_FILE) {
+         value.type = BRW_REGISTER_TYPE_F;
+
+         mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1;
+
+         /* This is a write to gl_TessLevelInner[], which lives in the
+          * Patch URB header.  The layout depends on the domain.
+          */
+         switch (key->tes_primitive_mode) {
+         case GL_QUADS:
+            /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed).
+             * We use an XXYX swizzle to reverse put .xy in the .wz
+             * channels, and use a .zw writemask.
+             */
+            swiz = BRW_SWIZZLE4(0, 0, 1, 0);
+            mask = writemask_for_backwards_vector(mask);
+            break;
+         case GL_TRIANGLES:
+            /* gl_TessLevelInner[].x lives at DWord 4, so we set the
+             * writemask to X and bump the URB offset by 1.
+             */
+            imm_offset = 1;
+            break;
+         case GL_ISOLINES:
+            /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */
+            return;
+         default:
+            unreachable("Bogus tessellation domain");
+         }
+      } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
+         value.type = BRW_REGISTER_TYPE_F;
+
+         mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1;
+
+         /* This is a write to gl_TessLevelOuter[] which lives in the
+          * Patch URB Header at DWords 4-7.  However, it's reversed, so
+          * instead of .xyzw we have .wzyx.
+          */
+         swiz = BRW_SWIZZLE_WZYX;
+         mask = writemask_for_backwards_vector(mask);
+      }
+
+      emit_urb_write(swizzle(value, swiz), mask,
+                     imm_offset, indirect_offset);
+      break;
+   }
+
+   case nir_intrinsic_barrier: {
+      dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+      emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+      emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+      break;
+   }
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+
+
+extern "C" const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tcs_prog_key *key,
+                struct brw_tcs_prog_data *prog_data,
+                const nir_shader *src_shader,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str)
+{
+   const struct brw_device_info *devinfo = compiler->devinfo;
+   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+   nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
+   nir->info.outputs_written = key->outputs_written;
+   nir->info.patch_outputs_written = key->patch_outputs_written;
+   nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar);
+   nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
+
+   prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2);
+
+   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
+                            nir->info.outputs_written,
+                            nir->info.patch_outputs_written);
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     32 bytes for the patch header (tessellation factors)
+    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
+    *              gl_MaxTessPatchComponents = 120)
+    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
+    *              gl_MaxPatchVertices = 32 and
+    *              gl_MaxTessControlOutputComponents = 128)
+    *
+    *  15808 bytes left for varying packing overhead
+    */
+   const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
+   const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
+   unsigned output_size_bytes = 0;
+   /* Note that the patch header is counted in num_per_patch_slots. */
+   output_size_bytes += num_per_patch_slots * 16;
+   output_size_bytes += nir->info.tcs.vertices_out * num_per_vertex_slots * 16;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES)
+      return false;
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   struct brw_vue_map input_vue_map;
+   brw_compute_vue_map(devinfo, &input_vue_map,
+                       nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID,
+                       true);
+
+   /* HS does not use the usual payload pushing from URB to GRFs,
+    * because we don't have enough registers for a full-size payload, and
+    * the hardware is broken on Haswell anyway.
+    */
+   vue_prog_data->urb_read_length = 0;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
+      fprintf(stderr, "TCS Input ");
+      brw_print_vue_map(stderr, &input_vue_map);
+      fprintf(stderr, "TCS Output ");
+      brw_print_vue_map(stderr, &vue_prog_data->vue_map);
+   }
+
+   vec4_tcs_visitor v(compiler, log_data, key, prog_data,
+                      nir, mem_ctx, shader_time_index, &input_vue_map);
+   if (!v.run()) {
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+      return NULL;
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS))
+      v.dump_instructions();
+
+   return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+                                     &prog_data->base, v.cfg,
+                                     final_assembly_size);
+}
+
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.h b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h
new file mode 100644
index 00000000000..2c6801b2ae3
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.h
+ *
+ * The vec4-mode tessellation control shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TCS_H
+#define BRW_VEC4_TCS_H
+
+#include "brw_compiler.h"
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tcs_visitor : public vec4_visitor
+{
+public:
+   vec4_tcs_visitor(const struct brw_compiler *compiler,
+                    void *log_data,
+                    const struct brw_tcs_prog_key *key,
+                    struct brw_tcs_prog_data *prog_data,
+                    const nir_shader *nir,
+                    void *mem_ctx,
+                    int shader_time_index,
+                    const struct brw_vue_map *input_vue_map);
+
+protected:
+   virtual void emit_nir_code();
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+   void emit_input_urb_read(const dst_reg &dst,
+                            const src_reg &vertex_index,
+                            unsigned base_offset,
+                            const src_reg &indirect_offset);
+   void emit_output_urb_read(const dst_reg &dst,
+                             unsigned base_offset,
+                             const src_reg &indirect_offset);
+
+   void emit_urb_write(const src_reg &value, unsigned writemask,
+                       unsigned base_offset, const src_reg &indirect_offset);
+
+   /* we do not use the normal end-of-shader URB write mechanism -- but every vec4 stage
+    * must provide implementations of these:
+    */
+   virtual void emit_urb_write_header(int mrf) {}
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return NULL; }
+
+   const struct brw_vue_map *input_vue_map;
+
+   const struct brw_tcs_prog_key *key;
+   src_reg invocation_id;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TCS_H */
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 59b748f2055..3095d82d91e 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -148,7 +148,9 @@ brw_codegen_vs_prog(struct brw_context *brw,
 
    brw_compute_vue_map(brw->intelScreen->devinfo,
                        &prog_data.base.vue_map, outputs_written,
-                       prog ? prog->SeparateShader : false);
+                       prog ? prog->SeparateShader ||
+                              prog->_LinkedShaders[MESA_SHADER_TESS_EVAL]
+                            : false);
 
    if (0) {
       _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG,
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
index 6cb3da46995..09eadbcb54f 100644
--- a/src/mesa/drivers/dri/i965/brw_vue_map.c
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -176,6 +176,73 @@ brw_compute_vue_map(const struct brw_device_info *devinfo,
    }
 
    vue_map->num_slots = separate ? slot + 1 : slot;
+   vue_map->num_per_vertex_slots = 0;
+   vue_map->num_per_patch_slots = 0;
+}
+
+/**
+ * Compute the VUE map for tessellation control shader outputs and
+ * tessellation evaluation shader inputs.
+ */
+void
+brw_compute_tess_vue_map(struct brw_vue_map *vue_map,
+                         GLbitfield64 vertex_slots,
+                         GLbitfield patch_slots)
+{
+   /* I don't think anything actually uses this... */
+   vue_map->slots_valid = vertex_slots;
+
+   vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER |
+                     VARYING_BIT_TESS_LEVEL_INNER);
+
+   /* Make sure that the values we store in vue_map->varying_to_slot and
+    * vue_map->slot_to_varying won't overflow the signed chars that are used
+    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
+    * values equal to VARYING_SLOT_TESS_MAX , we need to ensure that
+    * VARYING_SLOT_TESS_MAX is <= 127, not 128.
+    */
+   STATIC_ASSERT(VARYING_SLOT_TESS_MAX <= 127);
+
+   for (int i = 0; i < VARYING_SLOT_TESS_MAX ; ++i) {
+      vue_map->varying_to_slot[i] = -1;
+      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
+   }
+
+   int slot = 0;
+
+   /* The first 8 DWords are reserved for the "Patch Header".
+    *
+    * VARYING_SLOT_TESS_LEVEL_OUTER / INNER live here, but the exact layout
+    * depends on the domain type.  They might not be in slots 0 and 1 as
+    * described here, but pretending they're separate allows us to uniquely
+    * identify them by distinct slot locations.
+    */
+   assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_INNER, slot++);
+   assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_OUTER, slot++);
+
+   /* first assign per-patch varyings */
+   while (patch_slots != 0) {
+      const int varying = ffsll(patch_slots) - 1;
+      if (vue_map->varying_to_slot[varying + VARYING_SLOT_PATCH0] == -1) {
+         assign_vue_slot(vue_map, varying + VARYING_SLOT_PATCH0, slot++);
+      }
+      patch_slots &= ~BITFIELD64_BIT(varying);
+   }
+
+   /* apparently, including the patch header... */
+   vue_map->num_per_patch_slots = slot;
+
+   /* then assign per-vertex varyings for each vertex in our patch */
+   while (vertex_slots != 0) {
+      const int varying = ffsll(vertex_slots) - 1;
+      if (vue_map->varying_to_slot[varying] == -1) {
+         assign_vue_slot(vue_map, varying, slot++);
+      }
+      vertex_slots &= ~BITFIELD64_BIT(varying);
+   }
+
+   vue_map->num_per_vertex_slots = slot - vue_map->num_per_patch_slots;
+   vue_map->num_slots = slot;
 }
 
 static const char *
@@ -196,11 +263,28 @@ varying_name(brw_varying_slot slot)
 void
 brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map)
 {
-   fprintf(fp, "VUE map (%d slots, %s)\n",
-           vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
-   for (int i = 0; i < vue_map->num_slots; i++) {
-      fprintf(fp, "  [%d] %s\n", i,
-              varying_name(vue_map->slot_to_varying[i]));
+   if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) {
+      fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n",
+              vue_map->num_slots,
+              vue_map->num_per_patch_slots,
+              vue_map->num_per_vertex_slots,
+              vue_map->separate ? "SSO" : "non-SSO");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) {
+            fprintf(fp, "  [%d] VARYING_SLOT_PATCH%d\n", i,
+                    vue_map->slot_to_varying[i] - VARYING_SLOT_PATCH0);
+         } else {
+            fprintf(fp, "  [%d] %s\n", i,
+                    varying_name(vue_map->slot_to_varying[i]));
+         }
+      }
+   } else {
+      fprintf(fp, "VUE map (%d slots, %s)\n",
+              vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         fprintf(fp, "  [%d] %s\n", i,
+                 varying_name(vue_map->slot_to_varying[i]));
+      }
    }
    fprintf(fp, "\n");
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index c4ebbf3b48c..76dc5775121 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -862,10 +862,8 @@ brw_update_texture_surfaces(struct brw_context *brw)
    /* BRW_NEW_VERTEX_PROGRAM */
    struct gl_program *vs = (struct gl_program *) brw->vertex_program;
 
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program;
-
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
    struct gl_program *tes = (struct gl_program *) brw->tess_eval_program;
 
    /* BRW_NEW_GEOMETRY_PROGRAM */
@@ -915,8 +913,7 @@ const struct brw_tracked_state brw_texture_surfaces = {
              BRW_NEW_FS_PROG_DATA |
              BRW_NEW_GEOMETRY_PROGRAM |
              BRW_NEW_GS_PROG_DATA |
-             BRW_NEW_TESS_CTRL_PROGRAM |
-             BRW_NEW_TESS_EVAL_PROGRAM |
+             BRW_NEW_TESS_PROGRAMS |
              BRW_NEW_TCS_PROG_DATA |
              BRW_NEW_TES_PROG_DATA |
              BRW_NEW_TEXTURE_BUFFER |
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index 6653a6d759b..da3b4cd90e8 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -65,7 +65,8 @@ gen6_upload_push_constants(struct brw_context *brw,
        * basic type of PROGRAM_STATE_VAR.
        */
       /* XXX: Should this happen somewhere before to get our state flag set? */
-      _mesa_load_state_parameters(ctx, prog->Parameters);
+      if (prog)
+         _mesa_load_state_parameters(ctx, prog->Parameters);
 
       gl_constant_value *param;
       unsigned i;
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index e87b9d1657f..89b73ca7519 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -50,6 +50,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
    unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16;
    gen7_emit_push_constant_state(brw,
                                  urb_size / 2 /* vs_size */,
+                                 0 /* hs_size */,
+                                 0 /* ds_size */,
                                  0 /* gs_size */,
                                  urb_size / 2 /* fs_size */);
 
@@ -60,6 +62,12 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
                        32 /* num_vs_entries */,
                        2 /* vs_size */,
                        2 /* vs_start */,
+                       0 /* num_hs_entries */,
+                       1 /* hs_size */,
+                       2 /* hs_start */,
+                       0 /* num_ds_entries */,
+                       1 /* ds_size */,
+                       2 /* ds_start */,
                        0 /* num_gs_entries */,
                        1 /* gs_size */,
                        2 /* gs_start */);
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index 1fde69cf78e..a025bb9dd66 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -141,7 +141,7 @@ brw_upload_cs_state(struct brw_context *brw)
       BEGIN_BATCH(4);
       OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2));
       OUT_BATCH(0);
-      OUT_BATCH(reg_aligned_constant_size * threads);
+      OUT_BATCH(ALIGN(reg_aligned_constant_size * threads, 64));
       OUT_BATCH(stage_state->push_const_offset);
       ADVANCE_BATCH();
    }
@@ -249,8 +249,8 @@ brw_upload_cs_push_constants(struct brw_context *brw,
 
       param = (gl_constant_value*)
          brw_state_batch(brw, type,
-                         reg_aligned_constant_size * threads,
-                         32, &stage_state->push_const_offset);
+                         ALIGN(reg_aligned_constant_size * threads, 64),
+                         64, &stage_state->push_const_offset);
       assert(param);
 
       STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float));
diff --git a/src/mesa/drivers/dri/i965/gen7_ds_state.c b/src/mesa/drivers/dri/i965/gen7_ds_state.c
index 4d3d94f68a6..9a697140386 100644
--- a/src/mesa/drivers/dri/i965/gen7_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_ds_state.c
@@ -30,7 +30,7 @@ static void
 gen7_upload_tes_push_constants(struct brw_context *brw)
 {
    struct brw_stage_state *stage_state = &brw->tes.base;
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    const struct brw_tess_eval_program *tep =
       (struct brw_tess_eval_program *) brw->tess_eval_program;
 
@@ -49,7 +49,7 @@ const struct brw_tracked_state gen7_tes_push_constants = {
       .mesa  = _NEW_PROGRAM_CONSTANTS,
       .brw   = BRW_NEW_BATCH |
                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
-               BRW_NEW_TESS_EVAL_PROGRAM |
+               BRW_NEW_TESS_PROGRAMS |
                BRW_NEW_TES_PROG_DATA,
    },
    .emit = gen7_upload_tes_push_constants,
diff --git a/src/mesa/drivers/dri/i965/gen7_hs_state.c b/src/mesa/drivers/dri/i965/gen7_hs_state.c
index fcaa9197857..6793617b9e2 100644
--- a/src/mesa/drivers/dri/i965/gen7_hs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_hs_state.c
@@ -30,26 +30,28 @@ static void
 gen7_upload_tcs_push_constants(struct brw_context *brw)
 {
    struct brw_stage_state *stage_state = &brw->tcs.base;
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    const struct brw_tess_ctrl_program *tcp =
       (struct brw_tess_ctrl_program *) brw->tess_ctrl_program;
+   bool active = brw->tess_eval_program;
 
-   if (tcp) {
+   if (active) {
       /* BRW_NEW_TCS_PROG_DATA */
       const struct brw_stage_prog_data *prog_data = &brw->tcs.prog_data->base.base;
       gen6_upload_push_constants(brw, &tcp->program.Base, prog_data,
                                       stage_state, AUB_TRACE_VS_CONSTANTS);
    }
 
-   gen7_upload_constant_state(brw, stage_state, tcp, _3DSTATE_CONSTANT_HS);
+   gen7_upload_constant_state(brw, stage_state, active, _3DSTATE_CONSTANT_HS);
 }
 
 const struct brw_tracked_state gen7_tcs_push_constants = {
    .dirty = {
       .mesa  = _NEW_PROGRAM_CONSTANTS,
       .brw   = BRW_NEW_BATCH |
+               BRW_NEW_DEFAULT_TESS_LEVELS |
                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
-               BRW_NEW_TESS_CTRL_PROGRAM |
+               BRW_NEW_TESS_PROGRAMS |
                BRW_NEW_TCS_PROG_DATA,
    },
    .emit = gen7_upload_tcs_push_constants,
diff --git a/src/mesa/drivers/dri/i965/gen7_te_state.c b/src/mesa/drivers/dri/i965/gen7_te_state.c
index 2650fa562ec..f221307bc43 100644
--- a/src/mesa/drivers/dri/i965/gen7_te_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_te_state.c
@@ -29,10 +29,8 @@
 static void
 upload_te_state(struct brw_context *brw)
 {
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    bool active = brw->tess_eval_program;
-   if (active)
-      assert(brw->tess_ctrl_program);
 
    const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data;
 
@@ -61,7 +59,7 @@ const struct brw_tracked_state gen7_te_state = {
       .mesa  = 0,
       .brw   = BRW_NEW_CONTEXT |
                BRW_NEW_TES_PROG_DATA |
-               BRW_NEW_TESS_EVAL_PROGRAM,
+               BRW_NEW_TESS_PROGRAMS,
    },
    .emit = upload_te_state,
 };
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index 99a9d3c6500..00edbcca662 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -34,7 +34,7 @@
  *   __________-__________   _________________-_________________
  *  /                     \ /                                   \
  * +-------------------------------------------------------------+
- * |     VS/FS/GS Push     |              VS/GS URB              |
+ * |  VS/HS/DS/GS/FS Push  |           VS/HS/DS/GS URB           |
  * |       Constants       |               Entries               |
  * +-------------------------------------------------------------+
  *
@@ -60,27 +60,32 @@
 static void
 gen7_allocate_push_constants(struct brw_context *brw)
 {
+   /* BRW_NEW_GEOMETRY_PROGRAM */
+   bool gs_present = brw->geometry_program;
+
+   /* BRW_NEW_TESS_PROGRAMS */
+   bool tess_present = brw->tess_eval_program;
+
    unsigned avail_size = 16;
    unsigned multiplier =
       (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1;
 
-   /* BRW_NEW_GEOMETRY_PROGRAM */
-   bool gs_present = brw->geometry_program;
+   int stages = 2 + gs_present + 2 * tess_present;
 
-   unsigned vs_size, gs_size;
-   if (gs_present) {
-      vs_size = avail_size / 3;
-      avail_size -= vs_size;
-      gs_size = avail_size / 2;
-      avail_size -= gs_size;
-   } else {
-      vs_size = avail_size / 2;
-      avail_size -= vs_size;
-      gs_size = 0;
-   }
-   unsigned fs_size = avail_size;
+   /* Divide up the available space equally between stages.  Because we
+    * round down (using floor division), there may be some left over
+    * space.  We allocate that to the pixel shader stage.
+    */
+   unsigned size_per_stage = avail_size / stages;
+
+   unsigned vs_size = size_per_stage;
+   unsigned hs_size = tess_present ? size_per_stage : 0;
+   unsigned ds_size = tess_present ? size_per_stage : 0;
+   unsigned gs_size = gs_present ? size_per_stage : 0;
+   unsigned fs_size = avail_size - size_per_stage * (stages - 1);
 
    gen7_emit_push_constant_state(brw, multiplier * vs_size,
+                                 multiplier * hs_size, multiplier * ds_size,
                                  multiplier * gs_size, multiplier * fs_size);
 
    /* From p115 of the Ivy Bridge PRM (3.2.1.4 3DSTATE_PUSH_CONSTANT_ALLOC_VS):
@@ -99,15 +104,24 @@ gen7_allocate_push_constants(struct brw_context *brw)
 
 void
 gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
+                              unsigned hs_size, unsigned ds_size,
                               unsigned gs_size, unsigned fs_size)
 {
    unsigned offset = 0;
 
-   BEGIN_BATCH(6);
+   BEGIN_BATCH(10);
    OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2));
    OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
    offset += vs_size;
 
+   OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_HS << 16 | (2 - 2));
+   OUT_BATCH(hs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
+   offset += hs_size;
+
+   OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_DS << 16 | (2 - 2));
+   OUT_BATCH(ds_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
+   offset += ds_size;
+
    OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2));
    OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT);
    offset += gs_size;
@@ -130,7 +144,9 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
 const struct brw_tracked_state gen7_push_constant_space = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM,
+      .brw = BRW_NEW_CONTEXT |
+             BRW_NEW_GEOMETRY_PROGRAM |
+             BRW_NEW_TESS_PROGRAMS,
    },
    .emit = gen7_allocate_push_constants,
 };
@@ -138,6 +154,7 @@ const struct brw_tracked_state gen7_push_constant_space = {
 static void
 gen7_upload_urb(struct brw_context *brw)
 {
+   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
    const int push_size_kB =
       (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16;
 
@@ -149,6 +166,15 @@ gen7_upload_urb(struct brw_context *brw)
    unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1;
    unsigned gs_entry_size_bytes = gs_size * 64;
 
+   /* BRW_NEW_TESS_PROGRAMS */
+   const bool tess_present = brw->tess_eval_program;
+   /* BRW_NEW_TCS_PROG_DATA */
+   unsigned hs_size = tess_present ? brw->tcs.prog_data->base.urb_entry_size : 1;
+   unsigned hs_entry_size_bytes = hs_size * 64;
+   /* BRW_NEW_TES_PROG_DATA */
+   unsigned ds_size = tess_present ? brw->tes.prog_data->base.urb_entry_size : 1;
+   unsigned ds_entry_size_bytes = ds_size * 64;
+
    /* If we're just switching between programs with the same URB requirements,
     * skip the rest of the logic.
     */
@@ -156,21 +182,29 @@ gen7_upload_urb(struct brw_context *brw)
        !(brw->ctx.NewDriverState & BRW_NEW_URB_SIZE) &&
        brw->urb.vsize == vs_size &&
        brw->urb.gs_present == gs_present &&
-       brw->urb.gsize == gs_size) {
+       brw->urb.gsize == gs_size &&
+       brw->urb.tess_present == tess_present &&
+       brw->urb.hsize == hs_size &&
+       brw->urb.dsize == ds_size) {
       return;
    }
    brw->urb.vsize = vs_size;
    brw->urb.gs_present = gs_present;
    brw->urb.gsize = gs_size;
+   brw->urb.tess_present = tess_present;
+   brw->urb.hsize = hs_size;
+   brw->urb.dsize = ds_size;
 
    /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS):
     *
     *     VS Number of URB Entries must be divisible by 8 if the VS URB Entry
     *     Allocation Size is less than 9 512-bit URB entries.
     *
-    * Similar text exists for GS.
+    * Similar text exists for HS, DS and GS.
     */
    unsigned vs_granularity = (vs_size < 9) ? 8 : 1;
+   unsigned hs_granularity = (hs_size < 9) ? 8 : 1;
+   unsigned ds_granularity = (ds_size < 9) ? 8 : 1;
    unsigned gs_granularity = (gs_size < 9) ? 8 : 1;
 
    /* URB allocations must be done in 8k chunks. */
@@ -191,13 +225,20 @@ gen7_upload_urb(struct brw_context *brw)
     * additional space it could actually make use of).
     */
 
-   /* VS has a lower limit on the number of URB entries */
+   /* VS has a lower limit on the number of URB entries.
+    *
+    * From the Broadwell PRM, 3DSTATE_URB_VS instruction:
+    * "When tessellation is enabled, the VS Number of URB Entries must be
+    *  greater than or equal to 192."
+    */
+   unsigned vs_min_entries =
+      tess_present && brw->gen == 8 ? 192 : brw->urb.min_vs_entries;
+
    unsigned vs_chunks =
-      ALIGN(brw->urb.min_vs_entries * vs_entry_size_bytes, chunk_size_bytes) /
-      chunk_size_bytes;
+      DIV_ROUND_UP(vs_min_entries * vs_entry_size_bytes, chunk_size_bytes);
    unsigned vs_wants =
-      ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes,
-            chunk_size_bytes) / chunk_size_bytes - vs_chunks;
+      DIV_ROUND_UP(brw->urb.max_vs_entries * vs_entry_size_bytes,
+                   chunk_size_bytes) - vs_chunks;
 
    unsigned gs_chunks = 0;
    unsigned gs_wants = 0;
@@ -210,21 +251,42 @@ gen7_upload_urb(struct brw_context *brw)
        *
        * (2) We can't allocate less than nr_gs_entries_granularity.
        */
-      gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
-                        chunk_size_bytes) / chunk_size_bytes;
-      gs_wants =
-         ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes,
-               chunk_size_bytes) / chunk_size_bytes - gs_chunks;
+      gs_chunks = DIV_ROUND_UP(MAX2(gs_granularity, 2) * gs_entry_size_bytes,
+                               chunk_size_bytes);
+      gs_wants = DIV_ROUND_UP(brw->urb.max_gs_entries * gs_entry_size_bytes,
+                              chunk_size_bytes) - gs_chunks;
+   }
+
+   unsigned hs_chunks = 0;
+   unsigned hs_wants = 0;
+   unsigned ds_chunks = 0;
+   unsigned ds_wants = 0;
+
+   if (tess_present) {
+      hs_chunks =
+         DIV_ROUND_UP(hs_granularity * hs_entry_size_bytes,
+                      chunk_size_bytes);
+      hs_wants =
+         DIV_ROUND_UP(devinfo->urb.max_hs_entries * hs_entry_size_bytes,
+                      chunk_size_bytes) - hs_chunks;
+
+      ds_chunks =
+         DIV_ROUND_UP(devinfo->urb.min_ds_entries * ds_entry_size_bytes,
+                      chunk_size_bytes);
+      ds_wants =
+         DIV_ROUND_UP(brw->urb.max_ds_entries * ds_entry_size_bytes,
+                      chunk_size_bytes) - ds_chunks;
    }
 
    /* There should always be enough URB space to satisfy the minimum
     * requirements of each stage.
     */
-   unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks;
+   unsigned total_needs = push_constant_chunks +
+                          vs_chunks + hs_chunks + ds_chunks + gs_chunks;
    assert(total_needs <= urb_chunks);
 
    /* Mete out remaining space (if any) in proportion to "wants". */
-   unsigned total_wants = vs_wants + gs_wants;
+   unsigned total_wants = vs_wants + hs_wants + ds_wants + gs_wants;
    unsigned remaining_space = urb_chunks - total_needs;
    if (remaining_space > total_wants)
       remaining_space = total_wants;
@@ -233,61 +295,100 @@ gen7_upload_urb(struct brw_context *brw)
          roundf(vs_wants * (((float) remaining_space) / total_wants));
       vs_chunks += vs_additional;
       remaining_space -= vs_additional;
+      total_wants -= vs_wants;
+
+      unsigned hs_additional = (unsigned)
+         round(hs_wants * (((double) remaining_space) / total_wants));
+      hs_chunks += hs_additional;
+      remaining_space -= hs_additional;
+      total_wants -= hs_wants;
+
+      unsigned ds_additional = (unsigned)
+         round(ds_wants * (((double) remaining_space) / total_wants));
+      ds_chunks += ds_additional;
+      remaining_space -= ds_additional;
+      total_wants -= ds_wants;
+
       gs_chunks += remaining_space;
    }
 
    /* Sanity check that we haven't over-allocated. */
-   assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks);
+   assert(push_constant_chunks +
+          vs_chunks + hs_chunks + ds_chunks + gs_chunks <= urb_chunks);
 
    /* Finally, compute the number of entries that can fit in the space
     * allocated to each stage.
     */
    unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes;
+   unsigned nr_hs_entries = hs_chunks * chunk_size_bytes / hs_entry_size_bytes;
+   unsigned nr_ds_entries = ds_chunks * chunk_size_bytes / ds_entry_size_bytes;
    unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes;
 
    /* Since we rounded up when computing *_wants, this may be slightly more
     * than the maximum allowed amount, so correct for that.
     */
    nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries);
+   nr_hs_entries = MIN2(nr_hs_entries, brw->urb.max_hs_entries);
+   nr_ds_entries = MIN2(nr_ds_entries, brw->urb.max_ds_entries);
    nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries);
 
    /* Ensure that we program a multiple of the granularity. */
    nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity);
+   nr_hs_entries = ROUND_DOWN_TO(nr_hs_entries, hs_granularity);
+   nr_ds_entries = ROUND_DOWN_TO(nr_ds_entries, ds_granularity);
    nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity);
 
    /* Finally, sanity check to make sure we have at least the minimum number
     * of entries needed for each stage.
     */
-   assert(nr_vs_entries >= brw->urb.min_vs_entries);
+   assert(nr_vs_entries >= vs_min_entries);
    if (gs_present)
       assert(nr_gs_entries >= 2);
+   if (tess_present) {
+      assert(nr_hs_entries >= 1);
+      assert(nr_ds_entries >= devinfo->urb.min_ds_entries);
+   }
 
    /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems
     * better to put reasonable data in there rather than leave them
     * uninitialized.
     */
    brw->urb.nr_vs_entries = nr_vs_entries;
+   brw->urb.nr_hs_entries = nr_hs_entries;
+   brw->urb.nr_ds_entries = nr_ds_entries;
    brw->urb.nr_gs_entries = nr_gs_entries;
 
    /* Lay out the URB in the following order:
     * - push constants
     * - VS
+    * - HS
+    * - DS
     * - GS
     */
    brw->urb.vs_start = push_constant_chunks;
-   brw->urb.gs_start = push_constant_chunks + vs_chunks;
+   brw->urb.hs_start = push_constant_chunks + vs_chunks;
+   brw->urb.ds_start = push_constant_chunks + vs_chunks + hs_chunks;
+   brw->urb.gs_start = push_constant_chunks + vs_chunks + hs_chunks +
+                       ds_chunks;
 
    if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
       gen7_emit_vs_workaround_flush(brw);
    gen7_emit_urb_state(brw,
                        brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
+                       brw->urb.nr_hs_entries, hs_size, brw->urb.hs_start,
+                       brw->urb.nr_ds_entries, ds_size, brw->urb.ds_start,
                        brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start);
 }
 
 void
 gen7_emit_urb_state(struct brw_context *brw,
-                    unsigned nr_vs_entries, unsigned vs_size,
-                    unsigned vs_start, unsigned nr_gs_entries,
+                    unsigned nr_vs_entries,
+                    unsigned vs_size, unsigned vs_start,
+                    unsigned nr_hs_entries,
+                    unsigned hs_size, unsigned hs_start,
+                    unsigned nr_ds_entries,
+                    unsigned ds_size, unsigned ds_start,
+                    unsigned nr_gs_entries,
                     unsigned gs_size, unsigned gs_start)
 {
    BEGIN_BATCH(8);
@@ -301,14 +402,15 @@ gen7_emit_urb_state(struct brw_context *brw,
              ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
              (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
 
-   /* Allocate the HS and DS zero space - we don't use them. */
    OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2));
-   OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-             (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
+   OUT_BATCH(nr_hs_entries |
+             ((hs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
+             (hs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
 
    OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2));
-   OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) |
-             (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
+   OUT_BATCH(nr_ds_entries |
+             ((ds_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) |
+             (ds_start << GEN7_URB_STARTING_ADDRESS_SHIFT));
    ADVANCE_BATCH();
 }
 
@@ -318,7 +420,10 @@ const struct brw_tracked_state gen7_urb = {
       .brw = BRW_NEW_CONTEXT |
              BRW_NEW_URB_SIZE |
              BRW_NEW_GEOMETRY_PROGRAM |
+             BRW_NEW_TESS_PROGRAMS |
              BRW_NEW_GS_PROG_DATA |
+             BRW_NEW_TCS_PROG_DATA |
+             BRW_NEW_TES_PROG_DATA |
              BRW_NEW_VS_PROG_DATA,
    },
    .emit = gen7_upload_urb,
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 06d5e65786b..7def5f5ad3c 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -77,14 +77,11 @@ upload_wm_state(struct brw_context *brw)
       dw1 |= GEN7_WM_KILL_ENABLE;
    }
 
-   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx)) {
-      dw1 |= GEN7_WM_DISPATCH_ENABLE;
-   }
-
    /* _NEW_BUFFERS | _NEW_COLOR */
+   const bool active_fs_has_side_effects =
+      _mesa_active_fragment_shader_has_side_effects(&brw->ctx);
    if (brw_color_buffer_write_enabled(brw) || writes_depth ||
-       prog_data->base.nr_image_params ||
-       dw1 & GEN7_WM_KILL_ENABLE) {
+       active_fs_has_side_effects || dw1 & GEN7_WM_KILL_ENABLE) {
       dw1 |= GEN7_WM_DISPATCH_ENABLE;
    }
    if (multisampled_fbo) {
@@ -110,7 +107,7 @@ upload_wm_state(struct brw_context *brw)
    /* BRW_NEW_FS_PROG_DATA */
    if (prog_data->early_fragment_tests)
       dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
-   else if (prog_data->base.nr_image_params)
+   else if (active_fs_has_side_effects)
       dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
 
    /* The "UAV access enable" bits are unnecessary on HSW because they only
@@ -123,7 +120,7 @@ upload_wm_state(struct brw_context *brw)
     */
    if (brw->is_haswell &&
        !(brw_color_buffer_write_enabled(brw) || writes_depth) &&
-       prog_data->base.nr_image_params)
+       active_fs_has_side_effects)
       dw2 |= HSW_WM_UAV_ONLY;
 
    BEGIN_BATCH(3);
diff --git a/src/mesa/drivers/dri/i965/gen8_ds_state.c b/src/mesa/drivers/dri/i965/gen8_ds_state.c
index a79e8aa020e..d91eb77e631 100644
--- a/src/mesa/drivers/dri/i965/gen8_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ds_state.c
@@ -31,9 +31,8 @@ gen8_upload_ds_state(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    const struct brw_stage_state *stage_state = &brw->tes.base;
-   /* BRW_NEW_TESS_EVAL_PROGRAM */
+   /* BRW_NEW_TESS_PROGRAMS */
    bool active = brw->tess_eval_program;
-   assert(!active || brw->tess_ctrl_program);
 
    /* BRW_NEW_TES_PROG_DATA */
    const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data;
@@ -92,7 +91,7 @@ const struct brw_tracked_state gen8_ds_state = {
    .dirty = {
       .mesa  = 0,
       .brw   = BRW_NEW_BATCH |
-               BRW_NEW_TESS_EVAL_PROGRAM |
+               BRW_NEW_TESS_PROGRAMS |
                BRW_NEW_TES_PROG_DATA,
    },
    .emit = gen8_upload_ds_state,
diff --git a/src/mesa/drivers/dri/i965/gen8_hs_state.c b/src/mesa/drivers/dri/i965/gen8_hs_state.c
index 38e22359ffb..21f3d469553 100644
--- a/src/mesa/drivers/dri/i965/gen8_hs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_hs_state.c
@@ -30,9 +30,8 @@ static void
 gen8_upload_hs_state(struct brw_context *brw)
 {
    const struct brw_stage_state *stage_state = &brw->tcs.base;
-   /* BRW_NEW_TESS_CTRL_PROGRAM */
-   bool active = brw->tess_ctrl_program;
-   assert(!active || brw->tess_eval_program);
+   /* BRW_NEW_TESS_PROGRAMS */
+   bool active = brw->tess_eval_program;
    /* BRW_NEW_HS_PROG_DATA */
    const struct brw_vue_prog_data *prog_data = &brw->tcs.prog_data->base;
 
@@ -84,7 +83,7 @@ const struct brw_tracked_state gen8_hs_state = {
       .mesa  = 0,
       .brw   = BRW_NEW_BATCH |
                BRW_NEW_TCS_PROG_DATA |
-               BRW_NEW_TESS_CTRL_PROGRAM,
+               BRW_NEW_TESS_PROGRAMS,
    },
    .emit = gen8_upload_hs_state,
 };
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 945f7106e3d..74cdcef015d 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -90,8 +90,7 @@ gen8_upload_ps_extra(struct brw_context *brw,
     *
     * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR
     */
-   if ((_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) ||
-        prog_data->base.nr_image_params) &&
+   if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx) &&
        !brw_color_buffer_write_enabled(brw))
       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 
@@ -157,7 +156,7 @@ upload_wm_state(struct brw_context *brw)
    /* BRW_NEW_FS_PROG_DATA */
    if (brw->wm.prog_data->early_fragment_tests)
       dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS;
-   else if (brw->wm.prog_data->base.nr_image_params)
+   else if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx))
       dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC;
 
    BEGIN_BATCH(2);
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 24761a70638..06672c1b4db 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -362,6 +362,7 @@ intelInitExtensions(struct gl_context *ctx)
 
    if (brw->gen >= 8) {
       ctx->Extensions.ARB_stencil_texturing = true;
+      ctx->Extensions.ARB_tessellation_shader = true;
    }
 
    if (brw->gen >= 9) {
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index cc90efe4886..a9f58b09422 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1339,6 +1339,11 @@ set_max_gl_versions(struct intel_screen *screen)
    switch (screen->devinfo->gen) {
    case 9:
    case 8:
+      psp->max_gl_core_version = 33;
+      psp->max_gl_compat_version = 30;
+      psp->max_gl_es1_version = 11;
+      psp->max_gl_es2_version = 31;
+      break;
    case 7:
    case 6:
       psp->max_gl_core_version = 33;
@@ -1492,6 +1497,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
 
    intelScreen->compiler = brw_compiler_create(intelScreen,
                                                intelScreen->devinfo);
+   intelScreen->program_id = 1;
 
    if (intelScreen->devinfo->has_resource_streamer) {
       int val = -1;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.c b/src/mesa/drivers/dri/nouveau/nouveau_screen.c
index 153f18e4a34..6f61f66f3b0 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_screen.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.c
@@ -40,6 +40,9 @@
 #include "main/renderbuffer.h"
 #include "swrast/s_renderbuffer.h"
 
+#include <nvif/class.h>
+#include <nvif/cl0080.h>
+
 static const __DRIextension *nouveau_screen_extensions[];
 
 static void
@@ -99,12 +102,22 @@ nouveau_init_screen2(__DRIscreen *dri_screen)
 	dri_screen->driverPrivate = screen;
 
 	/* Open the DRM device. */
-	ret = nouveau_device_wrap(dri_screen->fd, 0, &screen->device);
+	ret = nouveau_drm_new(dri_screen->fd, &screen->drm);
 	if (ret) {
 		nouveau_error("Error opening the DRM device.\n");
 		goto fail;
 	}
 
+	ret = nouveau_device_new(&screen->drm->client, NV_DEVICE,
+				 &(struct nv_device_v0) {
+					.device = ~0ULL,
+				 }, sizeof(struct nv_device_v0),
+				 &screen->device);
+	if (ret) {
+		nouveau_error("Error creating device object.\n");
+		goto fail;
+	}
+
 	/* Choose the card specific function pointers. */
 	switch (screen->device->chipset & 0xf0) {
 	case 0x00:
@@ -213,6 +226,7 @@ nouveau_destroy_screen(__DRIscreen *dri_screen)
 		return;
 
 	nouveau_device_del(&screen->device);
+	nouveau_drm_del(&screen->drm);
 
 	free(screen);
 	dri_screen->driverPrivate = NULL;
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.h b/src/mesa/drivers/dri/nouveau/nouveau_screen.h
index 45b1ee928d6..e3c192802d4 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_screen.h
+++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.h
@@ -33,6 +33,7 @@ struct nouveau_context;
 
 struct nouveau_screen {
 	__DRIscreen *dri_screen;
+	struct nouveau_drm *drm;
 	struct nouveau_device *device;
 	const struct nouveau_driver *driver;
 };
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c
index 5c7dcac3841..8462ab6627d 100644
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -645,10 +645,100 @@ GLAPI OSMesaContext GLAPIENTRY
 OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits,
                         GLint accumBits, OSMesaContext sharelist )
 {
+   int attribs[100], n = 0;
+
+   attribs[n++] = OSMESA_FORMAT;
+   attribs[n++] = format;
+   attribs[n++] = OSMESA_DEPTH_BITS;
+   attribs[n++] = depthBits;
+   attribs[n++] = OSMESA_STENCIL_BITS;
+   attribs[n++] = stencilBits;
+   attribs[n++] = OSMESA_ACCUM_BITS;
+   attribs[n++] = accumBits;
+   attribs[n++] = 0;
+
+   return OSMesaCreateContextAttribs(attribs, sharelist);
+}
+
+
+/**
+ * New in Mesa 11.2
+ *
+ * Create context with attribute list.
+ */
+GLAPI OSMesaContext GLAPIENTRY
+OSMesaCreateContextAttribs(const int *attribList, OSMesaContext sharelist)
+{
    OSMesaContext osmesa;
    struct dd_function_table functions;
    GLint rind, gind, bind, aind;
    GLint redBits = 0, greenBits = 0, blueBits = 0, alphaBits =0;
+   GLenum format = OSMESA_RGBA;
+   GLint depthBits = 0, stencilBits = 0, accumBits = 0;
+   int profile = OSMESA_COMPAT_PROFILE, version_major = 1, version_minor = 0;
+   gl_api api_profile = API_OPENGL_COMPAT;
+   int i;
+
+   for (i = 0; attribList[i]; i += 2) {
+      switch (attribList[i]) {
+      case OSMESA_FORMAT:
+         format = attribList[i+1];
+         switch (format) {
+         case OSMESA_COLOR_INDEX:
+         case OSMESA_RGBA:
+         case OSMESA_BGRA:
+         case OSMESA_ARGB:
+         case OSMESA_RGB:
+         case OSMESA_BGR:
+         case OSMESA_RGB_565:
+            /* legal */
+            break;
+         default:
+            return NULL;
+         }
+         break;
+      case OSMESA_DEPTH_BITS:
+         depthBits = attribList[i+1];
+         if (depthBits < 0)
+            return NULL;
+         break;
+      case OSMESA_STENCIL_BITS:
+         stencilBits = attribList[i+1];
+         if (stencilBits < 0)
+            return NULL;
+         break;
+      case OSMESA_ACCUM_BITS:
+         accumBits = attribList[i+1];
+         if (accumBits < 0)
+            return NULL;
+         break;
+      case OSMESA_PROFILE:
+         profile = attribList[i+1];
+         if (profile == OSMESA_COMPAT_PROFILE)
+            api_profile = API_OPENGL_COMPAT;
+         else if (profile == OSMESA_CORE_PROFILE)
+            api_profile = API_OPENGL_CORE;
+         else
+            return NULL;
+         break;
+      case OSMESA_CONTEXT_MAJOR_VERSION:
+         version_major = attribList[i+1];
+         if (version_major < 1)
+            return NULL;
+         break;
+      case OSMESA_CONTEXT_MINOR_VERSION:
+         version_minor = attribList[i+1];
+         if (version_minor < 0)
+            return NULL;
+         break;
+      case 0:
+         /* end of list */
+         break;
+      default:
+         fprintf(stderr, "Bad attribute in OSMesaCreateContextAttribs()\n");
+         return NULL;
+      }
+   }
 
    rind = gind = bind = aind = 0;
    if (format==OSMESA_RGBA) {
@@ -742,7 +832,7 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits,
       functions.UpdateState = osmesa_update_state;
 
       if (!_mesa_initialize_context(&osmesa->mesa,
-                                    API_OPENGL_COMPAT,
+                                    api_profile,
                                     osmesa->gl_visual,
                                     sharelist ? &sharelist->mesa
                                               : (struct gl_context *) NULL,
@@ -819,6 +909,13 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits,
 
          _mesa_compute_version(ctx);
 
+         if (ctx->Version < version_major * 10 + version_minor) {
+            _mesa_destroy_visual(osmesa->gl_visual);
+            _mesa_free_context_data(ctx);
+            free(osmesa);
+            return NULL;
+         }
+
          /* Exec table initialization requires the version to be computed */
          _mesa_initialize_dispatch_tables(ctx);
          _mesa_initialize_vbo_vtxfmt(ctx);
@@ -1121,6 +1218,7 @@ struct name_function
 static struct name_function functions[] = {
    { "OSMesaCreateContext", (OSMESAproc) OSMesaCreateContext },
    { "OSMesaCreateContextExt", (OSMESAproc) OSMesaCreateContextExt },
+   { "OSMesaCreateContextAttribs", (OSMESAproc) OSMesaCreateContextAttribs },
    { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext },
    { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent },
    { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext },
diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c
index 935ba05b7cc..8fcbff6a7a4 100644
--- a/src/mesa/main/atifragshader.c
+++ b/src/mesa/main/atifragshader.c
@@ -293,7 +293,7 @@ _mesa_DeleteFragmentShaderATI(GLuint id)
 	 prog->RefCount--;
 	 if (prog->RefCount <= 0) {
 	    assert(prog != &DummyShader);
-	    free(prog);
+            _mesa_delete_ati_fragment_shader(ctx, prog);
 	 }
       }
    }
@@ -345,6 +345,9 @@ _mesa_BeginFragmentShaderATI(void)
    ctx->ATIFragmentShader.Current->isValid = GL_FALSE;
    ctx->ATIFragmentShader.Current->swizzlerq = 0;
    ctx->ATIFragmentShader.Compiling = 1;
+#if MESA_DEBUG_ATI_FS
+   _mesa_debug(ctx, "%s %u\n", __func__, ctx->ATIFragmentShader.Current->Id);
+#endif
 }
 
 void GLAPIENTRY
diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c
index abc553966e9..5729e601c12 100644
--- a/src/mesa/main/blit.c
+++ b/src/mesa/main/blit.c
@@ -286,8 +286,17 @@ _mesa_blit_framebuffer(struct gl_context *ctx,
             }
             /* extra checks for multisample copies... */
             if (readFb->Visual.samples > 0 || drawFb->Visual.samples > 0) {
-               /* color formats must match */
-               if (!compatible_resolve_formats(colorReadRb, colorDrawRb)) {
+               /* color formats must match on GLES. This isn't checked on
+                * desktop GL because the GL 4.4 spec was changed to allow it.
+                * In the section entitled “Changes in the released
+                * Specification of July 22, 2013” it says:
+                *
+                * “Relax BlitFramebuffer in section 18.3.1 so that format
+                *  conversion can take place during multisample blits, since
+                *  drivers already allow this and some apps depend on it.”
+                */
+               if (_mesa_is_gles(ctx) &&
+                   !compatible_resolve_formats(colorReadRb, colorDrawRb)) {
                   _mesa_error(ctx, GL_INVALID_OPERATION,
                          "%s(bad src/dst multisample pixel formats)", func);
                   return;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index fabe9913c11..26dfac1ab90 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2155,8 +2155,6 @@ struct gl_compute_program_state
 /**
  * ATI_fragment_shader runtime state
  */
-#define ATI_FS_INPUT_PRIMARY 0
-#define ATI_FS_INPUT_SECONDARY 1
 
 struct atifs_instruction;
 struct atifs_setupinst;
@@ -4542,11 +4540,22 @@ enum _debug
    DEBUG_INCOMPLETE_FBO         = (1 << 3)
 };
 
+/**
+ * Checks if the active fragment shader program can have side effects due
+ * to use of things like atomic buffers or images
+ */
 static inline bool
-_mesa_active_fragment_shader_has_atomic_ops(const struct gl_context *ctx)
+_mesa_active_fragment_shader_has_side_effects(const struct gl_context *ctx)
 {
-   return ctx->Shader._CurrentFragmentProgram != NULL &&
-      ctx->Shader._CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT]->NumAtomicBuffers > 0;
+   const struct gl_shader *sh;
+
+   if (!ctx->_Shader->_CurrentFragmentProgram)
+      return false;
+
+   sh = ctx->_Shader->_CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT];
+   return sh->NumAtomicBuffers > 0 ||
+          sh->NumImages > 0 ||
+          sh->NumShaderStorageBlocks > 0;
 }
 
 #ifdef __cplusplus
diff --git a/src/mesa/main/performance_monitor.c b/src/mesa/main/performance_monitor.c
index 98dfbea083c..43529b2b35a 100644
--- a/src/mesa/main/performance_monitor.c
+++ b/src/mesa/main/performance_monitor.c
@@ -591,11 +591,10 @@ perf_monitor_result_size(const struct gl_context *ctx,
 
    for (group = 0; group < ctx->PerfMonitor.NumGroups; group++) {
       const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[group];
-      for (counter = 0; counter < g->NumCounters; counter++) {
-         const struct gl_perf_monitor_counter *c = &g->Counters[counter];
+      BITSET_WORD tmp;
 
-         if (!BITSET_TEST(m->ActiveCounters[group], counter))
-            continue;
+      BITSET_FOREACH_SET(counter, tmp, m->ActiveCounters[group], g->NumCounters) {
+         const struct gl_perf_monitor_counter *c = &g->Counters[counter];
 
          size += sizeof(uint32_t); /* Group ID */
          size += sizeof(uint32_t); /* Counter ID */
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index ced10a93b1d..e526119db19 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -1373,46 +1373,107 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
 }
 
 static bool
-validate_io(const struct gl_shader *input_stage,
-            const struct gl_shader *output_stage, bool isES)
+validate_io(const struct gl_shader *producer,
+            const struct gl_shader *consumer, bool isES)
 {
-   assert(input_stage && output_stage);
+   assert(producer && consumer);
+   unsigned inputs = 0, outputs = 0;
+
+   /* From OpenGL ES 3.1 spec (Interface matching):
+    *
+    *    "An output variable is considered to match an input variable in the
+    *    subsequent shader if:
+    *
+    *    - the two variables match in name, type, and qualification; or
+    *    - the two variables are declared with the same location qualifier and
+    *      match in type and qualification.
+    *
+    *    ...
+    *
+    *    At an interface between program objects, the set of inputs and outputs
+    *    are considered to match exactly if and only if:
+    *
+    *    - Every declared input variable has a matching output, as described
+    *    above.
+    *
+    *    - There are no user-defined output variables declared without a
+    *    matching input variable declaration.
+    *
+    *    - All matched input and output variables have identical precision
+    *    qualification.
+    *
+    *    When the set of inputs and outputs on an interface between programs
+    *    matches exactly, all inputs are well-defined except when the
+    *    corresponding outputs were not written in the previous shader. However,
+    *    any mismatch between inputs and outputs will result in a validation
+    *    failure."
+    *
+    * OpenGL Core 4.5 spec includes same paragraph as above but without check
+    * for precision and the last 'validation failure' clause. Therefore
+    * behaviour is more relaxed, input and output amount is not required by the
+    * spec to be validated.
+    *
+    * FIXME: Update once Khronos spec bug #15331 is resolved.
+    * FIXME: Add validation by type, currently information loss during varying
+    * packing makes this challenging.
+    */
+
+   /* Currently no matching done for desktop. */
+   if (!isES)
+      return true;
 
    /* For each output in a, find input in b and do any required checks. */
-   foreach_in_list(ir_instruction, out, input_stage->ir) {
+   foreach_in_list(ir_instruction, out, producer->ir) {
       ir_variable *out_var = out->as_variable();
-      if (!out_var || out_var->data.mode != ir_var_shader_out)
+      if (!out_var || out_var->data.mode != ir_var_shader_out ||
+          is_gl_identifier(out_var->name))
          continue;
 
-      foreach_in_list(ir_instruction, in, output_stage->ir) {
+      outputs++;
+
+      inputs = 0;
+      foreach_in_list(ir_instruction, in, consumer->ir) {
          ir_variable *in_var = in->as_variable();
-         if (!in_var || in_var->data.mode != ir_var_shader_in)
+         if (!in_var || in_var->data.mode != ir_var_shader_in ||
+             is_gl_identifier(in_var->name))
+            continue;
+
+         inputs++;
+
+         /* Match by location qualifier and precision.
+          *
+          * FIXME: Add explicit location matching validation here. Be careful
+          * not to match varyings with explicit locations to varyings without
+          * explicit locations.
+          */
+         if ((in_var->data.explicit_location &&
+             out_var->data.explicit_location) &&
+             in_var->data.location == out_var->data.location &&
+             in_var->data.precision == out_var->data.precision)
             continue;
 
-         if (strcmp(in_var->name, out_var->name) == 0) {
-            /* Since we now only validate precision, we can skip this step for
-             * desktop GLSL shaders, there precision qualifier is ignored.
-             *
-             * From OpenGL 4.50 Shading Language spec, section 4.7:
-             *     "For the purposes of determining if an output from one
-             *     shader stage matches an input of the next stage, the
-             *     precision qualifier need not match."
+         unsigned len = strlen(in_var->name);
+
+         /* Handle input swizzle in variable name. */
+         const char *dot = strchr(in_var->name, '.');
+         if (dot)
+            len = dot - in_var->name;
+
+         /* Match by name and precision. */
+         if (strncmp(in_var->name, out_var->name, len) == 0) {
+            /* From OpenGL ES 3.1 spec:
+             *     "When both shaders are in separate programs, mismatched
+             *     precision qualifiers will result in a program interface
+             *     mismatch that will result in program pipeline validation
+             *     failures, as described in section 7.4.1 (“Shader Interface
+             *     Matching”) of the OpenGL ES 3.1 Specification."
              */
-            if (isES) {
-               /* From OpenGL ES 3.1 spec:
-                *     "When both shaders are in separate programs, mismatched
-                *     precision qualifiers will result in a program interface
-                *     mismatch that will result in program pipeline validation
-                *     failures, as described in section 7.4.1 (“Shader Interface
-                *     Matching”) of the OpenGL ES 3.1 Specification."
-                */
-               if (in_var->data.precision != out_var->data.precision)
-                  return false;
-            }
+            if (in_var->data.precision != out_var->data.precision)
+               return false;
          }
       }
    }
-   return true;
+   return inputs == outputs;
 }
 
 /**
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index ac40891f435..e258ad9d1db 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -208,7 +208,7 @@ _mesa_validate_shader_target(const struct gl_context *ctx, GLenum type)
    case GL_TESS_EVALUATION_SHADER:
       return ctx == NULL || _mesa_has_tessellation(ctx);
    case GL_COMPUTE_SHADER:
-      return ctx == NULL || ctx->Extensions.ARB_compute_shader;
+      return ctx == NULL || _mesa_has_compute_shaders(ctx);
    default:
       return false;
    }
@@ -1514,6 +1514,8 @@ void GLAPIENTRY
 _mesa_LinkProgram(GLhandleARB programObj)
 {
    GET_CURRENT_CONTEXT(ctx);
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glLinkProgram %u\n", programObj);
    link_program(ctx, programObj);
 }
 
@@ -1731,6 +1733,9 @@ _mesa_UseProgram(GLhandleARB program)
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader_program *shProg;
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glUseProgram %u\n", program);
+
    if (_mesa_is_xfb_active_and_unpaused(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glUseProgram(transform feedback active)");
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index e92bb111cc4..112a73dc0e7 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -433,7 +433,8 @@ compute_version_es1(const struct gl_extensions *extensions)
 }
 
 static GLuint
-compute_version_es2(const struct gl_extensions *extensions)
+compute_version_es2(const struct gl_extensions *extensions,
+                    const struct gl_constants *consts)
 {
    /* OpenGL ES 2.0 is derived from OpenGL 2.0 */
    const bool ver_2_0 = (extensions->ARB_texture_cube_map &&
@@ -464,9 +465,11 @@ compute_version_es2(const struct gl_extensions *extensions)
                          extensions->EXT_texture_snorm &&
                          extensions->NV_primitive_restart &&
                          extensions->OES_depth_texture_cube_map);
+   const bool es31_compute_shader =
+      consts->MaxComputeWorkGroupInvocations >= 128;
    const bool ver_3_1 = (ver_3_0 &&
                          extensions->ARB_arrays_of_arrays &&
-                         extensions->ARB_compute_shader &&
+                         es31_compute_shader &&
                          extensions->ARB_draw_indirect &&
                          extensions->ARB_explicit_uniform_location &&
                          extensions->ARB_framebuffer_no_attachments &&
@@ -508,7 +511,7 @@ _mesa_get_version(const struct gl_extensions *extensions,
    case API_OPENGLES:
       return compute_version_es1(extensions);
    case API_OPENGLES2:
-      return compute_version_es2(extensions);
+      return compute_version_es2(extensions, consts);
    }
    return 0;
 }
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index c5d8c483429..c6f9ef68418 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -1114,7 +1114,13 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
       if (ir->operands[0]->type->is_vector() ||
 	  ir->operands[1]->type->is_vector()) {
 	 src_reg temp = get_temp(glsl_type::vec4_type);
-	 emit(ir, OPCODE_SNE, dst_reg(temp), op[0], op[1]);
+         if (ir->operands[0]->type->is_boolean() &&
+             ir->operands[1]->as_constant() &&
+             ir->operands[1]->as_constant()->is_zero()) {
+            temp = op[0];
+         } else {
+            emit(ir, OPCODE_SNE, dst_reg(temp), op[0], op[1]);
+         }
 
 	 /* After the dot-product, the value will be an integer on the
 	  * range [0,4].  Zero stays zero, and positive values become 1.0.
@@ -1140,32 +1146,6 @@ ir_to_mesa_visitor::visit(ir_expression *ir)
       }
       break;
 
-   case ir_unop_any: {
-      assert(ir->operands[0]->type->is_vector());
-
-      /* After the dot-product, the value will be an integer on the
-       * range [0,4].  Zero stays zero, and positive values become 1.0.
-       */
-      ir_to_mesa_instruction *const dp =
-	 emit_dp(ir, result_dst, op[0], op[0],
-		 ir->operands[0]->type->vector_elements);
-      if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
-	 /* The clamping to [0,1] can be done for free in the fragment
-	  * shader with a saturate.
-	  */
-	 dp->saturate = true;
-      } else {
-	 /* Negating the result of the dot-product gives values on the range
-	  * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
-	  * is achieved using SLT.
-	  */
-	 src_reg slt_src = result_src;
-	 slt_src.negate = ~slt_src.negate;
-	 emit(ir, OPCODE_SLT, result_dst, slt_src, src_reg_for_float(0.0));
-      }
-      break;
-   }
-
    case ir_binop_logic_xor:
       emit(ir, OPCODE_SNE, result_dst, op[0], op[1]);
       break;
diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c
index bdb335e4ba3..12490d0c380 100644
--- a/src/mesa/program/prog_statevars.c
+++ b/src/mesa/program/prog_statevars.c
@@ -474,7 +474,7 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[],
           * single MAD.
           * linear: fogcoord * -1/(end-start) + end/(end-start)
           * exp: 2^-(density/ln(2) * fogcoord)
-          * exp2: 2^-((density/(ln(2)^2) * fogcoord)^2)
+          * exp2: 2^-((density/(sqrt(ln(2))) * fogcoord)^2)
           */
          value[0] = (ctx->Fog.End == ctx->Fog.Start)
             ? 1.0f : (GLfloat)(-1.0F / (ctx->Fog.End - ctx->Fog.Start));
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index d5386ee70e8..8ca830a0b14 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -554,8 +554,8 @@ static void
 ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
 {
    nir_ssa_def *cmp = b->shader->options->native_integers ?
-      nir_bany4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0))) :
-      nir_fany4(b, nir_slt(b, src[0], nir_imm_float(b, 0.0)));
+      nir_bany_inequal4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_int(b, 0)) :
+      nir_fany_nequal4(b, nir_slt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_float(b, 0.0));
 
    nir_intrinsic_instr *discard =
       nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
@@ -928,6 +928,7 @@ ptn_add_output_stores(struct ptn_compile *c)
       nir_intrinsic_instr *store =
          nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
       store->num_components = glsl_get_vector_elements(var->type);
+      store->const_index[0] = (1 << store->num_components) - 1;
       store->variables[0] =
          nir_deref_var_create(store, c->output_vars[var->data.location]);
 
@@ -998,6 +999,7 @@ setup_registers_and_variables(struct ptn_compile *c)
             nir_intrinsic_instr *store =
                nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
             store->num_components = 4;
+            store->const_index[0] = WRITEMASK_XYZW;
             store->variables[0] = nir_deref_var_create(store, fullvar);
             store->src[0] = nir_src_for_ssa(f001);
             nir_builder_instr_insert(b, &store->instr);
diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index 4252c27962e..94231cf1946 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -250,7 +250,7 @@ update_shader_samplers(struct st_context *st,
    samplers_used = prog->SamplersUsed;
 
    if (*num_samplers == 0 && samplers_used == 0x0)
-       return;
+      return;
 
    *num_samplers = 0;
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index b6a0e6b4f4f..89ad6cd8c28 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -1776,89 +1776,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
       break;
 
-   case ir_unop_any: {
-      assert(ir->operands[0]->type->is_vector());
-
-      if (native_integers) {
-         int dst_swizzle = 0, op0_swizzle, i;
-         st_src_reg accum = op[0];
-
-         op0_swizzle = op[0].swizzle;
-         accum.swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 0),
-                                       GET_SWZ(op0_swizzle, 0),
-                                       GET_SWZ(op0_swizzle, 0),
-                                       GET_SWZ(op0_swizzle, 0));
-         for (i = 0; i < 4; i++) {
-            if (result_dst.writemask & (1 << i)) {
-               dst_swizzle = MAKE_SWIZZLE4(i, i, i, i);
-               break;
-            }
-         }
-         assert(i != 4);
-         assert(ir->operands[0]->type->is_boolean());
-
-         /* OR all the components together, since they should be either 0 or ~0
-          */
-         switch (ir->operands[0]->type->vector_elements) {
-         case 4:
-            op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 3),
-                                          GET_SWZ(op0_swizzle, 3),
-                                          GET_SWZ(op0_swizzle, 3),
-                                          GET_SWZ(op0_swizzle, 3));
-            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
-            accum = st_src_reg(result_dst);
-            accum.swizzle = dst_swizzle;
-            /* fallthrough */
-         case 3:
-            op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 2),
-                                          GET_SWZ(op0_swizzle, 2),
-                                          GET_SWZ(op0_swizzle, 2),
-                                          GET_SWZ(op0_swizzle, 2));
-            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
-            accum = st_src_reg(result_dst);
-            accum.swizzle = dst_swizzle;
-            /* fallthrough */
-         case 2:
-            op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 1),
-                                          GET_SWZ(op0_swizzle, 1),
-                                          GET_SWZ(op0_swizzle, 1),
-                                          GET_SWZ(op0_swizzle, 1));
-            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
-            break;
-         default:
-            assert(!"Unexpected vector size");
-            break;
-         }
-      } else {
-         /* After the dot-product, the value will be an integer on the
-          * range [0,4].  Zero stays zero, and positive values become 1.0.
-          */
-         glsl_to_tgsi_instruction *const dp =
-            emit_dp(ir, result_dst, op[0], op[0],
-                    ir->operands[0]->type->vector_elements);
-         if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
-             result_dst.type == GLSL_TYPE_FLOAT) {
-            /* The clamping to [0,1] can be done for free in the fragment
-             * shader with a saturate.
-             */
-            dp->saturate = true;
-         } else if (result_dst.type == GLSL_TYPE_FLOAT) {
-            /* Negating the result of the dot-product gives values on the range
-             * [-4, 0].  Zero stays zero, and negative values become 1.0.  This
-             * is achieved using SLT.
-             */
-            st_src_reg slt_src = result_src;
-            slt_src.negate = ~slt_src.negate;
-            emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
-         }
-         else {
-            /* Use SNE 0 if integers are being used as boolean values. */
-            emit_asm(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
-         }
-      }
-      break;
-   }
-
    case ir_binop_logic_xor:
       if (native_integers)
          emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
diff --git a/src/mesa/swrast/s_atifragshader.c b/src/mesa/swrast/s_atifragshader.c
index 2974deed41b..414a4144e25 100644
--- a/src/mesa/swrast/s_atifragshader.c
+++ b/src/mesa/swrast/s_atifragshader.c
@@ -26,6 +26,8 @@
 #include "swrast/s_atifragshader.h"
 #include "swrast/s_context.h"
 
+#define ATI_FS_INPUT_PRIMARY 0
+#define ATI_FS_INPUT_SECONDARY 1
 
 /**
  * State for executing ATI fragment shader.
diff --git a/src/util/ralloc.c b/src/util/ralloc.c
index bb4cf9612eb..6d4032bd4f6 100644
--- a/src/util/ralloc.c
+++ b/src/util/ralloc.c
@@ -293,6 +293,7 @@ ralloc_adopt(const void *new_ctx, void *old_ctx)
 
    /* Connect the two lists together; parent them to new_ctx; make old_ctx empty. */
    child->next = new_info->child;
+   child->parent = new_info;
    new_info->child = old_info->child;
    old_info->child = NULL;
 }
diff --git a/src/vulkan/anv_meta.c b/src/vulkan/anv_meta.c
index 63976cdfe42..cf6678d852f 100644
--- a/src/vulkan/anv_meta.c
+++ b/src/vulkan/anv_meta.c
@@ -107,7 +107,7 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim)
    nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
                                                  color_type, "f_color");
    color_out->data.location = FRAG_RESULT_DATA0;
-   nir_store_var(&b, color_out, &tex->dest.ssa);
+   nir_store_var(&b, color_out, &tex->dest.ssa, 4);
 
    return b.shader;
 }
diff --git a/src/vulkan/anv_pipeline.c b/src/vulkan/anv_pipeline.c
index 02be5a81984..c2070be10ec 100644
--- a/src/vulkan/anv_pipeline.c
+++ b/src/vulkan/anv_pipeline.c
@@ -116,6 +116,9 @@ anv_shader_compile_to_nir(struct anv_device *device,
 
       nir_inline_functions(nir);
       nir_validate_shader(nir);
+
+      nir_lower_system_values(nir);
+      nir_validate_shader(nir);
    }
 
    /* Vulkan uses the separate-shader linking model */
@@ -379,8 +382,8 @@ anv_pipeline_compile(struct anv_pipeline *pipeline,
    prog_data->binding_table.image_start = bias;
 
    /* Finish the optimization and compilation process */
-   nir = brw_lower_nir(nir, &pipeline->device->info, NULL,
-                       compiler->scalar_stage[stage]);
+   nir = brw_nir_lower_io(nir, &pipeline->device->info,
+                          compiler->scalar_stage[stage]);
 
    /* nir_lower_io will only handle the push constants; we need to set this
     * to the full number of possible uniforms.