Merge remote-tracking branch 'mesa-public/master' into vulkan

author: Jason Ekstrand <[email protected]> 2015-08-25 17:12:03 -0700
committer: Jason Ekstrand <[email protected]> 2015-08-25 18:41:21 -0700
commit: 9b387b5d3f4103c51079ea5298d33086af6da433 (patch)
tree: 4127f2284b6b4a5746bbc01bbfc6a97305057cb4
parent: 5360edcb304e147341b934567f3bbf40e9d5a3b5 (diff)
parent: 1d2a844e7d55645ea3d24fb589bec03695b3d2b1 (diff)
247 files changed, 7061 insertions, 3849 deletions
diff --git a/Makefile.am b/Makefile.am
index 9f49ce65353..149610c7c69 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -32,7 +32,9 @@ AM_DISTCHECK_CONFIGURE_FLAGS = \
 	--enable-vdpau \
 	--enable-xa \
 	--enable-xvmc \
-	--with-egl-platforms=x11,wayland,drm
+	--with-egl-platforms=x11,wayland,drm \
+	--with-dri-drivers=i915,i965,nouveau,radeon,r200,swrast \
+	--with-gallium-drivers=i915,ilo,nouveau,r300,r600,radeonsi,freedreno,svga,swrast
 
 ACLOCAL_AMFLAGS = -I m4
 
diff --git a/VERSION b/VERSION
index 2b1181ddc3f..8a1e8e32743 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-11.0.0-devel
+11.1.0-devel
diff --git a/configure.ac b/configure.ac
index 74e13b3fcb7..dd23ecadd08 100644
--- a/configure.ac
+++ b/configure.ac
@@ -74,7 +74,7 @@ LIBDRM_AMDGPU_REQUIRED=2.4.63
 LIBDRM_INTEL_REQUIRED=2.4.61
 LIBDRM_NVVIEUX_REQUIRED=2.4.33
 LIBDRM_NOUVEAU_REQUIRED=2.4.62
-LIBDRM_FREEDRENO_REQUIRED=2.4.57
+LIBDRM_FREEDRENO_REQUIRED=2.4.64
 DRI2PROTO_REQUIRED=2.6
 DRI3PROTO_REQUIRED=1.0
 PRESENTPROTO_REQUIRED=1.0
@@ -1639,6 +1639,10 @@ if test "x$enable_nine" = xyes; then
     if test "x$with_gallium_drivers" = xswrast; then
         AC_MSG_ERROR([nine requires at least one non-swrast gallium driver])
     fi
+    if test $GCC_VERSION_MAJOR -lt 4 -o $GCC_VERSION_MAJOR -eq 4 -a $GCC_VERSION_MINOR -lt 6; then
+        AC_MSG_ERROR([gcc >= 4.6 is required to build nine])
+    fi
+
     if test "x$enable_dri3" = xno; then
         AC_MSG_WARN([using nine together with wine requires DRI3 enabled system])
     fi
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 54c0c5aa6a8..331b2daaeb6 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -163,7 +163,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_multi_draw_indirect                           DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_program_interface_query                       DONE (all drivers)
   GL_ARB_robust_buffer_access_behavior                 not started
-  GL_ARB_shader_image_size                             in progress (Martin Peres)
+  GL_ARB_shader_image_size                             DONE (i965)
   GL_ARB_shader_storage_buffer_object                  in progress (Iago Toral, Samuel Iglesias)
   GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
@@ -210,8 +210,8 @@ GLES3.1, GLSL ES 3.1
   GL_ARB_framebuffer_no_attachments                    DONE (i965)
   GL_ARB_program_interface_query                       DONE (all drivers)
   GL_ARB_shader_atomic_counters                        DONE (i965)
-  GL_ARB_shader_image_load_store                       in progress (curro)
-  GL_ARB_shader_image_size                             in progress (Martin Peres)
+  GL_ARB_shader_image_load_store                       DONE (i965)
+  GL_ARB_shader_image_size                             DONE (i965)
   GL_ARB_shader_storage_buffer_object                  in progress (Iago Toral, Samuel Iglesias)
   GL_ARB_shading_language_packing                      DONE (all drivers)
   GL_ARB_separate_shader_objects                       DONE (all drivers)
diff --git a/docs/index.html b/docs/index.html
index b9e6148914e..b067256d7b2 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,12 @@
 
 <h1>News</h1>
 
+<h2>August 22 2015</h2>
+<p>
+<a href="relnotes/10.6.5.html">Mesa 10.6.5</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>August 11 2015</h2>
 <p>
 <a href="relnotes/10.6.4.html">Mesa 10.6.4</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 39e7f61e792..2cc4701d3ff 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/10.6.5.html">10.6.5 release notes</a>
 <li><a href="relnotes/10.6.4.html">10.6.4 release notes</a>
 <li><a href="relnotes/10.6.3.html">10.6.3 release notes</a>
 <li><a href="relnotes/10.6.2.html">10.6.2 release notes</a>
diff --git a/docs/relnotes/10.6.5.html b/docs/relnotes/10.6.5.html
new file mode 100644
index 00000000000..e7326ff1941
--- /dev/null
+++ b/docs/relnotes/10.6.5.html
@@ -0,0 +1,124 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.6.5 Release Notes / August 22, 2015</h1>
+
+<p>
+Mesa 10.6.5 is a bug fix release which fixes bugs found since the 10.6.4 release.
+</p>
+<p>
+Mesa 10.6.5 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+afe290fc7af75a25df5ee52396a9f09e5dba85fb3e159304bdda265b8564b0d4  mesa-10.6.5.tar.gz
+fb6fac3c85bcfa9d06b8dd439169f23f0c0924a88e44362e738b99b1feff762f  mesa-10.6.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=85252">Bug 85252</a> - Segfault in compiler while processing ternary operator with void arguments</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91570">Bug 91570</a> - Upgrading mesa to 10.6 causes segfault in OpenGL applications with GeForce4 MX 440 / AGP 8X</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91610">Bug 91610</a> - [BSW] GPU hang for spec.shaders.point-vertex-id gl_instanceid divisor</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Adam Jackson (1):</p>
+<ul>
+  <li>glx: Fix __glXWireToEvent for BufferSwapComplete</li>
+</ul>
+
+<p>Alex Deucher (2):</p>
+<ul>
+  <li>radeonsi: add new OLAND pci id</li>
+  <li>radeonsi: properly set the raster_config for KV</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 10.6.4</li>
+  <li>vc4: add missing nir include, to fix the build</li>
+  <li>Revert "radeonsi: properly set the raster_config for KV"</li>
+  <li>Update version to 10.6.5</li>
+</ul>
+
+<p>Frank Binns (1):</p>
+<ul>
+  <li>egl/x11: don't abort when creating a DRI2 drawable fails</li>
+</ul>
+
+<p>Ilia Mirkin (3):</p>
+<ul>
+  <li>nouveau: no need to do tnl wakeup, state updates are always hooked up</li>
+  <li>gm107/ir: indirect handle goes first on maxwell also</li>
+  <li>nv50,nvc0: take level into account when doing eng2d multi-layer blits</li>
+</ul>
+
+<p>Jason Ekstrand (4):</p>
+<ul>
+  <li>meta/copy_image: Stash off the scissor</li>
+  <li>mesa/formats: Only do byteswapping for packed formats</li>
+  <li>mesa/formats: Fix swizzle flipping for big-endian targets</li>
+  <li>mesa/formats: Don't flip channels of null array formats</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>radeonsi: fix polygon offset scale</li>
+  <li>r600g: fix polygon offset scale</li>
+  <li>r600g: allow setting geometry shader sampler states</li>
+</ul>
+
+<p>Neil Roberts (1):</p>
+<ul>
+  <li>i965/bdw: Fix setting the instancing state for the SGVS element</li>
+</ul>
+
+<p>Oded Gabbay (2):</p>
+<ul>
+  <li>mesa: clear existing swizzle info before bitwise-OR</li>
+  <li>mesa/formats: don't byteswap when building array formats</li>
+</ul>
+
+<p>Renaud Gaubert (1):</p>
+<ul>
+  <li>glsl: avoid compiler's segfault when processing operators with void arguments</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/11.0.0.html b/docs/relnotes/11.0.0.html
index 75967ac7eec..537b88341b5 100644
--- a/docs/relnotes/11.0.0.html
+++ b/docs/relnotes/11.0.0.html
@@ -46,9 +46,12 @@ Note: some of the new features are only available with certain drivers.
 <ul>
 <li>New hardware support for AMD GCN 1.2 GPUs: Tonga, Iceland, Carrizo, Fiji</li>
 <li>OpenGL 4.1 on radeonsi, nvc0</li>
+<li>OpenGL ES 3.0 on freedreno (a3xx, a4xx)
 <li>GL_AMD_vertex_shader_viewport_index on radeonsi</li>
 <li>GL_ARB_conditional_render_inverted on r600, radeonsi</li>
+<li>GL_ARB_depth_buffer_float on a4xx</li>
 <li>GL_ARB_derivative_control on radeonsi</li>
+<li>GL_ARB_draw_buffers, GL_ARB_draw_buffers_blend on a4xx</li>
 <li>GL_ARB_fragment_layer_viewport on radeonsi</li>
 <li>GL_ARB_framebuffer_no_attachments on i965</li>
 <li>GL_ARB_get_texture_sub_image for all drivers</li>
@@ -56,12 +59,15 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_gpu_shader_fp64 on llvmpipe, radeonsi</li>
 <li>GL_ARB_shader_image_load_store on i965</li>
 <li>GL_ARB_shader_precision on radeonsi, nvc0</li>
+<li>GL_ARB_shader_image_size on i965</li>
 <li>GL_ARB_shader_stencil_export on llvmpipe</li>
 <li>GL_ARB_shader_subroutine on core profile all drivers</li>
 <li>GL_ARB_tessellation_shader on nvc0, radeonsi</li>
+<li>GL_ARB_transform_feedback2, GL_ARB_transform_feedback_instanced, GL_EXT_transform_feedback on a3xx, a4xx</li>
 <li>GL_ARB_vertex_attrib_64bit on llvmpipe, radeonsi</li>
 <li>GL_ARB_viewport_array on radeonsi</li>
 <li>GL_EXT_depth_bounds_test on radeonsi, nv30, nv50, nvc0</li>
+<li>GL_EXT_texture_compression_s3tc on freedreno (a3xx)</li>
 <li>GL_NV_read_depth (GLES) on all drivers</li>
 <li>GL_NV_read_depth_stencil (GLES) on all drivers</li>
 <li>GL_NV_read_stencil (GLES) on all drivers</li>
@@ -69,6 +75,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_OES_texture_half_float on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
 <li>GL_OES_texture_float_linear on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
 <li>GL_OES_texture_half_float_linear on all r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe</li>
+<li>GL_EXT_draw_buffers2 on a4xx</li>
 <li>GLX_ARB_create_context_robustness on r600, radeonsi</li>
 <li>EGL_EXT_create_context_robustness on r600, radeonsi</li>
 <li>EGL_KHR_gl_colorspace on r600, radeonsi, nv50, nvc0</li>
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html
new file mode 100644
index 00000000000..7f80206a9f2
--- /dev/null
+++ b/docs/relnotes/11.1.0.html
@@ -0,0 +1,60 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.1.0 Release Notes / TBD</h1>
+
+<p>
+Mesa 11.1.0 is a new development release.
+People who are concerned with stability and reliability should stick
+with a previous release or wait for Mesa 11.1.1.
+</p>
+<p>
+Mesa 11.1.0 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD.
+</pre>
+
+
+<h2>New features</h2>
+
+<p>
+Note: some of the new features are only available with certain drivers.
+</p>
+
+<ul>
+TBD.
+</ul>
+
+<h2>Bug fixes</h2>
+
+TBD.
+
+<h2>Changes</h2>
+
+TBD.
+
+</div>
+</body>
+</html>
diff --git a/include/GL/glext.h b/include/GL/glext.h
index e5f1d891ec5..907a5826d71 100644
--- a/include/GL/glext.h
+++ b/include/GL/glext.h
@@ -6,7 +6,7 @@ extern "C" {
 #endif
 
 /*
-** Copyright (c) 2013-2014 The Khronos Group Inc.
+** Copyright (c) 2013-2015 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -33,7 +33,7 @@ extern "C" {
 ** used to make the header, and the header can be found at
 **   http://www.opengl.org/registry/
 **
-** Khronos $Revision: 29735 $ on $Date: 2015-02-02 19:00:01 -0800 (Mon, 02 Feb 2015) $
+** Khronos $Revision: 31811 $ on $Date: 2015-08-10 17:01:11 +1000 (Mon, 10 Aug 2015) $
 */
 
 #if defined(_WIN32) && !defined(APIENTRY) && !defined(__CYGWIN__) && !defined(__SCITECH_SNAP__)
@@ -53,7 +53,7 @@ extern "C" {
 #define GLAPI extern
 #endif
 
-#define GL_GLEXT_VERSION 20150202
+#define GL_GLEXT_VERSION 20150809
 
 /* Generated C header for:
  * API: gl
@@ -1041,6 +1041,22 @@ typedef unsigned short GLhalf;
 #define GL_COLOR_ATTACHMENT13             0x8CED
 #define GL_COLOR_ATTACHMENT14             0x8CEE
 #define GL_COLOR_ATTACHMENT15             0x8CEF
+#define GL_COLOR_ATTACHMENT16             0x8CF0
+#define GL_COLOR_ATTACHMENT17             0x8CF1
+#define GL_COLOR_ATTACHMENT18             0x8CF2
+#define GL_COLOR_ATTACHMENT19             0x8CF3
+#define GL_COLOR_ATTACHMENT20             0x8CF4
+#define GL_COLOR_ATTACHMENT21             0x8CF5
+#define GL_COLOR_ATTACHMENT22             0x8CF6
+#define GL_COLOR_ATTACHMENT23             0x8CF7
+#define GL_COLOR_ATTACHMENT24             0x8CF8
+#define GL_COLOR_ATTACHMENT25             0x8CF9
+#define GL_COLOR_ATTACHMENT26             0x8CFA
+#define GL_COLOR_ATTACHMENT27             0x8CFB
+#define GL_COLOR_ATTACHMENT28             0x8CFC
+#define GL_COLOR_ATTACHMENT29             0x8CFD
+#define GL_COLOR_ATTACHMENT30             0x8CFE
+#define GL_COLOR_ATTACHMENT31             0x8CFF
 #define GL_DEPTH_ATTACHMENT               0x8D00
 #define GL_STENCIL_ATTACHMENT             0x8D20
 #define GL_FRAMEBUFFER                    0x8D40
@@ -2859,6 +2875,17 @@ GLAPI void APIENTRY glTextureBarrier (void);
 #define GL_ARB_ES3_1_compatibility 1
 #endif /* GL_ARB_ES3_1_compatibility */
 
+#ifndef GL_ARB_ES3_2_compatibility
+#define GL_ARB_ES3_2_compatibility 1
+#define GL_PRIMITIVE_BOUNDING_BOX_ARB     0x92BE
+#define GL_MULTISAMPLE_LINE_WIDTH_RANGE_ARB 0x9381
+#define GL_MULTISAMPLE_LINE_WIDTH_GRANULARITY_ARB 0x9382
+typedef void (APIENTRYP PFNGLPRIMITIVEBOUNDINGBOXARBPROC) (GLfloat minX, GLfloat minY, GLfloat minZ, GLfloat minW, GLfloat maxX, GLfloat maxY, GLfloat maxZ, GLfloat maxW);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glPrimitiveBoundingBoxARB (GLfloat minX, GLfloat minY, GLfloat minZ, GLfloat minW, GLfloat maxX, GLfloat maxY, GLfloat maxZ, GLfloat maxW);
+#endif
+#endif /* GL_ARB_ES3_2_compatibility */
+
 #ifndef GL_ARB_ES3_compatibility
 #define GL_ARB_ES3_compatibility 1
 #endif /* GL_ARB_ES3_compatibility */
@@ -3272,6 +3299,10 @@ GLAPI GLboolean APIENTRY glIsProgramARB (GLuint program);
 #define GL_FRAGMENT_SHADER_DERIVATIVE_HINT_ARB 0x8B8B
 #endif /* GL_ARB_fragment_shader */
 
+#ifndef GL_ARB_fragment_shader_interlock
+#define GL_ARB_fragment_shader_interlock 1
+#endif /* GL_ARB_fragment_shader_interlock */
+
 #ifndef GL_ARB_framebuffer_no_attachments
 #define GL_ARB_framebuffer_no_attachments 1
 #endif /* GL_ARB_framebuffer_no_attachments */
@@ -3332,6 +3363,91 @@ GLAPI void APIENTRY glFramebufferTextureFaceARB (GLenum target, GLenum attachmen
 #define GL_ARB_gpu_shader_fp64 1
 #endif /* GL_ARB_gpu_shader_fp64 */
 
+#ifndef GL_ARB_gpu_shader_int64
+#define GL_ARB_gpu_shader_int64 1
+#define GL_INT64_ARB                      0x140E
+#define GL_INT64_VEC2_ARB                 0x8FE9
+#define GL_INT64_VEC3_ARB                 0x8FEA
+#define GL_INT64_VEC4_ARB                 0x8FEB
+#define GL_UNSIGNED_INT64_VEC2_ARB        0x8FF5
+#define GL_UNSIGNED_INT64_VEC3_ARB        0x8FF6
+#define GL_UNSIGNED_INT64_VEC4_ARB        0x8FF7
+typedef void (APIENTRYP PFNGLUNIFORM1I64ARBPROC) (GLint location, GLint64 x);
+typedef void (APIENTRYP PFNGLUNIFORM2I64ARBPROC) (GLint location, GLint64 x, GLint64 y);
+typedef void (APIENTRYP PFNGLUNIFORM3I64ARBPROC) (GLint location, GLint64 x, GLint64 y, GLint64 z);
+typedef void (APIENTRYP PFNGLUNIFORM4I64ARBPROC) (GLint location, GLint64 x, GLint64 y, GLint64 z, GLint64 w);
+typedef void (APIENTRYP PFNGLUNIFORM1I64VARBPROC) (GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM2I64VARBPROC) (GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM3I64VARBPROC) (GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM4I64VARBPROC) (GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM1UI64ARBPROC) (GLint location, GLuint64 x);
+typedef void (APIENTRYP PFNGLUNIFORM2UI64ARBPROC) (GLint location, GLuint64 x, GLuint64 y);
+typedef void (APIENTRYP PFNGLUNIFORM3UI64ARBPROC) (GLint location, GLuint64 x, GLuint64 y, GLuint64 z);
+typedef void (APIENTRYP PFNGLUNIFORM4UI64ARBPROC) (GLint location, GLuint64 x, GLuint64 y, GLuint64 z, GLuint64 w);
+typedef void (APIENTRYP PFNGLUNIFORM1UI64VARBPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM2UI64VARBPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM3UI64VARBPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLUNIFORM4UI64VARBPROC) (GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLGETUNIFORMI64VARBPROC) (GLuint program, GLint location, GLint64 *params);
+typedef void (APIENTRYP PFNGLGETUNIFORMUI64VARBPROC) (GLuint program, GLint location, GLuint64 *params);
+typedef void (APIENTRYP PFNGLGETNUNIFORMI64VARBPROC) (GLuint program, GLint location, GLsizei bufSize, GLint64 *params);
+typedef void (APIENTRYP PFNGLGETNUNIFORMUI64VARBPROC) (GLuint program, GLint location, GLsizei bufSize, GLuint64 *params);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1I64ARBPROC) (GLuint program, GLint location, GLint64 x);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2I64ARBPROC) (GLuint program, GLint location, GLint64 x, GLint64 y);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3I64ARBPROC) (GLuint program, GLint location, GLint64 x, GLint64 y, GLint64 z);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4I64ARBPROC) (GLuint program, GLint location, GLint64 x, GLint64 y, GLint64 z, GLint64 w);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1I64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2I64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3I64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4I64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1UI64ARBPROC) (GLuint program, GLint location, GLuint64 x);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2UI64ARBPROC) (GLuint program, GLint location, GLuint64 x, GLuint64 y);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3UI64ARBPROC) (GLuint program, GLint location, GLuint64 x, GLuint64 y, GLuint64 z);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4UI64ARBPROC) (GLuint program, GLint location, GLuint64 x, GLuint64 y, GLuint64 z, GLuint64 w);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM1UI64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM2UI64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM3UI64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+typedef void (APIENTRYP PFNGLPROGRAMUNIFORM4UI64VARBPROC) (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glUniform1i64ARB (GLint location, GLint64 x);
+GLAPI void APIENTRY glUniform2i64ARB (GLint location, GLint64 x, GLint64 y);
+GLAPI void APIENTRY glUniform3i64ARB (GLint location, GLint64 x, GLint64 y, GLint64 z);
+GLAPI void APIENTRY glUniform4i64ARB (GLint location, GLint64 x, GLint64 y, GLint64 z, GLint64 w);
+GLAPI void APIENTRY glUniform1i64vARB (GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glUniform2i64vARB (GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glUniform3i64vARB (GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glUniform4i64vARB (GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glUniform1ui64ARB (GLint location, GLuint64 x);
+GLAPI void APIENTRY glUniform2ui64ARB (GLint location, GLuint64 x, GLuint64 y);
+GLAPI void APIENTRY glUniform3ui64ARB (GLint location, GLuint64 x, GLuint64 y, GLuint64 z);
+GLAPI void APIENTRY glUniform4ui64ARB (GLint location, GLuint64 x, GLuint64 y, GLuint64 z, GLuint64 w);
+GLAPI void APIENTRY glUniform1ui64vARB (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glUniform2ui64vARB (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glUniform3ui64vARB (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glUniform4ui64vARB (GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glGetUniformi64vARB (GLuint program, GLint location, GLint64 *params);
+GLAPI void APIENTRY glGetUniformui64vARB (GLuint program, GLint location, GLuint64 *params);
+GLAPI void APIENTRY glGetnUniformi64vARB (GLuint program, GLint location, GLsizei bufSize, GLint64 *params);
+GLAPI void APIENTRY glGetnUniformui64vARB (GLuint program, GLint location, GLsizei bufSize, GLuint64 *params);
+GLAPI void APIENTRY glProgramUniform1i64ARB (GLuint program, GLint location, GLint64 x);
+GLAPI void APIENTRY glProgramUniform2i64ARB (GLuint program, GLint location, GLint64 x, GLint64 y);
+GLAPI void APIENTRY glProgramUniform3i64ARB (GLuint program, GLint location, GLint64 x, GLint64 y, GLint64 z);
+GLAPI void APIENTRY glProgramUniform4i64ARB (GLuint program, GLint location, GLint64 x, GLint64 y, GLint64 z, GLint64 w);
+GLAPI void APIENTRY glProgramUniform1i64vARB (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glProgramUniform2i64vARB (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glProgramUniform3i64vARB (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glProgramUniform4i64vARB (GLuint program, GLint location, GLsizei count, const GLint64 *value);
+GLAPI void APIENTRY glProgramUniform1ui64ARB (GLuint program, GLint location, GLuint64 x);
+GLAPI void APIENTRY glProgramUniform2ui64ARB (GLuint program, GLint location, GLuint64 x, GLuint64 y);
+GLAPI void APIENTRY glProgramUniform3ui64ARB (GLuint program, GLint location, GLuint64 x, GLuint64 y, GLuint64 z);
+GLAPI void APIENTRY glProgramUniform4ui64ARB (GLuint program, GLint location, GLuint64 x, GLuint64 y, GLuint64 z, GLuint64 w);
+GLAPI void APIENTRY glProgramUniform1ui64vARB (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glProgramUniform2ui64vARB (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glProgramUniform3ui64vARB (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+GLAPI void APIENTRY glProgramUniform4ui64vARB (GLuint program, GLint location, GLsizei count, const GLuint64 *value);
+#endif
+#endif /* GL_ARB_gpu_shader_int64 */
+
 #ifndef GL_ARB_half_float_pixel
 #define GL_ARB_half_float_pixel 1
 typedef unsigned short GLhalfARB;
@@ -3711,6 +3827,16 @@ GLAPI void APIENTRY glGetQueryObjectuivARB (GLuint id, GLenum pname, GLuint *par
 #define GL_ARB_occlusion_query2 1
 #endif /* GL_ARB_occlusion_query2 */
 
+#ifndef GL_ARB_parallel_shader_compile
+#define GL_ARB_parallel_shader_compile 1
+#define GL_MAX_SHADER_COMPILER_THREADS_ARB 0x91B0
+#define GL_COMPLETION_STATUS_ARB          0x91B1
+typedef void (APIENTRYP PFNGLMAXSHADERCOMPILERTHREADSARBPROC) (GLuint count);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glMaxShaderCompilerThreadsARB (GLuint count);
+#endif
+#endif /* GL_ARB_parallel_shader_compile */
+
 #ifndef GL_ARB_pipeline_statistics_query
 #define GL_ARB_pipeline_statistics_query 1
 #define GL_VERTICES_SUBMITTED_ARB         0x82EE
@@ -3753,6 +3879,10 @@ GLAPI void APIENTRY glPointParameterfvARB (GLenum pname, const GLfloat *params);
 #define GL_COORD_REPLACE_ARB              0x8862
 #endif /* GL_ARB_point_sprite */
 
+#ifndef GL_ARB_post_depth_coverage
+#define GL_ARB_post_depth_coverage 1
+#endif /* GL_ARB_post_depth_coverage */
+
 #ifndef GL_ARB_program_interface_query
 #define GL_ARB_program_interface_query 1
 #endif /* GL_ARB_program_interface_query */
@@ -3826,6 +3956,26 @@ GLAPI void APIENTRY glGetnMinmaxARB (GLenum target, GLboolean reset, GLenum form
 #define GL_ARB_robustness_isolation 1
 #endif /* GL_ARB_robustness_isolation */
 
+#ifndef GL_ARB_sample_locations
+#define GL_ARB_sample_locations 1
+#define GL_SAMPLE_LOCATION_SUBPIXEL_BITS_ARB 0x933D
+#define GL_SAMPLE_LOCATION_PIXEL_GRID_WIDTH_ARB 0x933E
+#define GL_SAMPLE_LOCATION_PIXEL_GRID_HEIGHT_ARB 0x933F
+#define GL_PROGRAMMABLE_SAMPLE_LOCATION_TABLE_SIZE_ARB 0x9340
+#define GL_SAMPLE_LOCATION_ARB            0x8E50
+#define GL_PROGRAMMABLE_SAMPLE_LOCATION_ARB 0x9341
+#define GL_FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_ARB 0x9342
+#define GL_FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_ARB 0x9343
+typedef void (APIENTRYP PFNGLFRAMEBUFFERSAMPLELOCATIONSFVARBPROC) (GLenum target, GLuint start, GLsizei count, const GLfloat *v);
+typedef void (APIENTRYP PFNGLNAMEDFRAMEBUFFERSAMPLELOCATIONSFVARBPROC) (GLuint framebuffer, GLuint start, GLsizei count, const GLfloat *v);
+typedef void (APIENTRYP PFNGLEVALUATEDEPTHVALUESARBPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glFramebufferSampleLocationsfvARB (GLenum target, GLuint start, GLsizei count, const GLfloat *v);
+GLAPI void APIENTRY glNamedFramebufferSampleLocationsfvARB (GLuint framebuffer, GLuint start, GLsizei count, const GLfloat *v);
+GLAPI void APIENTRY glEvaluateDepthValuesARB (void);
+#endif
+#endif /* GL_ARB_sample_locations */
+
 #ifndef GL_ARB_sample_shading
 #define GL_ARB_sample_shading 1
 #define GL_SAMPLE_SHADING_ARB             0x8C36
@@ -3852,14 +4002,26 @@ GLAPI void APIENTRY glMinSampleShadingARB (GLfloat value);
 #define GL_ARB_separate_shader_objects 1
 #endif /* GL_ARB_separate_shader_objects */
 
+#ifndef GL_ARB_shader_atomic_counter_ops
+#define GL_ARB_shader_atomic_counter_ops 1
+#endif /* GL_ARB_shader_atomic_counter_ops */
+
 #ifndef GL_ARB_shader_atomic_counters
 #define GL_ARB_shader_atomic_counters 1
 #endif /* GL_ARB_shader_atomic_counters */
 
+#ifndef GL_ARB_shader_ballot
+#define GL_ARB_shader_ballot 1
+#endif /* GL_ARB_shader_ballot */
+
 #ifndef GL_ARB_shader_bit_encoding
 #define GL_ARB_shader_bit_encoding 1
 #endif /* GL_ARB_shader_bit_encoding */
 
+#ifndef GL_ARB_shader_clock
+#define GL_ARB_shader_clock 1
+#endif /* GL_ARB_shader_clock */
+
 #ifndef GL_ARB_shader_draw_parameters
 #define GL_ARB_shader_draw_parameters 1
 #endif /* GL_ARB_shader_draw_parameters */
@@ -4029,6 +4191,10 @@ GLAPI void APIENTRY glGetShaderSourceARB (GLhandleARB obj, GLsizei maxLength, GL
 #define GL_ARB_shader_texture_lod 1
 #endif /* GL_ARB_shader_texture_lod */
 
+#ifndef GL_ARB_shader_viewport_layer_array
+#define GL_ARB_shader_viewport_layer_array 1
+#endif /* GL_ARB_shader_viewport_layer_array */
+
 #ifndef GL_ARB_shading_language_100
 #define GL_ARB_shading_language_100 1
 #define GL_SHADING_LANGUAGE_VERSION_ARB   0x8B8C
@@ -4102,12 +4268,20 @@ GLAPI void APIENTRY glNamedBufferPageCommitmentARB (GLuint buffer, GLintptr offs
 #define GL_MAX_SPARSE_3D_TEXTURE_SIZE_ARB 0x9199
 #define GL_MAX_SPARSE_ARRAY_TEXTURE_LAYERS_ARB 0x919A
 #define GL_SPARSE_TEXTURE_FULL_ARRAY_CUBE_MIPMAPS_ARB 0x91A9
-typedef void (APIENTRYP PFNGLTEXPAGECOMMITMENTARBPROC) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean resident);
+typedef void (APIENTRYP PFNGLTEXPAGECOMMITMENTARBPROC) (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean commit);
 #ifdef GL_GLEXT_PROTOTYPES
-GLAPI void APIENTRY glTexPageCommitmentARB (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean resident);
+GLAPI void APIENTRY glTexPageCommitmentARB (GLenum target, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean commit);
 #endif
 #endif /* GL_ARB_sparse_texture */
 
+#ifndef GL_ARB_sparse_texture2
+#define GL_ARB_sparse_texture2 1
+#endif /* GL_ARB_sparse_texture2 */
+
+#ifndef GL_ARB_sparse_texture_clamp
+#define GL_ARB_sparse_texture_clamp 1
+#endif /* GL_ARB_sparse_texture_clamp */
+
 #ifndef GL_ARB_stencil_texturing
 #define GL_ARB_stencil_texturing 1
 #endif /* GL_ARB_stencil_texturing */
@@ -4260,6 +4434,12 @@ GLAPI void APIENTRY glGetCompressedTexImageARB (GLenum target, GLint level, void
 #define GL_DOT3_RGBA_ARB                  0x86AF
 #endif /* GL_ARB_texture_env_dot3 */
 
+#ifndef GL_ARB_texture_filter_minmax
+#define GL_ARB_texture_filter_minmax 1
+#define GL_TEXTURE_REDUCTION_MODE_ARB     0x9366
+#define GL_WEIGHTED_AVERAGE_ARB           0x9367
+#endif /* GL_ARB_texture_filter_minmax */
+
 #ifndef GL_ARB_texture_float
 #define GL_ARB_texture_float 1
 #define GL_TEXTURE_RED_TYPE_ARB           0x8C10
@@ -4754,6 +4934,11 @@ GLAPI void APIENTRY glBlendBarrierKHR (void);
 #define GL_KHR_debug 1
 #endif /* GL_KHR_debug */
 
+#ifndef GL_KHR_no_error
+#define GL_KHR_no_error 1
+#define GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR  0x00000008
+#endif /* GL_KHR_no_error */
+
 #ifndef GL_KHR_robust_buffer_access_behavior
 #define GL_KHR_robust_buffer_access_behavior 1
 #endif /* GL_KHR_robust_buffer_access_behavior */
@@ -4896,7 +5081,6 @@ typedef void (APIENTRYP PFNGLPOINTPARAMETERXVOESPROC) (GLenum pname, const GLfix
 typedef void (APIENTRYP PFNGLPOINTSIZEXOESPROC) (GLfixed size);
 typedef void (APIENTRYP PFNGLPOLYGONOFFSETXOESPROC) (GLfixed factor, GLfixed units);
 typedef void (APIENTRYP PFNGLROTATEXOESPROC) (GLfixed angle, GLfixed x, GLfixed y, GLfixed z);
-typedef void (APIENTRYP PFNGLSAMPLECOVERAGEOESPROC) (GLfixed value, GLboolean invert);
 typedef void (APIENTRYP PFNGLSCALEXOESPROC) (GLfixed x, GLfixed y, GLfixed z);
 typedef void (APIENTRYP PFNGLTEXENVXOESPROC) (GLenum target, GLenum pname, GLfixed param);
 typedef void (APIENTRYP PFNGLTEXENVXVOESPROC) (GLenum target, GLenum pname, const GLfixed *params);
@@ -5001,7 +5185,6 @@ GLAPI void APIENTRY glPointParameterxvOES (GLenum pname, const GLfixed *params);
 GLAPI void APIENTRY glPointSizexOES (GLfixed size);
 GLAPI void APIENTRY glPolygonOffsetxOES (GLfixed factor, GLfixed units);
 GLAPI void APIENTRY glRotatexOES (GLfixed angle, GLfixed x, GLfixed y, GLfixed z);
-GLAPI void APIENTRY glSampleCoverageOES (GLfixed value, GLboolean invert);
 GLAPI void APIENTRY glScalexOES (GLfixed x, GLfixed y, GLfixed z);
 GLAPI void APIENTRY glTexEnvxOES (GLenum target, GLenum pname, GLfixed param);
 GLAPI void APIENTRY glTexEnvxvOES (GLenum target, GLenum pname, const GLfixed *params);
@@ -6715,7 +6898,7 @@ typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBLFORMATEXTPROC) (GLuint vaob
 typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBBINDINGEXTPROC) (GLuint vaobj, GLuint attribindex, GLuint bindingindex);
 typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXBINDINGDIVISOREXTPROC) (GLuint vaobj, GLuint bindingindex, GLuint divisor);
 typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBLOFFSETEXTPROC) (GLuint vaobj, GLuint buffer, GLuint index, GLint size, GLenum type, GLsizei stride, GLintptr offset);
-typedef void (APIENTRYP PFNGLTEXTUREPAGECOMMITMENTEXTPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean resident);
+typedef void (APIENTRYP PFNGLTEXTUREPAGECOMMITMENTEXTPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean commit);
 typedef void (APIENTRYP PFNGLVERTEXARRAYVERTEXATTRIBDIVISOREXTPROC) (GLuint vaobj, GLuint index, GLuint divisor);
 #ifdef GL_GLEXT_PROTOTYPES
 GLAPI void APIENTRY glMatrixLoadfEXT (GLenum mode, const GLfloat *m);
@@ -6971,7 +7154,7 @@ GLAPI void APIENTRY glVertexArrayVertexAttribLFormatEXT (GLuint vaobj, GLuint at
 GLAPI void APIENTRY glVertexArrayVertexAttribBindingEXT (GLuint vaobj, GLuint attribindex, GLuint bindingindex);
 GLAPI void APIENTRY glVertexArrayVertexBindingDivisorEXT (GLuint vaobj, GLuint bindingindex, GLuint divisor);
 GLAPI void APIENTRY glVertexArrayVertexAttribLOffsetEXT (GLuint vaobj, GLuint buffer, GLuint index, GLint size, GLenum type, GLsizei stride, GLintptr offset);
-GLAPI void APIENTRY glTexturePageCommitmentEXT (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean resident);
+GLAPI void APIENTRY glTexturePageCommitmentEXT (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLboolean commit);
 GLAPI void APIENTRY glVertexArrayVertexAttribDivisorEXT (GLuint vaobj, GLuint index, GLuint divisor);
 #endif
 #endif /* GL_EXT_direct_state_access */
@@ -8635,6 +8818,14 @@ GLAPI void APIENTRY glBlendFuncSeparateINGR (GLenum sfactorRGB, GLenum dfactorRG
 #define GL_INTEL_fragment_shader_ordering 1
 #endif /* GL_INTEL_fragment_shader_ordering */
 
+#ifndef GL_INTEL_framebuffer_CMAA
+#define GL_INTEL_framebuffer_CMAA 1
+typedef void (APIENTRYP PFNGLAPPLYFRAMEBUFFERATTACHMENTCMAAINTELPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glApplyFramebufferAttachmentCMAAINTEL (void);
+#endif
+#endif /* GL_INTEL_framebuffer_CMAA */
+
 #ifndef GL_INTEL_map_texture
 #define GL_INTEL_map_texture 1
 #define GL_TEXTURE_MEMORY_LAYOUT_INTEL    0x83FF
@@ -8939,6 +9130,65 @@ GLAPI void APIENTRY glBlendBarrierNV (void);
 #define GL_NV_blend_square 1
 #endif /* GL_NV_blend_square */
 
+#ifndef GL_NV_command_list
+#define GL_NV_command_list 1
+#define GL_TERMINATE_SEQUENCE_COMMAND_NV  0x0000
+#define GL_NOP_COMMAND_NV                 0x0001
+#define GL_DRAW_ELEMENTS_COMMAND_NV       0x0002
+#define GL_DRAW_ARRAYS_COMMAND_NV         0x0003
+#define GL_DRAW_ELEMENTS_STRIP_COMMAND_NV 0x0004
+#define GL_DRAW_ARRAYS_STRIP_COMMAND_NV   0x0005
+#define GL_DRAW_ELEMENTS_INSTANCED_COMMAND_NV 0x0006
+#define GL_DRAW_ARRAYS_INSTANCED_COMMAND_NV 0x0007
+#define GL_ELEMENT_ADDRESS_COMMAND_NV     0x0008
+#define GL_ATTRIBUTE_ADDRESS_COMMAND_NV   0x0009
+#define GL_UNIFORM_ADDRESS_COMMAND_NV     0x000A
+#define GL_BLEND_COLOR_COMMAND_NV         0x000B
+#define GL_STENCIL_REF_COMMAND_NV         0x000C
+#define GL_LINE_WIDTH_COMMAND_NV          0x000D
+#define GL_POLYGON_OFFSET_COMMAND_NV      0x000E
+#define GL_ALPHA_REF_COMMAND_NV           0x000F
+#define GL_VIEWPORT_COMMAND_NV            0x0010
+#define GL_SCISSOR_COMMAND_NV             0x0011
+#define GL_FRONT_FACE_COMMAND_NV          0x0012
+typedef void (APIENTRYP PFNGLCREATESTATESNVPROC) (GLsizei n, GLuint *states);
+typedef void (APIENTRYP PFNGLDELETESTATESNVPROC) (GLsizei n, const GLuint *states);
+typedef GLboolean (APIENTRYP PFNGLISSTATENVPROC) (GLuint state);
+typedef void (APIENTRYP PFNGLSTATECAPTURENVPROC) (GLuint state, GLenum mode);
+typedef GLuint (APIENTRYP PFNGLGETCOMMANDHEADERNVPROC) (GLenum tokenID, GLuint size);
+typedef GLushort (APIENTRYP PFNGLGETSTAGEINDEXNVPROC) (GLenum shadertype);
+typedef void (APIENTRYP PFNGLDRAWCOMMANDSNVPROC) (GLenum primitiveMode, GLuint buffer, const GLintptr *indirects, const GLsizei *sizes, GLuint count);
+typedef void (APIENTRYP PFNGLDRAWCOMMANDSADDRESSNVPROC) (GLenum primitiveMode, const GLuint64 *indirects, const GLsizei *sizes, GLuint count);
+typedef void (APIENTRYP PFNGLDRAWCOMMANDSSTATESNVPROC) (GLuint buffer, const GLintptr *indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+typedef void (APIENTRYP PFNGLDRAWCOMMANDSSTATESADDRESSNVPROC) (const GLuint64 *indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+typedef void (APIENTRYP PFNGLCREATECOMMANDLISTSNVPROC) (GLsizei n, GLuint *lists);
+typedef void (APIENTRYP PFNGLDELETECOMMANDLISTSNVPROC) (GLsizei n, const GLuint *lists);
+typedef GLboolean (APIENTRYP PFNGLISCOMMANDLISTNVPROC) (GLuint list);
+typedef void (APIENTRYP PFNGLLISTDRAWCOMMANDSSTATESCLIENTNVPROC) (GLuint list, GLuint segment, const void **indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+typedef void (APIENTRYP PFNGLCOMMANDLISTSEGMENTSNVPROC) (GLuint list, GLuint segments);
+typedef void (APIENTRYP PFNGLCOMPILECOMMANDLISTNVPROC) (GLuint list);
+typedef void (APIENTRYP PFNGLCALLCOMMANDLISTNVPROC) (GLuint list);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glCreateStatesNV (GLsizei n, GLuint *states);
+GLAPI void APIENTRY glDeleteStatesNV (GLsizei n, const GLuint *states);
+GLAPI GLboolean APIENTRY glIsStateNV (GLuint state);
+GLAPI void APIENTRY glStateCaptureNV (GLuint state, GLenum mode);
+GLAPI GLuint APIENTRY glGetCommandHeaderNV (GLenum tokenID, GLuint size);
+GLAPI GLushort APIENTRY glGetStageIndexNV (GLenum shadertype);
+GLAPI void APIENTRY glDrawCommandsNV (GLenum primitiveMode, GLuint buffer, const GLintptr *indirects, const GLsizei *sizes, GLuint count);
+GLAPI void APIENTRY glDrawCommandsAddressNV (GLenum primitiveMode, const GLuint64 *indirects, const GLsizei *sizes, GLuint count);
+GLAPI void APIENTRY glDrawCommandsStatesNV (GLuint buffer, const GLintptr *indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+GLAPI void APIENTRY glDrawCommandsStatesAddressNV (const GLuint64 *indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+GLAPI void APIENTRY glCreateCommandListsNV (GLsizei n, GLuint *lists);
+GLAPI void APIENTRY glDeleteCommandListsNV (GLsizei n, const GLuint *lists);
+GLAPI GLboolean APIENTRY glIsCommandListNV (GLuint list);
+GLAPI void APIENTRY glListDrawCommandsStatesClientNV (GLuint list, GLuint segment, const void **indirects, const GLsizei *sizes, const GLuint *states, const GLuint *fbos, GLuint count);
+GLAPI void APIENTRY glCommandListSegmentsNV (GLuint list, GLuint segments);
+GLAPI void APIENTRY glCompileCommandListNV (GLuint list);
+GLAPI void APIENTRY glCallCommandListNV (GLuint list);
+#endif
+#endif /* GL_NV_command_list */
+
 #ifndef GL_NV_compute_program5
 #define GL_NV_compute_program5 1
 #define GL_COMPUTE_PROGRAM_NV             0x90FB
@@ -8971,6 +9221,17 @@ GLAPI void APIENTRY glSubpixelPrecisionBiasNV (GLuint xbits, GLuint ybits);
 #endif
 #endif /* GL_NV_conservative_raster */
 
+#ifndef GL_NV_conservative_raster_dilate
+#define GL_NV_conservative_raster_dilate 1
+#define GL_CONSERVATIVE_RASTER_DILATE_NV  0x9379
+#define GL_CONSERVATIVE_RASTER_DILATE_RANGE_NV 0x937A
+#define GL_CONSERVATIVE_RASTER_DILATE_GRANULARITY_NV 0x937B
+typedef void (APIENTRYP PFNGLCONSERVATIVERASTERPARAMETERFNVPROC) (GLenum pname, GLfloat value);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glConservativeRasterParameterfNV (GLenum pname, GLfloat value);
+#endif
+#endif /* GL_NV_conservative_raster_dilate */
+
 #ifndef GL_NV_copy_depth_to_color
 #define GL_NV_copy_depth_to_color 1
 #define GL_DEPTH_STENCIL_TO_RGBA_NV       0x886E
@@ -10850,6 +11111,21 @@ GLAPI void APIENTRY glVideoCaptureStreamParameterdvNV (GLuint video_capture_slot
 #define GL_FORMAT_SUBSAMPLE_244_244_OML   0x8983
 #endif /* GL_OML_subsample */
 
+#ifndef GL_OVR_multiview
+#define GL_OVR_multiview 1
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_NUM_VIEWS_OVR 0x9630
+#define GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_BASE_VIEW_INDEX_OVR 0x9632
+#define GL_MAX_VIEWS_OVR                  0x9631
+typedef void (APIENTRYP PFNGLFRAMEBUFFERTEXTUREMULTIVIEWOVRPROC) (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glFramebufferTextureMultiviewOVR (GLenum target, GLenum attachment, GLuint texture, GLint level, GLint baseViewIndex, GLsizei numViews);
+#endif
+#endif /* GL_OVR_multiview */
+
+#ifndef GL_OVR_multiview2
+#define GL_OVR_multiview2 1
+#endif /* GL_OVR_multiview2 */
+
 #ifndef GL_PGI_misc_hints
 #define GL_PGI_misc_hints 1
 #define GL_PREFER_DOUBLEBUFFER_HINT_PGI   0x1A1F8
diff --git a/scons/gallium.py b/scons/gallium.py
index 51b84d7652b..46dbf0ebd0e 100755
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -300,6 +300,7 @@ def generate(env):
 
     # C preprocessor options
     cppdefines = []
+    cppdefines += ['__STDC_LIMIT_MACROS']
     if env['build'] in ('debug', 'checked'):
         cppdefines += ['DEBUG']
     else:
diff --git a/src/egl/SConscript b/src/egl/SConscript
index 1b2a4271ef7..f8102db6f3a 100644
--- a/src/egl/SConscript
+++ b/src/egl/SConscript
@@ -15,7 +15,6 @@ env.Append(CPPPATH = [
 
 # parse Makefile.sources
 egl_sources = env.ParseSourceList('Makefile.sources', 'LIBEGL_C_FILES')
-egl_sources.append(env.ParseSourceList('Makefile.sources', 'dri2_backend_core_FILES'))
 
 env.Append(CPPDEFINES = [
     '_EGL_NATIVE_PLATFORM=_EGL_PLATFORM_HAIKU',
diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index a439a3be6b6..eda50875e02 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -68,7 +68,7 @@ release_buffer(struct gbm_surface *_surf, struct gbm_bo *bo)
 {
    struct gbm_dri_surface *surf = (struct gbm_dri_surface *) _surf;
    struct dri2_egl_surface *dri2_surf = surf->dri_private;
-   int i;
+   unsigned i;
 
    for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
       if (dri2_surf->color_buffers[i].bo == bo) {
@@ -82,7 +82,7 @@ has_free_buffers(struct gbm_surface *_surf)
 {
    struct gbm_dri_surface *surf = (struct gbm_dri_surface *) _surf;
    struct dri2_egl_surface *dri2_surf = surf->dri_private;
-   int i;
+   unsigned i;
 
    for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++)
       if (!dri2_surf->color_buffers[i].locked)
@@ -189,7 +189,7 @@ dri2_drm_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(surf);
-   int i;
+   unsigned i;
 
    if (!_eglPutSurface(surf))
       return EGL_TRUE;
@@ -218,7 +218,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
    struct gbm_dri_surface *surf = dri2_surf->gbm_surf;
-   int i;
+   unsigned i;
 
    if (dri2_surf->back == NULL) {
       for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
@@ -414,7 +414,7 @@ dri2_drm_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(draw);
-   int i;
+   unsigned i;
 
    if (dri2_dpy->swrast) {
       (*dri2_dpy->core->swapBuffers)(dri2_surf->dri_drawable);
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index dabaf1ebbd1..dbc64ba2d8a 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -1227,6 +1227,8 @@ dri2_wl_swrast_get_stride_for_format(int format, int w)
  * Taken from weston shared/os-compatibility.c
  */
 
+#ifndef HAVE_MKOSTEMP
+
 static int
 set_cloexec_or_close(int fd)
 {
@@ -1249,6 +1251,8 @@ err:
    return -1;
 }
 
+#endif
+
 /*
  * Taken from weston shared/os-compatibility.c
  */
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 93dfb803389..278d5e9bf5b 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -24,6 +24,7 @@
 
 #include "util/ralloc.h"
 #include "glsl/nir/nir.h"
+#include "glsl/nir/nir_control_flow.h"
 #include "glsl/nir/nir_builder.h"
 #include "glsl/list.h"
 #include "glsl/shader_enums.h"
@@ -307,7 +308,7 @@ ttn_emit_immediate(struct ttn_compile *c)
    for (i = 0; i < 4; i++)
       load_const->value.u[i] = tgsi_imm->u[i].Uint;
 
-   nir_instr_insert_after_cf_list(b->cf_node_list, &load_const->instr);
+   nir_builder_instr_insert(b, &load_const->instr);
 }
 
 static nir_src
@@ -363,7 +364,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
          load->variables[0] = ttn_array_deref(c, load, var, offset, indirect);
 
          nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
-         nir_instr_insert_after_cf_list(b->cf_node_list, &load->instr);
+         nir_builder_instr_insert(b, &load->instr);
 
          src = nir_src_for_ssa(&load->dest.ssa);
 
@@ -414,7 +415,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
       load->num_components = ncomp;
 
       nir_ssa_dest_init(&load->instr, &load->dest, ncomp, NULL);
-      nir_instr_insert_after_cf_list(b->cf_node_list, &load->instr);
+      nir_builder_instr_insert(b, &load->instr);
 
       src = nir_src_for_ssa(&load->dest.ssa);
       break;
@@ -476,7 +477,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
          srcn++;
       }
       nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
-      nir_instr_insert_after_cf_list(b->cf_node_list, &load->instr);
+      nir_builder_instr_insert(b, &load->instr);
 
       src = nir_src_for_ssa(&load->dest.ssa);
       break;
@@ -552,7 +553,7 @@ ttn_get_dest(struct ttn_compile *c, struct tgsi_full_dst_register *tgsi_fdst)
 
          load->dest = nir_dest_for_reg(reg);
 
-         nir_instr_insert_after_cf_list(b->cf_node_list, &load->instr);
+         nir_builder_instr_insert(b, &load->instr);
       } else {
          assert(!tgsi_dst->Indirect);
          dest.dest.reg.reg = c->temp_regs[index].reg;
@@ -667,7 +668,7 @@ ttn_alu(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
       instr->src[i].src = nir_src_for_ssa(src[i]);
 
    instr->dest = dest;
-   nir_instr_insert_after_cf_list(b->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(b, &instr->instr);
 }
 
 static void
@@ -683,7 +684,7 @@ ttn_move_dest_masked(nir_builder *b, nir_alu_dest dest,
    mov->src[0].src = nir_src_for_ssa(def);
    for (unsigned i = def->num_components; i < 4; i++)
       mov->src[0].swizzle[i] = def->num_components - 1;
-   nir_instr_insert_after_cf_list(b->cf_node_list, &mov->instr);
+   nir_builder_instr_insert(b, &mov->instr);
 }
 
 static void
@@ -902,7 +903,7 @@ ttn_kill(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
 {
    nir_intrinsic_instr *discard =
       nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard);
-   nir_instr_insert_after_cf_list(b->cf_node_list, &discard->instr);
+   nir_builder_instr_insert(b, &discard->instr);
 }
 
 static void
@@ -912,7 +913,7 @@ ttn_kill_if(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
    nir_intrinsic_instr *discard =
       nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
    discard->src[0] = nir_src_for_ssa(cmp);
-   nir_instr_insert_after_cf_list(b->cf_node_list, &discard->instr);
+   nir_builder_instr_insert(b, &discard->instr);
 }
 
 static void
@@ -976,14 +977,14 @@ static void
 ttn_cont(nir_builder *b)
 {
    nir_jump_instr *instr = nir_jump_instr_create(b->shader, nir_jump_continue);
-   nir_instr_insert_after_cf_list(b->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(b, &instr->instr);
 }
 
 static void
 ttn_brk(nir_builder *b)
 {
    nir_jump_instr *instr = nir_jump_instr_create(b->shader, nir_jump_break);
-   nir_instr_insert_after_cf_list(b->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(b, &instr->instr);
 }
 
 static void
@@ -1279,7 +1280,7 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
    assert(src_number == num_srcs);
 
    nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL);
-   nir_instr_insert_after_cf_list(b->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(b, &instr->instr);
 
    /* Resolve the writemask on the texture op. */
    ttn_move_dest(b, dest, &instr->dest.ssa);
@@ -1318,10 +1319,10 @@ ttn_txq(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
    txs->src[0].src_type = nir_tex_src_lod;
 
    nir_ssa_dest_init(&txs->instr, &txs->dest, 3, NULL);
-   nir_instr_insert_after_cf_list(b->cf_node_list, &txs->instr);
+   nir_builder_instr_insert(b, &txs->instr);
 
    nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, NULL);
-   nir_instr_insert_after_cf_list(b->cf_node_list, &qlv->instr);
+   nir_builder_instr_insert(b, &qlv->instr);
 
    ttn_move_dest_masked(b, dest, &txs->dest.ssa, TGSI_WRITEMASK_XYZ);
    ttn_move_dest_masked(b, dest, &qlv->dest.ssa, TGSI_WRITEMASK_W);
@@ -1730,7 +1731,7 @@ ttn_emit_instruction(struct ttn_compile *c)
       store->variables[0] = ttn_array_deref(c, store, var, offset, indirect);
       store->src[0] = nir_src_for_reg(dest.dest.reg.reg);
 
-      nir_instr_insert_after_cf_list(b->cf_node_list, &store->instr);
+      nir_builder_instr_insert(b, &store->instr);
    }
 }
 
@@ -1759,11 +1760,26 @@ ttn_add_output_stores(struct ttn_compile *c)
          store->const_index[0] = loc;
          store->src[0].reg.reg = c->output_regs[loc].reg;
          store->src[0].reg.base_offset = c->output_regs[loc].offset;
-         nir_instr_insert_after_cf_list(b->cf_node_list, &store->instr);
+         nir_builder_instr_insert(b, &store->instr);
       }
    }
 }
 
+static gl_shader_stage
+tgsi_processor_to_shader_stage(unsigned processor)
+{
+   switch (processor) {
+   case TGSI_PROCESSOR_FRAGMENT:  return MESA_SHADER_FRAGMENT;
+   case TGSI_PROCESSOR_VERTEX:    return MESA_SHADER_VERTEX;
+   case TGSI_PROCESSOR_GEOMETRY:  return MESA_SHADER_GEOMETRY;
+   case TGSI_PROCESSOR_TESS_CTRL: return MESA_SHADER_TESS_CTRL;
+   case TGSI_PROCESSOR_TESS_EVAL: return MESA_SHADER_TESS_EVAL;
+   case TGSI_PROCESSOR_COMPUTE:   return MESA_SHADER_COMPUTE;
+   default:
+      unreachable("invalid TGSI processor");
+   };
+}
+
 struct nir_shader *
 tgsi_to_nir(const void *tgsi_tokens,
             const nir_shader_compiler_options *options)
@@ -1775,7 +1791,12 @@ tgsi_to_nir(const void *tgsi_tokens,
    int ret;
 
    c = rzalloc(NULL, struct ttn_compile);
-   s = nir_shader_create(NULL, options);
+
+   tgsi_scan_shader(tgsi_tokens, &scan);
+   c->scan = &scan;
+
+   s = nir_shader_create(NULL, tgsi_processor_to_shader_stage(scan.processor),
+                         options);
 
    nir_function *func = nir_function_create(s, "main");
    nir_function_overload *overload = nir_function_overload_create(func);
@@ -1784,9 +1805,6 @@ tgsi_to_nir(const void *tgsi_tokens,
    nir_builder_init(&c->build, impl);
    nir_builder_insert_after_cf_list(&c->build, &impl->body);
 
-   tgsi_scan_shader(tgsi_tokens, &scan);
-   c->scan = &scan;
-
    s->num_inputs = scan.file_max[TGSI_FILE_INPUT] + 1;
    s->num_uniforms = scan.const_file_max[0] + 1;
    s->num_outputs = scan.file_max[TGSI_FILE_OUTPUT] + 1;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index a6675c5168d..3e3ed5b19d1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -259,7 +259,7 @@ struct translate_ctx
    struct tgsi_token *tokens_end;
    struct tgsi_header *header;
    unsigned processor : 4;
-   int implied_array_size : 5;
+   unsigned implied_array_size : 6;
    unsigned num_immediates;
 };
 
@@ -675,6 +675,9 @@ parse_register_dcl(
    eat_opt_white( &cur );
 
    if (cur[0] == '[') {
+      bool is_in = *file == TGSI_FILE_INPUT;
+      bool is_out = *file == TGSI_FILE_OUTPUT;
+
       ++cur;
       ctx->cur = cur;
       if (!parse_register_dcl_bracket( ctx, &brackets[1] ))
@@ -684,7 +687,11 @@ parse_register_dcl(
        * input primitive. so we want to declare just
        * the index relevant to the semantics which is in
        * the second bracket */
-      if (ctx->processor == TGSI_PROCESSOR_GEOMETRY && *file == TGSI_FILE_INPUT) {
+
+      /* tessellation has similar constraints to geometry shader */
+      if ((ctx->processor == TGSI_PROCESSOR_GEOMETRY && is_in) ||
+          (ctx->processor == TGSI_PROCESSOR_TESS_EVAL && is_in) ||
+          (ctx->processor == TGSI_PROCESSOR_TESS_CTRL && (is_in || is_out))) {
          brackets[0] = brackets[1];
          *num_brackets = 1;
       } else {
@@ -740,6 +747,14 @@ parse_dst_operand(
       dst->Dimension.Indirect = 0;
       dst->Dimension.Dimension = 0;
       dst->Dimension.Index = bracket[0].index;
+
+      if (bracket[0].ind_file != TGSI_FILE_NULL) {
+         dst->Dimension.Indirect = 1;
+         dst->DimIndirect.File = bracket[0].ind_file;
+         dst->DimIndirect.Index = bracket[0].ind_index;
+         dst->DimIndirect.Swizzle = bracket[0].ind_comp;
+         dst->DimIndirect.ArrayID = bracket[0].ind_array;
+      }
       bracket[0] = bracket[1];
    }
    dst->Register.Index = bracket[0].index;
@@ -1623,6 +1638,10 @@ static boolean translate( struct translate_ctx *ctx )
    if (!parse_header( ctx ))
       return FALSE;
 
+   if (ctx->processor == TGSI_PROCESSOR_TESS_CTRL ||
+       ctx->processor == TGSI_PROCESSOR_TESS_EVAL)
+       ctx->implied_array_size = 32;
+
    while (*ctx->cur != '\0') {
       uint label_val = 0;
       if (!eat_white( &ctx->cur )) {
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 85206eab1a7..9bba07aa18e 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -104,7 +104,7 @@ struct blitter_context_priv
    void *fs_resolve_uint[PIPE_MAX_TEXTURE_TYPES][NUM_RESOLVE_FRAG_SHADERS][2];
 
    /* Blend state. */
-   void *blend[PIPE_MASK_RGBA+1]; /**< blend state with writemask */
+   void *blend[PIPE_MASK_RGBA+1][2]; /**< blend state with writemask */
    void *blend_clear[GET_CLEAR_BLEND_STATE_IDX(PIPE_CLEAR_COLOR)+1];
 
    /* Depth stencil alpha state. */
@@ -159,7 +159,7 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    struct pipe_rasterizer_state rs_state;
    struct pipe_sampler_state sampler_state;
    struct pipe_vertex_element velem[2];
-   unsigned i;
+   unsigned i, j;
 
    ctx = CALLOC_STRUCT(blitter_context_priv);
    if (!ctx)
@@ -208,8 +208,20 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    memset(&blend, 0, sizeof(blend));
 
    for (i = 0; i <= PIPE_MASK_RGBA; i++) {
-      blend.rt[0].colormask = i;
-      ctx->blend[i] = pipe->create_blend_state(pipe, &blend);
+      for (j = 0; j < 2; j++) {
+         memset(&blend.rt[0], 0, sizeof(blend.rt[0]));
+         blend.rt[0].colormask = i;
+         if (j) {
+            blend.rt[0].blend_enable = 1;
+            blend.rt[0].rgb_func = PIPE_BLEND_ADD;
+            blend.rt[0].rgb_src_factor = PIPE_BLENDFACTOR_SRC_ALPHA;
+            blend.rt[0].rgb_dst_factor = PIPE_BLENDFACTOR_INV_SRC_ALPHA;
+            blend.rt[0].alpha_func = PIPE_BLEND_ADD;
+            blend.rt[0].alpha_src_factor = PIPE_BLENDFACTOR_SRC_ALPHA;
+            blend.rt[0].alpha_dst_factor = PIPE_BLENDFACTOR_INV_SRC_ALPHA;
+         }
+         ctx->blend[i][j] = pipe->create_blend_state(pipe, &blend);
+      }
    }
 
    /* depth stencil alpha state objects */
@@ -409,9 +421,10 @@ void util_blitter_destroy(struct blitter_context *blitter)
    struct pipe_context *pipe = blitter->pipe;
    int i, j, f;
 
-   for (i = 0; i <= PIPE_MASK_RGBA; i++) {
-      pipe->delete_blend_state(pipe, ctx->blend[i]);
-   }
+   for (i = 0; i <= PIPE_MASK_RGBA; i++)
+      for (j = 0; j < 2; j++)
+         pipe->delete_blend_state(pipe, ctx->blend[i][j]);
+
    for (i = 0; i < Elements(ctx->blend_clear); i++) {
       if (ctx->blend_clear[i])
          pipe->delete_blend_state(pipe, ctx->blend_clear[i]);
@@ -1217,7 +1230,7 @@ static void *get_clear_blend_state(struct blitter_context_priv *ctx,
 
    /* Return an existing blend state. */
    if (!clear_buffers)
-      return ctx->blend[0];
+      return ctx->blend[0][0];
 
    index = GET_CLEAR_BLEND_STATE_IDX(clear_buffers);
 
@@ -1483,7 +1496,8 @@ void util_blitter_copy_texture(struct blitter_context *blitter,
    /* Copy. */
    util_blitter_blit_generic(blitter, dst_view, &dstbox,
                              src_view, srcbox, src->width0, src->height0,
-                             PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+                             PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+                             FALSE);
 
    pipe_surface_reference(&dst_view, NULL);
    pipe_sampler_view_reference(&src_view, NULL);
@@ -1496,7 +1510,8 @@ void util_blitter_blit_generic(struct blitter_context *blitter,
                                const struct pipe_box *srcbox,
                                unsigned src_width0, unsigned src_height0,
                                unsigned mask, unsigned filter,
-                               const struct pipe_scissor_state *scissor)
+                               const struct pipe_scissor_state *scissor,
+                               boolean alpha_blend)
 {
    struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter;
    struct pipe_context *pipe = ctx->base.pipe;
@@ -1550,7 +1565,7 @@ void util_blitter_blit_generic(struct blitter_context *blitter,
    fb_state.zsbuf = NULL;
 
    if (blit_depth || blit_stencil) {
-      pipe->bind_blend_state(pipe, ctx->blend[0]);
+      pipe->bind_blend_state(pipe, ctx->blend[0][0]);
 
       if (blit_depth && blit_stencil) {
          pipe->bind_depth_stencil_alpha_state(pipe,
@@ -1573,7 +1588,9 @@ void util_blitter_blit_generic(struct blitter_context *blitter,
       }
 
    } else {
-      pipe->bind_blend_state(pipe, ctx->blend[mask & PIPE_MASK_RGBA]);
+      unsigned colormask = mask & PIPE_MASK_RGBA;
+
+      pipe->bind_blend_state(pipe, ctx->blend[colormask][alpha_blend]);
       pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
       ctx->bind_fs_state(pipe,
             blitter_get_fs_texfetch_col(ctx, src->format, src_target,
@@ -1786,7 +1803,8 @@ util_blitter_blit(struct blitter_context *blitter,
    util_blitter_blit_generic(blitter, dst_view, &info->dst.box,
                              src_view, &info->src.box, src->width0, src->height0,
                              info->mask, info->filter,
-                             info->scissor_enable ? &info->scissor : NULL);
+                             info->scissor_enable ? &info->scissor : NULL,
+                             info->alpha_blend);
 
    pipe_surface_reference(&dst_view, NULL);
    pipe_sampler_view_reference(&src_view, NULL);
@@ -1815,7 +1833,7 @@ void util_blitter_clear_render_target(struct blitter_context *blitter,
    blitter_disable_render_cond(ctx);
 
    /* bind states */
-   pipe->bind_blend_state(pipe, ctx->blend[PIPE_MASK_RGBA]);
+   pipe->bind_blend_state(pipe, ctx->blend[PIPE_MASK_RGBA][0]);
    pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
    bind_fs_write_one_cbuf(ctx);
    pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
@@ -1867,7 +1885,7 @@ void util_blitter_clear_depth_stencil(struct blitter_context *blitter,
    blitter_disable_render_cond(ctx);
 
    /* bind states */
-   pipe->bind_blend_state(pipe, ctx->blend[0]);
+   pipe->bind_blend_state(pipe, ctx->blend[0][0]);
    if ((clear_flags & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL) {
       sr.ref_value[0] = stencil & 0xff;
       pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil);
@@ -1933,8 +1951,8 @@ void util_blitter_custom_depth_stencil(struct blitter_context *blitter,
    blitter_disable_render_cond(ctx);
 
    /* bind states */
-   pipe->bind_blend_state(pipe, cbsurf ? ctx->blend[PIPE_MASK_RGBA] :
-                                         ctx->blend[0]);
+   pipe->bind_blend_state(pipe, cbsurf ? ctx->blend[PIPE_MASK_RGBA][0] :
+                                         ctx->blend[0][0]);
    pipe->bind_depth_stencil_alpha_state(pipe, dsa_stage);
    if (cbsurf)
       bind_fs_write_one_cbuf(ctx);
@@ -2187,7 +2205,7 @@ void util_blitter_custom_color(struct blitter_context *blitter,
 
    /* bind states */
    pipe->bind_blend_state(pipe, custom_blend ? custom_blend
-                                             : ctx->blend[PIPE_MASK_RGBA]);
+                                             : ctx->blend[PIPE_MASK_RGBA][0]);
    pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil);
    bind_fs_write_one_cbuf(ctx);
    pipe->bind_vertex_elements_state(pipe, ctx->velem_state);
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index 0cd173d6284..becdb029f13 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -246,7 +246,8 @@ void util_blitter_blit_generic(struct blitter_context *blitter,
                                const struct pipe_box *srcbox,
                                unsigned src_width0, unsigned src_height0,
                                unsigned mask, unsigned filter,
-                               const struct pipe_scissor_state *scissor);
+                               const struct pipe_scissor_state *scissor,
+                               boolean alpha_blend);
 
 void util_blitter_blit(struct blitter_context *blitter,
 		       const struct pipe_blit_info *info);
diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c
index 654b5bbc552..70ed911b7f1 100644
--- a/src/gallium/auxiliary/util/u_surface.c
+++ b/src/gallium/auxiliary/util/u_surface.c
@@ -676,6 +676,9 @@ util_try_blit_via_copy_region(struct pipe_context *ctx,
       return FALSE;
    }
 
+   if (blit->alpha_blend)
+      return FALSE;
+
    ctx->resource_copy_region(ctx, blit->dst.resource, blit->dst.level,
                              blit->dst.box.x, blit->dst.box.y, blit->dst.box.z,
                              blit->src.resource, blit->src.level,
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index c4516baf2ec..dd489568a77 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index 8e8cf6a03f2..441bfec5756 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
@@ -326,6 +326,13 @@ enum a3xx_tex_type {
 	A3XX_TEX_3D = 3,
 };
 
+enum a3xx_tex_msaa {
+	A3XX_TPL1_MSAA1X = 0,
+	A3XX_TPL1_MSAA2X = 1,
+	A3XX_TPL1_MSAA4X = 2,
+	A3XX_TPL1_MSAA8X = 3,
+};
+
 #define A3XX_INT0_RBBM_GPU_IDLE					0x00000001
 #define A3XX_INT0_RBBM_AHB_ERROR				0x00000002
 #define A3XX_INT0_RBBM_REG_TIMEOUT				0x00000004
@@ -2652,6 +2659,7 @@ static inline uint32_t A3XX_VGT_DRAW_INITIATOR_NUM_INSTANCES(uint32_t val)
 #define REG_A3XX_VGT_IMMED_DATA					0x000021fd
 
 #define REG_A3XX_TEX_SAMP_0					0x00000000
+#define A3XX_TEX_SAMP_0_CLAMPENABLE				0x00000001
 #define A3XX_TEX_SAMP_0_MIPFILTER_LINEAR			0x00000002
 #define A3XX_TEX_SAMP_0_XY_MAG__MASK				0x0000000c
 #define A3XX_TEX_SAMP_0_XY_MAG__SHIFT				2
@@ -2695,6 +2703,7 @@ static inline uint32_t A3XX_TEX_SAMP_0_COMPARE_FUNC(enum adreno_compare_func val
 {
 	return ((val) << A3XX_TEX_SAMP_0_COMPARE_FUNC__SHIFT) & A3XX_TEX_SAMP_0_COMPARE_FUNC__MASK;
 }
+#define A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF			0x01000000
 #define A3XX_TEX_SAMP_0_UNNORM_COORDS				0x80000000
 
 #define REG_A3XX_TEX_SAMP_1					0x00000001
@@ -2750,6 +2759,12 @@ static inline uint32_t A3XX_TEX_CONST_0_MIPLVLS(uint32_t val)
 {
 	return ((val) << A3XX_TEX_CONST_0_MIPLVLS__SHIFT) & A3XX_TEX_CONST_0_MIPLVLS__MASK;
 }
+#define A3XX_TEX_CONST_0_MSAATEX__MASK				0x00300000
+#define A3XX_TEX_CONST_0_MSAATEX__SHIFT				20
+static inline uint32_t A3XX_TEX_CONST_0_MSAATEX(enum a3xx_tex_msaa val)
+{
+	return ((val) << A3XX_TEX_CONST_0_MSAATEX__SHIFT) & A3XX_TEX_CONST_0_MSAATEX__MASK;
+}
 #define A3XX_TEX_CONST_0_FMT__MASK				0x1fc00000
 #define A3XX_TEX_CONST_0_FMT__SHIFT				22
 static inline uint32_t A3XX_TEX_CONST_0_FMT(enum a3xx_tex_fmt val)
@@ -2785,7 +2800,7 @@ static inline uint32_t A3XX_TEX_CONST_1_FETCHSIZE(enum a3xx_tex_fetchsize val)
 }
 
 #define REG_A3XX_TEX_CONST_2					0x00000002
-#define A3XX_TEX_CONST_2_INDX__MASK				0x000000ff
+#define A3XX_TEX_CONST_2_INDX__MASK				0x000001ff
 #define A3XX_TEX_CONST_2_INDX__SHIFT				0
 static inline uint32_t A3XX_TEX_CONST_2_INDX(uint32_t val)
 {
@@ -2805,7 +2820,7 @@ static inline uint32_t A3XX_TEX_CONST_2_SWAP(enum a3xx_color_swap val)
 }
 
 #define REG_A3XX_TEX_CONST_3					0x00000003
-#define A3XX_TEX_CONST_3_LAYERSZ1__MASK				0x00007fff
+#define A3XX_TEX_CONST_3_LAYERSZ1__MASK				0x0001ffff
 #define A3XX_TEX_CONST_3_LAYERSZ1__SHIFT			0
 static inline uint32_t A3XX_TEX_CONST_3_LAYERSZ1(uint32_t val)
 {
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.c b/src/gallium/drivers/freedreno/a3xx/fd3_format.c
index ec87aa979e3..04cb9b98fb7 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_format.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.c
@@ -262,6 +262,15 @@ static struct fd3_format formats[PIPE_FORMAT_COUNT] = {
 	_T(ETC2_R11_SNORM, ETC2_R11_SNORM, NONE, WZYX),
 	_T(ETC2_RG11_UNORM, ETC2_RG11_UNORM, NONE, WZYX),
 	_T(ETC2_RG11_SNORM, ETC2_RG11_SNORM, NONE, WZYX),
+
+	_T(DXT1_RGB,   DXT1, NONE, WZYX),
+	_T(DXT1_SRGB,  DXT1, NONE, WZYX),
+	_T(DXT1_RGBA,  DXT1, NONE, WZYX),
+	_T(DXT1_SRGBA, DXT1, NONE, WZYX),
+	_T(DXT3_RGBA,  DXT3, NONE, WZYX),
+	_T(DXT3_SRGBA, DXT3, NONE, WZYX),
+	_T(DXT5_RGBA,  DXT5, NONE, WZYX),
+	_T(DXT5_SRGBA, DXT5, NONE, WZYX),
 };
 
 enum a3xx_vtx_fmt
@@ -301,7 +310,7 @@ fd3_pipe2fetchsize(enum pipe_format format)
 {
 	if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
 		format = PIPE_FORMAT_Z32_FLOAT;
-	switch (util_format_get_blocksizebits(format)) {
+	switch (util_format_get_blocksizebits(format) / util_format_get_blockwidth(format)) {
 	case 8: return TFETCH_1_BYTE;
 	case 16: return TFETCH_2_BYTE;
 	case 32: return TFETCH_4_BYTE;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
index 9c16804fff5..583caaa806f 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
@@ -73,7 +73,7 @@ fd3_rasterizer_state_create(struct pipe_context *pctx,
 	so->gras_su_poly_offset_scale =
 			A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL(cso->offset_scale);
 	so->gras_su_poly_offset_offset =
-			A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units);
+			A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units * 2.0f);
 
 	so->gras_su_mode_control =
 			A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(cso->line_width/2.0);
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
index c30658d0e7b..2d6ecb2c050 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -115,6 +115,7 @@ fd3_sampler_state_create(struct pipe_context *pctx,
 
 	so->texsamp0 =
 			COND(!cso->normalized_coords, A3XX_TEX_SAMP_0_UNNORM_COORDS) |
+			COND(!cso->seamless_cube_map, A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF) |
 			COND(miplinear, A3XX_TEX_SAMP_0_MIPFILTER_LINEAR) |
 			A3XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) |
 			A3XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) |
@@ -239,7 +240,7 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 			A3XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl));
 	/* when emitted, A3XX_TEX_CONST_2_INDX() must be OR'd in: */
 	so->texconst2 =
-			A3XX_TEX_CONST_2_PITCH(rsc->slices[lvl].pitch * rsc->cpp);
+			A3XX_TEX_CONST_2_PITCH(util_format_get_nblocksx(cso->format, rsc->slices[lvl].pitch) * rsc->cpp);
 	switch (prsc->target) {
 	case PIPE_TEXTURE_1D_ARRAY:
 	case PIPE_TEXTURE_2D_ARRAY:
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 563f70ac5eb..2e1d712a28c 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
@@ -162,10 +162,13 @@ enum a4xx_tex_fmt {
 	TFMT4_8_UNORM = 4,
 	TFMT4_8_8_UNORM = 14,
 	TFMT4_8_8_8_8_UNORM = 28,
+	TFMT4_8_SNORM = 5,
 	TFMT4_8_8_SNORM = 15,
 	TFMT4_8_8_8_8_SNORM = 29,
+	TFMT4_8_UINT = 6,
 	TFMT4_8_8_UINT = 16,
 	TFMT4_8_8_8_8_UINT = 30,
+	TFMT4_8_SINT = 7,
 	TFMT4_8_8_SINT = 17,
 	TFMT4_8_8_8_8_SINT = 31,
 	TFMT4_16_UINT = 21,
@@ -430,7 +433,7 @@ static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(enum a3xx_color_swap val)
 	return ((val) << A4XX_RB_MRT_BUF_INFO_COLOR_SWAP__SHIFT) & A4XX_RB_MRT_BUF_INFO_COLOR_SWAP__MASK;
 }
 #define A4XX_RB_MRT_BUF_INFO_COLOR_SRGB				0x00002000
-#define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__MASK		0x007fc000
+#define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__MASK		0xffffc000
 #define A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH__SHIFT		14
 static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(uint32_t val)
 {
@@ -440,7 +443,7 @@ static inline uint32_t A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(uint32_t val)
 static inline uint32_t REG_A4XX_RB_MRT_BASE(uint32_t i0) { return 0x000020a6 + 0x5*i0; }
 
 static inline uint32_t REG_A4XX_RB_MRT_CONTROL3(uint32_t i0) { return 0x000020a7 + 0x5*i0; }
-#define A4XX_RB_MRT_CONTROL3_STRIDE__MASK			0x0001fff8
+#define A4XX_RB_MRT_CONTROL3_STRIDE__MASK			0x03fffff8
 #define A4XX_RB_MRT_CONTROL3_STRIDE__SHIFT			3
 static inline uint32_t A4XX_RB_MRT_CONTROL3_STRIDE(uint32_t val)
 {
@@ -1460,6 +1463,7 @@ static inline uint32_t A4XX_SP_FS_MRT_REG_MRTFORMAT(enum a4xx_color_fmt val)
 {
 	return ((val) << A4XX_SP_FS_MRT_REG_MRTFORMAT__SHIFT) & A4XX_SP_FS_MRT_REG_MRTFORMAT__MASK;
 }
+#define A4XX_SP_FS_MRT_REG_COLOR_SRGB				0x00040000
 
 #define REG_A4XX_SP_CS_CTRL_REG0				0x00002300
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index ab7850e50b0..3a1d4b617d3 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -56,6 +56,7 @@ struct fd4_emit {
 	uint32_t sprite_coord_enable;  /* bitmask */
 	bool sprite_coord_mode;
 	bool rasterflat;
+	bool no_decode_srgb;
 
 	/* cached to avoid repeated lookups of same variants: */
 	struct ir3_shader_variant *vp, *fp;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
index 3e0045449eb..6c9e217d5ad 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c
@@ -79,9 +79,9 @@ struct fd4_format {
 static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 	/* 8-bit */
 	VT(R8_UNORM,   8_UNORM, R8_UNORM, WZYX),
-	V_(R8_SNORM,   8_SNORM, NONE,     WZYX),
-	V_(R8_UINT,    8_UINT,  NONE,     WZYX),
-	V_(R8_SINT,    8_SINT,  NONE,     WZYX),
+	VT(R8_SNORM,   8_SNORM, NONE,     WZYX),
+	VT(R8_UINT,    8_UINT,  NONE,     WZYX),
+	VT(R8_SINT,    8_SINT,  NONE,     WZYX),
 	V_(R8_USCALED, 8_UINT,  NONE,     WZYX),
 	V_(R8_SSCALED, 8_UINT,  NONE,     WZYX),
 
@@ -115,8 +115,8 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = {
 
 	VT(R8G8_UNORM,   8_8_UNORM, R8G8_UNORM, WZYX),
 	VT(R8G8_SNORM,   8_8_SNORM, R8G8_SNORM, WZYX),
-	VT(R8G8_UINT,    8_8_UINT,  NONE,       WZYX),
-	VT(R8G8_SINT,    8_8_SINT,  NONE,       WZYX),
+	VT(R8G8_UINT,    8_8_UINT,  R8G8_UINT,  WZYX),
+	VT(R8G8_SINT,    8_8_SINT,  R8G8_SINT,  WZYX),
 	V_(R8G8_USCALED, 8_8_UINT,  NONE,       WZYX),
 	V_(R8G8_SSCALED, 8_8_SINT,  NONE,       WZYX),
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 81c37f72565..3f8bbf3a124 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -46,7 +46,8 @@
 
 static void
 emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
-		struct pipe_surface **bufs, uint32_t *bases, uint32_t bin_w)
+		struct pipe_surface **bufs, uint32_t *bases,
+		uint32_t bin_w, bool decode_srgb)
 {
 	enum a4xx_tile_mode tile_mode;
 	unsigned i;
@@ -60,6 +61,7 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
 		enum a4xx_color_fmt format = 0;
 		enum a3xx_color_swap swap = WZYX;
+		bool srgb = false;
 		struct fd_resource *rsc = NULL;
 		struct fd_resource_slice *slice = NULL;
 		uint32_t stride = 0;
@@ -68,10 +70,9 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 
 		if ((i < nr_bufs) && bufs[i]) {
 			struct pipe_surface *psurf = bufs[i];
-			enum pipe_format pformat = 0;
+			enum pipe_format pformat = psurf->format;
 
 			rsc = fd_resource(psurf->texture);
-			pformat = psurf->format;
 
 			/* In case we're drawing to Z32F_S8, the "color" actually goes to
 			 * the stencil
@@ -86,6 +87,11 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 			format = fd4_pipe2color(pformat);
 			swap = fd4_pipe2swap(pformat);
 
+			if (decode_srgb)
+				srgb = util_format_is_srgb(pformat);
+			else
+				pformat = util_format_linear(pformat);
+
 			debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer);
 
 			offset = fd_resource_offset(rsc, psurf->u.tex.level,
@@ -108,7 +114,8 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 		OUT_RING(ring, A4XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) |
 				A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) |
-				A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap));
+				A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) |
+				COND(srgb, A4XX_RB_MRT_BUF_INFO_COLOR_SRGB));
 		if (bin_w || (i >= nr_bufs) || !bufs[i]) {
 			OUT_RING(ring, base);
 			OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride));
@@ -282,7 +289,7 @@ emit_mem2gmem_surf(struct fd_context *ctx, uint32_t *bases,
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct pipe_surface *zsbufs[2];
 
-	emit_mrt(ring, nr_bufs, bufs, bases, bin_w);
+	emit_mrt(ring, nr_bufs, bufs, bases, bin_w, false);
 
 	if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) {
 		/* The gmem_restore_tex logic will put the first buffer's stencil
@@ -315,6 +322,7 @@ fd4_emit_tile_mem2gmem(struct fd_context *ctx, struct fd_tile *tile)
 			.key = {
 				.half_precision = fd_half_precision(pfb),
 			},
+			.no_decode_srgb = true,
 	};
 	unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0};
 	float x0, y0, x1, y1;
@@ -520,7 +528,7 @@ fd4_emit_sysmem_prep(struct fd_context *ctx)
 	OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
 			A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));
 
-	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0);
+	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0, true);
 
 	/* setup scissor/offset for current tile: */
 	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
@@ -677,7 +685,7 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 	OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
 	OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2));
 
-	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w);
+	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w, true);
 
 	/* setup scissor/offset for current tile: */
 	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index 1a6d0142132..a3d7123ccee 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -450,10 +450,15 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 	OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8);
 	for (i = 0; i < 8; i++) {
 		enum a4xx_color_fmt format = 0;
-		if (i < nr)
+		bool srgb = false;
+		if (i < nr) {
 			format = fd4_emit_format(bufs[i]);
+			if (bufs[i] && !emit->no_decode_srgb)
+				srgb = util_format_is_srgb(bufs[i]->format);
+		}
 		OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) |
 				A4XX_SP_FS_MRT_REG_MRTFORMAT(format) |
+				COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) |
 				COND(emit->key.half_precision,
 					A4XX_SP_FS_MRT_REG_HALF_PRECISION));
 	}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index d2bc5fee6c0..213b29c9181 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -187,9 +187,9 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 			A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size);
 		break;
 	case PIPE_TEXTURE_CUBE:
-	case PIPE_TEXTURE_CUBE_ARRAY:  /* ?? not sure about _CUBE_ARRAY */
+	case PIPE_TEXTURE_CUBE_ARRAY:
 		so->texconst3 =
-			A4XX_TEX_CONST_3_DEPTH(1) |
+			A4XX_TEX_CONST_3_DEPTH(prsc->array_size / 6) |
 			A4XX_TEX_CONST_3_LAYERSZ(rsc->layer_size);
 		break;
 	case PIPE_TEXTURE_3D:
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index 00b6acba065..29944b7ac08 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 98a90e26679..432dce3dfb3 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -13,8 +13,8 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10551 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  66709 bytes, from 2015-05-20 20:03:14)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63728 bytes, from 2015-08-05 18:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67120 bytes, from 2015-08-14 23:22:03)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63915 bytes, from 2015-08-24 16:56:28)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <[email protected]> (robclark)
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 8e6d43150ce..0b6b9fbbe7a 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -131,11 +131,13 @@ static void
 fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
 		unsigned flags)
 {
+	struct fd_ringbuffer *ring = fd_context(pctx)->ring;
+
 	fd_context_render(pctx);
 
 	if (fence) {
 		fd_screen_fence_ref(pctx->screen, fence, NULL);
-		*fence = fd_fence_create(pctx);
+		*fence = fd_fence_create(pctx, fd_ringbuffer_timestamp(ring));
 	}
 }
 
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index 04a9feacd58..5125f091860 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -50,35 +50,18 @@ fd_screen_fence_ref(struct pipe_screen *pscreen,
 	*ptr = pfence;
 }
 
-/* TODO we need to spiff out libdrm_freedreno a bit to allow passing
- * the timeout.. and maybe a better way to check if fence has been
- * signaled.  The current implementation is a bit lame for now to
- * avoid bumping libdrm version requirement.
- */
-
-boolean fd_screen_fence_signalled(struct pipe_screen *screen,
-		struct pipe_fence_handle *fence)
-{
-	uint32_t timestamp = fd_ringbuffer_timestamp(fence->ctx->ring);
-
-	/* TODO util helper for compare w/ rollover? */
-	return timestamp >= fence->timestamp;
-}
-
 boolean fd_screen_fence_finish(struct pipe_screen *screen,
 		struct pipe_fence_handle *fence,
 		uint64_t timeout)
 {
-	if (!timeout)
-		return fd_screen_fence_signalled(screen, fence);
-
-	if (fd_pipe_wait(fence->screen->pipe, fence->timestamp))
+	if (fd_pipe_wait_timeout(fence->screen->pipe, fence->timestamp, timeout))
 		return false;
 
 	return true;
 }
 
-struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx)
+struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx,
+		uint32_t timestamp)
 {
 	struct pipe_fence_handle *fence;
 	struct fd_context *ctx = fd_context(pctx);
@@ -91,7 +74,7 @@ struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx)
 
 	fence->ctx = ctx;
 	fence->screen = ctx->screen;
-	fence->timestamp = fd_ringbuffer_timestamp(ctx->ring);
+	fence->timestamp = timestamp;
 
 	return fence;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.h b/src/gallium/drivers/freedreno/freedreno_fence.h
index e36bcc4d1f2..06c314a6116 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.h
+++ b/src/gallium/drivers/freedreno/freedreno_fence.h
@@ -34,11 +34,10 @@
 void fd_screen_fence_ref(struct pipe_screen *pscreen,
 		struct pipe_fence_handle **ptr,
 		struct pipe_fence_handle *pfence);
-boolean fd_screen_fence_signalled(struct pipe_screen *screen,
-		struct pipe_fence_handle *pfence);
 boolean fd_screen_fence_finish(struct pipe_screen *screen,
 		struct pipe_fence_handle *pfence,
 		uint64_t timeout);
-struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx);
+struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx,
+		uint32_t timestamp);
 
 #endif /* FREEDRENO_FENCE_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index 709ad4eb55b..98de0969cab 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -222,7 +222,7 @@ fd_resource_transfer_map(struct pipe_context *pctx,
 	ptrans->level = level;
 	ptrans->usage = usage;
 	ptrans->box = *box;
-	ptrans->stride = slice->pitch * rsc->cpp;
+	ptrans->stride = util_format_get_nblocksx(format, slice->pitch) * rsc->cpp;
 	ptrans->layer_stride = slice->size0;
 
 	if (usage & PIPE_TRANSFER_READ)
@@ -375,9 +375,11 @@ setup_slices(struct fd_resource *rsc, uint32_t alignment)
 
 	for (level = 0; level <= prsc->last_level; level++) {
 		struct fd_resource_slice *slice = fd_resource_slice(rsc, level);
+		uint32_t blocks;
 
 		slice->pitch = width = align(width, 32);
 		slice->offset = size;
+		blocks = util_format_get_nblocks(prsc->format, width, height);
 		/* 1d array and 2d array textures must all have the same layer size
 		 * for each miplevel on a3xx. 3d textures can have different layer
 		 * sizes for high levels, but the hw auto-sizer is buggy (or at least
@@ -387,9 +389,9 @@ setup_slices(struct fd_resource *rsc, uint32_t alignment)
 		if (prsc->target == PIPE_TEXTURE_3D && (
 					level == 1 ||
 					(level > 1 && rsc->slices[level - 1].size0 > 0xf000)))
-			slice->size0 = align(slice->pitch * height * rsc->cpp, alignment);
+			slice->size0 = align(blocks * rsc->cpp, alignment);
 		else if (level == 0 || rsc->layer_first || alignment == 1)
-			slice->size0 = align(slice->pitch * height * rsc->cpp, alignment);
+			slice->size0 = align(blocks * rsc->cpp, alignment);
 		else
 			slice->size0 = rsc->slices[level - 1].size0;
 
@@ -459,7 +461,6 @@ fd_resource_create(struct pipe_screen *pscreen,
 	if (is_a4xx(fd_screen(pscreen))) {
 		switch (tmpl->target) {
 		case PIPE_TEXTURE_3D:
-			/* TODO 3D_ARRAY? */
 			rsc->layer_first = false;
 			break;
 		default:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index b55f5b36ca9..86e9a21da2f 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -163,7 +163,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 	case PIPE_CAP_TEXTURE_BARRIER:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-	case PIPE_CAP_CUBE_MAP_ARRAY:
 	case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
 	case PIPE_CAP_START_INSTANCE:
 	case PIPE_CAP_COMPUTE:
@@ -176,6 +175,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_INDEP_BLEND_ENABLE:
 	case PIPE_CAP_INDEP_BLEND_FUNC:
 	case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 		return is_a3xx(screen) || is_a4xx(screen);
 
 	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
@@ -191,8 +191,13 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 16383;
 
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
+	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
 		return is_a3xx(screen);
 
+	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+	case PIPE_CAP_CUBE_MAP_ARRAY:
+		return is_a4xx(screen);
+
 	case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
 		return 256;
 
@@ -202,7 +207,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return is_ir3(screen) ? 130 : 120;
 
 	/* Unsupported features. */
-	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
 	case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
@@ -230,8 +234,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-	case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
-	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 		return 0;
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 0ab33455ed1..071901a3cc7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1636,6 +1636,11 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 			coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0);
 	}
 
+	/* the array coord for cube arrays needs 0.5 added to it */
+	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE && tex->is_array &&
+		opc != OPC_ISAML)
+		coord[3] = ir3_ADD_F(b, coord[3], 0, create_immed(b, fui(0.5)), 0);
+
 	/*
 	 * lay out the first argument in the proper order:
 	 *  - actual coordinates first
@@ -1759,6 +1764,12 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
 
 	tex_info(tex, &flags, &coords);
 
+	/* Actually we want the number of dimensions, not coordinates. This
+	 * distinction only matters for cubes.
+	 */
+	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+		coords = 2;
+
 	dst = get_dst(ctx, &tex->dest, 4);
 
 	compile_assert(ctx, tex->num_srcs == 1);
@@ -2301,7 +2312,7 @@ emit_instructions(struct ir3_compile *ctx)
 	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
 
 	/* Create inputs in first block: */
-	ctx->block = get_block(ctx, fxn->start_block);
+	ctx->block = get_block(ctx, nir_start_block(fxn));
 	ctx->in_block = ctx->block;
 	list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
index dc9e4626f27..bed7b7b826a 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -29,6 +29,7 @@
 
 #include "ir3_nir.h"
 #include "glsl/nir/nir_builder.h"
+#include "glsl/nir/nir_control_flow.h"
 
 /* Based on nir_opt_peephole_select, and hacked up to more aggressively
  * flatten anything that can be flattened
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index eaf3b3c35e8..88018398e96 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -189,7 +189,7 @@ ir3_ra_alloc_reg_set(void *memctx)
 	}
 
 	/* allocate the reg-set.. */
-	set->regs = ra_alloc_reg_set(set, ra_reg_count);
+	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
 	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
 	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
 
diff --git a/src/gallium/drivers/i915/i915_surface.c b/src/gallium/drivers/i915/i915_surface.c
index 24e01560110..b2a639cb0f1 100644
--- a/src/gallium/drivers/i915/i915_surface.c
+++ b/src/gallium/drivers/i915/i915_surface.c
@@ -120,7 +120,8 @@ i915_surface_copy_render(struct pipe_context *pipe,
 
    util_blitter_blit_generic(i915->blitter, dst_view, &dstbox,
                              src_view, src_box, src_width0, src_height0,
-                             PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+                             PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+                             FALSE);
    return;
 
 fallback:
diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
index 3fae3bc9cba..9346ea3204d 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -121,7 +121,8 @@ NV50_CODEGEN_SOURCES := \
 	codegen/nv50_ir_target_nv50.cpp \
 	codegen/nv50_ir_target_nv50.h \
 	codegen/nv50_ir_util.cpp \
-	codegen/nv50_ir_util.h
+	codegen/nv50_ir_util.h \
+	codegen/unordered_set.h
 
 NVC0_CODEGEN_SOURCES := \
 	codegen/nv50_ir_emit_gk110.cpp \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index 3ddaeafebbd..ba1b0851927 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -29,8 +29,8 @@
 #include <deque>
 #include <list>
 #include <vector>
-#include <tr1/unordered_set>
 
+#include "codegen/unordered_set.h"
 #include "codegen/nv50_ir_util.h"
 #include "codegen/nv50_ir_graph.h"
 
@@ -585,10 +585,10 @@ public:
 
    static inline Value *get(Iterator&);
 
-   std::tr1::unordered_set<ValueRef *> uses;
+   unordered_set<ValueRef *> uses;
    std::list<ValueDef *> defs;
-   typedef std::tr1::unordered_set<ValueRef *>::iterator UseIterator;
-   typedef std::tr1::unordered_set<ValueRef *>::const_iterator UseCIterator;
+   typedef unordered_set<ValueRef *>::iterator UseIterator;
+   typedef unordered_set<ValueRef *>::const_iterator UseCIterator;
    typedef std::list<ValueDef *>::iterator DefIterator;
    typedef std::list<ValueDef *>::const_iterator DefCIterator;
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index f06056f8f17..8f1542959c9 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -933,6 +933,7 @@ CodeEmitterGK110::emitCVT(const Instruction *i)
 
    code[0] |= typeSizeofLog2(dType) << 10;
    code[0] |= typeSizeofLog2(i->sType) << 12;
+   code[1] |= i->subOp << 12;
 
    if (isSignedIntType(dType))
       code[0] |= 0x4000;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index ef5c87d0437..6e22788341f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -818,6 +818,7 @@ CodeEmitterGM107::emitI2F()
    emitField(0x31, 1, (insn->op == OP_ABS) || insn->src(0).mod.abs());
    emitCC   (0x2f);
    emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
+   emitField(0x29, 2, insn->subOp);
    emitRND  (0x27, rnd, -1);
    emitField(0x0d, 1, isSignedType(insn->sType));
    emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
@@ -850,6 +851,7 @@ CodeEmitterGM107::emitI2I()
    emitField(0x31, 1, (insn->op == OP_ABS) || insn->src(0).mod.abs());
    emitCC   (0x2f);
    emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
+   emitField(0x29, 2, insn->subOp);
    emitField(0x0d, 1, isSignedType(insn->sType));
    emitField(0x0c, 1, isSignedType(insn->dType));
    emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index f607f3ba3ec..6bf5219d346 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1020,6 +1020,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i)
       code[0] |= util_logbase2(typeSizeof(dType)) << 20;
       code[0] |= util_logbase2(typeSizeof(i->sType)) << 23;
 
+      // for 8/16 source types, the byte/word is in subOp. word 1 is
+      // represented as 2.
+      code[1] |= i->subOp << 0x17;
+
       if (sat)
          code[0] |= 0x20;
       if (abs)
@@ -2614,11 +2618,12 @@ private:
          int imul; // integer MUL to MUL delay 3
       } res;
       struct ScoreData {
-         int r[64];
+         int r[256];
          int p[8];
          int c;
       } rd, wr;
       int base;
+      int regs;
 
       void rebase(const int base)
       {
@@ -2627,7 +2632,7 @@ private:
             return;
          this->base = 0;
 
-         for (int i = 0; i < 64; ++i) {
+         for (int i = 0; i < regs; ++i) {
             rd.r[i] += delta;
             wr.r[i] += delta;
          }
@@ -2646,16 +2651,17 @@ private:
          res.imul += delta;
          res.tex += delta;
       }
-      void wipe()
+      void wipe(int regs)
       {
          memset(&rd, 0, sizeof(rd));
          memset(&wr, 0, sizeof(wr));
          memset(&res, 0, sizeof(res));
+         this->regs = regs;
       }
       int getLatest(const ScoreData& d) const
       {
          int max = 0;
-         for (int i = 0; i < 64; ++i)
+         for (int i = 0; i < regs; ++i)
             if (d.r[i] > max)
                max = d.r[i];
          for (int i = 0; i < 8; ++i)
@@ -2690,7 +2696,7 @@ private:
       }
       void setMax(const RegScores *that)
       {
-         for (int i = 0; i < 64; ++i) {
+         for (int i = 0; i < regs; ++i) {
             rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
             wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
          }
@@ -2711,7 +2717,7 @@ private:
       }
       void print(int cycle)
       {
-         for (int i = 0; i < 64; ++i) {
+         for (int i = 0; i < regs; ++i) {
             if (rd.r[i] > cycle)
                INFO("rd $r%i @ %i\n", i, rd.r[i]);
             if (wr.r[i] > cycle)
@@ -2806,9 +2812,10 @@ SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const
 bool
 SchedDataCalculator::visit(Function *func)
 {
+   int regs = targ->getFileSize(FILE_GPR) + 1;
    scoreBoards.resize(func->cfg.getSize());
    for (size_t i = 0; i < scoreBoards.size(); ++i)
-      scoreBoards[i].wipe();
+      scoreBoards[i].wipe(regs);
    return true;
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 4847a0f3355..f153674e9ce 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2990,9 +2990,15 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
    case TGSI_OPCODE_UBFE:
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
          src0 = fetchSrc(0, c);
-         src1 = fetchSrc(1, c);
-         src2 = fetchSrc(2, c);
-         mkOp3(OP_INSBF, TYPE_U32, src1, src2, mkImm(0x808), src1);
+         if (tgsi.getSrc(1).getFile() == TGSI_FILE_IMMEDIATE &&
+             tgsi.getSrc(2).getFile() == TGSI_FILE_IMMEDIATE) {
+            src1 = loadImm(NULL, tgsi.getSrc(2).getValueU32(c, info) << 8 |
+                           tgsi.getSrc(1).getValueU32(c, info));
+         } else {
+            src1 = fetchSrc(1, c);
+            src2 = fetchSrc(2, c);
+            mkOp3(OP_INSBF, TYPE_U32, src1, src2, mkImm(0x808), src1);
+         }
          mkOp2(OP_EXTBF, dstTy, dst0[c], src0, src1);
       }
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 1f3fce2bb9a..420cc4ef7c0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -193,100 +193,16 @@ GM107LoweringPass::visit(Instruction *i)
       checkPredicate(i);
 
    switch (i->op) {
-   case OP_TEX:
-   case OP_TXB:
-   case OP_TXL:
-   case OP_TXF:
-   case OP_TXG:
-      return handleTEX(i->asTex());
-   case OP_TXD:
-      return handleTXD(i->asTex());
-   case OP_TXLQ:
-      return handleTXLQ(i->asTex());
-   case OP_TXQ:
-      return handleTXQ(i->asTex());
-   case OP_EX2:
-      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
-      i->setSrc(0, i->getDef(0));
-      break;
-   case OP_POW:
-      return handlePOW(i);
-   case OP_DIV:
-      return handleDIV(i);
-   case OP_MOD:
-      return handleMOD(i);
-   case OP_SQRT:
-      return handleSQRT(i);
-   case OP_EXPORT:
-      return handleEXPORT(i);
    case OP_PFETCH:
       return handlePFETCH(i);
-   case OP_EMIT:
-   case OP_RESTART:
-      return handleOUT(i);
-   case OP_RDSV:
-      return handleRDSV(i);
-   case OP_WRSV:
-      return handleWRSV(i);
-   case OP_LOAD:
-      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
-         if (prog->getType() == Program::TYPE_COMPUTE) {
-            i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
-            i->getSrc(0)->reg.fileIndex = 0;
-         } else
-         if (prog->getType() == Program::TYPE_GEOMETRY &&
-             i->src(0).isIndirect(0)) {
-            // XXX: this assumes vec4 units
-            Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                    i->getIndirect(0, 0), bld.mkImm(4));
-            i->setIndirect(0, 0, ptr);
-            i->op = OP_VFETCH;
-         } else {
-            i->op = OP_VFETCH;
-            assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
-         }
-      } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
-         if (i->src(0).isIndirect(1)) {
-            Value *ptr;
-            if (i->src(0).isIndirect(0))
-               ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(0x1010),
-                                i->getIndirect(0, 0));
-            else
-               ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(16));
-            i->setIndirect(0, 1, NULL);
-            i->setIndirect(0, 0, ptr);
-            i->subOp = NV50_IR_SUBOP_LDC_IS;
-         }
-      }
-      break;
-   case OP_ATOM:
-   {
-      const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
-      handleATOM(i);
-      handleCasExch(i, cctl);
-   }
-      break;
-   case OP_SULDB:
-   case OP_SULDP:
-   case OP_SUSTB:
-   case OP_SUSTP:
-   case OP_SUREDB:
-   case OP_SUREDP:
-      handleSurfaceOpNVE4(i->asTex());
-      break;
    case OP_DFDX:
    case OP_DFDY:
-      handleDFDX(i);
-      break;
+      return handleDFDX(i);
    case OP_POPCNT:
-      handlePOPCNT(i);
-      break;
+      return handlePOPCNT(i);
    default:
-      break;
+      return NVC0LoweringPass::visit(i);
    }
-   return true;
 }
 
 } // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index c3c302da5c8..b1f406585a9 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -224,7 +224,7 @@ NVC0LegalizePostRA::findFirstUses(
       const Instruction *texi,
       const Instruction *insn,
       std::list<TexUse> &uses,
-      std::tr1::unordered_set<const Instruction *>& visited)
+      unordered_set<const Instruction *>& visited)
 {
    for (int d = 0; insn->defExists(d); ++d) {
       Value *v = insn->getDef(d);
@@ -323,7 +323,7 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
    if (!uses)
       return false;
    for (size_t i = 0; i < texes.size(); ++i) {
-      std::tr1::unordered_set<const Instruction *> visited;
+      unordered_set<const Instruction *> visited;
       findFirstUses(texes[i], texes[i], uses[i], visited);
    }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 260e101a1fb..2ce52e5c4f7 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -20,8 +20,6 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include <tr1/unordered_set>
-
 #include "codegen/nv50_ir.h"
 #include "codegen/nv50_ir_build_util.h"
 
@@ -73,7 +71,7 @@ private:
    inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
    void findFirstUses(const Instruction *tex, const Instruction *def,
                       std::list<TexUse>&,
-                      std::tr1::unordered_set<const Instruction *>&);
+                      unordered_set<const Instruction *>&);
    void findOverwritingDefs(const Instruction *tex, Instruction *insn,
                             const BasicBlock *term,
                             std::list<TexUse>&);
@@ -111,10 +109,11 @@ protected:
 
    void checkPredicate(Instruction *);
 
+   virtual bool visit(Instruction *);
+
 private:
    virtual bool visit(Function *);
    virtual bool visit(BasicBlock *);
-   virtual bool visit(Instruction *);
 
    void readTessCoord(LValue *dst, int c);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index cea96dcdfc5..b01ef4128cd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1023,27 +1023,53 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 
    case OP_AND:
    {
-      CmpInstruction *cmp = i->getSrc(t)->getInsn()->asCmp();
-      if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
-         return;
-      if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
-         return;
-      if (imm0.reg.data.f32 != 1.0)
-         return;
-      if (i->getSrc(t)->getInsn()->dType != TYPE_U32)
-         return;
+      Instruction *src = i->getSrc(t)->getInsn();
+      ImmediateValue imm1;
+      if (imm0.reg.data.u32 == 0) {
+         i->op = OP_MOV;
+         i->setSrc(0, new_ImmediateValue(prog, 0u));
+         i->src(0).mod = Modifier(0);
+         i->setSrc(1, NULL);
+      } else if (imm0.reg.data.u32 == ~0U) {
+         i->op = i->src(t).mod.getOp();
+         if (t) {
+            i->setSrc(0, i->getSrc(t));
+            i->src(0).mod = i->src(t).mod;
+         }
+         i->setSrc(1, NULL);
+      } else if (src->asCmp()) {
+         CmpInstruction *cmp = src->asCmp();
+         if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
+            return;
+         if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
+            return;
+         if (imm0.reg.data.f32 != 1.0)
+            return;
+         if (cmp->dType != TYPE_U32)
+            return;
 
-      i->getSrc(t)->getInsn()->dType = TYPE_F32;
-      if (i->src(t).mod != Modifier(0)) {
-         assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
-         i->src(t).mod = Modifier(0);
-         cmp->setCond = inverseCondCode(cmp->setCond);
-      }
-      i->op = OP_MOV;
-      i->setSrc(s, NULL);
-      if (t) {
-         i->setSrc(0, i->getSrc(t));
-         i->setSrc(t, NULL);
+         cmp->dType = TYPE_F32;
+         if (i->src(t).mod != Modifier(0)) {
+            assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
+            i->src(t).mod = Modifier(0);
+            cmp->setCond = inverseCondCode(cmp->setCond);
+         }
+         i->op = OP_MOV;
+         i->setSrc(s, NULL);
+         if (t) {
+            i->setSrc(0, i->getSrc(t));
+            i->setSrc(t, NULL);
+         }
+      } else if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32) &&
+                 src->op == OP_SHR &&
+                 src->src(1).getImmediate(imm1) &&
+                 i->src(t).mod == Modifier(0) &&
+                 util_is_power_of_two(imm0.reg.data.u32 + 1)) {
+         // low byte = offset, high byte = width
+         uint32_t ext = (util_last_bit(imm0.reg.data.u32) << 8) | imm1.reg.data.u32;
+         i->op = OP_EXTBF;
+         i->setSrc(0, src->getSrc(0));
+         i->setSrc(1, new_ImmediateValue(prog, ext));
       }
    }
       break;
@@ -1106,6 +1132,84 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       i->op = OP_MOV;
       break;
    }
+   case OP_CVT: {
+      Storage res;
+
+      // TODO: handle 64-bit values properly
+      if (typeSizeof(i->dType) == 8 || typeSizeof(i->sType) == 8)
+         return;
+
+      // TODO: handle single byte/word extractions
+      if (i->subOp)
+         return;
+
+      bld.setPosition(i, true); /* make sure bld is init'ed */
+
+#define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
+   case type: \
+      switch (i->sType) { \
+      case TYPE_F32: \
+         res.data.dst = util_iround(i->saturate ? \
+                                    CLAMP(imm0.reg.data.f32, fmin, fmax) : \
+                                    imm0.reg.data.f32); \
+         break; \
+      case TYPE_S32: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.s32, imin, imax) : \
+                        imm0.reg.data.s32; \
+         break; \
+      case TYPE_U32: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.u32, umin, umax) : \
+                        imm0.reg.data.u32; \
+         break; \
+      case TYPE_S16: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.s16, imin, imax) : \
+                        imm0.reg.data.s16; \
+         break; \
+      case TYPE_U16: \
+         res.data.dst = i->saturate ? \
+                        CLAMP(imm0.reg.data.u16, umin, umax) : \
+                        imm0.reg.data.u16; \
+         break; \
+      default: return; \
+      } \
+      i->setSrc(0, bld.mkImm(res.data.dst)); \
+      break
+
+      switch(i->dType) {
+      CASE(TYPE_U16, u16, 0, UINT16_MAX, 0, UINT16_MAX, 0, UINT16_MAX);
+      CASE(TYPE_S16, s16, INT16_MIN, INT16_MAX, INT16_MIN, INT16_MAX, 0, INT16_MAX);
+      CASE(TYPE_U32, u32, 0, UINT32_MAX, 0, INT32_MAX, 0, UINT32_MAX);
+      CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
+      case TYPE_F32:
+         switch (i->sType) {
+         case TYPE_F32:
+            res.data.f32 = i->saturate ?
+               CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+               imm0.reg.data.f32;
+            break;
+         case TYPE_U16: res.data.f32 = (float) imm0.reg.data.u16; break;
+         case TYPE_U32: res.data.f32 = (float) imm0.reg.data.u32; break;
+         case TYPE_S16: res.data.f32 = (float) imm0.reg.data.s16; break;
+         case TYPE_S32: res.data.f32 = (float) imm0.reg.data.s32; break;
+         default:
+            return;
+         }
+         i->setSrc(0, bld.mkImm(res.data.f32));
+         break;
+      default:
+         return;
+      }
+#undef CASE
+
+      i->setType(i->dType); /* Remove i->sType, which we don't need anymore */
+      i->op = OP_MOV;
+      i->saturate = 0;
+      i->src(0).mod = Modifier(0); /* Clear the already applied modifier */
+      break;
+   }
    default:
       return;
    }
@@ -1212,7 +1316,8 @@ private:
    void handleRCP(Instruction *);
    void handleSLCT(Instruction *);
    void handleLOGOP(Instruction *);
-   void handleCVT(Instruction *);
+   void handleCVT_NEG(Instruction *);
+   void handleCVT_EXTBF(Instruction *);
    void handleSUCLAMP(Instruction *);
 
    BuildUtil bld;
@@ -1463,12 +1568,12 @@ AlgebraicOpt::handleLOGOP(Instruction *logop)
 // nv50:
 //  F2I(NEG(I2F(ABS(SET))))
 void
-AlgebraicOpt::handleCVT(Instruction *cvt)
+AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
 {
+   Instruction *insn = cvt->getSrc(0)->getInsn();
    if (cvt->sType != TYPE_F32 ||
        cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
       return;
-   Instruction *insn = cvt->getSrc(0)->getInsn();
    if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
       return;
    if (insn->src(0).mod != Modifier(0))
@@ -1498,6 +1603,104 @@ AlgebraicOpt::handleCVT(Instruction *cvt)
    delete_Instruction(prog, cvt);
 }
 
+// Some shaders extract packed bytes out of words and convert them to
+// e.g. float. The Fermi+ CVT instruction can extract those directly, as can
+// nv50 for word sizes.
+//
+// CVT(EXTBF(x, byte/word))
+// CVT(AND(bytemask, x))
+// CVT(AND(bytemask, SHR(x, 8/16/24)))
+// CVT(SHR(x, 16/24))
+void
+AlgebraicOpt::handleCVT_EXTBF(Instruction *cvt)
+{
+   Instruction *insn = cvt->getSrc(0)->getInsn();
+   ImmediateValue imm;
+   Value *arg = NULL;
+   unsigned width, offset;
+   if ((cvt->sType != TYPE_U32 && cvt->sType != TYPE_S32) || !insn)
+      return;
+   if (insn->op == OP_EXTBF && insn->src(1).getImmediate(imm)) {
+      width = (imm.reg.data.u32 >> 8) & 0xff;
+      offset = imm.reg.data.u32 & 0xff;
+      arg = insn->getSrc(0);
+
+      if (width != 8 && width != 16)
+         return;
+      if (width == 8 && offset & 0x7)
+         return;
+      if (width == 16 && offset & 0xf)
+         return;
+   } else if (insn->op == OP_AND) {
+      int s;
+      if (insn->src(0).getImmediate(imm))
+         s = 0;
+      else if (insn->src(1).getImmediate(imm))
+         s = 1;
+      else
+         return;
+
+      if (imm.reg.data.u32 == 0xff)
+         width = 8;
+      else if (imm.reg.data.u32 == 0xffff)
+         width = 16;
+      else
+         return;
+
+      arg = insn->getSrc(!s);
+      Instruction *shift = arg->getInsn();
+      offset = 0;
+      if (shift && shift->op == OP_SHR &&
+          shift->sType == cvt->sType &&
+          shift->src(1).getImmediate(imm) &&
+          ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+           (width == 16 && (imm.reg.data.u32 & 0xf) == 0))) {
+         arg = shift->getSrc(0);
+         offset = imm.reg.data.u32;
+      }
+   } else if (insn->op == OP_SHR &&
+              insn->sType == cvt->sType &&
+              insn->src(1).getImmediate(imm)) {
+      arg = insn->getSrc(0);
+      if (imm.reg.data.u32 == 24) {
+         width = 8;
+         offset = 24;
+      } else if (imm.reg.data.u32 == 16) {
+         width = 16;
+         offset = 16;
+      } else {
+         return;
+      }
+   }
+
+   if (!arg)
+      return;
+
+   // Irrespective of what came earlier, we can undo a shift on the argument
+   // by adjusting the offset.
+   Instruction *shift = arg->getInsn();
+   if (shift && shift->op == OP_SHL &&
+       shift->src(1).getImmediate(imm) &&
+       ((width == 8 && (imm.reg.data.u32 & 0x7) == 0) ||
+        (width == 16 && (imm.reg.data.u32 & 0xf) == 0)) &&
+       imm.reg.data.u32 <= offset) {
+      arg = shift->getSrc(0);
+      offset -= imm.reg.data.u32;
+   }
+
+   // The unpackSnorm lowering still leaves a few shifts behind, but it's too
+   // annoying to detect them.
+
+   if (width == 8) {
+      cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U8 : TYPE_S8;
+   } else {
+      assert(width == 16);
+      cvt->sType = cvt->sType == TYPE_U32 ? TYPE_U16 : TYPE_S16;
+   }
+   cvt->setSrc(0, arg);
+   cvt->subOp = offset >> 3;
+}
+
 // SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
 void
 AlgebraicOpt::handleSUCLAMP(Instruction *insn)
@@ -1568,7 +1771,9 @@ AlgebraicOpt::visit(BasicBlock *bb)
          handleLOGOP(i);
          break;
       case OP_CVT:
-         handleCVT(i);
+         handleCVT_NEG(i);
+         if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
+             handleCVT_EXTBF(i);
          break;
       case OP_SUCLAMP:
          handleSUCLAMP(i);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 78bc97f4397..0cd21cf47f5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -25,7 +25,6 @@
 
 #include <stack>
 #include <limits>
-#include <tr1/unordered_set>
 
 namespace nv50_ir {
 
@@ -1551,7 +1550,7 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
       // Keep track of which instructions to delete later. Deleting them
       // inside the loop is unsafe since a single instruction may have
       // multiple destinations that all need to be spilled (like OP_SPLIT).
-      std::tr1::unordered_set<Instruction *> to_del;
+      unordered_set<Instruction *> to_del;
 
       for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
            ++d) {
@@ -1593,7 +1592,7 @@ SpillCodeInserter::run(const std::list<ValuePair>& lst)
          }
       }
 
-      for (std::tr1::unordered_set<Instruction *>::const_iterator it = to_del.begin();
+      for (unordered_set<Instruction *>::const_iterator it = to_del.begin();
            it != to_del.end(); ++it)
          delete_Instruction(func->getProgram(), *it);
    }
diff --git a/src/gallium/drivers/nouveau/codegen/unordered_set.h b/src/gallium/drivers/nouveau/codegen/unordered_set.h
new file mode 100644
index 00000000000..8ef6d462ffd
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/unordered_set.h
@@ -0,0 +1,48 @@
+#ifndef __NV50_UNORDERED_SET_H__
+#define __NV50_UNORDERED_SET_H__
+
+#if (__cplusplus >= 201103L) || defined(ANDROID)
+#include <unordered_set>
+#else
+#include <tr1/unordered_set>
+#endif
+
+namespace nv50_ir {
+
+#if __cplusplus >= 201103L
+using std::unordered_set;
+#elif !defined(ANDROID)
+using std::tr1::unordered_set;
+#else // Android release before lollipop
+using std::isfinite;
+typedef std::tr1::unordered_set<void *> voidptr_unordered_set;
+
+template <typename V>
+class unordered_set : public voidptr_unordered_set {
+  public:
+    typedef voidptr_unordered_set _base;
+    typedef _base::iterator _biterator;
+    typedef _base::const_iterator const_biterator;
+
+    class iterator : public _biterator {
+      public:
+        iterator(const _biterator & i) : _biterator(i) {}
+        V operator*() const { return reinterpret_cast<V>(*_biterator(*this)); }
+    };
+    class const_iterator : public const_biterator {
+      public:
+        const_iterator(const iterator & i) : const_biterator(i) {}
+        const_iterator(const const_biterator & i) : const_biterator(i) {}
+        const V operator*() const { return reinterpret_cast<const V>(*const_biterator(*this)); }
+    };
+
+    iterator begin() { return _base::begin(); }
+    iterator end() { return _base::end(); }
+    const_iterator begin() const { return _base::begin(); }
+    const_iterator end() const { return _base::end(); }
+};
+#endif
+
+} // namespace nv50_ir
+
+#endif // __NV50_UNORDERED_SET_H__
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c
index 86604984220..495450b317a 100644
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -190,6 +190,10 @@ main(int argc, char *argv[])
       type = PIPE_SHADER_GEOMETRY;
    else if (!strncmp(text, "COMP", 4))
       type = PIPE_SHADER_COMPUTE;
+   else if (!strncmp(text, "TESS_CTRL", 9))
+      type = PIPE_SHADER_TESS_CTRL;
+   else if (!strncmp(text, "TESS_EVAL", 9))
+      type = PIPE_SHADER_TESS_EVAL;
    else {
       _debug_printf("Unrecognized TGSI header\n");
       return 1;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 9505a0b4085..410e6311e60 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -117,7 +117,6 @@ nv50_blend_state_create(struct pipe_context *pipe,
    struct nv50_blend_stateobj *so = CALLOC_STRUCT(nv50_blend_stateobj);
    int i;
    bool emit_common_func = cso->rt[0].blend_enable;
-   uint32_t ms;
 
    if (nv50_context(pipe)->screen->tesla->oclass >= NVA3_3D_CLASS) {
       SB_BEGIN_3D(so, BLEND_INDEPENDENT, 1);
@@ -189,15 +188,6 @@ nv50_blend_state_create(struct pipe_context *pipe,
       SB_DATA    (so, nv50_colormask(cso->rt[0].colormask));
    }
 
-   ms = 0;
-   if (cso->alpha_to_coverage)
-      ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
-   if (cso->alpha_to_one)
-      ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
-
-   SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1);
-   SB_DATA    (so, ms);
-
    assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
    return so;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 985603df5fa..b304a177b50 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -1,4 +1,6 @@
 
+#include "util/u_format.h"
+
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_defs.xml.h"
 
@@ -314,6 +316,25 @@ nv50_validate_derived_2(struct nv50_context *nv50)
 }
 
 static void
+nv50_validate_derived_3(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+   uint32_t ms = 0;
+
+   if ((!fb->nr_cbufs || !fb->cbufs[0] ||
+        !util_format_is_pure_integer(fb->cbufs[0]->format)) && nv50->blend) {
+      if (nv50->blend->pipe.alpha_to_coverage)
+         ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
+      if (nv50->blend->pipe.alpha_to_one)
+         ms |= NV50_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
+   }
+
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_CTRL), 1);
+   PUSH_DATA (push, ms);
+}
+
+static void
 nv50_validate_clip(struct nv50_context *nv50)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
@@ -474,6 +495,7 @@ static struct state_validate {
     { nv50_validate_derived_rs,    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_validate_derived_2,     NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
+    { nv50_validate_derived_3,     NV50_NEW_BLEND | NV50_NEW_FRAMEBUFFER },
     { nv50_validate_clip,          NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
index cf75d1eb11b..4b1d00c7155 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_stateobj.h
@@ -19,7 +19,7 @@
 struct nv50_blend_stateobj {
    struct pipe_blend_state pipe;
    int size;
-   uint32_t state[84]; // TODO: allocate less if !independent_blend_enable
+   uint32_t state[82]; // TODO: allocate less if !independent_blend_enable
 };
 
 struct nv50_rasterizer_stateobj {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index b1ae01692cb..64348b3c378 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -68,6 +68,10 @@ nv50_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
       return NV50_SURFACE_FORMAT_R16_UNORM;
    case 4:
       return NV50_SURFACE_FORMAT_BGRA8_UNORM;
+   case 8:
+      return NV50_SURFACE_FORMAT_RGBA16_FLOAT;
+   case 16:
+      return NV50_SURFACE_FORMAT_RGBA32_FLOAT;
    default:
       return 0;
    }
@@ -1003,6 +1007,8 @@ nv50_blitctx_prepare_state(struct nv50_blitctx *blit)
    /* zsa state */
    BEGIN_NV04(push, NV50_3D(DEPTH_TEST_ENABLE), 1);
    PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(DEPTH_BOUNDS_EN), 1);
+   PUSH_DATA (push, 0);
    BEGIN_NV04(push, NV50_3D(STENCIL_ENABLE), 1);
    PUSH_DATA (push, 0);
    BEGIN_NV04(push, NV50_3D(ALPHA_TEST_ENABLE), 1);
@@ -1387,18 +1393,24 @@ nv50_blit_eng2d(struct nv50_context *nv50, const struct pipe_blit_info *info)
             PUSH_DATA (push, info->dst.box.z + i);
          } else {
             const unsigned z = info->dst.box.z + i;
+            const uint64_t address = dst->base.address +
+               dst->level[info->dst.level].offset +
+               z * dst->layer_stride;
             BEGIN_NV04(push, NV50_2D(DST_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, dst->base.address + z * dst->layer_stride);
-            PUSH_DATA (push, dst->base.address + z * dst->layer_stride);
+            PUSH_DATAh(push, address);
+            PUSH_DATA (push, address);
          }
          if (src->layout_3d) {
             /* not possible because of depth tiling */
             assert(0);
          } else {
             const unsigned z = info->src.box.z + i;
+            const uint64_t address = src->base.address +
+               src->level[info->src.level].offset +
+               z * src->layer_stride;
             BEGIN_NV04(push, NV50_2D(SRC_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, src->base.address + z * src->layer_stride);
-            PUSH_DATA (push, src->base.address + z * src->layer_stride);
+            PUSH_DATAh(push, address);
+            PUSH_DATA (push, address);
          }
          BEGIN_NV04(push, NV50_2D(BLIT_SRC_Y_INT), 1); /* trigger */
          PUSH_DATA (push, srcy >> 32);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 84f8db6a8ac..7a15a11f560 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -132,6 +132,9 @@ nvc0_context_unreference_resources(struct nvc0_context *nvc0)
       pipe_resource_reference(res, NULL);
    }
    util_dynarray_fini(&nvc0->global_residents);
+
+   if (nvc0->tcp_empty)
+      nvc0->base.pipe.delete_tcs_state(&nvc0->base.pipe, nvc0->tcp_empty);
 }
 
 static void
@@ -306,13 +309,6 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
    pipe->memory_barrier = nvc0_memory_barrier;
    pipe->get_sample_position = nvc0_context_get_sample_position;
 
-   if (!screen->cur_ctx) {
-      nvc0->state = screen->save_state;
-      screen->cur_ctx = nvc0;
-      nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx);
-   }
-   screen->base.pushbuf->kick_notify = nvc0_default_kick_notify;
-
    nvc0_init_query_functions(nvc0);
    nvc0_init_surface_functions(nvc0);
    nvc0_init_state_functions(nvc0);
@@ -326,6 +322,21 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
 
    /* shader builtin library is per-screen, but we need a context for m2mf */
    nvc0_program_library_upload(nvc0);
+   nvc0_program_init_tcp_empty(nvc0);
+   if (!nvc0->tcp_empty)
+      goto out_err;
+   /* set the empty tctl prog on next draw in case one is never set */
+   nvc0->dirty |= NVC0_NEW_TCTLPROG;
+
+   /* now that there are no more opportunities for errors, set the current
+    * context if there isn't already one.
+    */
+   if (!screen->cur_ctx) {
+      nvc0->state = screen->save_state;
+      screen->cur_ctx = nvc0;
+      nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx);
+   }
+   screen->base.pushbuf->kick_notify = nvc0_default_kick_notify;
 
    /* add permanently resident buffers to bufctxts */
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index f4499423a10..df1a891a43e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -128,6 +128,8 @@ struct nvc0_context {
    struct nvc0_program *fragprog;
    struct nvc0_program *compprog;
 
+   struct nvc0_program *tcp_empty;
+
    struct nvc0_constbuf constbuf[6][NVC0_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[6];
    uint16_t constbuf_valid[6];
@@ -227,6 +229,7 @@ void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_library_upload(struct nvc0_context *);
 uint32_t nvc0_program_symbol_offset(const struct nvc0_program *,
                                     uint32_t label);
+void nvc0_program_init_tcp_empty(struct nvc0_context *);
 
 /* nvc0_query.c */
 void nvc0_init_query_functions(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 507a2507fe3..12f1bb728d7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -22,6 +22,8 @@
 
 #include "pipe/p_defines.h"
 
+#include "tgsi/tgsi_ureg.h"
+
 #include "nvc0/nvc0_context.h"
 
 #include "codegen/nv50_ir_driver.h"
@@ -799,3 +801,18 @@ nvc0_program_symbol_offset(const struct nvc0_program *prog, uint32_t label)
          return prog->code_base + base + syms[i].offset;
    return prog->code_base; /* no symbols or symbol not found */
 }
+
+void
+nvc0_program_init_tcp_empty(struct nvc0_context *nvc0)
+{
+   struct ureg_program *ureg;
+
+   ureg = ureg_create(TGSI_PROCESSOR_TESS_CTRL);
+   if (!ureg)
+      return;
+
+   ureg_property(ureg, TGSI_PROPERTY_TCS_VERTICES_OUT, 1);
+   ureg_END(ureg);
+
+   nvc0->tcp_empty = ureg_create_shader_and_destroy(ureg, &nvc0->base.pipe);
+}
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 8aa127adc0a..8f8ac2d34b9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -148,8 +148,13 @@ nvc0_tctlprog_validate(struct nvc0_context *nvc0)
       BEGIN_NVC0(push, NVC0_3D(SP_GPR_ALLOC(2)), 1);
       PUSH_DATA (push, tp->num_gprs);
    } else {
-      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 1);
+      tp = nvc0->tcp_empty;
+      /* not a whole lot we can do to handle this failure */
+      if (!nvc0_program_validate(nvc0, tp))
+         assert(!"unable to validate empty tcp");
+      BEGIN_NVC0(push, NVC0_3D(SP_SELECT(2)), 2);
       PUSH_DATA (push, 0x20);
+      PUSH_DATA (push, tp->code_base);
    }
    nvc0_program_update_context_state(nvc0, tp, 1);
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 2a33857d9df..ee29912eb40 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -90,7 +90,6 @@ nvc0_blend_state_create(struct pipe_context *pipe,
    struct nvc0_blend_stateobj *so = CALLOC_STRUCT(nvc0_blend_stateobj);
    int i;
    int r; /* reference */
-   uint32_t ms;
    uint8_t blend_en = 0;
    bool indep_masks = false;
    bool indep_funcs = false;
@@ -176,15 +175,6 @@ nvc0_blend_state_create(struct pipe_context *pipe,
       }
    }
 
-   ms = 0;
-   if (cso->alpha_to_coverage)
-      ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
-   if (cso->alpha_to_one)
-      ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
-
-   SB_BEGIN_3D(so, MULTISAMPLE_CTRL, 1);
-   SB_DATA    (so, ms);
-
    assert(so->size <= (sizeof(so->state) / sizeof(so->state[0])));
    return so;
 }
@@ -234,7 +224,7 @@ nvc0_rasterizer_state_create(struct pipe_context *pipe,
     SB_IMMED_3D(so, MULTISAMPLE_ENABLE, cso->multisample);
 
     SB_IMMED_3D(so, LINE_SMOOTH_ENABLE, cso->line_smooth);
-    if (cso->line_smooth)
+    if (cso->line_smooth || cso->multisample)
        SB_BEGIN_3D(so, LINE_WIDTH_SMOOTH, 1);
     else
        SB_BEGIN_3D(so, LINE_WIDTH_ALIASED, 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index ce1119c284d..47bd66d1e35 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -1,4 +1,5 @@
 
+#include "util/u_format.h"
 #include "util/u_math.h"
 
 #include "nvc0/nvc0_context.h"
@@ -555,6 +556,25 @@ nvc0_validate_derived_2(struct nvc0_context *nvc0)
 }
 
 static void
+nvc0_validate_derived_3(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
+   uint32_t ms = 0;
+
+   if ((!fb->nr_cbufs || !fb->cbufs[0] ||
+        !util_format_is_pure_integer(fb->cbufs[0]->format)) && nvc0->blend) {
+      if (nvc0->blend->pipe.alpha_to_coverage)
+         ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_COVERAGE;
+      if (nvc0->blend->pipe.alpha_to_one)
+         ms |= NVC0_3D_MULTISAMPLE_CTRL_ALPHA_TO_ONE;
+   }
+
+   BEGIN_NVC0(push, NVC0_3D(MULTISAMPLE_CTRL), 1);
+   PUSH_DATA (push, ms);
+}
+
+static void
 nvc0_validate_tess_state(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -628,6 +648,7 @@ static struct state_validate {
     { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
                                    NVC0_NEW_RASTERIZER },
     { nvc0_validate_derived_2,     NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER },
+    { nvc0_validate_derived_3,     NVC0_NEW_BLEND | NVC0_NEW_FRAMEBUFFER },
     { nvc0_validate_clip,          NVC0_NEW_CLIP | NVC0_NEW_RASTERIZER |
                                    NVC0_NEW_VERTPROG |
                                    NVC0_NEW_TEVLPROG |
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 18fcc12dea3..8bc33c6a0e0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -17,7 +17,7 @@
 struct nvc0_blend_stateobj {
    struct pipe_blend_state pipe;
    int size;
-   uint32_t state[72];
+   uint32_t state[70];
 };
 
 struct nvc0_rasterizer_stateobj {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 51a6f93f891..dbdf292c862 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -887,6 +887,7 @@ nvc0_blitctx_prepare_state(struct nvc0_blitctx *blit)
 
    /* zsa state */
    IMMED_NVC0(push, NVC0_3D(DEPTH_TEST_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(DEPTH_BOUNDS_EN), 0);
    IMMED_NVC0(push, NVC0_3D(STENCIL_ENABLE), 0);
    IMMED_NVC0(push, NVC0_3D(ALPHA_TEST_ENABLE), 0);
 
@@ -1336,18 +1337,24 @@ nvc0_blit_eng2d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
             PUSH_DATA (push, info->dst.box.z + i);
          } else {
             const unsigned z = info->dst.box.z + i;
+            const uint64_t address = dst->base.address +
+               dst->level[info->dst.level].offset +
+               z * dst->layer_stride;
             BEGIN_NVC0(push, NVC0_2D(DST_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, dst->base.address + z * dst->layer_stride);
-            PUSH_DATA (push, dst->base.address + z * dst->layer_stride);
+            PUSH_DATAh(push, address);
+            PUSH_DATA (push, address);
          }
          if (src->layout_3d) {
             /* not possible because of depth tiling */
             assert(0);
          } else {
             const unsigned z = info->src.box.z + i;
+            const uint64_t address = src->base.address +
+               src->level[info->src.level].offset +
+               z * src->layer_stride;
             BEGIN_NVC0(push, NVC0_2D(SRC_ADDRESS_HIGH), 2);
-            PUSH_DATAh(push, src->base.address + z * src->layer_stride);
-            PUSH_DATA (push, src->base.address + z * src->layer_stride);
+            PUSH_DATAh(push, address);
+            PUSH_DATA (push, address);
          }
          BEGIN_NVC0(push, NVC0_2D(BLIT_SRC_Y_INT), 1); /* trigger */
          PUSH_DATA (push, srcy >> 32);
diff --git a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
index 14f93fba986..e8f4087cbc4 100644
--- a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
+++ b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
@@ -693,7 +693,8 @@ void rc_init_regalloc_state(struct rc_regalloc_state *s)
 	};
 
 	/* Allocate the main ra data structure */
-	s->regs = ra_alloc_reg_set(NULL, R500_PFS_NUM_TEMP_REGS * RC_MASK_XYZW);
+	s->regs = ra_alloc_reg_set(NULL, R500_PFS_NUM_TEMP_REGS * RC_MASK_XYZW,
+                                   true);
 
 	/* Create the register classes */
 	for (i = 0; i < RC_REG_CLASS_COUNT; i++) {
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index 6ea8f24cc14..b8cc316c2ac 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -667,7 +667,8 @@ static void r300_resource_copy_region(struct pipe_context *pipe,
     r300_blitter_begin(r300, R300_COPY);
     util_blitter_blit_generic(r300->blitter, dst_view, &dstbox,
                               src_view, src_box, src_width0, src_height0,
-                              PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+                              PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+                              FALSE);
     r300_blitter_end(r300);
 
     pipe_surface_reference(&dst_view, NULL);
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index b0002c3b50f..22a0950a491 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -732,7 +732,8 @@ void r600_resource_copy_region(struct pipe_context *ctx,
 	r600_blitter_begin(ctx, R600_COPY_TEXTURE);
 	util_blitter_blit_generic(rctx->blitter, dst_view, &dstbox,
 				  src_view, src_box, src_width0, src_height0,
-				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+				  FALSE);
 	r600_blitter_end(ctx);
 
 	pipe_surface_reference(&dst_view, NULL);
diff --git a/src/gallium/drivers/r600/r600_isa.h b/src/gallium/drivers/r600/r600_isa.h
index 381f06d5924..fdbe1c00d20 100644
--- a/src/gallium/drivers/r600/r600_isa.h
+++ b/src/gallium/drivers/r600/r600_isa.h
@@ -262,7 +262,7 @@ static const struct alu_op_info alu_op_table[] = {
 		{"PRED_SETNE_PUSH_INT",       2, { 0x4D, 0x4D },{  AF_VS, AF_VS, AF_VS, AF_VS},  AF_PRED_PUSH | AF_CC_NE | AF_INT_CMP },
 		{"PRED_SETLT_PUSH_INT",       2, { 0x4E, 0x4E },{  AF_VS, AF_VS, AF_VS, AF_VS},  AF_PRED_PUSH | AF_CC_LT | AF_INT_CMP },
 		{"PRED_SETLE_PUSH_INT",       2, { 0x4F, 0x4F },{  AF_VS, AF_VS, AF_VS, AF_VS},  AF_PRED_PUSH | AF_CC_LE | AF_INT_CMP },
-		{"FLT_TO_INT",                1, { 0x6B, 0x50 },{   AF_S,  AF_S, AF_VS, AF_VS},  AF_INT_DST | AF_CVT },
+		{"FLT_TO_INT",                1, { 0x6B, 0x50 },{   AF_S,  AF_S,  AF_V,  AF_V},  AF_INT_DST | AF_CVT },
 		{"BFREV_INT",                 1, {   -1, 0x51 },{      0,     0, AF_VS, AF_VS},  AF_INT_DST },
 		{"ADDC_UINT",                 2, {   -1, 0x52 },{      0,     0, AF_VS, AF_VS},  AF_UINT_DST },
 		{"SUBB_UINT",                 2, {   -1, 0x53 },{      0,     0, AF_VS, AF_VS},  AF_UINT_DST },
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 9b66105641a..384ba800a79 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -90,7 +90,7 @@
 
 struct r600_context;
 struct r600_bytecode;
-struct r600_shader_key;
+union  r600_shader_key;
 
 /* This is an atom containing GPU commands that never change.
  * This is supposed to be copied directly into the CS. */
@@ -643,7 +643,7 @@ void r600_resource_copy_region(struct pipe_context *ctx,
 /* r600_shader.c */
 int r600_pipe_shader_create(struct pipe_context *ctx,
 			    struct r600_pipe_shader *shader,
-			    struct r600_shader_key key);
+			    union r600_shader_key key);
 
 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader);
 
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 8d1f95abddc..4c4b6005981 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -62,7 +62,7 @@ The compiler must issue the source argument to slots z, y, and x
 
 static int r600_shader_from_tgsi(struct r600_context *rctx,
 				 struct r600_pipe_shader *pipeshader,
-				 struct r600_shader_key key);
+				 union r600_shader_key key);
 
 
 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
@@ -133,7 +133,7 @@ static int store_shader(struct pipe_context *ctx,
 
 int r600_pipe_shader_create(struct pipe_context *ctx,
 			    struct r600_pipe_shader *shader,
-			    struct r600_shader_key key)
+			    union r600_shader_key key)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_pipe_shader_selector *sel = shader->selector;
@@ -141,7 +141,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 	bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
 	unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
 	unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
-	unsigned export_shader = key.vs_as_es;
+	unsigned export_shader = key.vs.as_es;
 
 	shader->shader.bc.isa = rctx->isa;
 
@@ -1802,7 +1802,7 @@ static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
 
 static int r600_shader_from_tgsi(struct r600_context *rctx,
 				 struct r600_pipe_shader *pipeshader,
-				 struct r600_shader_key key)
+				 union r600_shader_key key)
 {
 	struct r600_screen *rscreen = rctx->screen;
 	struct r600_shader *shader = &pipeshader->shader;
@@ -1816,7 +1816,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	unsigned opcode;
 	int i, j, k, r = 0;
 	int next_param_base = 0, next_clip_base;
-	int max_color_exports = MAX2(key.nr_cbufs, 1);
+	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
 	/* Declarations used by llvm code */
 	bool use_llvm = false;
 	bool indirect_gprs;
@@ -1830,8 +1830,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.shader = shader;
 	ctx.native_integers = true;
 
-	shader->vs_as_gs_a = key.vs_as_gs_a;
-	shader->vs_as_es = key.vs_as_es;
+	shader->vs_as_gs_a = key.vs.as_gs_a;
+	shader->vs_as_es = key.vs.as_es;
 
 	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
 			   rscreen->has_compressed_msaa_texturing);
@@ -1844,9 +1844,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	shader->processor_type = ctx.type;
 	ctx.bc->type = shader->processor_type;
 
-	ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
+	ring_outputs = key.vs.as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
 
-	if (key.vs_as_es) {
+	if (key.vs.as_es) {
 		ctx.gs_for_vs = &rctx->gs_shader->current->shader;
 	} else {
 		ctx.gs_for_vs = NULL;
@@ -1866,7 +1866,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	shader->nr_ps_color_exports = 0;
 	shader->nr_ps_max_color_exports = 0;
 
-	shader->two_side = key.color_two_side;
+	shader->two_side = key.ps.color_two_side;
 
 	/* register allocations */
 	/* Values [0,127] correspond to GPR[0..127].
@@ -1970,7 +1970,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	shader->fs_write_all = FALSE;
 
 	if (shader->vs_as_gs_a)
-		vs_add_primid_output(&ctx, key.vs_prim_id_out);
+		vs_add_primid_output(&ctx, key.vs.prim_id_out);
 
 	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
 		tgsi_parse_token(&ctx.parse);
@@ -2091,7 +2091,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
 		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
 		radeon_llvm_ctx.stream_outputs = &so;
-		radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
+		radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
 		radeon_llvm_ctx.has_compressed_msaa_texturing =
 			ctx.bc->has_compressed_msaa_texturing;
 		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
@@ -2270,7 +2270,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	convert_edgeflag_to_int(&ctx);
 
 	if (ring_outputs) {
-		if (key.vs_as_es)
+		if (key.vs.as_es)
 			emit_gs_ring_writes(&ctx, FALSE);
 	} else {
 		/* Export output */
@@ -2386,7 +2386,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 						j--;
 						continue;
 					}
-					output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
+					output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
 					output[j].array_base = shader->output[i].sid;
 					output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 					shader->nr_ps_color_exports++;
@@ -2399,7 +2399,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 							output[j].swizzle_x = 0;
 							output[j].swizzle_y = 1;
 							output[j].swizzle_z = 2;
-							output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
+							output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
 							output[j].burst_count = 1;
 							output[j].array_base = k;
 							output[j].op = CF_OP_EXPORT;
@@ -6151,10 +6151,10 @@ static int tgsi_cmp(struct r600_shader_ctx *ctx)
 		r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
 		if (r)
 			return r;
-		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[2]);
+		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
 		if (r)
 			return r;
-		r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[2], &ctx->src[1]);
+		r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
 		if (r)
 			return r;
 		tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 5d05c8153d7..927bac57673 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -95,13 +95,17 @@ struct r600_shader {
 	struct r600_shader_array * arrays;
 };
 
-struct r600_shader_key {
-	unsigned color_two_side:1;
-	unsigned alpha_to_one:1;
-	unsigned nr_cbufs:4;
-	unsigned vs_as_es:1;
-	unsigned vs_as_gs_a:1;
-	unsigned vs_prim_id_out:8;
+union r600_shader_key {
+	struct {
+		unsigned	nr_cbufs:4;
+		unsigned	color_two_side:1;
+		unsigned	alpha_to_one:1;
+	} ps;
+	struct {
+		unsigned	prim_id_out:8;
+		unsigned	as_es:1; /* export shader */
+		unsigned	as_gs_a:1;
+	} vs;
 };
 
 struct r600_shader_array {
@@ -122,7 +126,7 @@ struct r600_pipe_shader {
 	unsigned		flatshade;
 	unsigned		pa_cl_vs_out_cntl;
 	unsigned		nr_ps_color_outputs;
-	struct r600_shader_key	key;
+	union r600_shader_key	key;
 	unsigned		db_shader_control;
 	unsigned		ps_depth_export;
 	unsigned		enabled_stream_buffers_mask;
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index aa4a8d0240f..a05dd8352c7 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -702,29 +702,39 @@ void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
 }
 
 /* Compute the key for the hw shader variant */
-static inline struct r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
+static inline union r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
 		struct r600_pipe_shader_selector * sel)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct r600_shader_key key;
+	union r600_shader_key key;
 	memset(&key, 0, sizeof(key));
 
-	if (sel->type == PIPE_SHADER_FRAGMENT) {
-		key.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side;
-		key.alpha_to_one = rctx->alpha_to_one &&
-				   rctx->rasterizer && rctx->rasterizer->multisample_enable &&
-				   !rctx->framebuffer.cb0_is_integer;
-		key.nr_cbufs = rctx->framebuffer.state.nr_cbufs;
-		/* Dual-source blending only makes sense with nr_cbufs == 1. */
-		if (key.nr_cbufs == 1 && rctx->dual_src_blend)
-			key.nr_cbufs = 2;
-	} else if (sel->type == PIPE_SHADER_VERTEX) {
-		key.vs_as_es = (rctx->gs_shader != NULL);
+	switch (sel->type) {
+	case PIPE_SHADER_VERTEX: {
+		key.vs.as_es = (rctx->gs_shader != NULL);
 		if (rctx->ps_shader->current->shader.gs_prim_id_input && !rctx->gs_shader) {
-			key.vs_as_gs_a = true;
-			key.vs_prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid;
+			key.vs.as_gs_a = true;
+			key.vs.prim_id_out = rctx->ps_shader->current->shader.input[rctx->ps_shader->current->shader.ps_prim_id_input].spi_sid;
 		}
+		break;
+	}
+	case PIPE_SHADER_GEOMETRY:
+		break;
+	case PIPE_SHADER_FRAGMENT: {
+		key.ps.color_two_side = rctx->rasterizer && rctx->rasterizer->two_side;
+		key.ps.alpha_to_one = rctx->alpha_to_one &&
+				      rctx->rasterizer && rctx->rasterizer->multisample_enable &&
+				      !rctx->framebuffer.cb0_is_integer;
+		key.ps.nr_cbufs = rctx->framebuffer.state.nr_cbufs;
+		/* Dual-source blending only makes sense with nr_cbufs == 1. */
+		if (key.ps.nr_cbufs == 1 && rctx->dual_src_blend)
+			key.ps.nr_cbufs = 2;
+		break;
 	}
+	default:
+		assert(0);
+	}
+
 	return key;
 }
 
@@ -734,7 +744,7 @@ static int r600_shader_select(struct pipe_context *ctx,
         struct r600_pipe_shader_selector* sel,
         bool *dirty)
 {
-	struct r600_shader_key key;
+	union r600_shader_key key;
 	struct r600_pipe_shader * shader = NULL;
 	int r;
 
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 16ee5410273..81f3f45db9f 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -209,8 +209,6 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
 
 static unsigned calc_ctx_size(struct ruvd_decoder *dec)
 {
-	unsigned width_in_mb, height_in_mb, ctx_size;
-
 	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
 	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
 
@@ -223,8 +221,7 @@ static unsigned calc_ctx_size(struct ruvd_decoder *dec)
 
 	width = align (width, 16);
 	height = align (height, 16);
-	ctx_size = ((width + 255) / 16)*((height + 255) / 16) * 16 * max_references + 52 * 1024;
-	return ctx_size;
+	return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024;
 }
 
 /* calculate size of reference picture buffer */
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 48972bd170c..b7450b6fcec 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -586,7 +586,8 @@ void si_resource_copy_region(struct pipe_context *ctx,
 	si_blitter_begin(ctx, SI_COPY);
 	util_blitter_blit_generic(sctx->blitter, dst_view, &dstbox,
 				  src_view, src_box, src_width0, src_height0,
-				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL);
+				  PIPE_MASK_RGBAZS, PIPE_TEX_FILTER_NEAREST, NULL,
+				  FALSE);
 	si_blitter_end(ctx);
 
 	pipe_surface_reference(&dst_view, NULL);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4288e9b2ab1..fa6c15a6591 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2277,7 +2277,7 @@ static void tex_fetch_args(
 	unsigned sampler_index;
 	unsigned num_deriv_channels = 0;
 	bool has_offset = HAVE_LLVM >= 0x0305 ? inst->Texture.NumOffsets > 0 : false;
-	LLVMValueRef res_ptr, samp_ptr;
+	LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL;
 
 	sampler_src = emit_data->inst->Instruction.NumSrcRegs - 1;
 	sampler_index = emit_data->inst->Src[sampler_src].Register.Index;
@@ -2293,9 +2293,19 @@ static void tex_fetch_args(
 
 		samp_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_SAMPLER);
 		samp_ptr = build_indexed_load_const(si_shader_ctx, samp_ptr, ind_index);
+
+		if (target == TGSI_TEXTURE_2D_MSAA ||
+		    target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+			ind_index = LLVMBuildAdd(gallivm->builder, ind_index,
+						 lp_build_const_int32(gallivm,
+								      SI_FMASK_TEX_OFFSET), "");
+			fmask_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_RESOURCE);
+			fmask_ptr = build_indexed_load_const(si_shader_ctx, res_ptr, ind_index);
+		}
 	} else {
 		res_ptr = si_shader_ctx->resources[sampler_index];
 		samp_ptr = si_shader_ctx->samplers[sampler_index];
+		fmask_ptr = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
 	}
 
 	if (target == TGSI_TEXTURE_BUFFER) {
@@ -2493,7 +2503,7 @@ static void tex_fetch_args(
 		txf_emit_data.dst_type = LLVMVectorType(
 			LLVMInt32TypeInContext(gallivm->context), 4);
 		txf_emit_data.args[0] = lp_build_gather_values(gallivm, txf_address, txf_count);
-		txf_emit_data.args[1] = si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index];
+		txf_emit_data.args[1] = fmask_ptr;
 		txf_emit_data.args[2] = lp_build_const_int32(gallivm, inst.Texture.Texture);
 		txf_emit_data.arg_count = 3;
 
@@ -2524,8 +2534,7 @@ static void tex_fetch_args(
 		 * resource descriptor is 0 (invalid),
 		 */
 		LLVMValueRef fmask_desc =
-			LLVMBuildBitCast(gallivm->builder,
-					 si_shader_ctx->resources[SI_FMASK_TEX_OFFSET + sampler_index],
+			LLVMBuildBitCast(gallivm->builder, fmask_ptr,
 					 LLVMVectorType(uint_bld->elem_type, 8), "");
 
 		LLVMValueRef fmask_word1 =
@@ -3973,7 +3982,7 @@ static void si_dump_key(unsigned shader, union si_shader_key *key)
 			fprintf(stderr, "  es_enabled_outputs = 0x%"PRIx64"\n",
 				key->vs.es_enabled_outputs);
 		fprintf(stderr, "  as_es = %u\n", key->vs.as_es);
-		fprintf(stderr, "  as_es = %u\n", key->vs.as_ls);
+		fprintf(stderr, "  as_ls = %u\n", key->vs.as_ls);
 		break;
 
 	case PIPE_SHADER_TESS_CTRL:
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 654c46f3c0d..3a63af8f2b0 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -270,6 +270,7 @@ struct vc4_context {
 
         struct ra_regs *regs;
         unsigned int reg_class_any;
+        unsigned int reg_class_a_or_b_or_acc;
         unsigned int reg_class_r4_or_a;
         unsigned int reg_class_a;
 
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index 7978ea1829f..5b435832b92 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -143,15 +143,6 @@ qir_opt_algebraic(struct vc4_compile *c)
                 case QOP_SEL_X_Y_ZC:
                 case QOP_SEL_X_Y_NS:
                 case QOP_SEL_X_Y_NC:
-                        if (qir_reg_equals(inst->src[0], inst->src[1])) {
-                                /* Turn "dst = (sf == x) ? a : a)" into
-                                 * "dst = a"
-                                 */
-                                replace_with_mov(c, inst, inst->src[1]);
-                                progress = true;
-                                break;
-                        }
-
                         if (is_zero(c, inst->src[1])) {
                                 /* Replace references to a 0 uniform value
                                  * with the SEL_X_0 equivalent.
@@ -207,6 +198,7 @@ qir_opt_algebraic(struct vc4_compile *c)
 
                         /* FADD(a, FSUB(0, b)) -> FSUB(a, b) */
                         if (inst->src[1].file == QFILE_TEMP &&
+                            c->defs[inst->src[1].index] &&
                             c->defs[inst->src[1].index]->op == QOP_FSUB) {
                                 struct qinst *fsub = c->defs[inst->src[1].index];
                                 if (is_zero(c, fsub->src[0])) {
@@ -221,6 +213,7 @@ qir_opt_algebraic(struct vc4_compile *c)
 
                         /* FADD(FSUB(0, b), a) -> FSUB(a, b) */
                         if (inst->src[0].file == QFILE_TEMP &&
+                            c->defs[inst->src[0].index] &&
                             c->defs[inst->src[0].index]->op == QOP_FSUB) {
                                 struct qinst *fsub = c->defs[inst->src[0].index];
                                 if (is_zero(c, fsub->src[0])) {
@@ -236,18 +229,20 @@ qir_opt_algebraic(struct vc4_compile *c)
                         break;
 
                 case QOP_FMUL:
-                        if (replace_x_0_with_0(c, inst, 0) ||
-                            replace_x_0_with_0(c, inst, 1) ||
-                            fmul_replace_one(c, inst, 0) ||
-                            fmul_replace_one(c, inst, 1)) {
+                        if (!inst->dst.pack &&
+                            (replace_x_0_with_0(c, inst, 0) ||
+                             replace_x_0_with_0(c, inst, 1) ||
+                             fmul_replace_one(c, inst, 0) ||
+                             fmul_replace_one(c, inst, 1))) {
                                 progress = true;
                                 break;
                         }
                         break;
 
                 case QOP_MUL24:
-                        if (replace_x_0_with_0(c, inst, 0) ||
-                            replace_x_0_with_0(c, inst, 1)) {
+                        if (!inst->dst.pack &&
+                            (replace_x_0_with_0(c, inst, 0) ||
+                             replace_x_0_with_0(c, inst, 1))) {
                                 progress = true;
                                 break;
                         }
@@ -280,6 +275,14 @@ qir_opt_algebraic(struct vc4_compile *c)
                         }
                         break;
 
+                case QOP_RCP:
+                        if (is_1f(c, inst->src[0])) {
+                                replace_with_mov(c, inst, inst->src[0]);
+                                progress = true;
+                                break;
+                        }
+                        break;
+
                 default:
                         break;
                 }
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index a755de9aa41..fd2539aed95 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -39,21 +39,27 @@ qir_opt_copy_propagation(struct vc4_compile *c)
 {
         bool progress = false;
         bool debug = false;
-        struct qreg *movs = calloc(c->num_temps, sizeof(struct qreg));
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                         int index = inst->src[i].index;
                         if (inst->src[i].file == QFILE_TEMP &&
-                            (movs[index].file == QFILE_TEMP ||
-                             movs[index].file == QFILE_UNIF)) {
+                            c->defs[index] &&
+                            c->defs[index]->op == QOP_MOV &&
+                            (c->defs[index]->src[0].file == QFILE_TEMP ||
+                             c->defs[index]->src[0].file == QFILE_UNIF)) {
+                                /* If it has a pack, it shouldn't be an SSA
+                                 * def.
+                                 */
+                                assert(!c->defs[index]->dst.pack);
+
                                 if (debug) {
                                         fprintf(stderr, "Copy propagate: ");
                                         qir_dump_inst(c, inst);
                                         fprintf(stderr, "\n");
                                 }
 
-                                inst->src[i] = movs[index];
+                                inst->src[i] = c->defs[index]->src[0];
 
                                 if (debug) {
                                         fprintf(stderr, "to: ");
@@ -64,14 +70,6 @@ qir_opt_copy_propagation(struct vc4_compile *c)
                                 progress = true;
                         }
                 }
-
-                if (inst->op == QOP_MOV &&
-                    inst->dst.file == QFILE_TEMP &&
-                    inst->src[0].file != QFILE_VPM) {
-                        movs[inst->dst.index] = inst->src[0];
-                }
         }
-
-        free(movs);
         return progress;
 }
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
index e04f02859d5..f2cdf8f694f 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
@@ -68,7 +68,7 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                         continue;
 
                 struct qinst *inst = c->defs[temp];
-                if (qir_is_multi_instruction(inst))
+                if (!inst || qir_is_multi_instruction(inst))
                         continue;
 
                 if (qir_depends_on_flags(inst) || inst->sf)
@@ -79,22 +79,6 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                         continue;
                 }
 
-                /* A QOP_TEX_RESULT destination is r4, so we can't move
-                 * accesses to it past another QOP_TEX_RESULT which would
-                 * update it.
-                 */
-                int src;
-                for (src = 0; src < qir_get_op_nsrc(inst->op); src++) {
-                        if (inst->src[src].file == QFILE_TEMP) {
-                                if (c->defs[inst->src[src].index]->op ==
-                                    QOP_TEX_RESULT) {
-                                        break;
-                                }
-                        }
-                }
-                if (src != qir_get_op_nsrc(inst->op))
-                        continue;
-
                 /* Move the generating instruction to the end of the program
                  * to maintain the order of the VPM writes.
                  */
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 13c472152d8..e002983fdbb 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -818,6 +818,72 @@ declare_uniform_range(struct vc4_compile *c, uint32_t start, uint32_t size)
         c->ubo_ranges[array_id].used = false;
 }
 
+static bool
+ntq_src_is_only_ssa_def_user(nir_src *src)
+{
+        if (!src->is_ssa)
+                return false;
+
+        if (!list_empty(&src->ssa->if_uses))
+                return false;
+
+        return (src->ssa->uses.next == &src->use_link &&
+                src->ssa->uses.next->next == &src->ssa->uses);
+}
+
+/**
+ * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack
+ * bit set.
+ *
+ * However, as an optimization, it tries to find the instructions generating
+ * the sources to be packed and just emit the pack flag there, if possible.
+ */
+static void
+ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
+{
+        struct qreg result = qir_get_temp(c);
+        struct nir_alu_instr *vec4 = NULL;
+
+        /* If packing from a vec4 op (as expected), identify it so that we can
+         * peek back at what generated its sources.
+         */
+        if (instr->src[0].src.is_ssa &&
+            instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu &&
+            nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op ==
+            nir_op_vec4) {
+                vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+        }
+
+        for (int i = 0; i < 4; i++) {
+                int swiz = instr->src[0].swizzle[i];
+                struct qreg src;
+                if (vec4) {
+                        src = ntq_get_src(c, vec4->src[swiz].src,
+                                          vec4->src[swiz].swizzle[0]);
+                } else {
+                        src = ntq_get_src(c, instr->src[0].src, swiz);
+                }
+
+                if (vec4 &&
+                    ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) &&
+                    src.file == QFILE_TEMP &&
+                    c->defs[src.index] &&
+                    qir_is_mul(c->defs[src.index]) &&
+                    !c->defs[src.index]->dst.pack) {
+                        struct qinst *rewrite = c->defs[src.index];
+                        c->defs[src.index] = NULL;
+                        rewrite->dst = result;
+                        rewrite->dst.pack = QPU_PACK_MUL_8A + i;
+                        continue;
+                }
+
+                qir_PACK_8_F(c, result, src, i);
+        }
+
+        struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+        *dest = result;
+}
+
 static void
 ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
 {
@@ -839,17 +905,7 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
         }
 
         if (instr->op == nir_op_pack_unorm_4x8) {
-                struct qreg result;
-                for (int i = 0; i < 4; i++) {
-                        struct qreg src = ntq_get_src(c, instr->src[0].src,
-                                                      instr->src[0].swizzle[i]);
-                        if (i == 0)
-                                result = qir_PACK_8888_F(c, src);
-                        else
-                                result = qir_PACK_8_F(c, result, src, i);
-                }
-                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
-                *dest = result;
+                ntq_emit_pack_unorm_4x8(c, instr);
                 return;
         }
 
@@ -1130,20 +1186,24 @@ emit_frag_end(struct vc4_compile *c)
 static void
 emit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w)
 {
-        struct qreg xyi[2];
+        struct qreg packed = qir_get_temp(c);
 
         for (int i = 0; i < 2; i++) {
                 struct qreg scale =
                         qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0);
 
-                xyi[i] = qir_FTOI(c, qir_FMUL(c,
-                                              qir_FMUL(c,
-                                                       c->outputs[c->output_position_index + i],
-                                                       scale),
-                                              rcp_w));
+                struct qreg packed_chan = packed;
+                packed_chan.pack = QPU_PACK_A_16A + i;
+
+                qir_FTOI_dest(c, packed_chan,
+                              qir_FMUL(c,
+                                       qir_FMUL(c,
+                                                c->outputs[c->output_position_index + i],
+                                                scale),
+                                       rcp_w));
         }
 
-        qir_VPM_WRITE(c, qir_PACK_SCALED(c, xyi[0], xyi[1]));
+        qir_VPM_WRITE(c, packed);
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 254140a72f5..9d930715f9b 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -71,12 +71,11 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_RSQ] = { "rsq", 1, 1, false, true },
         [QOP_EXP2] = { "exp2", 1, 2, false, true },
         [QOP_LOG2] = { "log2", 1, 2, false, true },
-        [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1, false, true },
-        [QOP_PACK_8A_F] = { "pack_8a_f", 1, 2, false, true },
-        [QOP_PACK_8B_F] = { "pack_8b_f", 1, 2, false, true },
-        [QOP_PACK_8C_F] = { "pack_8c_f", 1, 2, false, true },
-        [QOP_PACK_8D_F] = { "pack_8d_f", 1, 2, false, true },
-        [QOP_PACK_SCALED] = { "pack_scaled", 1, 2, false, true },
+        [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1 },
+        [QOP_PACK_8A_F] = { "pack_8a_f", 1, 1 },
+        [QOP_PACK_8B_F] = { "pack_8b_f", 1, 1 },
+        [QOP_PACK_8C_F] = { "pack_8c_f", 1, 1 },
+        [QOP_PACK_8D_F] = { "pack_8d_f", 1, 1 },
         [QOP_TLB_DISCARD_SETUP] = { "discard", 0, 1, true },
         [QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true },
         [QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true },
@@ -169,6 +168,18 @@ qir_is_multi_instruction(struct qinst *inst)
 }
 
 bool
+qir_is_mul(struct qinst *inst)
+{
+        switch (inst->op) {
+        case QOP_FMUL:
+        case QOP_MUL24:
+                return true;
+        default:
+                return false;
+        }
+}
+
+bool
 qir_is_tex(struct qinst *inst)
 {
         return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT;
@@ -273,6 +284,14 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
                 inst->sf ? ".sf" : "");
 
         qir_print_reg(c, inst->dst, true);
+        if (inst->dst.pack) {
+                if (inst->dst.pack) {
+                        if (qir_is_mul(inst))
+                                vc4_qpu_disasm_pack_mul(stderr, inst->dst.pack);
+                        else
+                                vc4_qpu_disasm_pack_a(stderr, inst->dst.pack);
+                }
+        }
         for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                 fprintf(stderr, ", ");
                 qir_print_reg(c, inst->src[i], false);
@@ -348,7 +367,7 @@ qir_emit(struct vc4_compile *c, struct qinst *inst)
         if (inst->dst.file == QFILE_TEMP)
                 c->defs[inst->dst.index] = inst;
 
-        list_addtail(&inst->link, &c->instructions);
+        qir_emit_nodef(c, inst);
 }
 
 bool
@@ -389,8 +408,11 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst)
 struct qreg
 qir_follow_movs(struct vc4_compile *c, struct qreg reg)
 {
-        while (reg.file == QFILE_TEMP && c->defs[reg.index]->op == QOP_MOV)
+        while (reg.file == QFILE_TEMP &&
+               c->defs[reg.index] &&
+               c->defs[reg.index]->op == QOP_MOV) {
                 reg = c->defs[reg.index]->src[0];
+        }
 
         return reg;
 }
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index cade795c12a..a2b21fa17bb 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -58,6 +58,7 @@ enum qfile {
 struct qreg {
         enum qfile file;
         uint32_t index;
+        int pack;
 };
 
 enum qop {
@@ -104,7 +105,6 @@ enum qop {
         QOP_LOG2,
         QOP_VW_SETUP,
         QOP_VR_SETUP,
-        QOP_PACK_SCALED,
         QOP_PACK_8888_F,
         QOP_PACK_8A_F,
         QOP_PACK_8B_F,
@@ -444,13 +444,20 @@ struct qreg qir_uniform(struct vc4_compile *c,
                         enum quniform_contents contents,
                         uint32_t data);
 void qir_reorder_uniforms(struct vc4_compile *c);
+
 void qir_emit(struct vc4_compile *c, struct qinst *inst);
+static inline void qir_emit_nodef(struct vc4_compile *c, struct qinst *inst)
+{
+        list_addtail(&inst->link, &c->instructions);
+}
+
 struct qreg qir_get_temp(struct vc4_compile *c);
 int qir_get_op_nsrc(enum qop qop);
 bool qir_reg_equals(struct qreg a, struct qreg b);
 bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
 bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst);
 bool qir_is_multi_instruction(struct qinst *inst);
+bool qir_is_mul(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
@@ -509,6 +516,12 @@ qir_##name(struct vc4_compile *c, struct qreg a)                         \
         struct qreg t = qir_get_temp(c);                                 \
         qir_emit(c, qir_inst(QOP_##name, t, a, c->undef));               \
         return t;                                                        \
+}                                                                        \
+static inline void                                                       \
+qir_##name##_dest(struct vc4_compile *c, struct qreg dest,               \
+                  struct qreg a)                                         \
+{                                                                        \
+        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef));      \
 }
 
 #define QIR_ALU2(name)                                                   \
@@ -518,6 +531,12 @@ qir_##name(struct vc4_compile *c, struct qreg a, struct qreg b)          \
         struct qreg t = qir_get_temp(c);                                 \
         qir_emit(c, qir_inst(QOP_##name, t, a, b));                      \
         return t;                                                        \
+}                                                                        \
+static inline void                                                       \
+qir_##name##_dest(struct vc4_compile *c, struct qreg dest,               \
+                  struct qreg a, struct qreg b)                          \
+{                                                                        \
+        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, b));             \
 }
 
 #define QIR_NODST_1(name)                                               \
@@ -534,6 +553,14 @@ qir_##name(struct vc4_compile *c, struct qreg a, struct qreg b)         \
         qir_emit(c, qir_inst(QOP_##name, c->undef, a, b));       \
 }
 
+#define QIR_PACK(name)                                                   \
+static inline struct qreg                                                \
+qir_##name(struct vc4_compile *c, struct qreg dest, struct qreg a)       \
+{                                                                        \
+        qir_emit_nodef(c, qir_inst(QOP_##name, dest, a, c->undef));      \
+        return dest;                                                     \
+}
+
 QIR_ALU1(MOV)
 QIR_ALU2(FADD)
 QIR_ALU2(FSUB)
@@ -570,12 +597,11 @@ QIR_ALU1(RCP)
 QIR_ALU1(RSQ)
 QIR_ALU1(EXP2)
 QIR_ALU1(LOG2)
-QIR_ALU2(PACK_SCALED)
 QIR_ALU1(PACK_8888_F)
-QIR_ALU2(PACK_8A_F)
-QIR_ALU2(PACK_8B_F)
-QIR_ALU2(PACK_8C_F)
-QIR_ALU2(PACK_8D_F)
+QIR_PACK(PACK_8A_F)
+QIR_PACK(PACK_8B_F)
+QIR_PACK(PACK_8C_F)
+QIR_PACK(PACK_8D_F)
 QIR_ALU1(VARY_ADD_C)
 QIR_NODST_2(TEX_S)
 QIR_NODST_2(TEX_T)
@@ -627,11 +653,12 @@ qir_UNPACK_16_I(struct vc4_compile *c, struct qreg src, int i)
 }
 
 static inline struct qreg
-qir_PACK_8_F(struct vc4_compile *c, struct qreg rest, struct qreg val, int chan)
+qir_PACK_8_F(struct vc4_compile *c, struct qreg dest, struct qreg val, int chan)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, t, rest, val));
-        return t;
+        qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, dest, val, c->undef));
+        if (dest.file == QFILE_TEMP)
+                c->defs[dest.index] = NULL;
+        return dest;
 }
 
 static inline struct qreg
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index fbb90ba12a0..0719d2828b5 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -24,6 +24,7 @@
 #ifndef VC4_QPU_H
 #define VC4_QPU_H
 
+#include <stdio.h>
 #include <stdint.h>
 
 #include "util/u_math.h"
@@ -206,6 +207,12 @@ void
 vc4_qpu_disasm(const uint64_t *instructions, int num_instructions);
 
 void
+vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack);
+
+void
+vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack);
+
+void
 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst);
 
 #endif /* VC4_QPU_H */
diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 00aeb300a9b..0879787ec03 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -245,6 +245,18 @@ get_special_write_desc(int reg, bool is_a)
         return special_write[reg];
 }
 
+void
+vc4_qpu_disasm_pack_mul(FILE *out, uint32_t pack)
+{
+        fprintf(out, ".%s", DESC(qpu_pack_mul, pack));
+}
+
+void
+vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack)
+{
+        fprintf(out, "%s", DESC(qpu_pack_a, pack));
+}
+
 static void
 print_alu_dst(uint64_t inst, bool is_mul)
 {
@@ -263,9 +275,9 @@ print_alu_dst(uint64_t inst, bool is_mul)
                 fprintf(stderr, "%s%d?", file, waddr);
 
         if (is_mul && (inst & QPU_PM)) {
-                fprintf(stderr, ".%s", DESC(qpu_pack_mul, pack));
+                vc4_qpu_disasm_pack_mul(stderr, pack);
         } else if (is_a && !(inst & QPU_PM)) {
-                fprintf(stderr, "%s", DESC(qpu_pack_a, pack));
+                vc4_qpu_disasm_pack_a(stderr, pack);
         }
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index f324056258c..adf3a8b3658 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -179,10 +179,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                 static const struct {
                         uint32_t op;
-                        bool is_mul;
                 } translate[] = {
-#define A(name) [QOP_##name] = {QPU_A_##name, false}
-#define M(name) [QOP_##name] = {QPU_M_##name, true}
+#define A(name) [QOP_##name] = {QPU_A_##name}
+#define M(name) [QOP_##name] = {QPU_M_##name}
                         A(FADD),
                         A(FSUB),
                         A(FMIN),
@@ -336,28 +335,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_PACK_8B_F:
                 case QOP_PACK_8C_F:
                 case QOP_PACK_8D_F:
-                        /* If dst doesn't happen to already contain src[0],
-                         * then we have to move it in.
-                         */
-                        if (qinst->src[0].file != QFILE_NULL &&
-                            (src[0].mux != dst.mux || src[0].addr != dst.addr)) {
-                                /* Don't overwrite src1 while setting up
-                                 * the dst!
-                                 */
-                                if (dst.mux == src[1].mux &&
-                                    dst.addr == src[1].addr) {
-                                        queue(c, qpu_m_MOV(qpu_rb(31), src[1]));
-                                        src[1] = qpu_rb(31);
-                                }
-
-                                queue(c, qpu_m_MOV(dst, src[0]));
-                        }
-
-                        queue(c, qpu_m_MOV(dst, src[1]));
-                        *last_inst(c) |= QPU_PM;
-                        *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8A +
-                                                       qinst->op - QOP_PACK_8A_F,
-                                                       QPU_PACK);
+                        queue(c,
+                              qpu_m_MOV(dst, src[0]) |
+                              QPU_PM |
+                              QPU_SET_FIELD(QPU_PACK_MUL_8A +
+                                            qinst->op - QOP_PACK_8A_F,
+                                            QPU_PACK));
                         break;
 
                 case QOP_FRAG_X:
@@ -419,24 +402,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
                         break;
 
-                case QOP_PACK_SCALED: {
-                        uint64_t a = (qpu_a_MOV(dst, src[0]) |
-                                      QPU_SET_FIELD(QPU_PACK_A_16A,
-                                                    QPU_PACK));
-                        uint64_t b = (qpu_a_MOV(dst, src[1]) |
-                                      QPU_SET_FIELD(QPU_PACK_A_16B,
-                                                    QPU_PACK));
-
-                        if (dst.mux == src[1].mux && dst.addr == src[1].addr) {
-                                queue(c, b);
-                                queue(c, a);
-                        } else {
-                                queue(c, a);
-                                queue(c, b);
-                        }
-                        break;
-                }
-
                 case QOP_TEX_S:
                 case QOP_TEX_T:
                 case QOP_TEX_R:
@@ -529,14 +494,24 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         fixup_raddr_conflict(c, dst, &src[0], &src[1]);
 
-                        if (translate[qinst->op].is_mul) {
+                        if (qir_is_mul(qinst)) {
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]));
+                                if (qinst->dst.pack) {
+                                        *last_inst(c) |= QPU_PM;
+                                        *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
+                                                                       QPU_PACK);
+                                }
                         } else {
                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
                                                     dst,
                                                     src[0], src[1]));
+                                if (qinst->dst.pack) {
+                                        assert(dst.mux == QPU_MUX_A);
+                                        *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
+                                                                       QPU_PACK);
+                                }
                         }
 
                         break;
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index a29db1f3abe..3ced50f3a44 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -113,9 +113,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
         if (vc4->regs)
                 return;
 
-        vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs));
+        vc4->regs = ra_alloc_reg_set(vc4, ARRAY_SIZE(vc4_regs), true);
 
         vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
+        vc4->reg_class_a_or_b_or_acc = ra_alloc_reg_class(vc4->regs);
         vc4->reg_class_r4_or_a = ra_alloc_reg_class(vc4->regs);
         vc4->reg_class_a = ra_alloc_reg_class(vc4->regs);
         for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
@@ -130,10 +131,12 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
                  */
                 if (vc4_regs[i].mux == QPU_MUX_R4) {
                         ra_class_add_reg(vc4->regs, vc4->reg_class_r4_or_a, i);
+                        ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
                         continue;
                 }
 
                 ra_class_add_reg(vc4->regs, vc4->reg_class_any, i);
+                ra_class_add_reg(vc4->regs, vc4->reg_class_a_or_b_or_acc, i);
         }
 
         for (uint32_t i = AB_INDEX; i < AB_INDEX + 64; i += 2) {
@@ -177,7 +180,8 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         uint8_t class_bits[c->num_temps];
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
-        memset(def, 0, sizeof(def));
+        for (int i = 0; i < ARRAY_SIZE(def); i++)
+                def[i] = ~0;
         memset(use, 0, sizeof(use));
 
         /* If things aren't ever written (undefined values), just read from
@@ -196,7 +200,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t ip = 0;
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (inst->dst.file == QFILE_TEMP) {
-                        def[inst->dst.index] = ip;
+                        def[inst->dst.index] = MIN2(ip, def[inst->dst.index]);
                         use[inst->dst.index] = ip;
                 }
 
@@ -267,17 +271,33 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                                         AB_INDEX + QPU_R_FRAG_PAYLOAD_ZW * 2);
                         break;
 
-                case QOP_PACK_SCALED:
-                        /* The pack flags require an A-file dst register. */
-                        class_bits[inst->dst.index] &= CLASS_BIT_A;
-                        break;
-
                 default:
                         break;
                 }
 
+                if (inst->dst.pack && !qir_is_mul(inst)) {
+                        /* The non-MUL pack flags require an A-file dst
+                         * register.
+                         */
+                        class_bits[inst->dst.index] &= CLASS_BIT_A;
+                }
+
                 if (qir_src_needs_a_file(inst)) {
-                        class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                        switch (inst->op) {
+                        case QOP_UNPACK_8A_F:
+                        case QOP_UNPACK_8B_F:
+                        case QOP_UNPACK_8C_F:
+                        case QOP_UNPACK_8D_F:
+                                /* Special case: these can be done as R4
+                                 * unpacks, as well.
+                                 */
+                                class_bits[inst->src[0].index] &= (CLASS_BIT_A |
+                                                                   CLASS_BIT_R4);
+                                break;
+                        default:
+                                class_bits[inst->src[0].index] &= CLASS_BIT_A;
+                                break;
+                        }
                 }
                 ip++;
         }
@@ -287,9 +307,11 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 
                 switch (class_bits[i]) {
                 case CLASS_BIT_A | CLASS_BIT_B_OR_ACC | CLASS_BIT_R4:
-                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
                         ra_set_node_class(g, node, vc4->reg_class_any);
                         break;
+                case CLASS_BIT_A | CLASS_BIT_B_OR_ACC:
+                        ra_set_node_class(g, node, vc4->reg_class_a_or_b_or_acc);
+                        break;
                 case CLASS_BIT_A | CLASS_BIT_R4:
                         ra_set_node_class(g, node, vc4->reg_class_r4_or_a);
                         break;
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 1e493f47ccf..266ebbafe36 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -663,6 +663,7 @@ struct pipe_blit_info
 
    boolean render_condition_enable; /**< whether the blit should honor the
                                     current render condition */
+   boolean alpha_blend; /* dst.rgb = src.rgb * src.a + dst.rgb * (1 - src.a) */
 };
 
 
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 63c3f8ee49b..7c23a27150b 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -346,6 +346,15 @@ namespace {
 
    // Kernel metadata
 
+   struct kernel_arg_md {
+      llvm::StringRef type_name;
+      llvm::StringRef access_qual;
+      kernel_arg_md(llvm::StringRef type_name_, llvm::StringRef access_qual_):
+         type_name(type_name_), access_qual(access_qual_) {}
+   };
+
+#if HAVE_LLVM >= 0x0306
+
    const llvm::MDNode *
    get_kernel_metadata(const llvm::Function *kernel_func) {
       auto mod = kernel_func->getParent();
@@ -356,12 +365,8 @@ namespace {
 
       const llvm::MDNode *kernel_node = nullptr;
       for (unsigned i = 0; i < kernels_node->getNumOperands(); ++i) {
-#if HAVE_LLVM >= 0x0306
          auto func = llvm::mdconst::dyn_extract<llvm::Function>(
-#else
-         auto func = llvm::dyn_cast<llvm::Function>(
-#endif
-                                    kernels_node->getOperand(i)->getOperand(0));
+               kernels_node->getOperand(i)->getOperand(0));
          if (func == kernel_func) {
             kernel_node = kernels_node->getOperand(i);
             break;
@@ -387,13 +392,6 @@ namespace {
       return node;
    }
 
-   struct kernel_arg_md {
-      llvm::StringRef type_name;
-      llvm::StringRef access_qual;
-      kernel_arg_md(llvm::StringRef type_name_, llvm::StringRef access_qual_):
-         type_name(type_name_), access_qual(access_qual_) {}
-   };
-
    std::vector<kernel_arg_md>
    get_kernel_arg_md(const llvm::Function *kernel_func) {
       auto num_args = kernel_func->getArgumentList().size();
@@ -415,6 +413,17 @@ namespace {
       return res;
    }
 
+#else
+
+   std::vector<kernel_arg_md>
+   get_kernel_arg_md(const llvm::Function *kernel_func) {
+      return std::vector<kernel_arg_md>(
+            kernel_func->getArgumentList().size(),
+            kernel_arg_md("", ""));
+   }
+
+#endif // HAVE_LLVM >= 0x0306
+
    std::vector<module::argument>
    get_kernel_args(const llvm::Module *mod, const std::string &kernel_name,
                    const clang::LangAS::Map &address_spaces) {
diff --git a/src/gallium/state_trackers/nine/adapter9.c b/src/gallium/state_trackers/nine/adapter9.c
index c5ffcb15a18..69e0fa25961 100644
--- a/src/gallium/state_trackers/nine/adapter9.c
+++ b/src/gallium/state_trackers/nine/adapter9.c
@@ -545,7 +545,7 @@ NineAdapter9_GetDeviceCaps( struct NineAdapter9 *This,
                      /*D3DDEVCAPS_RTPATCHES |*/
                      /*D3DDEVCAPS_RTPATCHHANDLEZERO |*/
                      /*D3DDEVCAPS_SEPARATETEXTUREMEMORIES |*/
-                     /*D3DDEVCAPS_TEXTURENONLOCALVIDMEM |*/
+                     D3DDEVCAPS_TEXTURENONLOCALVIDMEM |
                      /* D3DDEVCAPS_TEXTURESYSTEMMEMORY |*/
                      D3DDEVCAPS_TEXTUREVIDEOMEMORY |
                      D3DDEVCAPS_TLVERTEXSYSTEMMEMORY |
@@ -561,32 +561,32 @@ NineAdapter9_GetDeviceCaps( struct NineAdapter9 *This,
                                D3DPMISCCAPS_TSSARGTEMP |
                                D3DPMISCCAPS_BLENDOP |
                                D3DPIPECAP(INDEP_BLEND_ENABLE, D3DPMISCCAPS_INDEPENDENTWRITEMASKS) |
-                               /*D3DPMISCCAPS_PERSTAGECONSTANT |*/
+                               /*D3DPMISCCAPS_PERSTAGECONSTANT |*/ /* TODO */
                                /*D3DPMISCCAPS_POSTBLENDSRGBCONVERT |*/ /* TODO */
                                D3DPMISCCAPS_FOGANDSPECULARALPHA |
                                D3DPIPECAP(BLEND_EQUATION_SEPARATE, D3DPMISCCAPS_SEPARATEALPHABLEND) |
                                D3DPIPECAP(MIXED_COLORBUFFER_FORMATS, D3DPMISCCAPS_MRTINDEPENDENTBITDEPTHS) |
                                D3DPMISCCAPS_MRTPOSTPIXELSHADERBLENDING |
-                               /*D3DPMISCCAPS_FOGVERTEXCLAMPED*/0;
+                               D3DPMISCCAPS_FOGVERTEXCLAMPED;
     if (!screen->get_param(screen, PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION))
         pCaps->PrimitiveMiscCaps |= D3DPMISCCAPS_CLIPTLVERTS;
 
     pCaps->RasterCaps =
         D3DPIPECAP(ANISOTROPIC_FILTER, D3DPRASTERCAPS_ANISOTROPY) |
-        /*D3DPRASTERCAPS_COLORPERSPECTIVE |*/
+        D3DPRASTERCAPS_COLORPERSPECTIVE |
         D3DPRASTERCAPS_DITHER |
         D3DPRASTERCAPS_DEPTHBIAS |
-        /*D3DPRASTERCAPS_FOGRANGE |*/
-        /*D3DPRASTERCAPS_FOGTABLE |*/
-        /*D3DPRASTERCAPS_FOGVERTEX |*/
+        D3DPRASTERCAPS_FOGRANGE |
+        D3DPRASTERCAPS_FOGTABLE |
+        D3DPRASTERCAPS_FOGVERTEX |
         D3DPRASTERCAPS_MIPMAPLODBIAS |
         D3DPRASTERCAPS_MULTISAMPLE_TOGGLE |
         D3DPRASTERCAPS_SCISSORTEST |
         D3DPRASTERCAPS_SLOPESCALEDEPTHBIAS |
         /*D3DPRASTERCAPS_WBUFFER |*/
-        /*D3DPRASTERCAPS_WFOG |*/
+        D3DPRASTERCAPS_WFOG |
         /*D3DPRASTERCAPS_ZBUFFERLESSHSR |*/
-        /*D3DPRASTERCAPS_ZFOG |*/
+        D3DPRASTERCAPS_ZFOG |
         D3DPRASTERCAPS_ZTEST;
 
     pCaps->ZCmpCaps = D3DPCMPCAPS_NEVER |
@@ -697,15 +697,12 @@ NineAdapter9_GetDeviceCaps( struct NineAdapter9 *This,
     pCaps->MaxAnisotropy =
         (DWORD)screen->get_paramf(screen, PIPE_CAPF_MAX_TEXTURE_ANISOTROPY);
 
-    pCaps->MaxVertexW = 1.0f; /* XXX */
-    pCaps->GuardBandLeft = screen->get_paramf(screen,
-                                              PIPE_CAPF_GUARD_BAND_LEFT);
-    pCaps->GuardBandTop = screen->get_paramf(screen,
-                                             PIPE_CAPF_GUARD_BAND_TOP);
-    pCaps->GuardBandRight = screen->get_paramf(screen,
-                                               PIPE_CAPF_GUARD_BAND_RIGHT);
-    pCaps->GuardBandBottom = screen->get_paramf(screen,
-                                                PIPE_CAPF_GUARD_BAND_BOTTOM);
+    /* Values for GeForce 9600 GT */
+    pCaps->MaxVertexW = 1e10f;
+    pCaps->GuardBandLeft = -1e9f;
+    pCaps->GuardBandTop = -1e9f;
+    pCaps->GuardBandRight = 1e9f;
+    pCaps->GuardBandBottom = 1e9f;
     pCaps->ExtentsAdjust = 0.0f;
 
     pCaps->StencilCaps =
@@ -724,8 +721,6 @@ NineAdapter9_GetDeviceCaps( struct NineAdapter9 *This,
         /*D3DFVFCAPS_DONOTSTRIPELEMENTS |*/
         D3DFVFCAPS_PSIZE;
 
-    /* XXX: Some of these are probably not in SM2.0 so cap them when I figure
-     * them out. For now leave them all enabled. */
     pCaps->TextureOpCaps = D3DTEXOPCAPS_DISABLE |
                            D3DTEXOPCAPS_SELECTARG1 |
                            D3DTEXOPCAPS_SELECTARG2 |
@@ -796,7 +791,8 @@ NineAdapter9_GetDeviceCaps( struct NineAdapter9 *This,
     pCaps->MaxVertexShaderConst = NINE_MAX_CONST_F;
 
     pCaps->PixelShaderVersion = D3DPS_VERSION(3,0);
-    pCaps->PixelShader1xMaxValue = 8.0f; /* XXX: wine */
+    /* Value for GeForce 9600 GT */
+    pCaps->PixelShader1xMaxValue = 65504.f;
 
     pCaps->DevCaps2 = D3DDEVCAPS2_STREAMOFFSET |
                       D3DDEVCAPS2_VERTEXELEMENTSCANSHARESTREAMOFFSET |
diff --git a/src/gallium/state_trackers/nine/basetexture9.c b/src/gallium/state_trackers/nine/basetexture9.c
index 17a8f448ee6..d13138b7d5c 100644
--- a/src/gallium/state_trackers/nine/basetexture9.c
+++ b/src/gallium/state_trackers/nine/basetexture9.c
@@ -57,7 +57,8 @@ NineBaseTexture9_ctor( struct NineBaseTexture9 *This,
     user_assert(!(Usage & (D3DUSAGE_RENDERTARGET | D3DUSAGE_DEPTHSTENCIL)) ||
                 Pool == D3DPOOL_DEFAULT, D3DERR_INVALIDCALL);
     user_assert(!(Usage & D3DUSAGE_DYNAMIC) ||
-                Pool != D3DPOOL_MANAGED, D3DERR_INVALIDCALL);
+                !(Pool == D3DPOOL_MANAGED ||
+                  Pool == D3DPOOL_SCRATCH), D3DERR_INVALIDCALL);
 
     hr = NineResource9_ctor(&This->base, pParams, initResource, alloc, Type, Pool, Usage);
     if (FAILED(hr))
@@ -85,6 +86,9 @@ NineBaseTexture9_ctor( struct NineBaseTexture9 *This,
                    util_format_has_depth(util_format_description(This->base.info.format));
 
     list_inithead(&This->list);
+    list_inithead(&This->list2);
+    if (Pool == D3DPOOL_MANAGED)
+        list_add(&This->list2, &This->base.base.device->managed_textures);
 
     return D3D_OK;
 }
@@ -98,7 +102,9 @@ NineBaseTexture9_dtor( struct NineBaseTexture9 *This )
     pipe_sampler_view_reference(&This->view[1], NULL);
 
     if (This->list.prev != NULL && This->list.next != NULL)
-        list_del(&This->list),
+        list_del(&This->list);
+    if (This->list2.prev != NULL && This->list2.next != NULL)
+        list_del(&This->list2);
 
     NineResource9_dtor(&This->base);
 }
@@ -153,6 +159,8 @@ NineBaseTexture9_SetAutoGenFilterType( struct NineBaseTexture9 *This,
     user_assert(FilterType != D3DTEXF_NONE, D3DERR_INVALIDCALL);
 
     This->mipfilter = FilterType;
+    This->dirty_mip = TRUE;
+    NineBaseTexture9_GenerateMipSubLevels(This);
 
     return D3D_OK;
 }
@@ -310,14 +318,12 @@ NineBaseTexture9_UploadSelf( struct NineBaseTexture9 *This )
                 tex->dirty_box.width, tex->dirty_box.height, tex->dirty_box.depth);
 
             if (tex->dirty_box.width) {
-                for (l = 0; l <= last_level; ++l) {
+                for (l = min_level_dirty; l <= last_level; ++l) {
                     u_box_minify_2d(&box, &tex->dirty_box, l);
-                    NineVolume9_AddDirtyRegion(tex->volumes[l], &tex->dirty_box);
+                    NineVolume9_UploadSelf(tex->volumes[l], &box);
                 }
                 memset(&tex->dirty_box, 0, sizeof(tex->dirty_box));
             }
-            for (l = min_level_dirty; l <= last_level; ++l)
-                NineVolume9_UploadSelf(tex->volumes[l]);
         } else {
             assert(!"invalid texture type");
         }
@@ -361,8 +367,7 @@ NineBaseTexture9_UploadSelf( struct NineBaseTexture9 *This )
                 box.width = u_minify(This->base.info.width0, l);
                 box.height = u_minify(This->base.info.height0, l);
                 box.depth = u_minify(This->base.info.depth0, l);
-                NineVolume9_AddDirtyRegion(tex->volumes[l], &box);
-                NineVolume9_UploadSelf(tex->volumes[l]);
+                NineVolume9_UploadSelf(tex->volumes[l], &box);
             }
         } else {
             assert(!"invalid texture type");
@@ -381,8 +386,7 @@ NineBaseTexture9_UploadSelf( struct NineBaseTexture9 *This )
 void WINAPI
 NineBaseTexture9_GenerateMipSubLevels( struct NineBaseTexture9 *This )
 {
-    struct pipe_resource *resource = This->base.resource;
-
+    struct pipe_resource *resource;
     unsigned base_level = 0;
     unsigned last_level = This->base.info.last_level - This->managed.lod;
     unsigned first_layer = 0;
@@ -405,6 +409,8 @@ NineBaseTexture9_GenerateMipSubLevels( struct NineBaseTexture9 *This )
 
     last_layer = util_max_layer(This->view[0]->texture, base_level);
 
+    resource = This->base.resource;
+
     util_gen_mipmap(This->pipe, resource,
                     resource->format, base_level, last_level,
                     first_layer, last_layer, filter);
@@ -530,6 +536,11 @@ NineBaseTexture9_UpdateSamplerView( struct NineBaseTexture9 *This,
             swizzle[2] = PIPE_SWIZZLE_RED;
             swizzle[3] = PIPE_SWIZZLE_RED;
         }
+    } else if (resource->format == PIPE_FORMAT_RGTC2_UNORM) {
+        swizzle[0] = PIPE_SWIZZLE_GREEN;
+        swizzle[1] = PIPE_SWIZZLE_RED;
+        swizzle[2] = PIPE_SWIZZLE_ONE;
+        swizzle[3] = PIPE_SWIZZLE_ONE;
     } else if (resource->format != PIPE_FORMAT_A8_UNORM &&
                resource->format != PIPE_FORMAT_RGTC1_UNORM) {
         /* exceptions:
@@ -578,6 +589,21 @@ NineBaseTexture9_PreLoad( struct NineBaseTexture9 *This )
         NineBaseTexture9_UploadSelf(This);
 }
 
+void
+NineBaseTexture9_UnLoad( struct NineBaseTexture9 *This )
+{
+    if (This->base.pool != D3DPOOL_MANAGED ||
+        This->managed.lod_resident == -1)
+        return;
+
+    pipe_resource_reference(&This->base.resource, NULL);
+    This->managed.lod_resident = -1;
+    This->managed.dirty = TRUE;
+
+    /* If the texture is bound, we have to re-upload it */
+    BASETEX_REGISTER_UPDATE(This);
+}
+
 #ifdef DEBUG
 void
 NineBaseTexture9_Dump( struct NineBaseTexture9 *This )
diff --git a/src/gallium/state_trackers/nine/basetexture9.h b/src/gallium/state_trackers/nine/basetexture9.h
index 9d6fb0c002a..b19a62195fc 100644
--- a/src/gallium/state_trackers/nine/basetexture9.h
+++ b/src/gallium/state_trackers/nine/basetexture9.h
@@ -30,7 +30,8 @@
 struct NineBaseTexture9
 {
     struct NineResource9 base;
-    struct list_head list;
+    struct list_head list; /* for update_textures */
+    struct list_head list2; /* for managed_textures */
 
     /* g3d */
     struct pipe_context *pipe;
@@ -94,6 +95,9 @@ NineBaseTexture9_GenerateMipSubLevels( struct NineBaseTexture9 *This );
 void WINAPI
 NineBaseTexture9_PreLoad( struct NineBaseTexture9 *This );
 
+void
+NineBaseTexture9_UnLoad( struct NineBaseTexture9 *This );
+
 /* For D3DPOOL_MANAGED only (after SetLOD change): */
 HRESULT
 NineBaseTexture9_CreatePipeResource( struct NineBaseTexture9 *This,
diff --git a/src/gallium/state_trackers/nine/cubetexture9.c b/src/gallium/state_trackers/nine/cubetexture9.c
index edea1f28a8d..abba2637946 100644
--- a/src/gallium/state_trackers/nine/cubetexture9.c
+++ b/src/gallium/state_trackers/nine/cubetexture9.c
@@ -43,7 +43,7 @@ NineCubeTexture9_ctor( struct NineCubeTexture9 *This,
     struct pipe_screen *screen = pParams->device->screen;
     enum pipe_format pf;
     unsigned i, l, f, offset, face_size = 0;
-    unsigned *level_offsets;
+    unsigned *level_offsets = NULL;
     D3DSURFACE_DESC sfdesc;
     void *p;
     HRESULT hr;
@@ -70,6 +70,13 @@ NineCubeTexture9_ctor( struct NineCubeTexture9 *This,
     if (Format == D3DFMT_ATI1 || Format == D3DFMT_ATI2)
         return D3DERR_INVALIDCALL;
 
+    if (compressed_format(Format)) {
+        const unsigned w = util_format_get_blockwidth(pf);
+        const unsigned h = util_format_get_blockheight(pf);
+
+        user_assert(!(EdgeLength % w) && !(EdgeLength % h), D3DERR_INVALIDCALL);
+    }
+
     info->screen = pParams->device->screen;
     info->target = PIPE_TEXTURE_CUBE;
     info->format = pf;
@@ -106,7 +113,7 @@ NineCubeTexture9_ctor( struct NineCubeTexture9 *This,
         face_size = nine_format_get_size_and_offsets(pf, level_offsets,
                                                      EdgeLength, EdgeLength,
                                                      info->last_level);
-        This->managed_buffer = MALLOC(6 * face_size);
+        This->managed_buffer = align_malloc(6 * face_size, 32);
         if (!This->managed_buffer)
             return E_OUTOFMEMORY;
     }
@@ -150,8 +157,12 @@ NineCubeTexture9_ctor( struct NineCubeTexture9 *This,
         }
     }
 
-    for (i = 0; i < 6; ++i) /* width = 0 means empty, depth stays 1 */
+    for (i = 0; i < 6; ++i) {
+        /* Textures start initially dirty */
+        This->dirty_rect[i].width = EdgeLength;
+        This->dirty_rect[i].height = EdgeLength;
         This->dirty_rect[i].depth = 1;
+    }
 
     return D3D_OK;
 }
@@ -259,13 +270,17 @@ NineCubeTexture9_AddDirtyRect( struct NineCubeTexture9 *This,
     user_assert(FaceType < 6, D3DERR_INVALIDCALL);
 
     if (This->base.base.pool != D3DPOOL_MANAGED) {
-        if (This->base.base.usage & D3DUSAGE_AUTOGENMIPMAP)
+        if (This->base.base.usage & D3DUSAGE_AUTOGENMIPMAP) {
             This->base.dirty_mip = TRUE;
+            BASETEX_REGISTER_UPDATE(&This->base);
+        }
         return D3D_OK;
     }
-    This->base.managed.dirty = TRUE;
 
-    BASETEX_REGISTER_UPDATE(&This->base);
+    if (This->base.base.pool == D3DPOOL_MANAGED) {
+        This->base.managed.dirty = TRUE;
+        BASETEX_REGISTER_UPDATE(&This->base);
+    }
 
     if (!pDirtyRect) {
         u_box_origin_2d(This->base.base.info.width0,
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index 55948cbb67f..99197a4361b 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -119,48 +119,6 @@ NineDevice9_SetDefaultState( struct NineDevice9 *This, boolean is_reset )
             This, (IDirect3DSurface9 *)This->swapchains[0]->zsbuf);
 }
 
-void
-NineDevice9_RestoreNonCSOState( struct NineDevice9 *This, unsigned mask )
-{
-    struct pipe_context *pipe = This->pipe;
-
-    DBG("This=%p mask=%u\n", This, mask);
-
-    if (mask & 0x1) {
-        struct pipe_constant_buffer cb;
-        cb.buffer_offset = 0;
-
-        if (This->prefer_user_constbuf) {
-            cb.buffer = NULL;
-            cb.user_buffer = This->state.vs_const_f;
-        } else {
-            cb.buffer = This->constbuf_vs;
-            cb.user_buffer = NULL;
-        }
-        cb.buffer_size = This->vs_const_size;
-        pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &cb);
-
-        if (This->prefer_user_constbuf) {
-            cb.user_buffer = This->state.ps_const_f;
-        } else {
-            cb.buffer = This->constbuf_ps;
-        }
-        cb.buffer_size = This->ps_const_size;
-        pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &cb);
-    }
-
-    if (mask & 0x2) {
-        struct pipe_poly_stipple stipple;
-        memset(&stipple, ~0, sizeof(stipple));
-        pipe->set_polygon_stipple(pipe, &stipple);
-    }
-
-    This->state.changed.group = NINE_STATE_ALL;
-    This->state.changed.vtxbuf = (1ULL << This->caps.MaxStreams) - 1;
-    This->state.changed.ucp = (1 << PIPE_MAX_CLIP_PLANES) - 1;
-    This->state.changed.texture = NINE_PS_SAMPLERS_MASK | NINE_VS_SAMPLERS_MASK;
-}
-
 #define GET_PCAP(n) pScreen->get_param(pScreen, PIPE_CAP_##n)
 HRESULT
 NineDevice9_ctor( struct NineDevice9 *This,
@@ -186,6 +144,7 @@ NineDevice9_ctor( struct NineDevice9 *This,
     if (FAILED(hr)) { return hr; }
 
     list_inithead(&This->update_textures);
+    list_inithead(&This->managed_textures);
 
     This->screen = pScreen;
     This->caps = *pCaps;
@@ -341,16 +300,19 @@ NineDevice9_ctor( struct NineDevice9 *This,
         This->state.vs_const_f = CALLOC(This->vs_const_size, 1);
         This->state.ps_const_f = CALLOC(This->ps_const_size, 1);
         This->state.vs_lconstf_temp = CALLOC(This->vs_const_size,1);
+        This->state.ps_lconstf_temp = CALLOC(This->ps_const_size,1);
         if (!This->state.vs_const_f || !This->state.ps_const_f ||
-            !This->state.vs_lconstf_temp)
+            !This->state.vs_lconstf_temp || !This->state.ps_lconstf_temp)
             return E_OUTOFMEMORY;
 
         if (strstr(pScreen->get_name(pScreen), "AMD") ||
             strstr(pScreen->get_name(pScreen), "ATI")) {
-            This->prefer_user_constbuf = TRUE;
             This->driver_bugs.buggy_barycentrics = TRUE;
         }
 
+        /* Disable NV path for now, needs some fixes */
+        This->prefer_user_constbuf = TRUE;
+
         tmpl.target = PIPE_BUFFER;
         tmpl.format = PIPE_FORMAT_R8_UNORM;
         tmpl.height0 = 1;
@@ -376,6 +338,8 @@ NineDevice9_ctor( struct NineDevice9 *This,
     {
         struct pipe_resource tmplt;
         struct pipe_sampler_view templ;
+        struct pipe_sampler_state samp;
+        memset(&samp, 0, sizeof(samp));
 
         tmplt.target = PIPE_TEXTURE_2D;
         tmplt.width0 = 1;
@@ -404,22 +368,39 @@ NineDevice9_ctor( struct NineDevice9 *This,
         templ.swizzle_a = PIPE_SWIZZLE_ONE;
         templ.target = This->dummy_texture->target;
 
-        This->dummy_sampler = This->pipe->create_sampler_view(This->pipe, This->dummy_texture, &templ);
-        if (!This->dummy_sampler)
+        This->dummy_sampler_view = This->pipe->create_sampler_view(This->pipe, This->dummy_texture, &templ);
+        if (!This->dummy_sampler_view)
             return D3DERR_DRIVERINTERNALERROR;
+
+        samp.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+        samp.max_lod = 15.0f;
+        samp.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        samp.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        samp.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+        samp.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+        samp.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+        samp.compare_mode = PIPE_TEX_COMPARE_NONE;
+        samp.compare_func = PIPE_FUNC_LEQUAL;
+        samp.normalized_coords = 1;
+        samp.seamless_cube_map = 1;
+        This->dummy_sampler_state = samp;
     }
 
     /* Allocate upload helper for drivers that suck (from st pov ;). */
-    {
-        unsigned bind = 0;
 
-        This->driver_caps.user_vbufs = GET_PCAP(USER_VERTEX_BUFFERS);
-        This->driver_caps.user_ibufs = GET_PCAP(USER_INDEX_BUFFERS);
+    This->driver_caps.user_vbufs = GET_PCAP(USER_VERTEX_BUFFERS);
+    This->driver_caps.user_ibufs = GET_PCAP(USER_INDEX_BUFFERS);
+    This->driver_caps.user_cbufs = GET_PCAP(USER_CONSTANT_BUFFERS);
+
+    if (!This->driver_caps.user_vbufs)
+        This->vertex_uploader = u_upload_create(This->pipe, 65536, 4, PIPE_BIND_VERTEX_BUFFER);
+    if (!This->driver_caps.user_ibufs)
+        This->index_uploader = u_upload_create(This->pipe, 128 * 1024, 4, PIPE_BIND_INDEX_BUFFER);
+    if (!This->driver_caps.user_cbufs) {
+        unsigned alignment = GET_PCAP(CONSTANT_BUFFER_OFFSET_ALIGNMENT);
 
-        if (!This->driver_caps.user_vbufs) bind |= PIPE_BIND_VERTEX_BUFFER;
-        if (!This->driver_caps.user_ibufs) bind |= PIPE_BIND_INDEX_BUFFER;
-        if (bind)
-            This->upload = u_upload_create(This->pipe, 1 << 20, 4, bind);
+        This->constbuf_uploader = u_upload_create(This->pipe, This->vs_const_size,
+                                                  alignment, PIPE_BIND_CONSTANT_BUFFER);
     }
 
     This->driver_caps.window_space_position_support = GET_PCAP(TGSI_VS_WINDOW_SPACE_POSITION);
@@ -429,10 +410,15 @@ NineDevice9_ctor( struct NineDevice9 *This,
     nine_ff_init(This); /* initialize fixed function code */
 
     NineDevice9_SetDefaultState(This, FALSE);
-    NineDevice9_RestoreNonCSOState(This, ~0);
+
+    {
+        struct pipe_poly_stipple stipple;
+        memset(&stipple, ~0, sizeof(stipple));
+        This->pipe->set_polygon_stipple(This->pipe, &stipple);
+    }
 
     This->update = &This->state;
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     ID3DPresentGroup_Release(This->present);
 
@@ -452,12 +438,16 @@ NineDevice9_dtor( struct NineDevice9 *This )
     nine_ff_fini(This);
     nine_state_clear(&This->state, TRUE);
 
-    if (This->upload)
-        u_upload_destroy(This->upload);
+    if (This->vertex_uploader)
+        u_upload_destroy(This->vertex_uploader);
+    if (This->index_uploader)
+        u_upload_destroy(This->index_uploader);
+    if (This->constbuf_uploader)
+        u_upload_destroy(This->constbuf_uploader);
 
     nine_bind(&This->record, NULL);
 
-    pipe_sampler_view_reference(&This->dummy_sampler, NULL);
+    pipe_sampler_view_reference(&This->dummy_sampler_view, NULL);
     pipe_resource_reference(&This->dummy_texture, NULL);
     pipe_resource_reference(&This->constbuf_vs, NULL);
     pipe_resource_reference(&This->constbuf_ps, NULL);
@@ -465,6 +455,7 @@ NineDevice9_dtor( struct NineDevice9 *This )
     FREE(This->state.vs_const_f);
     FREE(This->state.ps_const_f);
     FREE(This->state.vs_lconstf_temp);
+    FREE(This->state.ps_lconstf_temp);
 
     if (This->swapchains) {
         for (i = 0; i < This->nswapchains; ++i)
@@ -547,10 +538,14 @@ NineDevice9_GetAvailableTextureMem( struct NineDevice9 *This )
 HRESULT WINAPI
 NineDevice9_EvictManagedResources( struct NineDevice9 *This )
 {
-    /* We don't really need to do anything here, but might want to free up
-     * the GPU virtual address space by killing pipe_resources.
-     */
-    STUB(D3D_OK);
+    struct NineBaseTexture9 *tex;
+
+    DBG("This=%p\n", This);
+    LIST_FOR_EACH_ENTRY(tex, &This->managed_textures, list2) {
+        NineBaseTexture9_UnLoad(tex);
+    }
+
+    return D3D_OK;
 }
 
 HRESULT WINAPI
@@ -599,11 +594,11 @@ NineDevice9_SetCursorProperties( struct NineDevice9 *This,
                                  UINT YHotSpot,
                                  IDirect3DSurface9 *pCursorBitmap )
 {
-    /* TODO: hardware cursor */
     struct NineSurface9 *surf = NineSurface9(pCursorBitmap);
     struct pipe_context *pipe = This->pipe;
     struct pipe_box box;
     struct pipe_transfer *transfer;
+    BOOL hw_cursor;
     void *ptr;
 
     DBG_FLAG(DBG_SWAPCHAIN, "This=%p XHotSpot=%u YHotSpot=%u "
@@ -611,8 +606,15 @@ NineDevice9_SetCursorProperties( struct NineDevice9 *This,
 
     user_assert(pCursorBitmap, D3DERR_INVALIDCALL);
 
-    This->cursor.w = MIN2(surf->desc.Width, This->cursor.image->width0);
-    This->cursor.h = MIN2(surf->desc.Height, This->cursor.image->height0);
+    if (This->swapchains[0]->params.Windowed) {
+        This->cursor.w = MIN2(surf->desc.Width, 32);
+        This->cursor.h = MIN2(surf->desc.Height, 32);
+        hw_cursor = 1; /* always use hw cursor for windowed mode */
+    } else {
+        This->cursor.w = MIN2(surf->desc.Width, This->cursor.image->width0);
+        This->cursor.h = MIN2(surf->desc.Height, This->cursor.image->height0);
+        hw_cursor = This->cursor.w == 32 && This->cursor.h == 32;
+    }
 
     u_box_origin_2d(This->cursor.w, This->cursor.h, &box);
 
@@ -643,16 +645,21 @@ NineDevice9_SetCursorProperties( struct NineDevice9 *This,
                                  lock.pBits, lock.Pitch,
                                  This->cursor.w, This->cursor.h);
 
-        if (!This->cursor.software &&
-            This->cursor.w == 32 && This->cursor.h == 32)
-            ID3DPresent_SetCursor(This->swapchains[0]->present,
-                                  lock.pBits, &This->cursor.hotspot,
-                                  This->cursor.visible);
+        if (hw_cursor)
+            hw_cursor = ID3DPresent_SetCursor(This->swapchains[0]->present,
+                                              lock.pBits,
+                                              &This->cursor.hotspot,
+                                              This->cursor.visible) == D3D_OK;
 
         NineSurface9_UnlockRect(surf);
     }
     pipe->transfer_unmap(pipe, transfer);
 
+    /* hide cursor if we emulate it */
+    if (!hw_cursor)
+        ID3DPresent_SetCursor(This->swapchains[0]->present, NULL, NULL, FALSE);
+    This->cursor.software = !hw_cursor;
+
     return D3D_OK;
 }
 
@@ -670,7 +677,7 @@ NineDevice9_SetCursorPosition( struct NineDevice9 *This,
     This->cursor.pos.y = Y;
 
     if (!This->cursor.software)
-        ID3DPresent_SetCursorPos(swap->present, &This->cursor.pos);
+        This->cursor.software = ID3DPresent_SetCursorPos(swap->present, &This->cursor.pos) != D3D_OK;
 }
 
 BOOL WINAPI
@@ -683,7 +690,7 @@ NineDevice9_ShowCursor( struct NineDevice9 *This,
 
     This->cursor.visible = bShow && (This->cursor.hotspot.x != -1);
     if (!This->cursor.software)
-        ID3DPresent_SetCursor(This->swapchains[0]->present, NULL, NULL, bShow);
+        This->cursor.software = ID3DPresent_SetCursor(This->swapchains[0]->present, NULL, NULL, bShow) != D3D_OK;
 
     return old;
 }
@@ -752,8 +759,8 @@ NineDevice9_Reset( struct NineDevice9 *This,
     for (i = 0; i < This->nswapchains; ++i) {
         D3DPRESENT_PARAMETERS *params = &pPresentationParameters[i];
         hr = NineSwapChain9_Resize(This->swapchains[i], params, NULL);
-        if (FAILED(hr))
-            return (hr == D3DERR_OUTOFVIDEOMEMORY) ? hr : D3DERR_DEVICELOST;
+        if (hr != D3D_OK)
+            return hr;
     }
 
     nine_pipe_context_clear(This);
@@ -1108,6 +1115,13 @@ create_zs_or_rt_surface(struct NineDevice9 *This,
     default: break;
     }
 
+    if (compressed_format(Format)) {
+        const unsigned w = util_format_get_blockwidth(templ.format);
+        const unsigned h = util_format_get_blockheight(templ.format);
+
+        user_assert(!(Width % w) && !(Height % h), D3DERR_INVALIDCALL);
+    }
+
     if (Pool == D3DPOOL_DEFAULT && Format != D3DFMT_NULL) {
         /* resource_create doesn't return an error code, so check format here */
         user_assert(templ.format != PIPE_FORMAT_NONE, D3DERR_INVALIDCALL);
@@ -1173,6 +1187,8 @@ NineDevice9_UpdateSurface( struct NineDevice9 *This,
 {
     struct NineSurface9 *dst = NineSurface9(pDestinationSurface);
     struct NineSurface9 *src = NineSurface9(pSourceSurface);
+    int copy_width, copy_height;
+    RECT destRect;
 
     DBG("This=%p pSourceSurface=%p pDestinationSurface=%p "
         "pSourceRect=%p pDestPoint=%p\n", This,
@@ -1184,13 +1200,75 @@ NineDevice9_UpdateSurface( struct NineDevice9 *This,
     if (pDestPoint)
         DBG("pDestPoint = (%u,%u)\n", pDestPoint->x, pDestPoint->y);
 
+    user_assert(dst && src, D3DERR_INVALIDCALL);
+
     user_assert(dst->base.pool == D3DPOOL_DEFAULT, D3DERR_INVALIDCALL);
     user_assert(src->base.pool == D3DPOOL_SYSTEMMEM, D3DERR_INVALIDCALL);
 
     user_assert(dst->desc.MultiSampleType == D3DMULTISAMPLE_NONE, D3DERR_INVALIDCALL);
     user_assert(src->desc.MultiSampleType == D3DMULTISAMPLE_NONE, D3DERR_INVALIDCALL);
 
-    return NineSurface9_CopySurface(dst, src, pDestPoint, pSourceRect);
+    user_assert(!src->lock_count, D3DERR_INVALIDCALL);
+    user_assert(!dst->lock_count, D3DERR_INVALIDCALL);
+
+    user_assert(dst->desc.Format == src->desc.Format, D3DERR_INVALIDCALL);
+    user_assert(!depth_stencil_format(dst->desc.Format), D3DERR_INVALIDCALL);
+
+    if (pSourceRect) {
+        copy_width = pSourceRect->right - pSourceRect->left;
+        copy_height = pSourceRect->bottom - pSourceRect->top;
+
+        user_assert(pSourceRect->left >= 0 &&
+                    copy_width > 0 &&
+                    pSourceRect->right <= src->desc.Width &&
+                    pSourceRect->top >= 0 &&
+                    copy_height > 0 &&
+                    pSourceRect->bottom <= src->desc.Height,
+                    D3DERR_INVALIDCALL);
+    } else {
+        copy_width = src->desc.Width;
+        copy_height = src->desc.Height;
+    }
+
+    destRect.right = copy_width;
+    destRect.bottom = copy_height;
+
+    if (pDestPoint) {
+        user_assert(pDestPoint->x >= 0 && pDestPoint->y >= 0,
+                    D3DERR_INVALIDCALL);
+        destRect.right += pDestPoint->x;
+        destRect.bottom += pDestPoint->y;
+    }
+
+    user_assert(destRect.right <= dst->desc.Width &&
+                destRect.bottom <= dst->desc.Height,
+                D3DERR_INVALIDCALL);
+
+    if (compressed_format(dst->desc.Format)) {
+        const unsigned w = util_format_get_blockwidth(dst->base.info.format);
+        const unsigned h = util_format_get_blockheight(dst->base.info.format);
+
+        if (pDestPoint) {
+            user_assert(!(pDestPoint->x % w) && !(pDestPoint->y % h),
+                        D3DERR_INVALIDCALL);
+        }
+
+        if (pSourceRect) {
+            user_assert(!(pSourceRect->left % w) && !(pSourceRect->top % h),
+                        D3DERR_INVALIDCALL);
+        }
+        if (!(copy_width == src->desc.Width &&
+              copy_width == dst->desc.Width &&
+              copy_height == src->desc.Height &&
+              copy_height == dst->desc.Height)) {
+            user_assert(!(copy_width  % w) && !(copy_height % h),
+                        D3DERR_INVALIDCALL);
+        }
+    }
+
+    NineSurface9_CopyMemToDefault(dst, src, pDestPoint, pSourceRect);
+
+    return D3D_OK;
 }
 
 HRESULT WINAPI
@@ -1202,6 +1280,7 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
     struct NineBaseTexture9 *srcb = NineBaseTexture9(pSourceTexture);
     unsigned l, m;
     unsigned last_level = dstb->base.info.last_level;
+    RECT rect;
 
     DBG("This=%p pSourceTexture=%p pDestinationTexture=%p\n", This,
         pSourceTexture, pDestinationTexture);
@@ -1227,10 +1306,6 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
 
     user_assert(dstb->base.type == srcb->base.type, D3DERR_INVALIDCALL);
 
-    /* TODO: We can restrict the update to the dirty portions of the source.
-     * Yes, this seems silly, but it's what MSDN says ...
-     */
-
     /* Find src level that matches dst level 0: */
     user_assert(srcb->base.info.width0 >= dstb->base.info.width0 &&
                 srcb->base.info.height0 >= dstb->base.info.height0 &&
@@ -1254,9 +1329,25 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
         struct NineTexture9 *dst = NineTexture9(dstb);
         struct NineTexture9 *src = NineTexture9(srcb);
 
-        for (l = 0; l <= last_level; ++l, ++m)
-            NineSurface9_CopySurface(dst->surfaces[l],
-                                     src->surfaces[m], NULL, NULL);
+        if (src->dirty_rect.width == 0)
+            return D3D_OK;
+
+        pipe_box_to_rect(&rect, &src->dirty_rect);
+        for (l = 0; l < m; ++l)
+            rect_minify_inclusive(&rect);
+
+        for (l = 0; l <= last_level; ++l, ++m) {
+            fit_rect_format_inclusive(dst->base.base.info.format,
+                                      &rect,
+                                      dst->surfaces[l]->desc.Width,
+                                      dst->surfaces[l]->desc.Height);
+            NineSurface9_CopyMemToDefault(dst->surfaces[l],
+                                          src->surfaces[m],
+                                          (POINT *)&rect,
+                                          &rect);
+            rect_minify_inclusive(&rect);
+        }
+        u_box_origin_2d(0, 0, &src->dirty_rect);
     } else
     if (dstb->base.type == D3DRTYPE_CUBETEXTURE) {
         struct NineCubeTexture9 *dst = NineCubeTexture9(dstb);
@@ -1265,10 +1356,25 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
 
         /* GPUs usually have them stored as arrays of mip-mapped 2D textures. */
         for (z = 0; z < 6; ++z) {
+            if (src->dirty_rect[z].width == 0)
+                continue;
+
+            pipe_box_to_rect(&rect, &src->dirty_rect[z]);
+            for (l = 0; l < m; ++l)
+                rect_minify_inclusive(&rect);
+
             for (l = 0; l <= last_level; ++l, ++m) {
-                NineSurface9_CopySurface(dst->surfaces[l * 6 + z],
-                                         src->surfaces[m * 6 + z], NULL, NULL);
+                fit_rect_format_inclusive(dst->base.base.info.format,
+                                          &rect,
+                                          dst->surfaces[l * 6 + z]->desc.Width,
+                                          dst->surfaces[l * 6 + z]->desc.Height);
+                NineSurface9_CopyMemToDefault(dst->surfaces[l * 6 + z],
+                                              src->surfaces[m * 6 + z],
+                                              (POINT *)&rect,
+                                              &rect);
+                rect_minify_inclusive(&rect);
             }
+            u_box_origin_2d(0, 0, &src->dirty_rect[z]);
             m -= l;
         }
     } else
@@ -1276,9 +1382,12 @@ NineDevice9_UpdateTexture( struct NineDevice9 *This,
         struct NineVolumeTexture9 *dst = NineVolumeTexture9(dstb);
         struct NineVolumeTexture9 *src = NineVolumeTexture9(srcb);
 
+        if (src->dirty_box.width == 0)
+            return D3D_OK;
         for (l = 0; l <= last_level; ++l, ++m)
-            NineVolume9_CopyVolume(dst->volumes[l],
-                                   src->volumes[m], 0, 0, 0, NULL);
+            NineVolume9_CopyMemToDefault(dst->volumes[l],
+                                         src->volumes[m], 0, 0, 0, NULL);
+        u_box_3d(0, 0, 0, 0, 0, 0, &src->dirty_box);
     } else{
         assert(!"invalid texture type");
     }
@@ -1308,7 +1417,12 @@ NineDevice9_GetRenderTargetData( struct NineDevice9 *This,
     user_assert(dst->desc.MultiSampleType < 2, D3DERR_INVALIDCALL);
     user_assert(src->desc.MultiSampleType < 2, D3DERR_INVALIDCALL);
 
-    return NineSurface9_CopySurface(dst, src, NULL, NULL);
+    user_assert(src->desc.Width == dst->desc.Width, D3DERR_INVALIDCALL);
+    user_assert(src->desc.Height == dst->desc.Height, D3DERR_INVALIDCALL);
+
+    NineSurface9_CopyDefaultToMem(dst, src);
+
+    return D3D_OK;
 }
 
 HRESULT WINAPI
@@ -1448,6 +1562,7 @@ NineDevice9_StretchRect( struct NineDevice9 *This,
     blit.filter = Filter == D3DTEXF_LINEAR ?
        PIPE_TEX_FILTER_LINEAR : PIPE_TEX_FILTER_NEAREST;
     blit.scissor_enable = FALSE;
+    blit.alpha_blend = FALSE;
 
     /* If both of a src and dst dimension are negative, flip them. */
     if (blit.dst.box.width < 0 && blit.src.box.width < 0) {
@@ -1464,8 +1579,12 @@ NineDevice9_StretchRect( struct NineDevice9 *This,
 
     user_assert(!scaled || dst != src, D3DERR_INVALIDCALL);
     user_assert(!scaled ||
-                !NineSurface9_IsOffscreenPlain(dst) ||
+                !NineSurface9_IsOffscreenPlain(dst), D3DERR_INVALIDCALL);
+    user_assert(!NineSurface9_IsOffscreenPlain(dst) ||
                 NineSurface9_IsOffscreenPlain(src), D3DERR_INVALIDCALL);
+    user_assert(NineSurface9_IsOffscreenPlain(dst) ||
+                dst->desc.Usage & (D3DUSAGE_RENDERTARGET | D3DUSAGE_DEPTHSTENCIL),
+                D3DERR_INVALIDCALL);
     user_assert(!scaled ||
                 (!util_format_is_compressed(dst->base.info.format) &&
                  !util_format_is_compressed(src->base.info.format)),
@@ -1561,11 +1680,8 @@ NineDevice9_ColorFill( struct NineDevice9 *This,
     }
     d3dcolor_to_pipe_color_union(&rgba, color);
 
-    fallback =
-        !This->screen->is_format_supported(This->screen, surf->base.info.format,
-                                           surf->base.info.target,
-                                           surf->base.info.nr_samples,
-                                           PIPE_BIND_RENDER_TARGET);
+    fallback = !(surf->base.info.bind & PIPE_BIND_RENDER_TARGET);
+
     if (!fallback) {
         psurf = NineSurface9_GetSurface(surf, 0);
         if (!psurf)
@@ -1774,7 +1890,7 @@ NineDevice9_Clear( struct NineDevice9 *This,
         return D3D_OK;
     d3dcolor_to_pipe_color_union(&rgba, Color);
 
-    nine_update_state(This, NINE_STATE_FB);
+    nine_update_state_framebuffer(This);
 
     rect.x1 = This->state.viewport.X;
     rect.y1 = This->state.viewport.Y;
@@ -2012,8 +2128,10 @@ NineDevice9_SetLight( struct NineDevice9 *This,
             return E_OUTOFMEMORY;
         state->ff.num_lights = N;
 
-        for (; n < Index; ++n)
+        for (; n < Index; ++n) {
+            memset(&state->ff.light[n], 0, sizeof(D3DLIGHT9));
             state->ff.light[n].Type = (D3DLIGHTTYPE)NINED3DLIGHT_INVALID;
+        }
     }
     state->ff.light[Index] = *pLight;
 
@@ -2508,6 +2626,7 @@ NineDevice9_SetTextureStageState( struct NineDevice9 *This,
                                   DWORD Value )
 {
     struct nine_state *state = This->update;
+    int bumpmap_index = -1;
 
     DBG("Stage=%u Type=%u Value=%08x\n", Stage, Type, Value);
     nine_dump_D3DTSS_value(DBG_FF, Type, Value);
@@ -2516,6 +2635,36 @@ NineDevice9_SetTextureStageState( struct NineDevice9 *This,
     user_assert(Type < Elements(state->ff.tex_stage[0]), D3DERR_INVALIDCALL);
 
     state->ff.tex_stage[Stage][Type] = Value;
+    switch (Type) {
+    case D3DTSS_BUMPENVMAT00:
+        bumpmap_index = 4 * Stage;
+        break;
+    case D3DTSS_BUMPENVMAT10:
+        bumpmap_index = 4 * Stage + 1;
+        break;
+    case D3DTSS_BUMPENVMAT01:
+        bumpmap_index = 4 * Stage + 2;
+        break;
+    case D3DTSS_BUMPENVMAT11:
+        bumpmap_index = 4 * Stage + 3;
+        break;
+    case D3DTSS_BUMPENVLSCALE:
+        bumpmap_index = 4 * 8 + 2 * Stage;
+        break;
+    case D3DTSS_BUMPENVLOFFSET:
+        bumpmap_index = 4 * 8 + 2 * Stage + 1;
+        break;
+    case D3DTSS_TEXTURETRANSFORMFLAGS:
+        state->changed.group |= NINE_STATE_PS1X_SHADER;
+        break;
+    default:
+        break;
+    }
+
+    if (bumpmap_index >= 0) {
+        state->bumpmap_vars[bumpmap_index] = Value;
+        state->changed.group |= NINE_STATE_PS_CONST;
+    }
 
     state->changed.group |= NINE_STATE_FF_PSSTAGES;
     state->ff.changed.tex_stage[Stage][Type / 32] |= 1 << (Type % 32);
@@ -2560,12 +2709,11 @@ NineDevice9_SetSamplerState( struct NineDevice9 *This,
     if (Sampler >= D3DDMAPSAMPLER)
         Sampler = Sampler - D3DDMAPSAMPLER + NINE_MAX_SAMPLERS_PS;
 
-    state->samp[Sampler][Type] = Value;
-    state->changed.group |= NINE_STATE_SAMPLER;
-    state->changed.sampler[Sampler] |= 1 << Type;
-
-    if (Type == D3DSAMP_SRGBTEXTURE)
-        state->changed.srgb = TRUE;
+    if (state->samp[Sampler][Type] != Value || unlikely(This->is_recording)) {
+        state->samp[Sampler][Type] = Value;
+        state->changed.group |= NINE_STATE_SAMPLER;
+        state->changed.sampler[Sampler] |= 1 << Type;
+    }
 
     return D3D_OK;
 }
@@ -2724,7 +2872,7 @@ NineDevice9_DrawPrimitive( struct NineDevice9 *This,
     DBG("iface %p, PrimitiveType %u, StartVertex %u, PrimitiveCount %u\n",
         This, PrimitiveType, StartVertex, PrimitiveCount);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     init_draw_info(&info, This, PrimitiveType, PrimitiveCount);
     info.indexed = FALSE;
@@ -2757,7 +2905,7 @@ NineDevice9_DrawIndexedPrimitive( struct NineDevice9 *This,
     user_assert(This->state.idxbuf, D3DERR_INVALIDCALL);
     user_assert(This->state.vdecl, D3DERR_INVALIDCALL);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     init_draw_info(&info, This, PrimitiveType, PrimitiveCount);
     info.indexed = TRUE;
@@ -2789,7 +2937,7 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
     user_assert(pVertexStreamZeroData && VertexStreamZeroStride,
                 D3DERR_INVALIDCALL);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     init_draw_info(&info, This, PrimitiveType, PrimitiveCount);
     info.indexed = FALSE;
@@ -2803,13 +2951,16 @@ NineDevice9_DrawPrimitiveUP( struct NineDevice9 *This,
     vtxbuf.buffer = NULL;
     vtxbuf.user_buffer = pVertexStreamZeroData;
 
-    if (!This->driver_caps.user_vbufs)
-        u_upload_data(This->upload,
+    if (!This->driver_caps.user_vbufs) {
+        u_upload_data(This->vertex_uploader,
                       0,
                       (info.max_index + 1) * VertexStreamZeroStride, /* XXX */
                       vtxbuf.user_buffer,
                       &vtxbuf.buffer_offset,
                       &vtxbuf.buffer);
+        u_upload_unmap(This->vertex_uploader);
+        vtxbuf.user_buffer = NULL;
+    }
 
     This->pipe->set_vertex_buffers(This->pipe, 0, 1, &vtxbuf);
 
@@ -2851,7 +3002,7 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
     user_assert(IndexDataFormat == D3DFMT_INDEX16 ||
                 IndexDataFormat == D3DFMT_INDEX32, D3DERR_INVALIDCALL);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     init_draw_info(&info, This, PrimitiveType, PrimitiveCount);
     info.indexed = TRUE;
@@ -2872,23 +3023,28 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
 
     if (!This->driver_caps.user_vbufs) {
         const unsigned base = info.min_index * VertexStreamZeroStride;
-        u_upload_data(This->upload,
+        u_upload_data(This->vertex_uploader,
                       base,
                       (info.max_index -
                        info.min_index + 1) * VertexStreamZeroStride, /* XXX */
                       (const uint8_t *)vbuf.user_buffer + base,
                       &vbuf.buffer_offset,
                       &vbuf.buffer);
+        u_upload_unmap(This->vertex_uploader);
         /* Won't be used: */
         vbuf.buffer_offset -= base;
+        vbuf.user_buffer = NULL;
     }
-    if (!This->driver_caps.user_ibufs)
-        u_upload_data(This->upload,
+    if (!This->driver_caps.user_ibufs) {
+        u_upload_data(This->index_uploader,
                       0,
                       info.count * ibuf.index_size,
                       ibuf.user_buffer,
                       &ibuf.offset,
                       &ibuf.buffer);
+        u_upload_unmap(This->index_uploader);
+        ibuf.user_buffer = NULL;
+    }
 
     This->pipe->set_vertex_buffers(This->pipe, 0, 1, &vbuf);
     This->pipe->set_index_buffer(This->pipe, &ibuf);
@@ -2935,7 +3091,7 @@ NineDevice9_ProcessVertices( struct NineDevice9 *This,
     if (!screen->get_param(screen, PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS))
         STUB(D3DERR_INVALIDCALL);
 
-    nine_update_state(This, ~0);
+    nine_update_state(This);
 
     /* TODO: Create shader with stream output. */
     STUB(D3DERR_INVALIDCALL);
@@ -3105,6 +3261,13 @@ NineDevice9_SetVertexShader( struct NineDevice9 *This,
 
     DBG("This=%p pShader=%p\n", This, pShader);
 
+    if (!This->is_recording && state->vs == (struct NineVertexShader9*)pShader)
+      return D3D_OK;
+
+    /* ff -> non-ff: commit back non-ff constants */
+    if (!state->vs && pShader)
+        state->commit |= NINE_STATE_COMMIT_CONST_VS;
+
     nine_bind(&state->vs, pShader);
 
     state->changed.group |= NINE_STATE_VS;
@@ -3139,6 +3302,12 @@ NineDevice9_SetVertexShaderConstantF( struct NineDevice9 *This,
        return D3D_OK;
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
+    if (!This->is_recording) {
+        if (!memcmp(&state->vs_const_f[StartRegister * 4], pConstantData,
+                    Vector4fCount * 4 * sizeof(state->vs_const_f[0])))
+            return D3D_OK;
+    }
+
     memcpy(&state->vs_const_f[StartRegister * 4],
            pConstantData,
            Vector4fCount * 4 * sizeof(state->vs_const_f[0]));
@@ -3188,6 +3357,11 @@ NineDevice9_SetVertexShaderConstantI( struct NineDevice9 *This,
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     if (This->driver_caps.vs_integer) {
+        if (!This->is_recording) {
+            if (!memcmp(&state->vs_const_i[StartRegister][0], pConstantData,
+                        Vector4iCount * sizeof(state->vs_const_i[0])))
+                return D3D_OK;
+        }
         memcpy(&state->vs_const_i[StartRegister][0],
                pConstantData,
                Vector4iCount * sizeof(state->vs_const_i[0]));
@@ -3252,6 +3426,16 @@ NineDevice9_SetVertexShaderConstantB( struct NineDevice9 *This,
     user_assert(StartRegister + BoolCount <= NINE_MAX_CONST_B, D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
+    if (!This->is_recording) {
+        bool noChange = true;
+        for (i = 0; i < BoolCount; i++) {
+            if (!!state->vs_const_b[StartRegister + i] != !!pConstantData[i])
+              noChange = false;
+        }
+        if (noChange)
+            return D3D_OK;
+    }
+
     for (i = 0; i < BoolCount; i++)
         state->vs_const_b[StartRegister + i] = pConstantData[i] ? bool_true : 0;
 
@@ -3433,6 +3617,13 @@ NineDevice9_SetPixelShader( struct NineDevice9 *This,
 
     DBG("This=%p pShader=%p\n", This, pShader);
 
+    if (!This->is_recording && state->ps == (struct NinePixelShader9*)pShader)
+      return D3D_OK;
+
+    /* ff -> non-ff: commit back non-ff constants */
+    if (!state->ps && pShader)
+        state->commit |= NINE_STATE_COMMIT_CONST_PS;
+
     nine_bind(&state->ps, pShader);
 
     state->changed.group |= NINE_STATE_PS;
@@ -3473,6 +3664,12 @@ NineDevice9_SetPixelShaderConstantF( struct NineDevice9 *This,
        return D3D_OK;
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
+    if (!This->is_recording) {
+        if (!memcmp(&state->ps_const_f[StartRegister * 4], pConstantData,
+                    Vector4fCount * 4 * sizeof(state->ps_const_f[0])))
+            return D3D_OK;
+    }
+
     memcpy(&state->ps_const_f[StartRegister * 4],
            pConstantData,
            Vector4fCount * 4 * sizeof(state->ps_const_f[0]));
@@ -3522,6 +3719,11 @@ NineDevice9_SetPixelShaderConstantI( struct NineDevice9 *This,
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
     if (This->driver_caps.ps_integer) {
+        if (!This->is_recording) {
+            if (!memcmp(&state->ps_const_i[StartRegister][0], pConstantData,
+                        Vector4iCount * sizeof(state->ps_const_i[0])))
+                return D3D_OK;
+        }
         memcpy(&state->ps_const_i[StartRegister][0],
                pConstantData,
                Vector4iCount * sizeof(state->ps_const_i[0]));
@@ -3585,6 +3787,16 @@ NineDevice9_SetPixelShaderConstantB( struct NineDevice9 *This,
     user_assert(StartRegister + BoolCount <= NINE_MAX_CONST_B, D3DERR_INVALIDCALL);
     user_assert(pConstantData, D3DERR_INVALIDCALL);
 
+    if (!This->is_recording) {
+        bool noChange = true;
+        for (i = 0; i < BoolCount; i++) {
+            if (!!state->ps_const_b[StartRegister + i] != !!pConstantData[i])
+              noChange = false;
+        }
+        if (noChange)
+            return D3D_OK;
+    }
+
     for (i = 0; i < BoolCount; i++)
         state->ps_const_b[StartRegister + i] = pConstantData[i] ? bool_true : 0;
 
diff --git a/src/gallium/state_trackers/nine/device9.h b/src/gallium/state_trackers/nine/device9.h
index 74607451c5f..98d9c4df06a 100644
--- a/src/gallium/state_trackers/nine/device9.h
+++ b/src/gallium/state_trackers/nine/device9.h
@@ -69,6 +69,7 @@ struct NineDevice9
     struct nine_state state;   /* device state */
 
     struct list_head update_textures;
+    struct list_head managed_textures;
 
     boolean is_recording;
     boolean in_scene;
@@ -83,7 +84,8 @@ struct NineDevice9
     uint16_t max_ps_const_f;
 
     struct pipe_resource *dummy_texture;
-    struct pipe_sampler_view *dummy_sampler;
+    struct pipe_sampler_view *dummy_sampler_view;
+    struct pipe_sampler_state dummy_sampler_state;
 
     struct gen_mipmap_state *gen_mipmap;
 
@@ -113,6 +115,7 @@ struct NineDevice9
     struct {
         boolean user_vbufs;
         boolean user_ibufs;
+        boolean user_cbufs;
         boolean window_space_position_support;
         boolean vs_integer;
         boolean ps_integer;
@@ -122,7 +125,9 @@ struct NineDevice9
         boolean buggy_barycentrics;
     } driver_bugs;
 
-    struct u_upload_mgr *upload;
+    struct u_upload_mgr *vertex_uploader;
+    struct u_upload_mgr *index_uploader;
+    struct u_upload_mgr *constbuf_uploader;
 
     struct nine_range_pool range_pool;
 
@@ -180,10 +185,6 @@ NineDevice9_GetCSO( struct NineDevice9 *This );
 const D3DCAPS9 *
 NineDevice9_GetCaps( struct NineDevice9 *This );
 
-/* Mask: 0x1 = constant buffers, 0x2 = stipple */
-void
-NineDevice9_RestoreNonCSOState( struct NineDevice9 *This, unsigned mask );
-
 /*** Direct3D public ***/
 
 HRESULT WINAPI
diff --git a/src/gallium/state_trackers/nine/nine_ff.c b/src/gallium/state_trackers/nine/nine_ff.c
index 8a53f0d9038..fe8933be69a 100644
--- a/src/gallium/state_trackers/nine/nine_ff.c
+++ b/src/gallium/state_trackers/nine/nine_ff.c
@@ -22,6 +22,7 @@
 #include "tgsi/tgsi_dump.h"
 #include "util/u_box.h"
 #include "util/u_hash_table.h"
+#include "util/u_upload_mgr.h"
 
 #define NINE_TGSI_LAZY_DEVS 1
 
@@ -30,13 +31,6 @@
 #define NINE_FF_NUM_VS_CONST 256
 #define NINE_FF_NUM_PS_CONST 24
 
-#define NINED3DTSS_TCI_DISABLE                       0
-#define NINED3DTSS_TCI_PASSTHRU                      1
-#define NINED3DTSS_TCI_CAMERASPACENORMAL             2
-#define NINED3DTSS_TCI_CAMERASPACEPOSITION           3
-#define NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR   4
-#define NINED3DTSS_TCI_SPHEREMAP                     5
-
 struct fvec4
 {
     float x, y, z, w;
@@ -63,16 +57,20 @@ struct nine_ff_vs_key
             uint32_t fog_range : 1;
             uint32_t color0in_one : 1;
             uint32_t color1in_one : 1;
-            uint32_t pad1 : 8;
-            uint32_t tc_gen : 24; /* 8 * 3 bits */
-            uint32_t pad2 : 8;
-            uint32_t tc_idx : 24;
+            uint32_t fog : 1;
+            uint32_t pad1 : 7;
+            uint32_t tc_dim_input: 16; /* 8 * 2 bits */
+            uint32_t pad2 : 16;
+            uint32_t tc_dim_output: 24; /* 8 * 3 bits */
             uint32_t pad3 : 8;
-            uint32_t tc_dim : 24; /* 8 * 3 bits */
+            uint32_t tc_gen : 24; /* 8 * 3 bits */
             uint32_t pad4 : 8;
+            uint32_t tc_idx : 24;
+            uint32_t pad5 : 8;
+            uint32_t passthrough;
         };
-        uint64_t value64[2]; /* don't forget to resize VertexShader9.ff_key */
-        uint32_t value32[4];
+        uint64_t value64[3]; /* don't forget to resize VertexShader9.ff_key */
+        uint32_t value32[6];
     };
 };
 
@@ -106,15 +104,18 @@ struct nine_ff_ps_key
                 uint32_t alphaarg2 : 3;
                 uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
                 uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
-                uint32_t projected : 1;
+                uint32_t pad       : 1;
                 /* that's 32 bit exactly */
             } ts[8];
-            uint32_t fog : 1; /* for vFog with programmable VS */
+            uint32_t projected : 16;
+            uint32_t fog : 1; /* for vFog coming from VS */
             uint32_t fog_mode : 2;
-            uint32_t specular : 1; /* 9 32-bit words with this */
+            uint32_t specular : 1;
+            uint32_t pad1 : 12; /* 9 32-bit words with this */
             uint8_t colorarg_b4[3];
             uint8_t colorarg_b5[3];
             uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
+            uint8_t pad2[3];
         };
         uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
         uint32_t value32[12];
@@ -222,7 +223,6 @@ static void nine_ureg_tgsi_dump(struct ureg_program *ureg, boolean override)
  * CONST[28].x___ RS.FogEnd
  * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
  * CONST[28].__z_ RS.FogDensity
- * CONST[29]      RS.FogColor
 
  * CONST[30].x___ TWEENFACTOR
  *
@@ -334,16 +334,15 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
 {
     const struct nine_ff_vs_key *key = vs->key;
     struct ureg_program *ureg = ureg_create(TGSI_PROCESSOR_VERTEX);
-    struct ureg_dst oPos, oCol[2], oTex[8], oPsz, oFog;
-    struct ureg_dst rCol[2]; /* oCol if no fog, TEMP otherwise */
+    struct ureg_dst oPos, oCol[2], oPsz, oFog;
     struct ureg_dst rVtx, rNrm;
     struct ureg_dst r[8];
     struct ureg_dst AR;
-    struct ureg_dst tmp, tmp_x, tmp_z;
+    struct ureg_dst tmp, tmp_x, tmp_y, tmp_z;
     unsigned i, c;
     unsigned label[32], l = 0;
     unsigned num_r = 8;
-    boolean need_rNrm = key->lighting || key->pointscale;
+    boolean need_rNrm = key->lighting || key->pointscale || key->passthrough & (1 << NINE_DECLUSAGE_NORMAL);
     boolean need_rVtx = key->lighting || key->fog_mode;
     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
 
@@ -406,9 +405,9 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
     if (key->vertexpointsize)
         vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);
 
-    if (key->vertexblend_indexed)
+    if (key->vertexblend_indexed || key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES))
         vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
-    if (key->vertexblend)
+    if (key->vertexblend || key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT))
         vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
     if (key->vertextween) {
         vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
@@ -420,19 +419,16 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
     oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
     oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
     oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
+    if (key->fog || key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
+        oFog = ureg_DECL_output(ureg, TGSI_SEMANTIC_FOG, 0);
+        oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
+    }
 
     if (key->vertexpointsize || key->pointscale) {
         oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0,
                                        TGSI_WRITEMASK_X, 0, 1);
         oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
     }
-    if (key->fog_mode) {
-        /* We apply fog to the vertex colors, oFog is for programmable shaders only ?
-         */
-        oFog = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_FOG, 0,
-                                       TGSI_WRITEMASK_X, 0, 1);
-        oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
-    }
 
     /* Declare TEMPs:
      */
@@ -440,18 +436,11 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
         r[i] = ureg_DECL_local_temporary(ureg);
     tmp = r[0];
     tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
+    tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
     tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
     if (key->lighting || key->vertexblend)
         AR = ureg_DECL_address(ureg);
 
-    if (key->fog_mode) {
-        rCol[0] = r[2];
-        rCol[1] = r[3];
-    } else {
-        rCol[0] = oCol[0];
-        rCol[1] = oCol[1];
-    }
-
     rVtx = ureg_writemask(r[1], TGSI_WRITEMASK_XYZ);
     rNrm = ureg_writemask(r[2], TGSI_WRITEMASK_XYZ);
 
@@ -560,8 +549,6 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
         ureg_CLAMP(ureg, oPsz, vs->aPsz, _XXXX(cPsz1), _YYYY(cPsz1));
 #endif
     } else if (key->pointscale) {
-        struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
-        struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
         struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
         struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);
 
@@ -582,72 +569,85 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
 #endif
     }
 
-    /* Texture coordinate generation:
-     * XXX: D3DTTFF_PROJECTED, transform matrix
-     */
     for (i = 0; i < 8; ++i) {
-        struct ureg_dst dst[5];
-        struct ureg_src src;
-        unsigned c;
+        struct ureg_dst oTex, input_coord, transformed, t;
+        unsigned c, writemask;
         const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
         const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
-        const unsigned dim = (key->tc_dim >> (i * 3)) & 0x7;
+        unsigned dim_input = 1 + ((key->tc_dim_input >> (i * 2)) & 0x3);
+        const unsigned dim_output = (key->tc_dim_output >> (i * 3)) & 0x7;
 
+        /* No texture output of index s */
         if (tci == NINED3DTSS_TCI_DISABLE)
             continue;
-        oTex[i] = ureg_DECL_output(ureg, texcoord_sn, i);
-
-        if (tci == NINED3DTSS_TCI_PASSTHRU)
-            vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
-
-        if (!dim) {
-            dst[c = 4] = oTex[i];
-        } else {
-            dst[4] = r[5];
-            src = ureg_src(dst[4]);
-            for (c = 0; c < (dim - 1); ++c)
-                dst[c] = ureg_writemask(tmp, (1 << dim) - 1);
-            dst[c] = ureg_writemask(oTex[i], (1 << dim) - 1);
-        }
+        oTex = ureg_DECL_output(ureg, texcoord_sn, i);
+        input_coord = r[5];
+        transformed = r[6];
 
+        /* Get the coordinate */
         switch (tci) {
         case NINED3DTSS_TCI_PASSTHRU:
-            ureg_MOV(ureg, dst[4], vs->aTex[idx]);
+            /* NINED3DTSS_TCI_PASSTHRU => Use texcoord coming from index idx *
+             * Else the idx is used only to determine wrapping mode. */
+            vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
+            ureg_MOV(ureg, input_coord, vs->aTex[idx]);
             break;
         case NINED3DTSS_TCI_CAMERASPACENORMAL:
-            assert(dim <= 3);
-            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_XYZ), ureg_src(rNrm));
-            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
+            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(rNrm));
+            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
+            dim_input = 4;
             break;
         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
-            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_XYZ), ureg_src(rVtx));
-            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
+            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(rVtx));
+            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
+            dim_input = 4;
             break;
         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
             ureg_DP3(ureg, tmp_x, ureg_src(rVtx), ureg_src(rNrm));
             ureg_MUL(ureg, tmp, ureg_src(rNrm), _X(tmp));
             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
-            ureg_SUB(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_XYZ), ureg_src(rVtx), ureg_src(tmp));
-            ureg_MOV(ureg, ureg_writemask(dst[4], TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
+            ureg_SUB(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(rVtx), ureg_src(tmp));
+            ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
+            dim_input = 4;
             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
             break;
         case NINED3DTSS_TCI_SPHEREMAP:
             assert(!"TODO");
             break;
         default:
+            assert(0);
             break;
         }
-        if (!dim)
-            continue;
-        dst[c].WriteMask = ~dst[c].WriteMask;
-        if (dst[c].WriteMask)
-            ureg_MOV(ureg, dst[c], src); /* store untransformed components */
-        dst[c].WriteMask = ~dst[c].WriteMask;
-        if (dim > 0) ureg_MUL(ureg, dst[0], _XXXX(src), _CONST(128 + i * 4));
-        if (dim > 1) ureg_MAD(ureg, dst[1], _YYYY(src), _CONST(129 + i * 4), ureg_src(tmp));
-        if (dim > 2) ureg_MAD(ureg, dst[2], _ZZZZ(src), _CONST(130 + i * 4), ureg_src(tmp));
-        if (dim > 3) ureg_MAD(ureg, dst[3], _WWWW(src), _CONST(131 + i * 4), ureg_src(tmp));
+
+        /* Apply the transformation */
+        /* dim_output == 0 => do not transform the components.
+         * XYZRHW also disables transformation */
+        if (!dim_output || key->position_t) {
+            transformed = input_coord;
+            writemask = TGSI_WRITEMASK_XYZW;
+        } else {
+            for (c = 0; c < dim_output; c++) {
+                t = ureg_writemask(transformed, 1 << c);
+                switch (dim_input) {
+                /* dim_input = 1 2 3: -> we add trailing 1 to input*/
+                case 1: ureg_MAD(ureg, t, _X(input_coord), _XXXX(_CONST(128 + i * 4 + c)), _YYYY(_CONST(128 + i * 4 + c)));
+                        break;
+                case 2: ureg_DP2(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
+                        ureg_ADD(ureg, t, ureg_src(transformed), _ZZZZ(_CONST(128 + i * 4 + c)));
+                        break;
+                case 3: ureg_DP3(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
+                        ureg_ADD(ureg, t, ureg_src(transformed), _WWWW(_CONST(128 + i * 4 + c)));
+                        break;
+                case 4: ureg_DP4(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c)); break;
+                default:
+                    assert(0);
+                }
+            }
+            writemask = (1 << dim_output) - 1;
+        }
+
+        ureg_MOV(ureg, ureg_writemask(oTex, writemask), ureg_src(transformed));
     }
 
     /* === Lighting:
@@ -692,8 +692,6 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
      * specular += light.specular * atten * powFact;
      */
     if (key->lighting) {
-        struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
-
         struct ureg_dst rAtt = ureg_writemask(r[1], TGSI_WRITEMASK_W);
         struct ureg_dst rHit = ureg_writemask(r[3], TGSI_WRITEMASK_XYZ);
         struct ureg_dst rMid = ureg_writemask(r[4], TGSI_WRITEMASK_XYZ);
@@ -851,22 +849,22 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W  ), vs->mtlA, vs->mtlE);
         }
-        ureg_MAD(ureg, rCol[0], ureg_src(rD), vs->mtlD, ureg_src(tmp));
-        ureg_MUL(ureg, rCol[1], ureg_src(rS), vs->mtlS);
+        ureg_MAD(ureg, oCol[0], ureg_src(rD), vs->mtlD, ureg_src(tmp));
+        ureg_MUL(ureg, oCol[1], ureg_src(rS), vs->mtlS);
     } else
     /* COLOR */
     if (key->darkness) {
         if (key->mtl_emissive == 0 && key->mtl_ambient == 0) {
-            ureg_MAD(ureg, rCol[0], vs->mtlD, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f), _CONST(19));
+            ureg_MAD(ureg, oCol[0], vs->mtlD, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f), _CONST(19));
         } else {
-            ureg_MAD(ureg, ureg_writemask(rCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
+            ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
             ureg_ADD(ureg, ureg_writemask(tmp,     TGSI_WRITEMASK_W), vs->mtlA, vs->mtlE);
-            ureg_ADD(ureg, ureg_writemask(rCol[0], TGSI_WRITEMASK_W), vs->mtlD, _W(tmp));
+            ureg_ADD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD, _W(tmp));
         }
-        ureg_MUL(ureg, rCol[1], ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f), vs->mtlS);
+        ureg_MUL(ureg, oCol[1], ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f), vs->mtlS);
     } else {
-        ureg_MOV(ureg, rCol[0], vs->aCol[0]);
-        ureg_MOV(ureg, rCol[1], vs->aCol[1]);
+        ureg_MOV(ureg, oCol[0], vs->aCol[0]);
+        ureg_MOV(ureg, oCol[1], vs->aCol[1]);
     }
 
     /* === Process fog.
@@ -874,10 +872,6 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
      * exp(x) = ex2(log2(e) * x)
      */
     if (key->fog_mode) {
-        /* Fog doesn't affect alpha, TODO: combine with light code output */
-        ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), _W(rCol[0]));
-        ureg_MOV(ureg, ureg_writemask(oCol[1], TGSI_WRITEMASK_W), _W(rCol[1]));
-
         if (key->position_t) {
             ureg_MOV(ureg, ureg_saturate(tmp_x), ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
         } else
@@ -905,10 +899,58 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
             ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
         }
         ureg_MOV(ureg, oFog, _X(tmp));
-        ureg_LRP(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _X(tmp), ureg_src(rCol[0]), _CONST(29));
-        ureg_LRP(ureg, ureg_writemask(oCol[1], TGSI_WRITEMASK_XYZ), _X(tmp), ureg_src(rCol[1]), _CONST(29));
+    } else if (key->fog && !(key->passthrough & (1 << NINE_DECLUSAGE_FOG))) {
+        ureg_MOV(ureg, oFog, ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
     }
 
+    if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT)) {
+        struct ureg_src input;
+        struct ureg_dst output;
+        input = vs->aWgt;
+        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 18);
+        ureg_MOV(ureg, output, input);
+    }
+    if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES)) {
+        struct ureg_src input;
+        struct ureg_dst output;
+        input = vs->aInd;
+        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 19);
+        ureg_MOV(ureg, output, input);
+    }
+    if (key->passthrough & (1 << NINE_DECLUSAGE_NORMAL)) {
+        struct ureg_src input;
+        struct ureg_dst output;
+        input = vs->aNrm;
+        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 20);
+        ureg_MOV(ureg, output, input);
+    }
+    if (key->passthrough & (1 << NINE_DECLUSAGE_TANGENT)) {
+        struct ureg_src input;
+        struct ureg_dst output;
+        input = build_vs_add_input(vs, NINE_DECLUSAGE_TANGENT);
+        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 21);
+        ureg_MOV(ureg, output, input);
+    }
+    if (key->passthrough & (1 << NINE_DECLUSAGE_BINORMAL)) {
+        struct ureg_src input;
+        struct ureg_dst output;
+        input = build_vs_add_input(vs, NINE_DECLUSAGE_BINORMAL);
+        output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 22);
+        ureg_MOV(ureg, output, input);
+    }
+    if (key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
+        struct ureg_src input;
+        struct ureg_dst output;
+        input = build_vs_add_input(vs, NINE_DECLUSAGE_FOG);
+        input = ureg_scalar(input, TGSI_SWIZZLE_X);
+        output = oFog;
+        ureg_MOV(ureg, output, input);
+    }
+    if (key->passthrough & (1 << NINE_DECLUSAGE_DEPTH)) {
+        (void) 0; /* TODO: replace z of position output ? */
+    }
+
+
     if (key->position_t && device->driver_caps.window_space_position_support)
         ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
 
@@ -1270,10 +1312,18 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
             if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
                 key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
             }
-            if (key->ts[s].projected)
-                ureg_TXP(ureg, ps.rTex, target, ps.vT[s], ps.s[s]);
-            else
+            if (key->projected & (3 << (s *2))) {
+                unsigned dim = 1 + ((key->projected >> (2 * s)) & 3);
+                if (dim == 4)
+                    ureg_TXP(ureg, ps.rTex, target, ps.vT[s], ps.s[s]);
+                else {
+                    ureg_RCP(ureg, ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X), ureg_scalar(ps.vT[s], dim-1));
+                    ureg_MUL(ureg, ps.rTmp, _XXXX(ps.rTmpSrc), ps.vT[s]);
+                    ureg_TEX(ureg, ps.rTex, target, ps.rTmpSrc, ps.s[s]);
+                }
+            } else {
                 ureg_TEX(ureg, ps.rTex, target, ps.vT[s], ps.s[s]);
+            }
         }
 
         if (s == 0 &&
@@ -1316,6 +1366,10 @@ nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
             colorarg[2] != alphaarg[2])
             dst.WriteMask = TGSI_WRITEMASK_XYZ;
 
+        /* Special DOTPRODUCT behaviour (see wine tests) */
+        if (key->ts[s].colorop == D3DTOP_DOTPRODUCT3)
+            dst.WriteMask = TGSI_WRITEMASK_XYZW;
+
         if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
         if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
         if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
@@ -1406,12 +1460,18 @@ nine_ff_get_vs(struct NineDevice9 *device)
             else if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
                 s = usage / NINE_DECLUSAGE_COUNT;
                 if (s < 8)
-                    input_texture_coord[s] = 1;
+                    input_texture_coord[s] = nine_decltype_get_dim(state->vdecl->decls[i].Type);
                 else
                     DBG("FF given texture coordinate >= 8. Ignoring\n");
-            }
+            } else if (usage < NINE_DECLUSAGE_NONE)
+                key.passthrough |= 1 << usage;
         }
     }
+    /* ff vs + ps 3.0: some elements are passed to the ps (wine test).
+     * We do restrict to indices 0 */
+    key.passthrough &= ~((1 << NINE_DECLUSAGE_POSITION) | (1 << NINE_DECLUSAGE_PSIZE) |
+                         (1 << NINE_DECLUSAGE_TEXCOORD) | (1 << NINE_DECLUSAGE_POSITIONT) |
+                         (1 << NINE_DECLUSAGE_TESSFACTOR) | (1 << NINE_DECLUSAGE_SAMPLE));
     if (!key.vertexpointsize)
         key.pointscale = !!state->rs[D3DRS_POINTSCALEENABLE];
 
@@ -1427,6 +1487,7 @@ nine_ff_get_vs(struct NineDevice9 *device)
         key.mtl_specular = state->rs[D3DRS_SPECULARMATERIALSOURCE];
         key.mtl_emissive = state->rs[D3DRS_EMISSIVEMATERIALSOURCE];
     }
+    key.fog = !!state->rs[D3DRS_FOGENABLE];
     key.fog_mode = state->rs[D3DRS_FOGENABLE] ? state->rs[D3DRS_FOGVERTEXMODE] : 0;
     if (key.fog_mode)
         key.fog_range = !key.position_t && state->rs[D3DRS_RANGEFOGENABLE];
@@ -1448,7 +1509,7 @@ nine_ff_get_vs(struct NineDevice9 *device)
 
     for (s = 0; s < 8; ++s) {
         unsigned gen = (state->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
-        unsigned dim = MIN2(state->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7, 4);
+        unsigned dim;
 
         if (key.position_t && gen > NINED3DTSS_TCI_PASSTHRU)
             gen = NINED3DTSS_TCI_PASSTHRU;
@@ -1458,7 +1519,14 @@ nine_ff_get_vs(struct NineDevice9 *device)
 
         key.tc_gen |= gen << (s * 3);
         key.tc_idx |= (state->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7) << (s * 3);
-        key.tc_dim |= dim << (s * 3);
+        key.tc_dim_input |= ((input_texture_coord[s]-1) & 0x3) << (s * 2);
+
+        dim = state->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7;
+        if (dim > 4)
+            dim = input_texture_coord[s];
+        if (dim == 1) /* NV behaviour */
+            dim = 0;
+        key.tc_dim_output |= dim << (s * 3);
     }
 
     vs = util_hash_table_get(device->ff.ht_vs, &key);
@@ -1473,6 +1541,7 @@ nine_ff_get_vs(struct NineDevice9 *device)
         memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));
 
         err = util_hash_table_set(device->ff.ht_vs, &vs->ff_key, vs);
+        (void)err;
         assert(err == PIPE_OK);
         device->ff.num_vs++;
         NineUnknown_ConvertRefToBind(NineUnknown(vs));
@@ -1543,8 +1612,6 @@ nine_ff_get_ps(struct NineDevice9 *device)
         }
         key.ts[s].resultarg = state->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;
 
-        key.ts[s].projected = !!(state->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & D3DTTFF_PROJECTED);
-
         if (state->texture[s]) {
             switch (state->texture[s]->base.type) {
             case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
@@ -1558,10 +1625,14 @@ nine_ff_get_ps(struct NineDevice9 *device)
             key.ts[s].textarget = 1;
         }
     }
+
+    key.projected = nine_ff_get_projected_key(state);
+
     for (; s < 8; ++s)
         key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
     if (state->rs[D3DRS_FOGENABLE])
         key.fog_mode = state->rs[D3DRS_FOGTABLEMODE];
+    key.fog = !!state->rs[D3DRS_FOGENABLE];
 
     ps = util_hash_table_get(device->ff.ht_ps, &key);
     if (ps)
@@ -1573,6 +1644,7 @@ nine_ff_get_ps(struct NineDevice9 *device)
         memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));
 
         err = util_hash_table_set(device->ff.ht_ps, &ps->ff_key, ps);
+        (void)err;
         assert(err == PIPE_OK);
         device->ff.num_ps++;
         NineUnknown_ConvertRefToBind(NineUnknown(ps));
@@ -1689,7 +1761,6 @@ nine_ff_load_point_and_fog_params(struct NineDevice9 *device)
     if (isinf(dst[28].y))
         dst[28].y = 0.0f;
     dst[28].z = asfloat(state->rs[D3DRS_FOGDENSITY]);
-    d3dcolor_to_rgba(&dst[29].x, state->rs[D3DRS_FOGCOLOR]);
 }
 
 static void
@@ -1703,7 +1774,7 @@ nine_ff_load_tex_matrices(struct NineDevice9 *device)
         return;
     for (s = 0; s < 8; ++s) {
         if (IS_D3DTS_DIRTY(state, TEXTURE0 + s))
-            M[32 + s] = *nine_state_access_transform(state, D3DTS_TEXTURE0 + s, FALSE);
+            nine_d3d_matrix_transpose(&M[32 + s], nine_state_access_transform(state, D3DTS_TEXTURE0 + s, FALSE));
     }
 }
 
@@ -1762,28 +1833,22 @@ nine_ff_load_viewport_info(struct NineDevice9 *device)
 void
 nine_ff_update(struct NineDevice9 *device)
 {
-    struct pipe_context *pipe = device->pipe;
     struct nine_state *state = &device->state;
+    struct pipe_constant_buffer cb;
 
     DBG("vs=%p ps=%p\n", device->state.vs, device->state.ps);
 
     /* NOTE: the only reference belongs to the hash table */
-    if (!device->state.vs)
+    if (!device->state.vs) {
         device->ff.vs = nine_ff_get_vs(device);
-    if (!device->state.ps)
+        device->state.changed.group |= NINE_STATE_VS;
+    }
+    if (!device->state.ps) {
         device->ff.ps = nine_ff_get_ps(device);
+        device->state.changed.group |= NINE_STATE_PS;
+    }
 
     if (!device->state.vs) {
-        if (device->state.ff.clobber.vs_const) {
-            device->state.ff.clobber.vs_const = FALSE;
-            device->state.changed.group |=
-                NINE_STATE_FF_VSTRANSF |
-                NINE_STATE_FF_MATERIAL |
-                NINE_STATE_FF_LIGHTING |
-                NINE_STATE_FF_OTHER;
-            device->state.ff.changed.transform[0] |= 0xff000c;
-            device->state.ff.changed.transform[8] |= 0xff;
-        }
         nine_ff_load_vs_transforms(device);
         nine_ff_load_tex_matrices(device);
         nine_ff_load_lights(device);
@@ -1792,57 +1857,45 @@ nine_ff_update(struct NineDevice9 *device)
 
         memset(state->ff.changed.transform, 0, sizeof(state->ff.changed.transform));
 
-        device->state.changed.group |= NINE_STATE_VS;
-        device->state.changed.group |= NINE_STATE_VS_CONST;
-
-        if (device->prefer_user_constbuf) {
-            struct pipe_context *pipe = device->pipe;
-            struct pipe_constant_buffer cb;
-            cb.buffer_offset = 0;
-            cb.buffer = NULL;
-            cb.user_buffer = device->ff.vs_const;
-            cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
-            pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &cb);
-        } else {
-            struct pipe_box box;
-            u_box_1d(0, NINE_FF_NUM_VS_CONST * 4 * sizeof(float), &box);
-            pipe->transfer_inline_write(pipe, device->constbuf_vs, 0,
-                                        0, &box,
-                                        device->ff.vs_const, 0, 0);
-            nine_ranges_insert(&device->state.changed.vs_const_f, 0, NINE_FF_NUM_VS_CONST,
-                               &device->range_pool);
+        cb.buffer_offset = 0;
+        cb.buffer = NULL;
+        cb.user_buffer = device->ff.vs_const;
+        cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
+
+        if (!device->driver_caps.user_cbufs) {
+            u_upload_data(device->constbuf_uploader,
+                          0,
+                          cb.buffer_size,
+                          cb.user_buffer,
+                          &cb.buffer_offset,
+                          &cb.buffer);
+            u_upload_unmap(device->constbuf_uploader);
+            cb.user_buffer = NULL;
         }
+        state->pipe.cb_vs_ff = cb;
+        state->commit |= NINE_STATE_COMMIT_CONST_VS;
     }
 
     if (!device->state.ps) {
-        if (device->state.ff.clobber.ps_const) {
-            device->state.ff.clobber.ps_const = FALSE;
-            device->state.changed.group |=
-                NINE_STATE_FF_PSSTAGES |
-                NINE_STATE_FF_OTHER;
-        }
         nine_ff_load_ps_params(device);
 
-        device->state.changed.group |= NINE_STATE_PS;
-        device->state.changed.group |= NINE_STATE_PS_CONST;
-
-        if (device->prefer_user_constbuf) {
-            struct pipe_context *pipe = device->pipe;
-            struct pipe_constant_buffer cb;
-            cb.buffer_offset = 0;
-            cb.buffer = NULL;
-            cb.user_buffer = device->ff.ps_const;
-            cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
-            pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &cb);
-        } else {
-            struct pipe_box box;
-            u_box_1d(0, NINE_FF_NUM_PS_CONST * 4 * sizeof(float), &box);
-            pipe->transfer_inline_write(pipe, device->constbuf_ps, 0,
-                                        0, &box,
-                                        device->ff.ps_const, 0, 0);
-            nine_ranges_insert(&device->state.changed.ps_const_f, 0, NINE_FF_NUM_PS_CONST,
-                               &device->range_pool);
+        cb.buffer_offset = 0;
+        cb.buffer = NULL;
+        cb.user_buffer = device->ff.ps_const;
+        cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
+
+        if (!device->driver_caps.user_cbufs) {
+            u_upload_data(device->constbuf_uploader,
+                          0,
+                          cb.buffer_size,
+                          cb.user_buffer,
+                          &cb.buffer_offset,
+                          &cb.buffer);
+            u_upload_unmap(device->constbuf_uploader);
+            cb.user_buffer = NULL;
         }
+        state->pipe.cb_ps_ff = cb;
+        state->commit |= NINE_STATE_COMMIT_CONST_PS;
     }
 
     device->state.changed.group &= ~NINE_STATE_FF;
diff --git a/src/gallium/state_trackers/nine/nine_ff.h b/src/gallium/state_trackers/nine/nine_ff.h
index 7cefa65b1c4..9c33c76370d 100644
--- a/src/gallium/state_trackers/nine/nine_ff.h
+++ b/src/gallium/state_trackers/nine/nine_ff.h
@@ -3,6 +3,7 @@
 #define _NINE_FF_H_
 
 #include "device9.h"
+#include "vertexdeclaration9.h"
 
 boolean nine_ff_init(struct NineDevice9 *);
 void    nine_ff_fini(struct NineDevice9 *);
@@ -29,4 +30,84 @@ nine_d3d_matrix_inverse_3x3(D3DMATRIX *, const D3DMATRIX *);
 void
 nine_d3d_matrix_transpose(D3DMATRIX *, const D3DMATRIX *);
 
+#define NINED3DTSS_TCI_DISABLE                       0
+#define NINED3DTSS_TCI_PASSTHRU                      1
+#define NINED3DTSS_TCI_CAMERASPACENORMAL             2
+#define NINED3DTSS_TCI_CAMERASPACEPOSITION           3
+#define NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR   4
+#define NINED3DTSS_TCI_SPHEREMAP                     5
+
+static inline unsigned
+nine_decltype_get_dim(BYTE type)
+{
+    switch (type) {
+    case D3DDECLTYPE_FLOAT1: return 1;
+    case D3DDECLTYPE_FLOAT2: return 2;
+    case D3DDECLTYPE_FLOAT3: return 3;
+    case D3DDECLTYPE_FLOAT4: return 4;
+    case D3DDECLTYPE_D3DCOLOR: return 1;
+    case D3DDECLTYPE_UBYTE4: return 4;
+    case D3DDECLTYPE_SHORT2: return 2;
+    case D3DDECLTYPE_SHORT4: return 4;
+    case D3DDECLTYPE_UBYTE4N: return 4;
+    case D3DDECLTYPE_SHORT2N: return 2;
+    case D3DDECLTYPE_SHORT4N: return 4;
+    case D3DDECLTYPE_USHORT2N: return 2;
+    case D3DDECLTYPE_USHORT4N: return 4;
+    case D3DDECLTYPE_UDEC3: return 3;
+    case D3DDECLTYPE_DEC3N: return 3;
+    case D3DDECLTYPE_FLOAT16_2: return 2;
+    case D3DDECLTYPE_FLOAT16_4: return 4;
+    default:
+        assert(!"Implementation error !");
+    }
+    return 0;
+}
+
+static inline uint16_t
+nine_ff_get_projected_key(struct nine_state *state)
+{
+    unsigned s, i;
+    uint16_t projected = 0;
+    char input_texture_coord[8];
+    memset(&input_texture_coord, 0, sizeof(input_texture_coord));
+
+    if (state->vdecl) {
+        for (i = 0; i < state->vdecl->nelems; i++) {
+            uint16_t usage = state->vdecl->usage_map[i];
+            if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
+                s = usage / NINE_DECLUSAGE_COUNT;
+                if (s < 8)
+                    input_texture_coord[s] = nine_decltype_get_dim(state->vdecl->decls[i].Type);
+            }
+        }
+    }
+
+    for (s = 0; s < 8; ++s) {
+        unsigned gen = (state->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
+        unsigned dim = state->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7;
+        unsigned proj = !!(state->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & D3DTTFF_PROJECTED);
+
+        if (!state->vs) {
+            if (dim > 4)
+                dim = input_texture_coord[s];
+
+            if (!dim && gen == NINED3DTSS_TCI_PASSTHRU)
+                dim = input_texture_coord[s];
+            else if (!dim)
+                dim = 4;
+
+            if (dim == 1) /* NV behaviour */
+                proj = 0;
+            if (dim > input_texture_coord[s] && gen == NINED3DTSS_TCI_PASSTHRU)
+                proj = 0;
+        } else {
+            dim = 4;
+        }
+        if (proj)
+            projected |= (dim-1) << (2 * s);
+    }
+    return projected;
+}
+
 #endif /* _NINE_FF_H_ */
diff --git a/src/gallium/state_trackers/nine/nine_pipe.c b/src/gallium/state_trackers/nine/nine_pipe.c
index 4cf37b9f59c..2be30f7e097 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.c
+++ b/src/gallium/state_trackers/nine/nine_pipe.c
@@ -27,7 +27,8 @@
 #include "cso_cache/cso_context.h"
 
 void
-nine_convert_dsa_state(struct cso_context *ctx, const DWORD *rs)
+nine_convert_dsa_state(struct pipe_depth_stencil_alpha_state *dsa_state,
+                       const DWORD *rs)
 {
     struct pipe_depth_stencil_alpha_state dsa;
 
@@ -65,16 +66,15 @@ nine_convert_dsa_state(struct cso_context *ctx, const DWORD *rs)
         dsa.alpha.ref_value = (float)rs[D3DRS_ALPHAREF] / 255.0f;
     }
 
-    cso_set_depth_stencil_alpha(ctx, &dsa);
+    *dsa_state = dsa;
 }
 
-/* TODO: Keep a static copy in device so we don't have to memset every time ? */
 void
-nine_convert_rasterizer_state(struct cso_context *ctx, const DWORD *rs)
+nine_convert_rasterizer_state(struct pipe_rasterizer_state *rast_state, const DWORD *rs)
 {
     struct pipe_rasterizer_state rast;
 
-    memset(&rast, 0, sizeof(rast)); /* memcmp safety */
+    memset(&rast, 0, sizeof(rast));
 
     rast.flatshade = rs[D3DRS_SHADEMODE] == D3DSHADE_FLAT;
  /* rast.light_twoside = 0; */
@@ -92,7 +92,7 @@ nine_convert_rasterizer_state(struct cso_context *ctx, const DWORD *rs)
  /* rast.poly_stipple_enable = 0; */
  /* rast.point_smooth = 0; */
     rast.sprite_coord_mode = PIPE_SPRITE_COORD_UPPER_LEFT;
-    rast.point_quad_rasterization = !!rs[D3DRS_POINTSPRITEENABLE];
+    rast.point_quad_rasterization = 1;
     rast.point_size_per_vertex = rs[NINED3DRS_VSPOINTSIZE];
     rast.multisample = !!rs[D3DRS_MULTISAMPLEANTIALIAS];
     rast.line_smooth = !!rs[D3DRS_ANTIALIASEDLINEENABLE];
@@ -110,12 +110,28 @@ nine_convert_rasterizer_state(struct cso_context *ctx, const DWORD *rs)
  /* rast.line_stipple_pattern = 0; */
     rast.sprite_coord_enable = rs[D3DRS_POINTSPRITEENABLE] ? 0xff : 0x00;
     rast.line_width = 1.0f;
-    rast.point_size = rs[NINED3DRS_VSPOINTSIZE] ? 1.0f : asfloat(rs[D3DRS_POINTSIZE]); /* XXX: D3DRS_POINTSIZE_MIN/MAX */
-    rast.offset_units = asfloat(rs[D3DRS_DEPTHBIAS]) * asfloat(rs[NINED3DRS_ZBIASSCALE]);
+    if (rs[NINED3DRS_VSPOINTSIZE]) {
+        rast.point_size = 1.0f;
+    } else {
+        rast.point_size = CLAMP(asfloat(rs[D3DRS_POINTSIZE]),
+                asfloat(rs[D3DRS_POINTSIZE_MIN]),
+                asfloat(rs[D3DRS_POINTSIZE_MAX]));
+    }
+    /* offset_units has the ogl/d3d11 meaning.
+     * d3d9: offset = scale * dz + bias
+     * ogl/d3d11: offset = scale * dz + r * bias
+     * with r implementation dependant and is supposed to be
+     * the smallest value the depth buffer format can hold.
+     * In practice on current and past hw it seems to be 2^-23
+     * for all formats except float formats where it varies depending
+     * on the content.
+     * For now use 1 << 23, but in the future perhaps add a way in gallium
+     * to get r for the format or get the gallium behaviour */
+    rast.offset_units = asfloat(rs[D3DRS_DEPTHBIAS]) * (float)(1 << 23);
     rast.offset_scale = asfloat(rs[D3DRS_SLOPESCALEDEPTHBIAS]);
  /* rast.offset_clamp = 0.0f; */
 
-    cso_set_rasterizer(ctx, &rast);
+    *rast_state = rast;
 }
 
 static inline void
@@ -137,7 +153,7 @@ nine_convert_blend_state_fixup(struct pipe_blend_state *blend, const DWORD *rs)
 }
 
 void
-nine_convert_blend_state(struct cso_context *ctx, const DWORD *rs)
+nine_convert_blend_state(struct pipe_blend_state *blend_state, const DWORD *rs)
 {
     struct pipe_blend_state blend;
 
@@ -181,7 +197,7 @@ nine_convert_blend_state(struct cso_context *ctx, const DWORD *rs)
 
     /* blend.force_srgb = !!rs[D3DRS_SRGBWRITEENABLE]; */
 
-    cso_set_blend(ctx, &blend);
+    *blend_state = blend;
 }
 
 void
@@ -239,8 +255,8 @@ nine_pipe_context_clear(struct NineDevice9 *This)
     cso_set_samplers(cso, PIPE_SHADER_VERTEX, 0, NULL);
     cso_set_samplers(cso, PIPE_SHADER_FRAGMENT, 0, NULL);
 
-    pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, 0, NULL);
-    pipe->set_sampler_views(pipe, PIPE_SHADER_VERTEX, 0, 0, NULL);
+    cso_set_sampler_views(cso, PIPE_SHADER_VERTEX, 0, NULL);
+    cso_set_sampler_views(cso, PIPE_SHADER_FRAGMENT, 0, NULL);
 
     pipe->set_vertex_buffers(pipe, 0, This->caps.MaxStreams, NULL);
     pipe->set_index_buffer(pipe, NULL);
diff --git a/src/gallium/state_trackers/nine/nine_pipe.h b/src/gallium/state_trackers/nine/nine_pipe.h
index 43a7737cdf9..86117866ed5 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.h
+++ b/src/gallium/state_trackers/nine/nine_pipe.h
@@ -27,6 +27,7 @@
 #include "pipe/p_format.h"
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h" /* pipe_box */
+#include "util/macros.h"
 #include "util/u_rect.h"
 #include "util/u_format.h"
 #include "nine_helpers.h"
@@ -36,9 +37,9 @@ struct cso_context;
 extern const enum pipe_format nine_d3d9_to_pipe_format_map[120];
 extern const D3DFORMAT nine_pipe_to_d3d9_format_map[PIPE_FORMAT_COUNT];
 
-void nine_convert_dsa_state(struct cso_context *, const DWORD *);
-void nine_convert_rasterizer_state(struct cso_context *, const DWORD *);
-void nine_convert_blend_state(struct cso_context *, const DWORD *);
+void nine_convert_dsa_state(struct pipe_depth_stencil_alpha_state *, const DWORD *);
+void nine_convert_rasterizer_state(struct pipe_rasterizer_state *, const DWORD *);
+void nine_convert_blend_state(struct pipe_blend_state *, const DWORD *);
 void nine_convert_sampler_state(struct cso_context *, int idx, const DWORD *);
 
 void nine_pipe_context_clear(struct NineDevice9 *);
@@ -81,6 +82,49 @@ rect_to_pipe_box(struct pipe_box *dst, const RECT *src)
     dst->depth = 1;
 }
 
+static inline void
+pipe_box_to_rect(RECT *dst, const struct pipe_box *src)
+{
+    dst->left = src->x;
+    dst->right = src->x + src->width;
+    dst->top = src->y;
+    dst->bottom = src->y + src->height;
+}
+
+static inline void
+rect_minify_inclusive(RECT *rect)
+{
+    rect->left = rect->left >> 2;
+    rect->top = rect->top >> 2;
+    rect->right = DIV_ROUND_UP(rect->right, 2);
+    rect->bottom = DIV_ROUND_UP(rect->bottom, 2);
+}
+
+/* We suppose:
+ * 0 <= rect->left < rect->right
+ * 0 <= rect->top < rect->bottom
+ */
+static inline void
+fit_rect_format_inclusive(enum pipe_format format, RECT *rect, int width, int height)
+{
+    const unsigned w = util_format_get_blockwidth(format);
+    const unsigned h = util_format_get_blockheight(format);
+
+    if (util_format_is_compressed(format)) {
+        rect->left = rect->left - rect->left % w;
+        rect->top = rect->top - rect->top % h;
+        rect->right = (rect->right % w) == 0 ?
+            rect->right :
+            rect->right - (rect->right % w) + w;
+        rect->bottom = (rect->bottom % h) == 0 ?
+            rect->bottom :
+            rect->bottom - (rect->bottom % h) + h;
+    }
+
+    rect->right = MIN2(rect->right, width);
+    rect->bottom = MIN2(rect->bottom, height);
+}
+
 static inline boolean
 rect_to_pipe_box_clamp(struct pipe_box *dst, const RECT *src)
 {
@@ -164,6 +208,23 @@ pipe_to_d3d9_format(enum pipe_format format)
     return nine_pipe_to_d3d9_format_map[format];
 }
 
+/* ATI1 and ATI2 are not officially compressed in d3d9 */
+static inline boolean
+compressed_format( D3DFORMAT fmt )
+{
+    switch (fmt) {
+    case D3DFMT_DXT1:
+    case D3DFMT_DXT2:
+    case D3DFMT_DXT3:
+    case D3DFMT_DXT4:
+    case D3DFMT_DXT5:
+        return TRUE;
+    default:
+        break;
+    }
+    return FALSE;
+}
+
 static inline boolean
 depth_stencil_format( D3DFORMAT fmt )
 {
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index 754f5af6b8e..28f27870dc8 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -89,6 +89,15 @@ static inline const char *d3dsio_to_string(unsigned opcode);
 #define NINE_SWIZZLE4(x,y,z,w) \
    TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w
 
+#define NINE_CONSTANT_SRC(index) \
+   ureg_src_register(TGSI_FILE_CONSTANT, index)
+
+#define NINE_APPLY_SWIZZLE(src, s) \
+   ureg_swizzle(src, NINE_SWIZZLE4(s, s, s, s))
+
+#define NINE_CONSTANT_SRC_SWIZZLE(index, s) \
+   NINE_APPLY_SWIZZLE(NINE_CONSTANT_SRC(index), s)
+
 #define NINED3DSPDM_SATURATE (D3DSPDM_SATURATE >> D3DSP_DSTMOD_SHIFT)
 #define NINED3DSPDM_PARTIALP (D3DSPDM_PARTIALPRECISION >> D3DSP_DSTMOD_SHIFT)
 #define NINED3DSPDM_CENTROID (D3DSPDM_MSAMPCENTROID >> D3DSP_DSTMOD_SHIFT)
@@ -444,6 +453,9 @@ struct shader_translator
         BYTE minor;
     } version;
     unsigned processor; /* TGSI_PROCESSOR_VERTEX/FRAMGENT */
+    unsigned num_constf_allowed;
+    unsigned num_consti_allowed;
+    unsigned num_constb_allowed;
 
     boolean native_integers;
     boolean inline_subroutines;
@@ -505,7 +517,6 @@ struct shader_translator
 
 #define IS_VS (tx->processor == TGSI_PROCESSOR_VERTEX)
 #define IS_PS (tx->processor == TGSI_PROCESSOR_FRAGMENT)
-#define NINE_MAX_CONST_F_SHADER (tx->processor == TGSI_PROCESSOR_VERTEX ? NINE_MAX_CONST_F : NINE_MAX_CONST_F_PS3)
 
 #define FAILURE_VOID(cond) if ((cond)) {tx->failure=1;return;}
 
@@ -528,7 +539,7 @@ static boolean
 tx_lconstf(struct shader_translator *tx, struct ureg_src *src, INT index)
 {
    INT i;
-   if (index < 0 || index >= NINE_MAX_CONST_F_SHADER) {
+   if (index < 0 || index >= tx->num_constf_allowed) {
        tx->failure = TRUE;
        return FALSE;
    }
@@ -543,7 +554,7 @@ tx_lconstf(struct shader_translator *tx, struct ureg_src *src, INT index)
 static boolean
 tx_lconsti(struct shader_translator *tx, struct ureg_src *src, INT index)
 {
-   if (index < 0 || index >= NINE_MAX_CONST_I) {
+   if (index < 0 || index >= tx->num_consti_allowed) {
        tx->failure = TRUE;
        return FALSE;
    }
@@ -554,7 +565,7 @@ tx_lconsti(struct shader_translator *tx, struct ureg_src *src, INT index)
 static boolean
 tx_lconstb(struct shader_translator *tx, struct ureg_src *src, INT index)
 {
-   if (index < 0 || index >= NINE_MAX_CONST_B) {
+   if (index < 0 || index >= tx->num_constb_allowed) {
        tx->failure = TRUE;
        return FALSE;
    }
@@ -568,9 +579,7 @@ tx_set_lconstf(struct shader_translator *tx, INT index, float f[4])
 {
     unsigned n;
 
-    FAILURE_VOID(index < 0 || index >= NINE_MAX_CONST_F_SHADER)
-    if (IS_VS && index >= NINE_MAX_CONST_F_SHADER)
-        WARN("lconstf index %i too high, indirect access won't work\n", index);
+    FAILURE_VOID(index < 0 || index >= tx->num_constf_allowed)
 
     for (n = 0; n < tx->num_lconstf; ++n)
         if (tx->lconstf[n].idx == index)
@@ -592,7 +601,7 @@ tx_set_lconstf(struct shader_translator *tx, INT index, float f[4])
 static void
 tx_set_lconsti(struct shader_translator *tx, INT index, int i[4])
 {
-    FAILURE_VOID(index < 0 || index >= NINE_MAX_CONST_I)
+    FAILURE_VOID(index < 0 || index >= tx->num_consti_allowed)
     tx->lconsti[index].idx = index;
     tx->lconsti[index].reg = tx->native_integers ?
        ureg_imm4i(tx->ureg, i[0], i[1], i[2], i[3]) :
@@ -601,7 +610,7 @@ tx_set_lconsti(struct shader_translator *tx, INT index, int i[4])
 static void
 tx_set_lconstb(struct shader_translator *tx, INT index, BOOL b)
 {
-    FAILURE_VOID(index < 0 || index >= NINE_MAX_CONST_B)
+    FAILURE_VOID(index < 0 || index >= tx->num_constb_allowed)
     tx->lconstb[index].idx = index;
     tx->lconstb[index].reg = tx->native_integers ?
        ureg_imm1u(tx->ureg, b ? 0xffffffff : 0) :
@@ -672,6 +681,54 @@ tx_pred_alloc(struct shader_translator *tx, INT idx)
         tx->regs.p = ureg_DECL_predicate(tx->ureg);
 }
 
+/* NOTE: It's not very clear on which ps1.1-ps1.3 instructions
+ * the projection should be applied on the texture. It doesn't
+ * apply on texkill.
+ * The doc is very imprecise here (it says the projection is done
+ * before rasterization, thus in vs, which seems wrong since ps instructions
+ * are affected differently)
+ * For now we only apply to the ps TEX instruction and TEXBEM.
+ * Perhaps some other instructions would need it */
+static inline void
+apply_ps1x_projection(struct shader_translator *tx, struct ureg_dst dst,
+                      struct ureg_src src, INT idx)
+{
+    struct ureg_dst tmp;
+    unsigned dim = 1 + ((tx->info->projected >> (2 * idx)) & 3);
+
+    /* no projection */
+    if (dim == 1) {
+        ureg_MOV(tx->ureg, dst, src);
+    } else {
+        tmp = tx_scratch_scalar(tx);
+        ureg_RCP(tx->ureg, tmp, ureg_scalar(src, dim-1));
+        ureg_MUL(tx->ureg, dst, tx_src_scalar(tmp), src);
+    }
+}
+
+static inline void
+TEX_with_ps1x_projection(struct shader_translator *tx, struct ureg_dst dst,
+                         unsigned target, struct ureg_src src0,
+                         struct ureg_src src1, INT idx)
+{
+    unsigned dim = 1 + ((tx->info->projected >> (2 * idx)) & 3);
+    struct ureg_dst tmp;
+
+    /* dim == 1: no projection
+     * Looks like must be disabled when it makes no
+     * sense according the texture dimensions
+     */
+    if (dim == 1 || dim <= target) {
+        ureg_TEX(tx->ureg, dst, target, src0, src1);
+    } else if (dim == 4) {
+        ureg_TXP(tx->ureg, dst, target, src0, src1);
+    } else {
+        tmp = tx_scratch(tx);
+        apply_ps1x_projection(tx, tmp, src0, idx);
+        ureg_TEX(tx->ureg, dst, target, ureg_src(tmp), src1);
+    }
+}
+
 static inline void
 tx_texcoord_alloc(struct shader_translator *tx, INT idx)
 {
@@ -1086,9 +1143,18 @@ _tx_dst_param(struct shader_translator *tx, const struct sm1_dst_param *param)
         assert(param->idx >= 0 && param->idx < 4);
         assert(!param->rel);
         tx->info->rt_mask |= 1 << param->idx;
-        if (ureg_dst_is_undef(tx->regs.oCol[param->idx]))
-            tx->regs.oCol[param->idx] =
-               ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_COLOR, param->idx);
+        if (ureg_dst_is_undef(tx->regs.oCol[param->idx])) {
+            /* ps < 3: oCol[0] will have fog blending afterward
+             * vs < 3: oD1.w (D3DPMISCCAPS_FOGANDSPECULARALPHA) set to 0 even if set */
+            if (!IS_VS && tx->version.major < 3 && param->idx == 0) {
+                tx->regs.oCol[0] = ureg_DECL_temporary(tx->ureg);
+            } else if (IS_VS && tx->version.major < 3 && param->idx == 1) {
+                tx->regs.oCol[1] = ureg_DECL_temporary(tx->ureg);
+            } else {
+                tx->regs.oCol[param->idx] =
+                    ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_COLOR, param->idx);
+            }
+        }
         dst = tx->regs.oCol[param->idx];
         if (IS_VS && tx->version.major < 3)
             dst = ureg_saturate(dst);
@@ -1824,7 +1890,7 @@ sm1_declusage_to_tgsi(struct tgsi_declaration_semantic *sem,
         sem->Index = 0;
         break;
     default:
-        assert(!"Invalid DECLUSAGE.");
+        unreachable(!"Invalid DECLUSAGE.");
         break;
     }
 }
@@ -2135,12 +2201,79 @@ DECL_SPECIAL(TEXKILL)
 
 DECL_SPECIAL(TEXBEM)
 {
-    STUB(D3DERR_INVALIDCALL);
-}
+    struct ureg_program *ureg = tx->ureg;
+    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
+    struct ureg_dst tmp, tmp2, texcoord;
+    struct ureg_src sample, m00, m01, m10, m11;
+    struct ureg_src bumpenvlscale, bumpenvloffset;
+    const int m = tx->insn.dst[0].idx;
+    const int n = tx->insn.src[0].idx;
 
-DECL_SPECIAL(TEXBEML)
-{
-    STUB(D3DERR_INVALIDCALL);
+    assert(tx->version.major == 1);
+
+    sample = ureg_DECL_sampler(ureg, m);
+    tx->info->sampler_mask |= 1 << m;
+
+    tx_texcoord_alloc(tx, m);
+
+    tmp = tx_scratch(tx);
+    tmp2 = tx_scratch(tx);
+    texcoord = tx_scratch(tx);
+    /*
+     * Bump-env-matrix:
+     * 00 is X
+     * 01 is Y
+     * 10 is Z
+     * 11 is W
+     */
+    nine_info_mark_const_f_used(tx->info, 8 + 8 + m/2);
+    m00 = NINE_CONSTANT_SRC_SWIZZLE(8 + m, X);
+    m01 = NINE_CONSTANT_SRC_SWIZZLE(8 + m, Y);
+    m10 = NINE_CONSTANT_SRC_SWIZZLE(8 + m, Z);
+    m11 = NINE_CONSTANT_SRC_SWIZZLE(8 + m, W);
+
+    /* These two attributes are packed as X=scale0 Y=offset0 Z=scale1 W=offset1 etc */
+    if (m % 2 == 0) {
+        bumpenvlscale = NINE_CONSTANT_SRC_SWIZZLE(8 + 8 + m / 2, X);
+        bumpenvloffset = NINE_CONSTANT_SRC_SWIZZLE(8 + 8 + m / 2, Y);
+    } else {
+        bumpenvlscale = NINE_CONSTANT_SRC_SWIZZLE(8 + 8 + m / 2, Z);
+        bumpenvloffset = NINE_CONSTANT_SRC_SWIZZLE(8 + 8 + m / 2, W);
+    }
+
+    apply_ps1x_projection(tx, texcoord, tx->regs.vT[m], m);
+
+    /* u' = TextureCoordinates(stage m)u + D3DTSS_BUMPENVMAT00(stage m)*t(n)R  */
+    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m00,
+             NINE_APPLY_SWIZZLE(ureg_src(tx->regs.tS[n]), X), ureg_src(texcoord));
+    /* u' = u' + D3DTSS_BUMPENVMAT10(stage m)*t(n)G */
+    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m10,
+             NINE_APPLY_SWIZZLE(ureg_src(tx->regs.tS[n]), Y),
+             NINE_APPLY_SWIZZLE(ureg_src(tmp), X));
+
+    /* v' = TextureCoordinates(stage m)v + D3DTSS_BUMPENVMAT01(stage m)*t(n)R */
+    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m01,
+             NINE_APPLY_SWIZZLE(ureg_src(tx->regs.tS[n]), X), ureg_src(texcoord));
+    /* v' = v' + D3DTSS_BUMPENVMAT11(stage m)*t(n)G*/
+    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m11,
+             NINE_APPLY_SWIZZLE(ureg_src(tx->regs.tS[n]), Y),
+             NINE_APPLY_SWIZZLE(ureg_src(tmp), Y));
+
+    /* Now the texture coordinates are in tmp.xy */
+
+    if (tx->insn.opcode == D3DSIO_TEXBEM) {
+        ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), ureg_src(tmp), sample);
+    } else if (tx->insn.opcode == D3DSIO_TEXBEML) {
+        /* t(m)RGBA = t(m)RGBA * [(t(n)B * D3DTSS_BUMPENVLSCALE(stage m)) + D3DTSS_BUMPENVLOFFSET(stage m)] */
+        ureg_TEX(ureg, tmp, ps1x_sampler_type(tx->info, m), ureg_src(tmp), sample);
+        ureg_MAD(ureg, tmp2, NINE_APPLY_SWIZZLE(ureg_src(tx->regs.tS[n]), Z),
+                 bumpenvlscale, bumpenvloffset);
+        ureg_MUL(ureg, dst, ureg_src(tmp), ureg_src(tmp2));
+    }
+
+    tx->info->bumpenvmat_needed = 1;
+
+    return D3D_OK;
 }
 
 DECL_SPECIAL(TEXREG2AR)
@@ -2421,7 +2554,43 @@ DECL_SPECIAL(TEXDEPTH)
 
 DECL_SPECIAL(BEM)
 {
-    STUB(D3DERR_INVALIDCALL);
+    struct ureg_program *ureg = tx->ureg;
+    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
+    struct ureg_src src0 = tx_src_param(tx, &tx->insn.src[0]);
+    struct ureg_src src1 = tx_src_param(tx, &tx->insn.src[1]);
+    struct ureg_src m00, m01, m10, m11;
+    const int m = tx->insn.dst[0].idx;
+    struct ureg_dst tmp;
+    /*
+     * Bump-env-matrix:
+     * 00 is X
+     * 01 is Y
+     * 10 is Z
+     * 11 is W
+     */
+    nine_info_mark_const_f_used(tx->info, 8 + m);
+    m00 = NINE_CONSTANT_SRC_SWIZZLE(8 + m, X);
+    m01 = NINE_CONSTANT_SRC_SWIZZLE(8 + m, Y);
+    m10 = NINE_CONSTANT_SRC_SWIZZLE(8 + m, Z);
+    m11 = NINE_CONSTANT_SRC_SWIZZLE(8 + m, W);
+    /* dest.r = src0.r + D3DTSS_BUMPENVMAT00(stage n) * src1.r  */
+    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m00,
+             NINE_APPLY_SWIZZLE(src1, X), NINE_APPLY_SWIZZLE(src0, X));
+    /* dest.r = dest.r + D3DTSS_BUMPENVMAT10(stage n) * src1.g; */
+    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m10,
+             NINE_APPLY_SWIZZLE(src1, Y), NINE_APPLY_SWIZZLE(ureg_src(tmp), X));
+
+    /* dest.g = src0.g + D3DTSS_BUMPENVMAT01(stage n) * src1.r */
+    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m01,
+             NINE_APPLY_SWIZZLE(src1, X), src0);
+    /* dest.g = dest.g + D3DTSS_BUMPENVMAT11(stage n) * src1.g */
+    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m11,
+             NINE_APPLY_SWIZZLE(src1, Y), NINE_APPLY_SWIZZLE(ureg_src(tmp), Y));
+    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XY), ureg_src(tmp));
+
+    tx->info->bumpenvmat_needed = 1;
+
+    return D3D_OK;
 }
 
 DECL_SPECIAL(TEXLD)
@@ -2482,7 +2651,7 @@ DECL_SPECIAL(TEX)
     src[1] = ureg_DECL_sampler(ureg, s);
     tx->info->sampler_mask |= 1 << s;
 
-    ureg_TEX(ureg, dst, t, src[0], src[1]);
+    TEX_with_ps1x_projection(tx, dst, t, src[0], src[1], s);
 
     return D3D_OK;
 }
@@ -2616,7 +2785,7 @@ struct sm1_op_info inst_table[] =
     _OPI(TEX,          TEX, V(0,0), V(0,0), V(1,4), V(1,4), 1, 1, SPECIAL(TEXLD_14)),
     _OPI(TEX,          TEX, V(0,0), V(0,0), V(2,0), V(3,0), 1, 2, SPECIAL(TEXLD)),
     _OPI(TEXBEM,       TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXBEM)),
-    _OPI(TEXBEML,      TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXBEML)),
+    _OPI(TEXBEML,      TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXBEM)),
     _OPI(TEXREG2AR,    TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXREG2AR)),
     _OPI(TEXREG2GB,    TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXREG2GB)),
     _OPI(TEXM3x2PAD,   TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x2PAD)),
@@ -3023,6 +3192,8 @@ tx_ctor(struct shader_translator *tx, struct nine_shader_info *info)
     info->lconstf.data = NULL;
     info->lconstf.ranges = NULL;
 
+    info->bumpenvmat_needed = 0;
+
     for (i = 0; i < Elements(tx->regs.rL); ++i) {
         tx->regs.rL[i] = ureg_dst_undef();
     }
@@ -3074,6 +3245,57 @@ tgsi_processor_from_type(unsigned shader_type)
     }
 }
 
+static void
+shader_add_ps_fog_stage(struct shader_translator *tx, struct ureg_src src_col)
+{
+    struct ureg_program *ureg = tx->ureg;
+    struct ureg_dst oCol0 = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
+    struct ureg_src fog_end, fog_coeff, fog_density;
+    struct ureg_src fog_vs, depth, fog_color;
+    struct ureg_dst fog_factor;
+
+    if (!tx->info->fog_enable) {
+        ureg_MOV(ureg, oCol0, src_col);
+        return;
+    }
+
+    if (tx->info->fog_mode != D3DFOG_NONE)
+        depth = ureg_scalar(ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                              TGSI_INTERPOLATE_LINEAR),
+                                              TGSI_SWIZZLE_Z);
+
+    nine_info_mark_const_f_used(tx->info, 33);
+    fog_color = NINE_CONSTANT_SRC(32);
+    fog_factor = tx_scratch_scalar(tx);
+
+    if (tx->info->fog_mode == D3DFOG_LINEAR) {
+        fog_end = NINE_CONSTANT_SRC_SWIZZLE(33, X);
+        fog_coeff = NINE_CONSTANT_SRC_SWIZZLE(33, Y);
+        ureg_SUB(ureg, fog_factor, fog_end, depth);
+        ureg_MUL(ureg, ureg_saturate(fog_factor), tx_src_scalar(fog_factor), fog_coeff);
+    } else if (tx->info->fog_mode == D3DFOG_EXP) {
+        fog_density = NINE_CONSTANT_SRC_SWIZZLE(33, X);
+        ureg_MUL(ureg, fog_factor, depth, fog_density);
+        ureg_MUL(ureg, fog_factor, tx_src_scalar(fog_factor), ureg_imm1f(ureg, -1.442695f));
+        ureg_EX2(ureg, fog_factor, tx_src_scalar(fog_factor));
+    } else if (tx->info->fog_mode == D3DFOG_EXP2) {
+        fog_density = NINE_CONSTANT_SRC_SWIZZLE(33, X);
+        ureg_MUL(ureg, fog_factor, depth, fog_density);
+        ureg_MUL(ureg, fog_factor, tx_src_scalar(fog_factor), tx_src_scalar(fog_factor));
+        ureg_MUL(ureg, fog_factor, tx_src_scalar(fog_factor), ureg_imm1f(ureg, -1.442695f));
+        ureg_EX2(ureg, fog_factor, tx_src_scalar(fog_factor));
+    } else {
+        fog_vs = ureg_scalar(ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_FOG, 0,
+                                            TGSI_INTERPOLATE_PERSPECTIVE),
+                                            TGSI_SWIZZLE_X);
+        ureg_MOV(ureg, fog_factor, fog_vs);
+    }
+
+    ureg_LRP(ureg, ureg_writemask(oCol0, TGSI_WRITEMASK_XYZ),
+             tx_src_scalar(fog_factor), src_col, fog_color);
+    ureg_MOV(ureg, ureg_writemask(oCol0, TGSI_WRITEMASK_W), src_col);
+}
+
 #define GET_CAP(n) device->screen->get_param( \
       device->screen, PIPE_CAP_##n)
 #define GET_SHADER_CAP(n) device->screen->get_shader_param( \
@@ -3123,6 +3345,24 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
     tx->texcoord_sn = tx->want_texcoord ?
         TGSI_SEMANTIC_TEXCOORD : TGSI_SEMANTIC_GENERIC;
 
+    if (IS_VS) {
+        tx->num_constf_allowed = NINE_MAX_CONST_F;
+    } else if (tx->version.major < 2) {/* IS_PS v1 */
+        tx->num_constf_allowed = 8;
+    } else if (tx->version.major == 2) {/* IS_PS v2 */
+        tx->num_constf_allowed = 32;
+    } else {/* IS_PS v3 */
+        tx->num_constf_allowed = NINE_MAX_CONST_F_PS3;
+    }
+
+    if (tx->version.major < 2) {
+        tx->num_consti_allowed = 0;
+        tx->num_constb_allowed = 0;
+    } else {
+        tx->num_consti_allowed = NINE_MAX_CONST_I;
+        tx->num_constb_allowed = NINE_MAX_CONST_B;
+    }
+
     /* VS must always write position. Declare it here to make it the 1st output.
      * (Some drivers like nv50 are buggy and rely on that.)
      */
@@ -3145,10 +3385,26 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
         goto out;
     }
 
-    if (IS_PS && (tx->version.major < 2) && tx->num_temp) {
-        ureg_MOV(tx->ureg, ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_COLOR, 0),
-                 ureg_src(tx->regs.r[0]));
-        info->rt_mask |= 0x1;
+    if (IS_PS && tx->version.major < 3) {
+        if (tx->version.major < 2) {
+            assert(tx->num_temp); /* there must be color output */
+            info->rt_mask |= 0x1;
+            shader_add_ps_fog_stage(tx, ureg_src(tx->regs.r[0]));
+        } else {
+            shader_add_ps_fog_stage(tx, ureg_src(tx->regs.oCol[0]));
+        }
+    }
+
+    if (IS_VS && tx->version.major < 3 && ureg_dst_is_undef(tx->regs.oFog) && info->fog_enable) {
+        tx->regs.oFog = ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_FOG, 0);
+        ureg_MOV(tx->ureg, ureg_writemask(tx->regs.oFog, TGSI_WRITEMASK_X), ureg_imm1f(tx->ureg, 0.0f));
+    }
+
+    /* vs < 3: oD1.w (D3DPMISCCAPS_FOGANDSPECULARALPHA) set to 0 even if set */
+    if (IS_VS && tx->version.major < 3 && !ureg_dst_is_undef(tx->regs.oCol[1])) {
+        struct ureg_dst dst = ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_COLOR, 1);
+        ureg_MOV(tx->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYZ), ureg_src(tx->regs.oCol[1]));
+        ureg_MOV(tx->ureg, ureg_writemask(dst, TGSI_WRITEMASK_W), ureg_imm1f(tx->ureg, 0.0f));
     }
 
     if (info->position_t)
@@ -3233,6 +3489,7 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
                        info->const_int_slots > 0 ?
                            max_const_f + info->const_int_slots :
                                info->const_float_slots;
+
     info->const_used_size = sizeof(float[4]) * slot_max; /* slots start from 1 */
 
     for (s = 0; s < slot_max; s++)
diff --git a/src/gallium/state_trackers/nine/nine_shader.h b/src/gallium/state_trackers/nine/nine_shader.h
index ec256c153a9..41577ac572b 100644
--- a/src/gallium/state_trackers/nine/nine_shader.h
+++ b/src/gallium/state_trackers/nine/nine_shader.h
@@ -59,6 +59,10 @@ struct nine_shader_info
     uint16_t sampler_mask_shadow; /* in, which samplers use depth compare */
     uint8_t rt_mask; /* out, which render targets are being written */
 
+    uint8_t fog_enable;
+    uint8_t fog_mode;
+    uint16_t projected; /* ps 1.1 to 1.3 */
+
     unsigned const_i_base; /* in vec4 (16 byte) units */
     unsigned const_b_base; /* in vec4 (16 byte) units */
     unsigned const_used_size;
@@ -68,6 +72,7 @@ struct nine_shader_info
     unsigned const_bool_slots;
 
     struct nine_lconstf lconstf; /* out, NOTE: members to be free'd by user */
+    uint8_t bumpenvmat_needed;
 };
 
 static inline void
@@ -137,4 +142,48 @@ nine_shader_variants_free(struct nine_shader_variant *list)
     }
 }
 
+struct nine_shader_variant64
+{
+    struct nine_shader_variant64 *next;
+    void *cso;
+    uint64_t key;
+};
+
+static inline void *
+nine_shader_variant_get64(struct nine_shader_variant64 *list, uint64_t key)
+{
+    while (list->key != key && list->next)
+        list = list->next;
+    if (list->key == key)
+        return list->cso;
+    return NULL;
+}
+
+static inline boolean
+nine_shader_variant_add64(struct nine_shader_variant64 *list,
+                          uint64_t key, void *cso)
+{
+    while (list->next) {
+        assert(list->key != key);
+        list = list->next;
+    }
+    list->next = MALLOC_STRUCT(nine_shader_variant64);
+    if (!list->next)
+        return FALSE;
+    list->next->next = NULL;
+    list->next->key = key;
+    list->next->cso = cso;
+    return TRUE;
+}
+
+static inline void
+nine_shader_variants_free64(struct nine_shader_variant64 *list)
+{
+    while (list->next) {
+        struct nine_shader_variant64 *ptr = list->next;
+        list->next = ptr->next;
+        FREE(ptr);
+    }
+}
+
 #endif /* _NINE_SHADER_H_ */
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c
index 6c835858d18..558d07a2bd0 100644
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -33,352 +33,36 @@
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "cso_cache/cso_context.h"
+#include "util/u_upload_mgr.h"
 #include "util/u_math.h"
 
 #define DBG_CHANNEL DBG_DEVICE
 
-static uint32_t
-update_framebuffer(struct NineDevice9 *device)
-{
-    struct pipe_context *pipe = device->pipe;
-    struct nine_state *state = &device->state;
-    struct pipe_framebuffer_state *fb = &device->state.fb;
-    unsigned i;
-    struct NineSurface9 *rt0 = state->rt[0];
-    unsigned w = rt0->desc.Width;
-    unsigned h = rt0->desc.Height;
-    D3DMULTISAMPLE_TYPE nr_samples = rt0->desc.MultiSampleType;
-    unsigned mask = state->ps ? state->ps->rt_mask : 1;
-    const int sRGB = state->rs[D3DRS_SRGBWRITEENABLE] ? 1 : 0;
-
-    DBG("\n");
-
-    state->rt_mask = 0x0;
-    fb->nr_cbufs = 0;
-
-    /* all render targets must have the same size and the depth buffer must be
-     * bigger. Multisample has to match, according to spec. But some apps do
-     * things wrong there, and no error is returned. The behaviour they get
-     * apparently is that depth buffer is disabled if it doesn't match.
-     * Surely the same for render targets. */
-
-    /* Special case: D3DFMT_NULL is used to bound no real render target,
-     * but render to depth buffer. We have to not take into account the render
-     * target info. TODO: know what should happen when there are several render targers
-     * and the first one is D3DFMT_NULL */
-    if (rt0->desc.Format == D3DFMT_NULL && state->ds) {
-        w = state->ds->desc.Width;
-        h = state->ds->desc.Height;
-        nr_samples = state->ds->desc.MultiSampleType;
-    }
-
-    for (i = 0; i < device->caps.NumSimultaneousRTs; ++i) {
-        struct NineSurface9 *rt = state->rt[i];
-
-        if (rt && rt->desc.Format != D3DFMT_NULL && (mask & (1 << i)) &&
-            rt->desc.Width == w && rt->desc.Height == h &&
-            rt->desc.MultiSampleType == nr_samples) {
-            fb->cbufs[i] = NineSurface9_GetSurface(rt, sRGB);
-            state->rt_mask |= 1 << i;
-            fb->nr_cbufs = i + 1;
-
-            if (unlikely(rt->desc.Usage & D3DUSAGE_AUTOGENMIPMAP)) {
-                assert(rt->texture == D3DRTYPE_TEXTURE ||
-                       rt->texture == D3DRTYPE_CUBETEXTURE);
-                NineBaseTexture9(rt->base.base.container)->dirty_mip = TRUE;
-            }
-        } else {
-            /* Color outputs must match RT slot,
-             * drivers will have to handle NULL entries for GL, too.
-             */
-            fb->cbufs[i] = NULL;
-        }
-    }
-
-    if (state->ds && state->ds->desc.Width >= w &&
-        state->ds->desc.Height >= h &&
-        state->ds->desc.MultiSampleType == nr_samples) {
-        fb->zsbuf = NineSurface9_GetSurface(state->ds, 0);
-    } else {
-        fb->zsbuf = NULL;
-    }
-
-    fb->width = w;
-    fb->height = h;
-
-    pipe->set_framebuffer_state(pipe, fb); /* XXX: cso ? */
-
-    if (fb->zsbuf) {
-        DWORD scale;
-        switch (fb->zsbuf->format) {
-        case PIPE_FORMAT_Z32_FLOAT:
-        case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-            scale = fui(1.0f);
-            break;
-        case PIPE_FORMAT_Z16_UNORM:
-            scale = fui((float)(1 << 16));
-            break;
-        default:
-            scale = fui((float)(1 << 24));
-            break;
-        }
-        if (state->rs[NINED3DRS_ZBIASSCALE] != scale) {
-            state->rs[NINED3DRS_ZBIASSCALE] = scale;
-            state->changed.group |= NINE_STATE_RASTERIZER;
-        }
-    }
-
-    return state->changed.group;
-}
-
-static void
-update_viewport(struct NineDevice9 *device)
-{
-    struct pipe_context *pipe = device->pipe;
-    const D3DVIEWPORT9 *vport = &device->state.viewport;
-    struct pipe_viewport_state pvport;
-
-    /* D3D coordinates are:
-     * -1 .. +1 for X,Y and
-     *  0 .. +1 for Z (we use pipe_rasterizer_state.clip_halfz)
-     */
-    pvport.scale[0] = (float)vport->Width * 0.5f;
-    pvport.scale[1] = (float)vport->Height * -0.5f;
-    pvport.scale[2] = vport->MaxZ - vport->MinZ;
-    pvport.translate[0] = (float)vport->Width * 0.5f + (float)vport->X;
-    pvport.translate[1] = (float)vport->Height * 0.5f + (float)vport->Y;
-    pvport.translate[2] = vport->MinZ;
-
-    /* We found R600 and SI cards have some imprecision
-     * on the barycentric coordinates used for interpolation.
-     * Some shaders rely on having something precise.
-     * We found that the proprietary driver has the imprecision issue,
-     * except when the render target width and height are powers of two.
-     * It is using some sort of workaround for these cases
-     * which covers likely all the cases the applications rely
-     * on something precise.
-     * We haven't found the workaround, but it seems like it's better
-     * for applications if the imprecision is biased towards infinity
-     * instead of -infinity (which is what measured). So shift slightly
-     * the viewport: not enough to change rasterization result (in particular
-     * for multisampling), but enough to make the imprecision biased
-     * towards infinity. We do this shift only if render target width and
-     * height are powers of two.
-     * Solves 'red shadows' bug on UE3 games.
-     */
-    if (device->driver_bugs.buggy_barycentrics &&
-        ((vport->Width & (vport->Width-1)) == 0) &&
-        ((vport->Height & (vport->Height-1)) == 0)) {
-        pvport.translate[0] -= 1.0f / 128.0f;
-        pvport.translate[1] -= 1.0f / 128.0f;
-    }
-
-    pipe->set_viewport_states(pipe, 0, 1, &pvport);
-}
-
-static inline void
-update_scissor(struct NineDevice9 *device)
-{
-    struct pipe_context *pipe = device->pipe;
-
-    pipe->set_scissor_states(pipe, 0, 1, &device->state.scissor);
-}
+/* State preparation only */
 
 static inline void
-update_blend(struct NineDevice9 *device)
+prepare_blend(struct NineDevice9 *device)
 {
-    nine_convert_blend_state(device->cso, device->state.rs);
+    nine_convert_blend_state(&device->state.pipe.blend, device->state.rs);
+    device->state.commit |= NINE_STATE_COMMIT_BLEND;
 }
 
 static inline void
-update_dsa(struct NineDevice9 *device)
+prepare_dsa(struct NineDevice9 *device)
 {
-    nine_convert_dsa_state(device->cso, device->state.rs);
+    nine_convert_dsa_state(&device->state.pipe.dsa, device->state.rs);
+    device->state.commit |= NINE_STATE_COMMIT_DSA;
 }
 
 static inline void
-update_rasterizer(struct NineDevice9 *device)
+prepare_rasterizer(struct NineDevice9 *device)
 {
-    nine_convert_rasterizer_state(device->cso, device->state.rs);
+    nine_convert_rasterizer_state(&device->state.pipe.rast, device->state.rs);
+    device->state.commit |= NINE_STATE_COMMIT_RASTERIZER;
 }
 
-/* Loop through VS inputs and pick the vertex elements with the declared
- * usage from the vertex declaration, then insert the instance divisor from
- * the stream source frequency setting.
- */
 static void
-update_vertex_elements(struct NineDevice9 *device)
-{
-    struct nine_state *state = &device->state;
-    const struct NineVertexDeclaration9 *vdecl = device->state.vdecl;
-    const struct NineVertexShader9 *vs;
-    unsigned n, b, i;
-    int index;
-    char vdecl_index_map[16]; /* vs->num_inputs <= 16 */
-    char used_streams[device->caps.MaxStreams];
-    int dummy_vbo_stream = -1;
-    BOOL need_dummy_vbo = FALSE;
-    struct pipe_vertex_element ve[PIPE_MAX_ATTRIBS];
-
-    state->stream_usage_mask = 0;
-    memset(vdecl_index_map, -1, 16);
-    memset(used_streams, 0, device->caps.MaxStreams);
-    vs = device->state.vs ? device->state.vs : device->ff.vs;
-
-    if (vdecl) {
-        for (n = 0; n < vs->num_inputs; ++n) {
-            DBG("looking up input %u (usage %u) from vdecl(%p)\n",
-                n, vs->input_map[n].ndecl, vdecl);
-
-            for (i = 0; i < vdecl->nelems; i++) {
-                if (vdecl->usage_map[i] == vs->input_map[n].ndecl) {
-                    vdecl_index_map[n] = i;
-                    used_streams[vdecl->elems[i].vertex_buffer_index] = 1;
-                    break;
-                }
-            }
-            if (vdecl_index_map[n] < 0)
-                need_dummy_vbo = TRUE;
-        }
-    } else {
-        /* No vertex declaration. Likely will never happen in practice,
-         * but we need not crash on this */
-        need_dummy_vbo = TRUE;
-    }
-
-    if (need_dummy_vbo) {
-        for (i = 0; i < device->caps.MaxStreams; i++ ) {
-            if (!used_streams[i]) {
-                dummy_vbo_stream = i;
-                break;
-            }
-        }
-    }
-    /* there are less vertex shader inputs than stream slots,
-     * so if we need a slot for the dummy vbo, we should have found one */
-    assert (!need_dummy_vbo || dummy_vbo_stream != -1);
-
-    for (n = 0; n < vs->num_inputs; ++n) {
-        index = vdecl_index_map[n];
-        if (index >= 0) {
-            ve[n] = vdecl->elems[index];
-            b = ve[n].vertex_buffer_index;
-            state->stream_usage_mask |= 1 << b;
-            /* XXX wine just uses 1 here: */
-            if (state->stream_freq[b] & D3DSTREAMSOURCE_INSTANCEDATA)
-                ve[n].instance_divisor = state->stream_freq[b] & 0x7FFFFF;
-        } else {
-            /* if the vertex declaration is incomplete compared to what the
-             * vertex shader needs, we bind a dummy vbo with 0 0 0 0.
-             * This is not precised by the spec, but is the behaviour
-             * tested on win */
-            ve[n].vertex_buffer_index = dummy_vbo_stream;
-            ve[n].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-            ve[n].src_offset = 0;
-            ve[n].instance_divisor = 0;
-        }
-    }
-
-    if (state->dummy_vbo_bound_at != dummy_vbo_stream) {
-        if (state->dummy_vbo_bound_at >= 0)
-            state->changed.vtxbuf |= 1 << state->dummy_vbo_bound_at;
-        if (dummy_vbo_stream >= 0) {
-            state->changed.vtxbuf |= 1 << dummy_vbo_stream;
-            state->vbo_bound_done = FALSE;
-        }
-        state->dummy_vbo_bound_at = dummy_vbo_stream;
-    }
-
-    cso_set_vertex_elements(device->cso, vs->num_inputs, ve);
-
-    state->changed.stream_freq = 0;
-}
-
-static inline uint32_t
-update_shader_variant_keys(struct NineDevice9 *device)
-{
-    struct nine_state *state = &device->state;
-    uint32_t mask = 0;
-    uint32_t vs_key = state->samplers_shadow;
-    uint32_t ps_key = state->samplers_shadow;
-
-    vs_key = (vs_key & NINE_VS_SAMPLERS_MASK) >> NINE_SAMPLER_VS(0);
-    ps_key = (ps_key & NINE_PS_SAMPLERS_MASK) >> NINE_SAMPLER_PS(0);
-
-    if (state->vs) vs_key &= state->vs->sampler_mask;
-    if (state->ps) {
-        if (unlikely(state->ps->byte_code.version < 0x20)) {
-            /* no depth textures, but variable targets */
-            uint32_t m = state->ps->sampler_mask;
-            ps_key = 0;
-            while (m) {
-                int s = ffs(m) - 1;
-                m &= ~(1 << s);
-                ps_key |= (state->texture[s] ? state->texture[s]->pstype : 1) << (s * 2);
-            }
-        } else {
-            ps_key &= state->ps->sampler_mask;
-        }
-    }
-
-    if (state->vs && state->vs_key != vs_key) {
-        state->vs_key = vs_key;
-        mask |= NINE_STATE_VS;
-    }
-    if (state->ps && state->ps_key != ps_key) {
-        state->ps_key = ps_key;
-        mask |= NINE_STATE_PS;
-    }
-    return mask;
-}
-
-static inline uint32_t
-update_vs(struct NineDevice9 *device)
-{
-    struct nine_state *state = &device->state;
-    struct NineVertexShader9 *vs = state->vs;
-    uint32_t changed_group = 0;
-
-    /* likely because we dislike FF */
-    if (likely(vs)) {
-        state->cso.vs = NineVertexShader9_GetVariant(vs, state->vs_key);
-    } else {
-        vs = device->ff.vs;
-        state->cso.vs = vs->variant.cso;
-    }
-    device->pipe->bind_vs_state(device->pipe, state->cso.vs);
-
-    if (state->rs[NINED3DRS_VSPOINTSIZE] != vs->point_size) {
-        state->rs[NINED3DRS_VSPOINTSIZE] = vs->point_size;
-        changed_group |= NINE_STATE_RASTERIZER;
-    }
-
-    if ((state->bound_samplers_mask_vs & vs->sampler_mask) != vs->sampler_mask)
-        /* Bound dummy sampler. */
-        changed_group |= NINE_STATE_SAMPLER;
-    return changed_group;
-}
-
-static inline uint32_t
-update_ps(struct NineDevice9 *device)
-{
-    struct nine_state *state = &device->state;
-    struct NinePixelShader9 *ps = state->ps;
-    uint32_t changed_group = 0;
-
-    if (likely(ps)) {
-        state->cso.ps = NinePixelShader9_GetVariant(ps, state->ps_key);
-    } else {
-        ps = device->ff.ps;
-        state->cso.ps = ps->variant.cso;
-    }
-    device->pipe->bind_fs_state(device->pipe, state->cso.ps);
-
-    if ((state->bound_samplers_mask_ps & ps->sampler_mask) != ps->sampler_mask)
-        /* Bound dummy sampler. */
-        changed_group |= NINE_STATE_SAMPLER;
-    return changed_group;
-}
+prepare_ps_constants_userbuf(struct NineDevice9 *device);
 
 #define DO_UPLOAD_CONST_F(buf,p,c,d) \
     do { \
@@ -391,7 +75,7 @@ update_ps(struct NineDevice9 *device)
 
 /* OK, this is a bit ugly ... */
 static void
-update_constants(struct NineDevice9 *device, unsigned shader_type)
+upload_constants(struct NineDevice9 *device, unsigned shader_type)
 {
     struct pipe_context *pipe = device->pipe;
     struct pipe_resource *buf;
@@ -438,10 +122,17 @@ update_constants(struct NineDevice9 *device, unsigned shader_type)
         lconstf_ranges = device->state.vs->lconstf.ranges;
         lconstf_data = device->state.vs->lconstf.data;
 
-        device->state.ff.clobber.vs_const = TRUE;
         device->state.changed.group &= ~NINE_STATE_VS_CONST;
     } else {
         DBG("PS\n");
+        /* features only implemented on the userbuf path */
+        if (device->state.ps->bumpenvmat_needed || (
+            device->state.ps->byte_code.version < 0x30 &&
+            device->state.rs[D3DRS_FOGENABLE])) {
+            device->prefer_user_constbuf = TRUE;
+            prepare_ps_constants_userbuf(device);
+            return;
+        }
         buf = device->constbuf_ps;
 
         const_f = device->state.ps_const_f;
@@ -464,7 +155,6 @@ update_constants(struct NineDevice9 *device, unsigned shader_type)
         lconstf_ranges = NULL;
         lconstf_data = NULL;
 
-        device->state.ff.clobber.ps_const = TRUE;
         device->state.changed.group &= ~NINE_STATE_PS_CONST;
     }
 
@@ -524,10 +214,9 @@ update_constants(struct NineDevice9 *device, unsigned shader_type)
 }
 
 static void
-update_vs_constants_userbuf(struct NineDevice9 *device)
+prepare_vs_constants_userbuf(struct NineDevice9 *device)
 {
     struct nine_state *state = &device->state;
-    struct pipe_context *pipe = device->pipe;
     struct pipe_constant_buffer cb;
     cb.buffer = NULL;
     cb.buffer_offset = 0;
@@ -567,7 +256,18 @@ update_vs_constants_userbuf(struct NineDevice9 *device)
         cb.user_buffer = dst;
     }
 
-    pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &cb);
+    if (!device->driver_caps.user_cbufs) {
+        u_upload_data(device->constbuf_uploader,
+                      0,
+                      cb.buffer_size,
+                      cb.user_buffer,
+                      &cb.buffer_offset,
+                      &cb.buffer);
+        u_upload_unmap(device->constbuf_uploader);
+        cb.user_buffer = NULL;
+    }
+
+    state->pipe.cb_vs = cb;
 
     if (device->state.changed.vs_const_f) {
         struct nine_range *r = device->state.changed.vs_const_f;
@@ -578,22 +278,19 @@ update_vs_constants_userbuf(struct NineDevice9 *device)
         device->state.changed.vs_const_f = NULL;
     }
     state->changed.group &= ~NINE_STATE_VS_CONST;
+    state->commit |= NINE_STATE_COMMIT_CONST_VS;
 }
 
 static void
-update_ps_constants_userbuf(struct NineDevice9 *device)
+prepare_ps_constants_userbuf(struct NineDevice9 *device)
 {
     struct nine_state *state = &device->state;
-    struct pipe_context *pipe = device->pipe;
     struct pipe_constant_buffer cb;
     cb.buffer = NULL;
     cb.buffer_offset = 0;
     cb.buffer_size = device->state.ps->const_used_size;
     cb.user_buffer = device->state.ps_const_f;
 
-    if (!cb.buffer_size)
-        return;
-
     if (state->changed.ps_const_i) {
         int *idst = (int *)&state->ps_const_f[4 * device->max_ps_const_f];
         memcpy(idst, state->ps_const_i, sizeof(state->ps_const_i));
@@ -606,7 +303,47 @@ update_ps_constants_userbuf(struct NineDevice9 *device)
         state->changed.ps_const_b = 0;
     }
 
-    pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &cb);
+    /* Upload special constants needed to implement PS1.x instructions like TEXBEM,TEXBEML and BEM */
+    if (device->state.ps->bumpenvmat_needed) {
+        memcpy(device->state.ps_lconstf_temp, cb.user_buffer, cb.buffer_size);
+        memcpy(&device->state.ps_lconstf_temp[4 * 8], &device->state.bumpmap_vars, sizeof(device->state.bumpmap_vars));
+
+        cb.user_buffer = device->state.ps_lconstf_temp;
+    }
+
+    if (state->ps->byte_code.version < 0x30 &&
+        state->rs[D3DRS_FOGENABLE]) {
+        float *dst = &state->ps_lconstf_temp[4 * 32];
+        if (cb.user_buffer != state->ps_lconstf_temp) {
+            memcpy(state->ps_lconstf_temp, cb.user_buffer, cb.buffer_size);
+            cb.user_buffer = state->ps_lconstf_temp;
+        }
+
+        d3dcolor_to_rgba(dst, state->rs[D3DRS_FOGCOLOR]);
+        if (state->rs[D3DRS_FOGTABLEMODE] == D3DFOG_LINEAR) {
+            dst[4] = asfloat(state->rs[D3DRS_FOGEND]);
+            dst[5] = 1.0f / (asfloat(state->rs[D3DRS_FOGEND]) - asfloat(state->rs[D3DRS_FOGSTART]));
+        } else if (state->rs[D3DRS_FOGTABLEMODE] != D3DFOG_NONE) {
+            dst[4] = asfloat(state->rs[D3DRS_FOGDENSITY]);
+        }
+        cb.buffer_size = 4 * 4 * 34;
+    }
+
+    if (!cb.buffer_size)
+        return;
+
+    if (!device->driver_caps.user_cbufs) {
+        u_upload_data(device->constbuf_uploader,
+                      0,
+                      cb.buffer_size,
+                      cb.user_buffer,
+                      &cb.buffer_offset,
+                      &cb.buffer);
+        u_upload_unmap(device->constbuf_uploader);
+        cb.user_buffer = NULL;
+    }
+
+    state->pipe.cb_ps = cb;
 
     if (device->state.changed.ps_const_f) {
         struct nine_range *r = device->state.changed.ps_const_f;
@@ -617,6 +354,286 @@ update_ps_constants_userbuf(struct NineDevice9 *device)
         device->state.changed.ps_const_f = NULL;
     }
     state->changed.group &= ~NINE_STATE_PS_CONST;
+    state->commit |= NINE_STATE_COMMIT_CONST_PS;
+}
+
+static inline uint32_t
+prepare_vs(struct NineDevice9 *device, uint8_t shader_changed)
+{
+    struct nine_state *state = &device->state;
+    struct NineVertexShader9 *vs = state->vs;
+    uint32_t changed_group = 0;
+    int has_key_changed = 0;
+
+    if (likely(vs))
+        has_key_changed = NineVertexShader9_UpdateKey(vs, state);
+
+    if (!shader_changed && !has_key_changed)
+        return 0;
+
+    /* likely because we dislike FF */
+    if (likely(vs)) {
+        state->cso.vs = NineVertexShader9_GetVariant(vs);
+    } else {
+        vs = device->ff.vs;
+        state->cso.vs = vs->ff_cso;
+    }
+
+    if (state->rs[NINED3DRS_VSPOINTSIZE] != vs->point_size) {
+        state->rs[NINED3DRS_VSPOINTSIZE] = vs->point_size;
+        changed_group |= NINE_STATE_RASTERIZER;
+    }
+
+    if ((state->bound_samplers_mask_vs & vs->sampler_mask) != vs->sampler_mask)
+        /* Bound dummy sampler. */
+        changed_group |= NINE_STATE_SAMPLER;
+
+    state->commit |= NINE_STATE_COMMIT_VS;
+    return changed_group;
+}
+
+static inline uint32_t
+prepare_ps(struct NineDevice9 *device, uint8_t shader_changed)
+{
+    struct nine_state *state = &device->state;
+    struct NinePixelShader9 *ps = state->ps;
+    uint32_t changed_group = 0;
+    int has_key_changed = 0;
+
+    if (likely(ps))
+        has_key_changed = NinePixelShader9_UpdateKey(ps, state);
+
+    if (!shader_changed && !has_key_changed)
+        return 0;
+
+    if (likely(ps)) {
+        state->cso.ps = NinePixelShader9_GetVariant(ps);
+    } else {
+        ps = device->ff.ps;
+        state->cso.ps = ps->ff_cso;
+    }
+
+    if ((state->bound_samplers_mask_ps & ps->sampler_mask) != ps->sampler_mask)
+        /* Bound dummy sampler. */
+        changed_group |= NINE_STATE_SAMPLER;
+
+    state->commit |= NINE_STATE_COMMIT_PS;
+    return changed_group;
+}
+
+/* State preparation incremental */
+
+/* State preparation + State commit */
+
+static uint32_t
+update_framebuffer(struct NineDevice9 *device)
+{
+    struct pipe_context *pipe = device->pipe;
+    struct nine_state *state = &device->state;
+    struct pipe_framebuffer_state *fb = &device->state.fb;
+    unsigned i;
+    struct NineSurface9 *rt0 = state->rt[0];
+    unsigned w = rt0->desc.Width;
+    unsigned h = rt0->desc.Height;
+    D3DMULTISAMPLE_TYPE nr_samples = rt0->desc.MultiSampleType;
+    unsigned mask = state->ps ? state->ps->rt_mask : 1;
+    const int sRGB = state->rs[D3DRS_SRGBWRITEENABLE] ? 1 : 0;
+
+    DBG("\n");
+
+    state->rt_mask = 0x0;
+    fb->nr_cbufs = 0;
+
+    /* all render targets must have the same size and the depth buffer must be
+     * bigger. Multisample has to match, according to spec. But some apps do
+     * things wrong there, and no error is returned. The behaviour they get
+     * apparently is that depth buffer is disabled if it doesn't match.
+     * Surely the same for render targets. */
+
+    /* Special case: D3DFMT_NULL is used to bound no real render target,
+     * but render to depth buffer. We have to not take into account the render
+     * target info. TODO: know what should happen when there are several render targers
+     * and the first one is D3DFMT_NULL */
+    if (rt0->desc.Format == D3DFMT_NULL && state->ds) {
+        w = state->ds->desc.Width;
+        h = state->ds->desc.Height;
+        nr_samples = state->ds->desc.MultiSampleType;
+    }
+
+    for (i = 0; i < device->caps.NumSimultaneousRTs; ++i) {
+        struct NineSurface9 *rt = state->rt[i];
+
+        if (rt && rt->desc.Format != D3DFMT_NULL && (mask & (1 << i)) &&
+            rt->desc.Width == w && rt->desc.Height == h &&
+            rt->desc.MultiSampleType == nr_samples) {
+            fb->cbufs[i] = NineSurface9_GetSurface(rt, sRGB);
+            state->rt_mask |= 1 << i;
+            fb->nr_cbufs = i + 1;
+
+            if (unlikely(rt->desc.Usage & D3DUSAGE_AUTOGENMIPMAP)) {
+                assert(rt->texture == D3DRTYPE_TEXTURE ||
+                       rt->texture == D3DRTYPE_CUBETEXTURE);
+                NineBaseTexture9(rt->base.base.container)->dirty_mip = TRUE;
+            }
+        } else {
+            /* Color outputs must match RT slot,
+             * drivers will have to handle NULL entries for GL, too.
+             */
+            fb->cbufs[i] = NULL;
+        }
+    }
+
+    if (state->ds && state->ds->desc.Width >= w &&
+        state->ds->desc.Height >= h &&
+        state->ds->desc.MultiSampleType == nr_samples) {
+        fb->zsbuf = NineSurface9_GetSurface(state->ds, 0);
+    } else {
+        fb->zsbuf = NULL;
+    }
+
+    fb->width = w;
+    fb->height = h;
+
+    pipe->set_framebuffer_state(pipe, fb); /* XXX: cso ? */
+
+    return state->changed.group;
+}
+
+static void
+update_viewport(struct NineDevice9 *device)
+{
+    struct pipe_context *pipe = device->pipe;
+    const D3DVIEWPORT9 *vport = &device->state.viewport;
+    struct pipe_viewport_state pvport;
+
+    /* D3D coordinates are:
+     * -1 .. +1 for X,Y and
+     *  0 .. +1 for Z (we use pipe_rasterizer_state.clip_halfz)
+     */
+    pvport.scale[0] = (float)vport->Width * 0.5f;
+    pvport.scale[1] = (float)vport->Height * -0.5f;
+    pvport.scale[2] = vport->MaxZ - vport->MinZ;
+    pvport.translate[0] = (float)vport->Width * 0.5f + (float)vport->X;
+    pvport.translate[1] = (float)vport->Height * 0.5f + (float)vport->Y;
+    pvport.translate[2] = vport->MinZ;
+
+    /* We found R600 and SI cards have some imprecision
+     * on the barycentric coordinates used for interpolation.
+     * Some shaders rely on having something precise.
+     * We found that the proprietary driver has the imprecision issue,
+     * except when the render target width and height are powers of two.
+     * It is using some sort of workaround for these cases
+     * which covers likely all the cases the applications rely
+     * on something precise.
+     * We haven't found the workaround, but it seems like it's better
+     * for applications if the imprecision is biased towards infinity
+     * instead of -infinity (which is what measured). So shift slightly
+     * the viewport: not enough to change rasterization result (in particular
+     * for multisampling), but enough to make the imprecision biased
+     * towards infinity. We do this shift only if render target width and
+     * height are powers of two.
+     * Solves 'red shadows' bug on UE3 games.
+     */
+    if (device->driver_bugs.buggy_barycentrics &&
+        ((vport->Width & (vport->Width-1)) == 0) &&
+        ((vport->Height & (vport->Height-1)) == 0)) {
+        pvport.translate[0] -= 1.0f / 128.0f;
+        pvport.translate[1] -= 1.0f / 128.0f;
+    }
+
+    pipe->set_viewport_states(pipe, 0, 1, &pvport);
+}
+
+/* Loop through VS inputs and pick the vertex elements with the declared
+ * usage from the vertex declaration, then insert the instance divisor from
+ * the stream source frequency setting.
+ */
+static void
+update_vertex_elements(struct NineDevice9 *device)
+{
+    struct nine_state *state = &device->state;
+    const struct NineVertexDeclaration9 *vdecl = device->state.vdecl;
+    const struct NineVertexShader9 *vs;
+    unsigned n, b, i;
+    int index;
+    char vdecl_index_map[16]; /* vs->num_inputs <= 16 */
+    char used_streams[device->caps.MaxStreams];
+    int dummy_vbo_stream = -1;
+    BOOL need_dummy_vbo = FALSE;
+    struct pipe_vertex_element ve[PIPE_MAX_ATTRIBS];
+
+    state->stream_usage_mask = 0;
+    memset(vdecl_index_map, -1, 16);
+    memset(used_streams, 0, device->caps.MaxStreams);
+    vs = device->state.vs ? device->state.vs : device->ff.vs;
+
+    if (vdecl) {
+        for (n = 0; n < vs->num_inputs; ++n) {
+            DBG("looking up input %u (usage %u) from vdecl(%p)\n",
+                n, vs->input_map[n].ndecl, vdecl);
+
+            for (i = 0; i < vdecl->nelems; i++) {
+                if (vdecl->usage_map[i] == vs->input_map[n].ndecl) {
+                    vdecl_index_map[n] = i;
+                    used_streams[vdecl->elems[i].vertex_buffer_index] = 1;
+                    break;
+                }
+            }
+            if (vdecl_index_map[n] < 0)
+                need_dummy_vbo = TRUE;
+        }
+    } else {
+        /* No vertex declaration. Likely will never happen in practice,
+         * but we need not crash on this */
+        need_dummy_vbo = TRUE;
+    }
+
+    if (need_dummy_vbo) {
+        for (i = 0; i < device->caps.MaxStreams; i++ ) {
+            if (!used_streams[i]) {
+                dummy_vbo_stream = i;
+                break;
+            }
+        }
+    }
+    /* there are less vertex shader inputs than stream slots,
+     * so if we need a slot for the dummy vbo, we should have found one */
+    assert (!need_dummy_vbo || dummy_vbo_stream != -1);
+
+    for (n = 0; n < vs->num_inputs; ++n) {
+        index = vdecl_index_map[n];
+        if (index >= 0) {
+            ve[n] = vdecl->elems[index];
+            b = ve[n].vertex_buffer_index;
+            state->stream_usage_mask |= 1 << b;
+            /* XXX wine just uses 1 here: */
+            if (state->stream_freq[b] & D3DSTREAMSOURCE_INSTANCEDATA)
+                ve[n].instance_divisor = state->stream_freq[b] & 0x7FFFFF;
+        } else {
+            /* if the vertex declaration is incomplete compared to what the
+             * vertex shader needs, we bind a dummy vbo with 0 0 0 0.
+             * This is not precised by the spec, but is the behaviour
+             * tested on win */
+            ve[n].vertex_buffer_index = dummy_vbo_stream;
+            ve[n].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+            ve[n].src_offset = 0;
+            ve[n].instance_divisor = 0;
+        }
+    }
+
+    if (state->dummy_vbo_bound_at != dummy_vbo_stream) {
+        if (state->dummy_vbo_bound_at >= 0)
+            state->changed.vtxbuf |= 1 << state->dummy_vbo_bound_at;
+        if (dummy_vbo_stream >= 0) {
+            state->changed.vtxbuf |= 1 << dummy_vbo_stream;
+            state->vbo_bound_done = FALSE;
+        }
+        state->dummy_vbo_bound_at = dummy_vbo_stream;
+    }
+
+    cso_set_vertex_elements(device->cso, vs->num_inputs, ve);
+
+    state->changed.stream_freq = 0;
 }
 
 static void
@@ -627,7 +644,6 @@ update_vertex_buffers(struct NineDevice9 *device)
     struct pipe_vertex_buffer dummy_vtxbuf;
     uint32_t mask = state->changed.vtxbuf;
     unsigned i;
-    unsigned start;
 
     DBG("mask=%x\n", mask);
 
@@ -656,27 +672,6 @@ update_vertex_buffers(struct NineDevice9 *device)
     state->changed.vtxbuf = 0;
 }
 
-static inline void
-update_index_buffer(struct NineDevice9 *device)
-{
-    struct pipe_context *pipe = device->pipe;
-    if (device->state.idxbuf)
-        pipe->set_index_buffer(pipe, &device->state.idxbuf->buffer);
-    else
-        pipe->set_index_buffer(pipe, NULL);
-}
-
-/* TODO: only go through dirty textures */
-static void
-validate_textures(struct NineDevice9 *device)
-{
-    struct NineBaseTexture9 *tex, *ptr;
-    LIST_FOR_EACH_ENTRY_SAFE(tex, ptr, &device->update_textures, list) {
-        list_delinit(&tex->list);
-        NineBaseTexture9_Validate(tex);
-    }
-}
-
 static inline boolean
 update_sampler_derived(struct nine_state *state, unsigned s)
 {
@@ -706,20 +701,16 @@ update_sampler_derived(struct nine_state *state, unsigned s)
 static void
 update_textures_and_samplers(struct NineDevice9 *device)
 {
-    struct pipe_context *pipe = device->pipe;
     struct nine_state *state = &device->state;
     struct pipe_sampler_view *view[NINE_MAX_SAMPLERS];
-    struct pipe_sampler_state samp;
     unsigned num_textures;
     unsigned i;
-    boolean commit_views;
     boolean commit_samplers;
     uint16_t sampler_mask = state->ps ? state->ps->sampler_mask :
                             device->ff.ps->sampler_mask;
 
     /* TODO: Can we reduce iterations here ? */
 
-    commit_views = FALSE;
     commit_samplers = FALSE;
     state->bound_samplers_mask_ps = 0;
     for (num_textures = 0, i = 0; i < NINE_MAX_SAMPLERS_PS; ++i) {
@@ -749,26 +740,12 @@ update_textures_and_samplers(struct NineDevice9 *device)
              * unbind dummy sampler directly when they are not needed
              * anymore, but they're going to be removed as long as texture
              * or sampler states are changed. */
-            view[i] = device->dummy_sampler;
+            view[i] = device->dummy_sampler_view;
             num_textures = i + 1;
 
-            memset(&samp, 0, sizeof(samp));
-            samp.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-            samp.max_lod = 15.0f;
-            samp.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-            samp.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-            samp.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-            samp.min_img_filter = PIPE_TEX_FILTER_NEAREST;
-            samp.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-            samp.compare_mode = PIPE_TEX_COMPARE_NONE;
-            samp.compare_func = PIPE_FUNC_LEQUAL;
-            samp.normalized_coords = 1;
-            samp.seamless_cube_map = 1;
-
             cso_single_sampler(device->cso, PIPE_SHADER_FRAGMENT,
-                               s - NINE_SAMPLER_PS(0), &samp);
+                               s - NINE_SAMPLER_PS(0), &device->dummy_sampler_state);
 
-            commit_views = TRUE;
             commit_samplers = TRUE;
             state->changed.sampler[s] = ~0;
         }
@@ -776,16 +753,11 @@ update_textures_and_samplers(struct NineDevice9 *device)
         state->bound_samplers_mask_ps |= (1 << s);
     }
 
-    commit_views |= (state->changed.texture & NINE_PS_SAMPLERS_MASK) != 0;
-    commit_views |= state->changed.srgb;
-    if (commit_views)
-        pipe->set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0,
-                                num_textures, view);
+    cso_set_sampler_views(device->cso, PIPE_SHADER_FRAGMENT, num_textures, view);
 
     if (commit_samplers)
         cso_single_sampler_done(device->cso, PIPE_SHADER_FRAGMENT);
 
-    commit_views = FALSE;
     commit_samplers = FALSE;
     sampler_mask = state->vs ? state->vs->sampler_mask : 0;
     state->bound_samplers_mask_vs = 0;
@@ -816,76 +788,170 @@ update_textures_and_samplers(struct NineDevice9 *device)
              * unbind dummy sampler directly when they are not needed
              * anymore, but they're going to be removed as long as texture
              * or sampler states are changed. */
-            view[i] = device->dummy_sampler;
+            view[i] = device->dummy_sampler_view;
             num_textures = i + 1;
 
-            memset(&samp, 0, sizeof(samp));
-            samp.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
-            samp.max_lod = 15.0f;
-            samp.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-            samp.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-            samp.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
-            samp.min_img_filter = PIPE_TEX_FILTER_NEAREST;
-            samp.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-            samp.compare_mode = PIPE_TEX_COMPARE_NONE;
-            samp.compare_func = PIPE_FUNC_LEQUAL;
-            samp.normalized_coords = 1;
-            samp.seamless_cube_map = 1;
-
             cso_single_sampler(device->cso, PIPE_SHADER_VERTEX,
-                               s - NINE_SAMPLER_VS(0), &samp);
+                               s - NINE_SAMPLER_VS(0), &device->dummy_sampler_state);
 
-            commit_views = TRUE;
             commit_samplers = TRUE;
             state->changed.sampler[s] = ~0;
         }
 
         state->bound_samplers_mask_vs |= (1 << s);
     }
-    commit_views |= (state->changed.texture & NINE_VS_SAMPLERS_MASK) != 0;
-    commit_views |= state->changed.srgb;
-    if (commit_views)
-        pipe->set_sampler_views(pipe, PIPE_SHADER_VERTEX, 0,
-                                num_textures, view);
+
+    cso_set_sampler_views(device->cso, PIPE_SHADER_VERTEX, num_textures, view);
 
     if (commit_samplers)
         cso_single_sampler_done(device->cso, PIPE_SHADER_VERTEX);
 
-    state->changed.srgb = FALSE;
     state->changed.texture = 0;
 }
 
+/* State commit only */
+
+static inline void
+commit_blend(struct NineDevice9 *device)
+{
+    cso_set_blend(device->cso, &device->state.pipe.blend);
+}
+
+static inline void
+commit_dsa(struct NineDevice9 *device)
+{
+    cso_set_depth_stencil_alpha(device->cso, &device->state.pipe.dsa);
+}
+
+static inline void
+commit_scissor(struct NineDevice9 *device)
+{
+    struct pipe_context *pipe = device->pipe;
+
+    pipe->set_scissor_states(pipe, 0, 1, &device->state.scissor);
+}
+
+static inline void
+commit_rasterizer(struct NineDevice9 *device)
+{
+    cso_set_rasterizer(device->cso, &device->state.pipe.rast);
+}
 
-#define NINE_STATE_FREQ_GROUP_0 \
-   (NINE_STATE_FB |             \
-    NINE_STATE_VIEWPORT |       \
-    NINE_STATE_SCISSOR |        \
-    NINE_STATE_BLEND |          \
-    NINE_STATE_DSA |            \
-    NINE_STATE_RASTERIZER |     \
-    NINE_STATE_VS |             \
-    NINE_STATE_PS |             \
-    NINE_STATE_BLEND_COLOR |    \
-    NINE_STATE_STENCIL_REF |    \
+static inline void
+commit_index_buffer(struct NineDevice9 *device)
+{
+    struct pipe_context *pipe = device->pipe;
+    if (device->state.idxbuf)
+        pipe->set_index_buffer(pipe, &device->state.idxbuf->buffer);
+    else
+        pipe->set_index_buffer(pipe, NULL);
+}
+
+static inline void
+commit_vs_constants(struct NineDevice9 *device)
+{
+    struct pipe_context *pipe = device->pipe;
+
+    if (unlikely(!device->state.vs))
+        pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->state.pipe.cb_vs_ff);
+    else
+        pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &device->state.pipe.cb_vs);
+}
+
+static inline void
+commit_ps_constants(struct NineDevice9 *device)
+{
+    struct pipe_context *pipe = device->pipe;
+
+    if (unlikely(!device->state.ps))
+        pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &device->state.pipe.cb_ps_ff);
+    else
+        pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &device->state.pipe.cb_ps);
+}
+
+static inline void
+commit_vs(struct NineDevice9 *device)
+{
+    struct nine_state *state = &device->state;
+
+    device->pipe->bind_vs_state(device->pipe, state->cso.vs);
+}
+
+
+static inline void
+commit_ps(struct NineDevice9 *device)
+{
+    struct nine_state *state = &device->state;
+
+    device->pipe->bind_fs_state(device->pipe, state->cso.ps);
+}
+/* State Update */
+
+#define NINE_STATE_SHADER_CHANGE_VS \
+   (NINE_STATE_VS |         \
+    NINE_STATE_TEXTURE |    \
+    NINE_STATE_FOG_SHADER)
+
+#define NINE_STATE_SHADER_CHANGE_PS \
+   (NINE_STATE_PS |         \
+    NINE_STATE_TEXTURE |    \
+    NINE_STATE_FOG_SHADER | \
+    NINE_STATE_PS1X_SHADER)
+
+#define NINE_STATE_FREQUENT \
+   (NINE_STATE_RASTERIZER | \
+    NINE_STATE_TEXTURE |    \
+    NINE_STATE_SAMPLER |    \
+    NINE_STATE_VS_CONST |   \
+    NINE_STATE_PS_CONST)
+
+#define NINE_STATE_COMMON \
+   (NINE_STATE_FB |       \
+    NINE_STATE_BLEND |    \
+    NINE_STATE_DSA |      \
+    NINE_STATE_VIEWPORT | \
+    NINE_STATE_VDECL |    \
+    NINE_STATE_IDXBUF)
+
+#define NINE_STATE_RARE      \
+   (NINE_STATE_SCISSOR |     \
+    NINE_STATE_BLEND_COLOR | \
+    NINE_STATE_STENCIL_REF | \
     NINE_STATE_SAMPLE_MASK)
 
-#define NINE_STATE_FREQ_GROUP_1 ~NINE_STATE_FREQ_GROUP_0
 
-#define NINE_STATE_SHADER_VARIANT_GROUP \
-    (NINE_STATE_TEXTURE | \
-     NINE_STATE_VS | \
-     NINE_STATE_PS)
+/* TODO: only go through dirty textures */
+static void
+validate_textures(struct NineDevice9 *device)
+{
+    struct NineBaseTexture9 *tex, *ptr;
+    LIST_FOR_EACH_ENTRY_SAFE(tex, ptr, &device->update_textures, list) {
+        list_delinit(&tex->list);
+        NineBaseTexture9_Validate(tex);
+    }
+}
+
+void
+nine_update_state_framebuffer(struct NineDevice9 *device)
+{
+    struct nine_state *state = &device->state;
+
+    validate_textures(device);
+
+    if (state->changed.group & NINE_STATE_FB)
+        update_framebuffer(device);
+
+    state->changed.group &= ~NINE_STATE_FB;
+}
 
 boolean
-nine_update_state(struct NineDevice9 *device, uint32_t mask)
+nine_update_state(struct NineDevice9 *device)
 {
     struct pipe_context *pipe = device->pipe;
     struct nine_state *state = &device->state;
     uint32_t group;
 
-    DBG("changed state groups: %x | %x\n",
-        state->changed.group & NINE_STATE_FREQ_GROUP_0,
-        state->changed.group & NINE_STATE_FREQ_GROUP_1);
+    DBG("changed state groups: %x\n", state->changed.group);
 
     /* NOTE: We may want to use the cso cache for everything, or let
      * NineDevice9.RestoreNonCSOState actually set the states, then we wouldn't
@@ -896,35 +962,79 @@ nine_update_state(struct NineDevice9 *device, uint32_t mask)
     validate_textures(device); /* may clobber state */
 
     /* ff_update may change VS/PS dirty bits */
-    if ((mask & NINE_STATE_FF) && unlikely(!state->vs || !state->ps))
+    if (unlikely(!state->vs || !state->ps))
         nine_ff_update(device);
-    group = state->changed.group & mask;
+    group = state->changed.group;
 
-    if (group & NINE_STATE_SHADER_VARIANT_GROUP)
-        group |= update_shader_variant_keys(device);
+    if (group & (NINE_STATE_SHADER_CHANGE_VS | NINE_STATE_SHADER_CHANGE_PS)) {
+        if (group & NINE_STATE_SHADER_CHANGE_VS)
+            group |= prepare_vs(device, (group & NINE_STATE_VS) != 0); /* may set NINE_STATE_RASTERIZER and NINE_STATE_SAMPLER*/
+        if (group & NINE_STATE_SHADER_CHANGE_PS)
+            group |= prepare_ps(device, (group & NINE_STATE_PS) != 0);
+    }
 
-    if (group & NINE_STATE_FREQ_GROUP_0) {
+    if (group & (NINE_STATE_COMMON | NINE_STATE_VS)) {
         if (group & NINE_STATE_FB)
-            group = update_framebuffer(device) & mask;
+            group |= update_framebuffer(device); /* may set NINE_STATE_RASTERIZER */
+        if (group & NINE_STATE_BLEND)
+            prepare_blend(device);
+        if (group & NINE_STATE_DSA)
+            prepare_dsa(device);
         if (group & NINE_STATE_VIEWPORT)
             update_viewport(device);
-        if (group & NINE_STATE_SCISSOR)
-            update_scissor(device);
-
-        if (group & NINE_STATE_DSA)
-            update_dsa(device);
-        if (group & NINE_STATE_BLEND)
-            update_blend(device);
-
-        if (group & NINE_STATE_VS)
-            group |= update_vs(device);
+        if ((group & (NINE_STATE_VDECL | NINE_STATE_VS)) ||
+            state->changed.stream_freq & ~1)
+            update_vertex_elements(device);
+        if (group & NINE_STATE_IDXBUF)
+            commit_index_buffer(device);
+    }
 
+    if (likely(group & (NINE_STATE_FREQUENT | NINE_STATE_VS | NINE_STATE_PS))) {
         if (group & NINE_STATE_RASTERIZER)
-            update_rasterizer(device);
+            prepare_rasterizer(device);
+        if (group & (NINE_STATE_TEXTURE | NINE_STATE_SAMPLER))
+            update_textures_and_samplers(device);
+        if (device->prefer_user_constbuf) {
+            if ((group & (NINE_STATE_VS_CONST | NINE_STATE_VS)) && state->vs)
+                prepare_vs_constants_userbuf(device);
+            if ((group & (NINE_STATE_PS_CONST | NINE_STATE_PS)) && state->ps)
+                prepare_ps_constants_userbuf(device);
+        } else {
+            if ((group & NINE_STATE_VS_CONST) && state->vs)
+                upload_constants(device, PIPE_SHADER_VERTEX);
+            if ((group & NINE_STATE_PS_CONST) && state->ps)
+                upload_constants(device, PIPE_SHADER_FRAGMENT);
+        }
+    }
 
-        if (group & NINE_STATE_PS)
-            group |= update_ps(device);
+    if (state->changed.vtxbuf)
+        update_vertex_buffers(device);
+
+    if (state->commit & NINE_STATE_COMMIT_BLEND)
+        commit_blend(device);
+    if (state->commit & NINE_STATE_COMMIT_DSA)
+        commit_dsa(device);
+    if (state->commit & NINE_STATE_COMMIT_RASTERIZER)
+        commit_rasterizer(device);
+    if (state->commit & NINE_STATE_COMMIT_CONST_VS)
+        commit_vs_constants(device);
+    if (state->commit & NINE_STATE_COMMIT_CONST_PS)
+        commit_ps_constants(device);
+    if (state->commit & NINE_STATE_COMMIT_VS)
+        commit_vs(device);
+    if (state->commit & NINE_STATE_COMMIT_PS)
+        commit_ps(device);
+
+    state->commit = 0;
+
+    if (unlikely(state->changed.ucp)) {
+        pipe->set_clip_state(pipe, &state->clip);
+        state->changed.ucp = 0;
+    }
 
+    if (unlikely(group & NINE_STATE_RARE)) {
+        if (group & NINE_STATE_SCISSOR)
+            commit_scissor(device);
         if (group & NINE_STATE_BLEND_COLOR) {
             struct pipe_blend_color color;
             d3dcolor_to_rgba(&color.color[0], state->rs[D3DRS_BLENDFACTOR]);
@@ -941,38 +1051,7 @@ nine_update_state(struct NineDevice9 *device, uint32_t mask)
         }
     }
 
-    if (state->changed.ucp) {
-        pipe->set_clip_state(pipe, &state->clip);
-        state->changed.ucp = 0;
-    }
-
-    if (group & (NINE_STATE_FREQ_GROUP_1 | NINE_STATE_VS)) {
-        if (group & (NINE_STATE_TEXTURE | NINE_STATE_SAMPLER))
-            update_textures_and_samplers(device);
-
-        if (group & NINE_STATE_IDXBUF)
-            update_index_buffer(device);
-
-        if ((group & (NINE_STATE_VDECL | NINE_STATE_VS)) ||
-            state->changed.stream_freq & ~1)
-            update_vertex_elements(device);
-
-        if (device->prefer_user_constbuf) {
-            if ((group & (NINE_STATE_VS_CONST | NINE_STATE_VS)) && state->vs)
-                update_vs_constants_userbuf(device);
-            if ((group & (NINE_STATE_PS_CONST | NINE_STATE_PS)) && state->ps)
-                update_ps_constants_userbuf(device);
-        } else {
-            if ((group & NINE_STATE_VS_CONST) && state->vs)
-                update_constants(device, PIPE_SHADER_VERTEX);
-            if ((group & NINE_STATE_PS_CONST) && state->ps)
-                update_constants(device, PIPE_SHADER_FRAGMENT);
-        }
-    }
-    if (state->changed.vtxbuf)
-        update_vertex_buffers(device);
-
-    device->state.changed.group &= ~mask |
+    device->state.changed.group &=
         (NINE_STATE_FF | NINE_STATE_VS_CONST | NINE_STATE_PS_CONST);
 
     DBG("finished\n");
@@ -980,6 +1059,7 @@ nine_update_state(struct NineDevice9 *device, uint32_t mask)
     return TRUE;
 }
 
+/* State defaults */
 
 static const DWORD nine_render_state_defaults[NINED3DRS_LAST + 1] =
 {
@@ -1134,6 +1214,18 @@ static const DWORD nine_samp_state_defaults[NINED3DSAMP_LAST + 1] =
     [NINED3DSAMP_MINLOD] = 0,
     [NINED3DSAMP_SHADOW] = 0
 };
+
+void nine_state_restore_non_cso(struct NineDevice9 *device)
+{
+    struct nine_state *state = &device->state;
+
+    state->changed.group = NINE_STATE_ALL;
+    state->changed.vtxbuf = (1ULL << device->caps.MaxStreams) - 1;
+    state->changed.ucp = (1 << PIPE_MAX_CLIP_PLANES) - 1;
+    state->changed.texture = NINE_PS_SAMPLERS_MASK | NINE_VS_SAMPLERS_MASK;
+    state->commit |= NINE_STATE_COMMIT_CONST_VS | NINE_STATE_COMMIT_CONST_PS;
+}
+
 void
 nine_state_set_defaults(struct NineDevice9 *device, const D3DCAPS9 *caps,
                         boolean is_reset)
@@ -1152,6 +1244,7 @@ nine_state_set_defaults(struct NineDevice9 *device, const D3DCAPS9 *caps,
     }
     state->ff.tex_stage[0][D3DTSS_COLOROP] = D3DTOP_MODULATE;
     state->ff.tex_stage[0][D3DTSS_ALPHAOP] = D3DTOP_SELECTARG1;
+    memset(&state->bumpmap_vars, 0, sizeof(state->bumpmap_vars));
 
     for (s = 0; s < Elements(state->samp); ++s) {
         memcpy(&state->samp[s], nine_samp_state_defaults,
@@ -1170,6 +1263,9 @@ nine_state_set_defaults(struct NineDevice9 *device, const D3DCAPS9 *caps,
     /* Set changed flags to initialize driver.
      */
     state->changed.group = NINE_STATE_ALL;
+    state->changed.vtxbuf = (1ULL << device->caps.MaxStreams) - 1;
+    state->changed.ucp = (1 << PIPE_MAX_CLIP_PLANES) - 1;
+    state->changed.texture = NINE_PS_SAMPLERS_MASK | NINE_VS_SAMPLERS_MASK;
 
     state->ff.changed.transform[0] = ~0;
     state->ff.changed.transform[D3DTS_WORLD / 32] |= 1 << (D3DTS_WORLD % 32);
@@ -1186,6 +1282,23 @@ nine_state_set_defaults(struct NineDevice9 *device, const D3DCAPS9 *caps,
         state->dummy_vbo_bound_at = -1;
         state->vbo_bound_done = FALSE;
     }
+
+    if (!device->prefer_user_constbuf) {
+        /* fill cb_vs and cb_ps for the non user constbuf path */
+        struct pipe_constant_buffer cb;
+
+        cb.buffer_offset = 0;
+        cb.buffer_size = device->vs_const_size;
+        cb.buffer = device->constbuf_vs;
+        cb.user_buffer = NULL;
+        state->pipe.cb_vs = cb;
+
+        cb.buffer_size = device->ps_const_size;
+        cb.buffer = device->constbuf_ps;
+        state->pipe.cb_ps = cb;
+
+        state->commit |= NINE_STATE_COMMIT_CONST_VS | NINE_STATE_COMMIT_CONST_PS;
+    }
 }
 
 void
@@ -1353,15 +1466,15 @@ const uint32_t nine_render_state_group[NINED3DRS_LAST + 1] =
     [D3DRS_ZFUNC] = NINE_STATE_DSA,
     [D3DRS_ALPHAREF] = NINE_STATE_DSA,
     [D3DRS_ALPHAFUNC] = NINE_STATE_DSA,
-    [D3DRS_DITHERENABLE] = NINE_STATE_RASTERIZER,
+    [D3DRS_DITHERENABLE] = NINE_STATE_BLEND,
     [D3DRS_ALPHABLENDENABLE] = NINE_STATE_BLEND,
-    [D3DRS_FOGENABLE] = NINE_STATE_FF_OTHER,
+    [D3DRS_FOGENABLE] = NINE_STATE_FF_OTHER | NINE_STATE_FOG_SHADER | NINE_STATE_PS_CONST,
     [D3DRS_SPECULARENABLE] = NINE_STATE_FF_LIGHTING,
-    [D3DRS_FOGCOLOR] = NINE_STATE_FF_OTHER,
-    [D3DRS_FOGTABLEMODE] = NINE_STATE_FF_OTHER,
-    [D3DRS_FOGSTART] = NINE_STATE_FF_OTHER,
-    [D3DRS_FOGEND] = NINE_STATE_FF_OTHER,
-    [D3DRS_FOGDENSITY] = NINE_STATE_FF_OTHER,
+    [D3DRS_FOGCOLOR] = NINE_STATE_FF_OTHER | NINE_STATE_PS_CONST,
+    [D3DRS_FOGTABLEMODE] = NINE_STATE_FF_OTHER | NINE_STATE_FOG_SHADER | NINE_STATE_PS_CONST,
+    [D3DRS_FOGSTART] = NINE_STATE_FF_OTHER | NINE_STATE_PS_CONST,
+    [D3DRS_FOGEND] = NINE_STATE_FF_OTHER | NINE_STATE_PS_CONST,
+    [D3DRS_FOGDENSITY] = NINE_STATE_FF_OTHER | NINE_STATE_PS_CONST,
     [D3DRS_RANGEFOGENABLE] = NINE_STATE_FF_OTHER,
     [D3DRS_STENCILENABLE] = NINE_STATE_DSA,
     [D3DRS_STENCILFAIL] = NINE_STATE_DSA,
@@ -1394,7 +1507,7 @@ const uint32_t nine_render_state_group[NINED3DRS_LAST + 1] =
     [D3DRS_VERTEXBLEND] = NINE_STATE_FF_OTHER,
     [D3DRS_CLIPPLANEENABLE] = NINE_STATE_RASTERIZER,
     [D3DRS_POINTSIZE] = NINE_STATE_RASTERIZER,
-    [D3DRS_POINTSIZE_MIN] = NINE_STATE_MISC_CONST,
+    [D3DRS_POINTSIZE_MIN] = NINE_STATE_RASTERIZER,
     [D3DRS_POINTSPRITEENABLE] = NINE_STATE_RASTERIZER,
     [D3DRS_POINTSCALEENABLE] = NINE_STATE_FF_OTHER,
     [D3DRS_POINTSCALE_A] = NINE_STATE_FF_OTHER,
@@ -1404,7 +1517,7 @@ const uint32_t nine_render_state_group[NINED3DRS_LAST + 1] =
     [D3DRS_MULTISAMPLEMASK] = NINE_STATE_SAMPLE_MASK,
     [D3DRS_PATCHEDGESTYLE] = NINE_STATE_UNHANDLED,
     [D3DRS_DEBUGMONITORTOKEN] = NINE_STATE_UNHANDLED,
-    [D3DRS_POINTSIZE_MAX] = NINE_STATE_MISC_CONST,
+    [D3DRS_POINTSIZE_MAX] = NINE_STATE_RASTERIZER,
     [D3DRS_INDEXEDVERTEXBLENDENABLE] = NINE_STATE_FF_OTHER,
     [D3DRS_COLORWRITEENABLE] = NINE_STATE_BLEND,
     [D3DRS_TWEENFACTOR] = NINE_STATE_FF_OTHER,
@@ -1446,6 +1559,8 @@ const uint32_t nine_render_state_group[NINED3DRS_LAST + 1] =
     [D3DRS_BLENDOPALPHA] = NINE_STATE_BLEND
 };
 
+/* Misc */
+
 D3DMATRIX *
 nine_state_access_transform(struct nine_state *state, D3DTRANSFORMSTATETYPE t,
                             boolean alloc)
@@ -1601,4 +1716,3 @@ const char *nine_d3drs_to_string(DWORD State)
         return "(invalid)";
     }
 }
-
diff --git a/src/gallium/state_trackers/nine/nine_state.h b/src/gallium/state_trackers/nine/nine_state.h
index 2bf3f637f93..b34da70ef48 100644
--- a/src/gallium/state_trackers/nine/nine_state.h
+++ b/src/gallium/state_trackers/nine/nine_state.h
@@ -33,8 +33,7 @@
 
 #define NINED3DRS_VSPOINTSIZE (D3DRS_BLENDOPALPHA + 1)
 #define NINED3DRS_RTMASK      (D3DRS_BLENDOPALPHA + 2)
-#define NINED3DRS_ZBIASSCALE  (D3DRS_BLENDOPALPHA + 3)
-#define NINED3DRS_ALPHACOVERAGE  (D3DRS_BLENDOPALPHA + 4)
+#define NINED3DRS_ALPHACOVERAGE  (D3DRS_BLENDOPALPHA + 3)
 
 #define D3DRS_LAST       D3DRS_BLENDOPALPHA
 #define NINED3DRS_LAST   NINED3DRS_ALPHACOVERAGE /* 213 */
@@ -67,17 +66,26 @@
 #define NINE_STATE_BLEND_COLOR (1 << 16)
 #define NINE_STATE_STENCIL_REF (1 << 17)
 #define NINE_STATE_SAMPLE_MASK (1 << 18)
-#define NINE_STATE_MISC_CONST  (1 << 19)
-#define NINE_STATE_FF          (0x1f << 20)
-#define NINE_STATE_FF_VS       (0x17 << 20)
-#define NINE_STATE_FF_PS       (0x18 << 20)
-#define NINE_STATE_FF_LIGHTING (1 << 20)
-#define NINE_STATE_FF_MATERIAL (1 << 21)
-#define NINE_STATE_FF_VSTRANSF (1 << 22)
-#define NINE_STATE_FF_PSSTAGES (1 << 23)
-#define NINE_STATE_FF_OTHER    (1 << 24)
-#define NINE_STATE_ALL          0x1ffffff
-#define NINE_STATE_UNHANDLED   (1 << 25)
+#define NINE_STATE_FF          (0x1f << 19)
+#define NINE_STATE_FF_VS       (0x17 << 19)
+#define NINE_STATE_FF_PS       (0x18 << 19)
+#define NINE_STATE_FF_LIGHTING (1 << 19)
+#define NINE_STATE_FF_MATERIAL (1 << 20)
+#define NINE_STATE_FF_VSTRANSF (1 << 21)
+#define NINE_STATE_FF_PSSTAGES (1 << 22)
+#define NINE_STATE_FF_OTHER    (1 << 23)
+#define NINE_STATE_FOG_SHADER  (1 << 24)
+#define NINE_STATE_PS1X_SHADER (1 << 25)
+#define NINE_STATE_ALL          0x3ffffff
+#define NINE_STATE_UNHANDLED   (1 << 26)
+
+#define NINE_STATE_COMMIT_DSA  (1 << 0)
+#define NINE_STATE_COMMIT_RASTERIZER (1 << 1)
+#define NINE_STATE_COMMIT_BLEND (1 << 2)
+#define NINE_STATE_COMMIT_CONST_VS (1 << 3)
+#define NINE_STATE_COMMIT_CONST_PS (1 << 4)
+#define NINE_STATE_COMMIT_VS (1 << 5)
+#define NINE_STATE_COMMIT_PS (1 << 6)
 
 
 #define NINE_MAX_SIMULTANEOUS_RENDERTARGETS 4
@@ -94,6 +102,8 @@
      NINE_MAX_CONST_I * 4 * sizeof(int))
 
 
+#define NINE_MAX_TEXTURE_STAGES 8
+
 #define NINE_MAX_LIGHTS        65536
 #define NINE_MAX_LIGHTS_ACTIVE 8
 
@@ -124,7 +134,6 @@ struct nine_state
         uint16_t vs_const_b; /* NINE_MAX_CONST_B == 16 */
         uint16_t ps_const_b;
         uint8_t ucp;
-        boolean srgb;
     } changed;
 
     struct NineSurface9 *rt[NINE_MAX_SIMULTANEOUS_RENDERTARGETS];
@@ -143,13 +152,13 @@ struct nine_state
     int    vs_const_i[NINE_MAX_CONST_I][4];
     BOOL   vs_const_b[NINE_MAX_CONST_B];
     float *vs_lconstf_temp;
-    uint32_t vs_key;
 
     struct NinePixelShader9 *ps;
     float *ps_const_f;
     int    ps_const_i[NINE_MAX_CONST_I][4];
     BOOL   ps_const_b[NINE_MAX_CONST_B];
-    uint32_t ps_key;
+    float *ps_lconstf_temp;
+    uint32_t bumpmap_vars[6 * NINE_MAX_TEXTURE_STAGES];
 
     struct {
         void *vs;
@@ -184,13 +193,9 @@ struct nine_state
     struct {
         struct {
             uint32_t group;
-            uint32_t tex_stage[NINE_MAX_SAMPLERS][(NINED3DTSS_COUNT + 31) / 32];
+            uint32_t tex_stage[NINE_MAX_TEXTURE_STAGES][(NINED3DTSS_COUNT + 31) / 32];
             uint32_t transform[(NINED3DTS_COUNT + 31) / 32];
         } changed;
-        struct {
-            boolean vs_const;
-            boolean ps_const;
-        } clobber;
 
         D3DMATRIX *transform; /* access only via nine_state_access_transform */
         unsigned num_transforms;
@@ -205,8 +210,19 @@ struct nine_state
 
         D3DMATERIAL9 material;
 
-        DWORD tex_stage[NINE_MAX_SAMPLERS][NINED3DTSS_COUNT];
+        DWORD tex_stage[NINE_MAX_TEXTURE_STAGES][NINED3DTSS_COUNT];
     } ff;
+
+    uint32_t commit;
+    struct {
+        struct pipe_depth_stencil_alpha_state dsa;
+        struct pipe_rasterizer_state rast;
+        struct pipe_blend_state blend;
+        struct pipe_constant_buffer cb_vs;
+        struct pipe_constant_buffer cb_ps;
+        struct pipe_constant_buffer cb_vs_ff;
+        struct pipe_constant_buffer cb_ps_ff;
+    } pipe;
 };
 
 /* map D3DRS -> NINE_STATE_x
@@ -220,8 +236,10 @@ extern const uint32_t nine_render_states_vertex[(NINED3DRS_COUNT + 31) / 32];
 
 struct NineDevice9;
 
-boolean nine_update_state(struct NineDevice9 *, uint32_t group_mask);
+void nine_update_state_framebuffer(struct NineDevice9 *);
+boolean nine_update_state(struct NineDevice9 *);
 
+void nine_state_restore_non_cso(struct NineDevice9 *device);
 void nine_state_set_defaults(struct NineDevice9 *, const D3DCAPS9 *,
                              boolean is_reset);
 void nine_state_clear(struct nine_state *, const boolean device);
diff --git a/src/gallium/state_trackers/nine/pixelshader9.c b/src/gallium/state_trackers/nine/pixelshader9.c
index 3f176a312bf..42bc349c2cc 100644
--- a/src/gallium/state_trackers/nine/pixelshader9.c
+++ b/src/gallium/state_trackers/nine/pixelshader9.c
@@ -46,7 +46,7 @@ NinePixelShader9_ctor( struct NinePixelShader9 *This,
         return hr;
 
     if (cso) {
-        This->variant.cso = cso;
+        This->ff_cso = cso;
         return D3D_OK;
     }
     device = This->base.device;
@@ -57,6 +57,8 @@ NinePixelShader9_ctor( struct NinePixelShader9 *This,
     info.const_b_base = NINE_CONST_B_BASE(device->max_ps_const_f) / 16;
     info.sampler_mask_shadow = 0x0;
     info.sampler_ps1xtypes = 0x0;
+    info.fog_enable = 0;
+    info.projected = 0;
 
     hr = nine_translate_shader(device, &info);
     if (FAILED(hr))
@@ -69,9 +71,13 @@ NinePixelShader9_ctor( struct NinePixelShader9 *This,
     This->byte_code.size = info.byte_size;
 
     This->variant.cso = info.cso;
+    This->last_cso = info.cso;
+    This->last_key = 0;
+
     This->sampler_mask = info.sampler_mask;
     This->rt_mask = info.rt_mask;
     This->const_used_size = info.const_used_size;
+    This->bumpenvmat_needed = info.bumpenvmat_needed;
     /* no constant relative addressing for ps */
     assert(info.lconstf.data == NULL);
     assert(info.lconstf.ranges == NULL);
@@ -82,11 +88,12 @@ NinePixelShader9_ctor( struct NinePixelShader9 *This,
 void
 NinePixelShader9_dtor( struct NinePixelShader9 *This )
 {
-    DBG("This=%p cso=%p\n", This, This->variant.cso);
+    DBG("This=%p\n", This);
 
     if (This->base.device) {
         struct pipe_context *pipe = This->base.device->pipe;
-        struct nine_shader_variant *var = &This->variant;
+        struct nine_shader_variant64 *var = &This->variant;
+
         do {
             if (var->cso) {
                 if (This->base.device->state.cso.ps == var->cso)
@@ -95,8 +102,14 @@ NinePixelShader9_dtor( struct NinePixelShader9 *This )
             }
             var = var->next;
         } while (var);
+
+        if (This->ff_cso) {
+            if (This->ff_cso == This->base.device->state.cso.ps)
+                pipe->bind_fs_state(pipe, NULL);
+            pipe->delete_fs_state(pipe, This->ff_cso);
+        }
     }
-    nine_shader_variants_free(&This->variant);
+    nine_shader_variants_free64(&This->variant);
 
     FREE((void *)This->byte_code.tokens); /* const_cast */
 
@@ -124,10 +137,16 @@ NinePixelShader9_GetFunction( struct NinePixelShader9 *This,
 }
 
 void *
-NinePixelShader9_GetVariant( struct NinePixelShader9 *This,
-                             uint32_t key )
+NinePixelShader9_GetVariant( struct NinePixelShader9 *This )
 {
-    void *cso = nine_shader_variant_get(&This->variant, key);
+    void *cso;
+    uint64_t key;
+
+    key = This->next_key;
+    if (key == This->last_key)
+        return This->last_cso;
+
+    cso = nine_shader_variant_get64(&This->variant, key);
     if (!cso) {
         struct NineDevice9 *device = This->base.device;
         struct nine_shader_info info;
@@ -139,13 +158,20 @@ NinePixelShader9_GetVariant( struct NinePixelShader9 *This,
         info.byte_code = This->byte_code.tokens;
         info.sampler_mask_shadow = key & 0xffff;
         info.sampler_ps1xtypes = key;
+        info.fog_enable = device->state.rs[D3DRS_FOGENABLE];
+        info.fog_mode = device->state.rs[D3DRS_FOGTABLEMODE];
+        info.projected = (key >> 48) & 0xffff;
 
         hr = nine_translate_shader(This->base.device, &info);
         if (FAILED(hr))
             return NULL;
-        nine_shader_variant_add(&This->variant, key, info.cso);
+        nine_shader_variant_add64(&This->variant, key, info.cso);
         cso = info.cso;
     }
+
+    This->last_key = key;
+    This->last_cso = cso;
+
     return cso;
 }
 
diff --git a/src/gallium/state_trackers/nine/pixelshader9.h b/src/gallium/state_trackers/nine/pixelshader9.h
index 6dad1d1ee76..e09009f6621 100644
--- a/src/gallium/state_trackers/nine/pixelshader9.h
+++ b/src/gallium/state_trackers/nine/pixelshader9.h
@@ -25,13 +25,16 @@
 
 #include "iunknown.h"
 #include "nine_shader.h"
+#include "nine_state.h"
+#include "basetexture9.h"
+#include "nine_ff.h"
 
 struct nine_lconstf;
 
 struct NinePixelShader9
 {
     struct NineUnknown base;
-    struct nine_shader_variant variant;
+    struct nine_shader_variant64 variant;
 
     struct {
         const DWORD *tokens;
@@ -41,11 +44,17 @@ struct NinePixelShader9
 
     unsigned const_used_size; /* in bytes */
 
+    uint8_t bumpenvmat_needed;
     uint16_t sampler_mask;
-    uint16_t sampler_mask_shadow;
     uint8_t rt_mask;
 
     uint64_t ff_key[6];
+    void *ff_cso;
+
+    uint64_t last_key;
+    void *last_cso;
+
+    uint64_t next_key;
 };
 static inline struct NinePixelShader9 *
 NinePixelShader9( void *data )
@@ -53,9 +62,49 @@ NinePixelShader9( void *data )
     return (struct NinePixelShader9 *)data;
 }
 
+static inline BOOL
+NinePixelShader9_UpdateKey( struct NinePixelShader9 *ps,
+                            struct nine_state *state )
+{
+    uint16_t samplers_shadow;
+    uint32_t samplers_ps1_types;
+    uint16_t projected;
+    uint64_t key;
+    BOOL res;
+
+    if (unlikely(ps->byte_code.version < 0x20)) {
+        /* no depth textures, but variable targets */
+        uint32_t m = ps->sampler_mask;
+        samplers_ps1_types = 0;
+        while (m) {
+            int s = ffs(m) - 1;
+            m &= ~(1 << s);
+            samplers_ps1_types |= (state->texture[s] ? state->texture[s]->pstype : 1) << (s * 2);
+        }
+        key = samplers_ps1_types;
+    } else {
+        samplers_shadow = (uint16_t)((state->samplers_shadow & NINE_PS_SAMPLERS_MASK) >> NINE_SAMPLER_PS(0));
+        key = samplers_shadow & ps->sampler_mask;
+    }
+
+    if (ps->byte_code.version < 0x30) {
+        key |= ((uint64_t)state->rs[D3DRS_FOGENABLE]) << 32;
+        key |= ((uint64_t)state->rs[D3DRS_FOGTABLEMODE]) << 33;
+    }
+
+    if (unlikely(ps->byte_code.version < 0x14)) {
+        projected = nine_ff_get_projected_key(state);
+        key |= ((uint64_t) projected) << 48;
+    }
+
+    res = ps->last_key != key;
+    if (res)
+        ps->next_key = key;
+    return res;
+}
+
 void *
-NinePixelShader9_GetVariant( struct NinePixelShader9 *vs,
-                             uint32_t key );
+NinePixelShader9_GetVariant( struct NinePixelShader9 *ps );
 
 /*** public ***/
 
diff --git a/src/gallium/state_trackers/nine/resource9.c b/src/gallium/state_trackers/nine/resource9.c
index bbc8320071b..6d915338b24 100644
--- a/src/gallium/state_trackers/nine/resource9.c
+++ b/src/gallium/state_trackers/nine/resource9.c
@@ -161,20 +161,22 @@ NineResource9_GetPrivateData( struct NineResource9 *This,
                               DWORD *pSizeOfData )
 {
     struct pheader *header;
+    DWORD sizeofdata;
 
     DBG("This=%p refguid=%p pData=%p pSizeOfData=%p\n",
         This, refguid, pData, pSizeOfData);
 
-    user_assert(pSizeOfData, E_POINTER);
-
     header = util_hash_table_get(This->pdata, refguid);
     if (!header) { return D3DERR_NOTFOUND; }
 
+    user_assert(pSizeOfData, E_POINTER);
+    sizeofdata = *pSizeOfData;
+    *pSizeOfData = header->size;
+
     if (!pData) {
-        *pSizeOfData = header->size;
         return D3D_OK;
     }
-    if (*pSizeOfData < header->size) {
+    if (sizeofdata < header->size) {
         return D3DERR_MOREDATA;
     }
 
@@ -206,10 +208,13 @@ DWORD WINAPI
 NineResource9_SetPriority( struct NineResource9 *This,
                            DWORD PriorityNew )
 {
-    DWORD prev = This->priority;
-
+    DWORD prev;
     DBG("This=%p, PriorityNew=%d\n", This, PriorityNew);
 
+    if (This->pool != D3DPOOL_MANAGED || This->type == D3DRTYPE_SURFACE)
+        return 0;
+
+    prev = This->priority;
     This->priority = PriorityNew;
     return prev;
 }
@@ -217,6 +222,9 @@ NineResource9_SetPriority( struct NineResource9 *This,
 DWORD WINAPI
 NineResource9_GetPriority( struct NineResource9 *This )
 {
+    if (This->pool != D3DPOOL_MANAGED || This->type == D3DRTYPE_SURFACE)
+        return 0;
+
     return This->priority;
 }
 
diff --git a/src/gallium/state_trackers/nine/stateblock9.c b/src/gallium/state_trackers/nine/stateblock9.c
index 032b9ffcbf0..6d6e1be0b7f 100644
--- a/src/gallium/state_trackers/nine/stateblock9.c
+++ b/src/gallium/state_trackers/nine/stateblock9.c
@@ -251,7 +251,7 @@ nine_state_copy_common(struct nine_state *dst,
         dst->ff.material = src->ff.material;
 
     if (mask->changed.group & NINE_STATE_FF_PSSTAGES) {
-        for (s = 0; s < NINE_MAX_SAMPLERS; ++s) {
+        for (s = 0; s < NINE_MAX_TEXTURE_STAGES; ++s) {
             for (i = 0; i < NINED3DTSS_COUNT; ++i)
                 if (mask->ff.changed.tex_stage[s][i / 32] & (1 << (i % 32)))
                     dst->ff.tex_stage[s][i] = src->ff.tex_stage[s][i];
diff --git a/src/gallium/state_trackers/nine/surface9.c b/src/gallium/state_trackers/nine/surface9.c
index 7533cb3a454..14c1ce927ad 100644
--- a/src/gallium/state_trackers/nine/surface9.c
+++ b/src/gallium/state_trackers/nine/surface9.c
@@ -104,11 +104,11 @@ NineSurface9_ctor( struct NineSurface9 *This,
     /* Ram buffer with no parent. Has to allocate the resource itself */
     if (!pResource && !pContainer) {
         assert(!user_buffer);
-        This->data = MALLOC(
+        This->data = align_malloc(
             nine_format_get_level_alloc_size(This->base.info.format,
                                              pDesc->Width,
                                              pDesc->Height,
-                                             0));
+                                             0), 32);
         if (!This->data)
             return E_OUTOFMEMORY;
     }
@@ -273,7 +273,7 @@ NineSurface9_AddDirtyRect( struct NineSurface9 *This,
             This->texture == D3DRTYPE_CUBETEXTURE ||
             This->texture == D3DRTYPE_TEXTURE);
 
-    if (This->base.pool != D3DPOOL_MANAGED)
+    if (This->base.pool == D3DPOOL_DEFAULT)
         return;
 
     /* Add a dirty rect to level 0 of the parent texture */
@@ -287,7 +287,7 @@ NineSurface9_AddDirtyRect( struct NineSurface9 *This,
             NineTexture9(This->base.base.container);
 
         NineTexture9_AddDirtyRect(tex, &dirty_rect);
-    } else { /* This->texture == D3DRTYPE_CUBETEXTURE */
+    } else if (This->texture == D3DRTYPE_CUBETEXTURE) {
         struct NineCubeTexture9 *ctex =
             NineCubeTexture9(This->base.base.container);
 
@@ -323,6 +323,13 @@ NineSurface9_LockRect( struct NineSurface9 *This,
         nine_D3DLOCK_to_str(Flags));
     NineSurface9_Dump(This);
 
+    /* check if it's already locked */
+    user_assert(This->lock_count == 0, D3DERR_INVALIDCALL);
+
+    /* set pBits to NULL after lock_count check */
+    user_assert(pLockedRect, E_POINTER);
+    pLockedRect->pBits = NULL;
+
 #ifdef NINE_STRICT
     user_assert(This->base.pool != D3DPOOL_DEFAULT ||
                 (resource && (resource->flags & NINE_RESOURCE_FLAG_LOCKABLE)),
@@ -337,19 +344,17 @@ NineSurface9_LockRect( struct NineSurface9 *This,
     user_assert(!((Flags & D3DLOCK_DISCARD) && (Flags & D3DLOCK_READONLY)),
                 D3DERR_INVALIDCALL);
 
-    /* check if it's already locked */
-    user_assert(This->lock_count == 0, D3DERR_INVALIDCALL);
-    user_assert(pLockedRect, E_POINTER);
-
     user_assert(This->desc.MultiSampleType == D3DMULTISAMPLE_NONE,
                 D3DERR_INVALIDCALL);
 
-    if (pRect && This->base.pool == D3DPOOL_DEFAULT &&
-        util_format_is_compressed(This->base.info.format)) {
+    if (pRect && This->desc.Pool == D3DPOOL_DEFAULT &&
+        compressed_format (This->desc.Format)) {
         const unsigned w = util_format_get_blockwidth(This->base.info.format);
         const unsigned h = util_format_get_blockheight(This->base.info.format);
-        user_assert(!(pRect->left % w) && !(pRect->right % w) &&
-                    !(pRect->top % h) && !(pRect->bottom % h),
+        user_assert((pRect->left == 0 && pRect->right == This->desc.Width &&
+                     pRect->top == 0 && pRect->bottom == This->desc.Height) ||
+                    (!(pRect->left % w) && !(pRect->right % w) &&
+                    !(pRect->top % h) && !(pRect->bottom % h)),
                     D3DERR_INVALIDCALL);
     }
 
@@ -363,13 +368,9 @@ NineSurface9_LockRect( struct NineSurface9 *This,
         usage |= PIPE_TRANSFER_DONTBLOCK;
 
     if (pRect) {
+        /* Windows XP accepts invalid locking rectangles, Windows 7 rejects
+         * them. Use Windows XP behaviour for now. */
         rect_to_pipe_box(&box, pRect);
-        if (u_box_clip_2d(&box, &box, This->desc.Width,
-                          This->desc.Height) < 0) {
-            DBG("pRect clipped by Width=%u Height=%u\n",
-                This->desc.Width, This->desc.Height);
-            return D3DERR_INVALIDCALL;
-        }
     } else {
         u_box_origin_2d(This->desc.Width, This->desc.Height, &box);
     }
@@ -463,140 +464,92 @@ IDirect3DSurface9Vtbl NineSurface9_vtable = {
     (void *)NineSurface9_ReleaseDC
 };
 
-HRESULT
-NineSurface9_CopySurface( struct NineSurface9 *This,
-                          struct NineSurface9 *From,
-                          const POINT *pDestPoint,
-                          const RECT *pSourceRect )
+/* When this function is called, we have already checked
+ * The copy regions fit the surfaces */
+void
+NineSurface9_CopyMemToDefault( struct NineSurface9 *This,
+                               struct NineSurface9 *From,
+                               const POINT *pDestPoint,
+                               const RECT *pSourceRect )
 {
     struct pipe_context *pipe = This->pipe;
     struct pipe_resource *r_dst = This->base.resource;
-    struct pipe_resource *r_src = From->base.resource;
-    struct pipe_transfer *transfer;
-    struct pipe_box src_box;
     struct pipe_box dst_box;
-    uint8_t *p_dst;
     const uint8_t *p_src;
+    int src_x, src_y, dst_x, dst_y, copy_width, copy_height;
 
-    DBG("This=%p From=%p pDestPoint=%p pSourceRect=%p\n",
-        This, From, pDestPoint, pSourceRect);
-
-    assert(This->base.pool != D3DPOOL_MANAGED &&
-           From->base.pool != D3DPOOL_MANAGED);
+    assert(This->base.pool == D3DPOOL_DEFAULT &&
+           From->base.pool == D3DPOOL_SYSTEMMEM);
 
-    user_assert(This->desc.Format == From->desc.Format, D3DERR_INVALIDCALL);
+    if (pDestPoint) {
+        dst_x = pDestPoint->x;
+        dst_y = pDestPoint->y;
+    } else {
+        dst_x = 0;
+        dst_y = 0;
+    }
 
-    dst_box.x = pDestPoint ? pDestPoint->x : 0;
-    dst_box.y = pDestPoint ? pDestPoint->y : 0;
+    if (pSourceRect) {
+        src_x = pSourceRect->left;
+        src_y = pSourceRect->top;
+        copy_width = pSourceRect->right - pSourceRect->left;
+        copy_height = pSourceRect->bottom - pSourceRect->top;
+    } else {
+        src_x = 0;
+        src_y = 0;
+        copy_width = From->desc.Width;
+        copy_height = From->desc.Height;
+    }
 
-    user_assert(dst_box.x >= 0 &&
-                dst_box.y >= 0, D3DERR_INVALIDCALL);
+    u_box_2d_zslice(dst_x, dst_y, This->layer,
+                    copy_width, copy_height, &dst_box);
 
-    dst_box.z = This->layer;
-    src_box.z = From->layer;
+    p_src = NineSurface9_GetSystemMemPointer(From, src_x, src_y);
 
-    dst_box.depth = 1;
-    src_box.depth = 1;
+    pipe->transfer_inline_write(pipe, r_dst, This->level,
+                                0, /* WRITE|DISCARD are implicit */
+                                &dst_box, p_src, From->stride, 0);
 
-    if (pSourceRect) {
-        /* make sure it doesn't range outside the source surface */
-        user_assert(pSourceRect->left >= 0 &&
-                    pSourceRect->right <= From->desc.Width &&
-                    pSourceRect->top >= 0 &&
-                    pSourceRect->bottom <= From->desc.Height,
-                    D3DERR_INVALIDCALL);
-        if (rect_to_pipe_box_xy_only_clamp(&src_box, pSourceRect))
-            return D3D_OK;
-    } else {
-        src_box.x = 0;
-        src_box.y = 0;
-        src_box.width = From->desc.Width;
-        src_box.height = From->desc.Height;
-    }
+    NineSurface9_MarkContainerDirty(This);
+}
 
-    /* limits */
-    dst_box.width = This->desc.Width - dst_box.x;
-    dst_box.height = This->desc.Height - dst_box.y;
+void
+NineSurface9_CopyDefaultToMem( struct NineSurface9 *This,
+                               struct NineSurface9 *From )
+{
+    struct pipe_context *pipe = This->pipe;
+    struct pipe_resource *r_src = From->base.resource;
+    struct pipe_transfer *transfer;
+    struct pipe_box src_box;
+    uint8_t *p_dst;
+    const uint8_t *p_src;
 
-    user_assert(src_box.width <= dst_box.width &&
-                src_box.height <= dst_box.height, D3DERR_INVALIDCALL);
+    assert(This->base.pool == D3DPOOL_SYSTEMMEM &&
+           From->base.pool == D3DPOOL_DEFAULT);
 
-    dst_box.width = src_box.width;
-    dst_box.height = src_box.height;
+    assert(This->desc.Width == From->desc.Width);
+    assert(This->desc.Height == From->desc.Height);
 
-    /* check source block align for compressed textures */
-    if (util_format_is_compressed(From->base.info.format) &&
-        ((src_box.width != From->desc.Width) ||
-         (src_box.height != From->desc.Height))) {
-        const unsigned w = util_format_get_blockwidth(From->base.info.format);
-        const unsigned h = util_format_get_blockheight(From->base.info.format);
-        user_assert(!(src_box.width % w) &&
-                    !(src_box.height % h),
-                    D3DERR_INVALIDCALL);
-    }
+    u_box_origin_2d(This->desc.Width, This->desc.Height, &src_box);
+    src_box.z = From->layer;
 
-    /* check destination block align for compressed textures */
-    if (util_format_is_compressed(This->base.info.format) &&
-        ((dst_box.width != This->desc.Width) ||
-         (dst_box.height != This->desc.Height) ||
-         dst_box.x != 0 ||
-         dst_box.y != 0)) {
-        const unsigned w = util_format_get_blockwidth(This->base.info.format);
-        const unsigned h = util_format_get_blockheight(This->base.info.format);
-        user_assert(!(dst_box.x % w) && !(dst_box.width % w) &&
-                    !(dst_box.y % h) && !(dst_box.height % h),
-                    D3DERR_INVALIDCALL);
-    }
+    p_src = pipe->transfer_map(pipe, r_src, From->level,
+                               PIPE_TRANSFER_READ,
+                               &src_box, &transfer);
+    p_dst = NineSurface9_GetSystemMemPointer(This, 0, 0);
 
-    if (r_dst && r_src) {
-        pipe->resource_copy_region(pipe,
-                                   r_dst, This->level,
-                                   dst_box.x, dst_box.y, dst_box.z,
-                                   r_src, From->level,
-                                   &src_box);
-    } else
-    if (r_dst) {
-        p_src = NineSurface9_GetSystemMemPointer(From, src_box.x, src_box.y);
-
-        pipe->transfer_inline_write(pipe, r_dst, This->level,
-                                    0, /* WRITE|DISCARD are implicit */
-                                    &dst_box, p_src, From->stride, 0);
-    } else
-    if (r_src) {
-        p_dst = NineSurface9_GetSystemMemPointer(This, 0, 0);
-
-        p_src = pipe->transfer_map(pipe, r_src, From->level,
-                                   PIPE_TRANSFER_READ,
-                                   &src_box, &transfer);
-        if (!p_src)
-            return D3DERR_DRIVERINTERNALERROR;
-
-        util_copy_rect(p_dst, This->base.info.format,
-                       This->stride, dst_box.x, dst_box.y,
-                       dst_box.width, dst_box.height,
-                       p_src,
-                       transfer->stride, src_box.x, src_box.y);
-
-        pipe->transfer_unmap(pipe, transfer);
-    } else {
-        p_dst = NineSurface9_GetSystemMemPointer(This, 0, 0);
-        p_src = NineSurface9_GetSystemMemPointer(From, 0, 0);
-
-        util_copy_rect(p_dst, This->base.info.format,
-                       This->stride, dst_box.x, dst_box.y,
-                       dst_box.width, dst_box.height,
-                       p_src,
-                       From->stride, src_box.x, src_box.y);
-    }
+    assert (p_src && p_dst);
 
-    if (This->base.pool == D3DPOOL_DEFAULT)
-        NineSurface9_MarkContainerDirty(This);
-    if (!r_dst && This->base.resource)
-        NineSurface9_AddDirtyRect(This, &dst_box);
+    util_copy_rect(p_dst, This->base.info.format,
+                   This->stride, 0, 0,
+                   This->desc.Width, This->desc.Height,
+                   p_src,
+                   transfer->stride, 0, 0);
 
-    return D3D_OK;
+    pipe->transfer_unmap(pipe, transfer);
 }
 
+
 /* Gladly, rendering to a MANAGED surface is not permitted, so we will
  * never have to do the reverse, i.e. download the surface.
  */
diff --git a/src/gallium/state_trackers/nine/surface9.h b/src/gallium/state_trackers/nine/surface9.h
index 73092ab8cf5..76156ae699c 100644
--- a/src/gallium/state_trackers/nine/surface9.h
+++ b/src/gallium/state_trackers/nine/surface9.h
@@ -125,11 +125,15 @@ HRESULT
 NineSurface9_UploadSelf( struct NineSurface9 *This,
                          const struct pipe_box *damaged );
 
-HRESULT
-NineSurface9_CopySurface( struct NineSurface9 *This,
-                          struct NineSurface9 *From,
-                          const POINT *pDestPoint,
-                          const RECT *pSourceRect );
+void
+NineSurface9_CopyMemToDefault( struct NineSurface9 *This,
+                               struct NineSurface9 *From,
+                               const POINT *pDestPoint,
+                               const RECT *pSourceRect );
+
+void
+NineSurface9_CopyDefaultToMem( struct NineSurface9 *This,
+                               struct NineSurface9 *From );
 
 static inline boolean
 NineSurface9_IsOffscreenPlain (struct NineSurface9 *This )
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index a62e6ad99d8..3f5be26fed7 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -184,7 +184,9 @@ NineSwapChain9_Resize( struct NineSwapChain9 *This,
 
     /* Note: It is the role of the backend to fill if necessary
      * BackBufferWidth and BackBufferHeight */
-    ID3DPresent_SetPresentParameters(This->present, pParams, This->mode);
+    hr = ID3DPresent_SetPresentParameters(This->present, pParams, This->mode);
+    if (hr != D3D_OK)
+        return hr;
 
     /* When we have flip behaviour, d3d9 expects we get back the screen buffer when we flip.
      * Here we don't get back the initial content of the screen. To emulate the behaviour
@@ -575,9 +577,10 @@ handle_draw_cursor_and_hud( struct NineSwapChain9 *This, struct pipe_resource *r
         blit.filter = PIPE_TEX_FILTER_NEAREST;
         blit.scissor_enable = FALSE;
 
-        ID3DPresent_GetCursorPos(This->present, &device->cursor.pos);
-
-        /* NOTE: blit messes up when box.x + box.width < 0, fix driver */
+        /* NOTE: blit messes up when box.x + box.width < 0, fix driver
+         * NOTE2: device->cursor.pos contains coordinates relative to the screen.
+         * This happens to be also the position of the cursor when we are fullscreen.
+         * We don't use sw cursor for Windowed mode */
         blit.dst.box.x = MAX2(device->cursor.pos.x, 0) - device->cursor.hotspot.x;
         blit.dst.box.y = MAX2(device->cursor.pos.y, 0) - device->cursor.hotspot.y;
         blit.dst.box.width = blit.src.box.width;
@@ -587,13 +590,14 @@ handle_draw_cursor_and_hud( struct NineSwapChain9 *This, struct pipe_resource *r
             blit.src.box.width, blit.src.box.height,
             blit.dst.box.x, blit.dst.box.y);
 
+        blit.alpha_blend = TRUE;
         This->pipe->blit(This->pipe, &blit);
     }
 
     if (device->hud && resource) {
         hud_draw(device->hud, resource); /* XXX: no offset */
         /* HUD doesn't clobber stipple */
-        NineDevice9_RestoreNonCSOState(device, ~0x2);
+        nine_state_restore_non_cso(device);
     }
 }
 
@@ -704,6 +708,7 @@ present( struct NineSwapChain9 *This,
         blit.mask = PIPE_MASK_RGBA;
         blit.filter = PIPE_TEX_FILTER_NEAREST;
         blit.scissor_enable = FALSE;
+        blit.alpha_blend = FALSE;
 
         This->pipe->blit(This->pipe, &blit);
     }
@@ -835,7 +840,7 @@ NineSwapChain9_Present( struct NineSwapChain9 *This,
     ID3DPresent_WaitBufferReleased(This->present, This->present_handles[0]);
 
     This->base.device->state.changed.group |= NINE_STATE_FB;
-    nine_update_state(This->base.device, NINE_STATE_FB);
+    nine_update_state_framebuffer(This->base.device);
 
     return hr;
 }
@@ -856,6 +861,8 @@ NineSwapChain9_GetFrontBufferData( struct NineSwapChain9 *This,
     DBG("GetFrontBufferData: This=%p pDestSurface=%p\n",
         This, pDestSurface);
 
+    user_assert(dest_surface->base.pool == D3DPOOL_SYSTEMMEM, D3DERR_INVALIDCALL);
+
     width = dest_surface->desc.Width;
     height = dest_surface->desc.Height;
 
@@ -870,7 +877,7 @@ NineSwapChain9_GetFrontBufferData( struct NineSwapChain9 *This,
     desc.MultiSampleQuality = 0;
     desc.Width = width;
     desc.Height = height;
-    /* NineSurface9_CopySurface needs same format. */
+    /* NineSurface9_CopyDefaultToMem needs same format. */
     desc.Format = dest_surface->desc.Format;
     desc.Usage = D3DUSAGE_RENDERTARGET;
     hr = NineSurface9_new(pDevice, NineUnknown(This), temp_resource, NULL, 0,
@@ -883,7 +890,7 @@ NineSwapChain9_GetFrontBufferData( struct NineSwapChain9 *This,
 
     ID3DPresent_FrontBufferCopy(This->present, temp_handle);
 
-    NineSurface9_CopySurface(dest_surface, temp_surface, NULL, NULL);
+    NineSurface9_CopyDefaultToMem(dest_surface, temp_surface);
 
     ID3DPresent_DestroyD3DWindowBuffer(This->present, temp_handle);
     NineUnknown_Destroy(NineUnknown(temp_surface));
diff --git a/src/gallium/state_trackers/nine/texture9.c b/src/gallium/state_trackers/nine/texture9.c
index 5900e76e52c..bc325c1335e 100644
--- a/src/gallium/state_trackers/nine/texture9.c
+++ b/src/gallium/state_trackers/nine/texture9.c
@@ -101,6 +101,13 @@ NineTexture9_ctor( struct NineTexture9 *This,
     if (Format != D3DFMT_NULL && pf == PIPE_FORMAT_NONE)
         return D3DERR_INVALIDCALL;
 
+    if (compressed_format(Format)) {
+        const unsigned w = util_format_get_blockwidth(pf);
+        const unsigned h = util_format_get_blockheight(pf);
+
+        user_assert(!(Width % w) && !(Height % h), D3DERR_INVALIDCALL);
+    }
+
     info->screen = screen;
     info->target = PIPE_TEXTURE_2D;
     info->format = pf;
@@ -152,10 +159,10 @@ NineTexture9_ctor( struct NineTexture9 *This,
          * apps access sublevels of texture even if they locked only first
          * level) */
         level_offsets = alloca(sizeof(unsigned) * (info->last_level + 1));
-        user_buffer = MALLOC(
+        user_buffer = align_malloc(
             nine_format_get_size_and_offsets(pf, level_offsets,
                                              Width, Height,
-                                             info->last_level));
+                                             info->last_level), 32);
         This->managed_buffer = user_buffer;
         if (!This->managed_buffer)
             return E_OUTOFMEMORY;
@@ -202,6 +209,9 @@ NineTexture9_ctor( struct NineTexture9 *This,
             return hr;
     }
 
+    /* Textures start initially dirty */
+    This->dirty_rect.width = Width;
+    This->dirty_rect.height = Height;
     This->dirty_rect.depth = 1; /* widht == 0 means empty, depth stays 1 */
 
     if (pSharedHandle && !*pSharedHandle) {/* Pool == D3DPOOL_SYSTEMMEM */
@@ -219,7 +229,8 @@ NineTexture9_dtor( struct NineTexture9 *This )
     if (This->surfaces) {
         /* The surfaces should have 0 references and be unbound now. */
         for (l = 0; l <= This->base.base.info.last_level; ++l)
-            NineUnknown_Destroy(&This->surfaces[l]->base.base);
+            if (This->surfaces[l])
+                NineUnknown_Destroy(&This->surfaces[l]->base.base);
         FREE(This->surfaces);
     }
 
@@ -295,18 +306,22 @@ NineTexture9_AddDirtyRect( struct NineTexture9 *This,
         pDirtyRect ? pDirtyRect->left : 0, pDirtyRect ? pDirtyRect->top : 0,
         pDirtyRect ? pDirtyRect->right : 0, pDirtyRect ? pDirtyRect->bottom : 0);
 
-    /* Tracking dirty regions on DEFAULT or SYSTEMMEM resources is pointless,
+    /* Tracking dirty regions on DEFAULT resources is pointless,
      * because we always write to the final storage. Just marked it dirty in
      * case we need to generate mip maps.
      */
-    if (This->base.base.pool != D3DPOOL_MANAGED) {
-        if (This->base.base.usage & D3DUSAGE_AUTOGENMIPMAP)
+    if (This->base.base.pool == D3DPOOL_DEFAULT) {
+        if (This->base.base.usage & D3DUSAGE_AUTOGENMIPMAP) {
             This->base.dirty_mip = TRUE;
+            BASETEX_REGISTER_UPDATE(&This->base);
+        }
         return D3D_OK;
     }
-    This->base.managed.dirty = TRUE;
 
-    BASETEX_REGISTER_UPDATE(&This->base);
+    if (This->base.base.pool == D3DPOOL_MANAGED) {
+        This->base.managed.dirty = TRUE;
+        BASETEX_REGISTER_UPDATE(&This->base);
+    }
 
     if (!pDirtyRect) {
         u_box_origin_2d(This->base.base.info.width0,
diff --git a/src/gallium/state_trackers/nine/vertexshader9.c b/src/gallium/state_trackers/nine/vertexshader9.c
index bbd5ce99d9a..fdfb79a138e 100644
--- a/src/gallium/state_trackers/nine/vertexshader9.c
+++ b/src/gallium/state_trackers/nine/vertexshader9.c
@@ -48,9 +48,10 @@ NineVertexShader9_ctor( struct NineVertexShader9 *This,
         return hr;
 
     if (cso) {
-        This->variant.cso = cso;
+        This->ff_cso = cso;
         return D3D_OK;
     }
+
     device = This->base.device;
 
     info.type = PIPE_SHADER_VERTEX;
@@ -59,6 +60,7 @@ NineVertexShader9_ctor( struct NineVertexShader9 *This,
     info.const_b_base = NINE_CONST_B_BASE(device->max_vs_const_f) / 16;
     info.sampler_mask_shadow = 0x0;
     info.sampler_ps1xtypes = 0x0;
+    info.fog_enable = 0;
 
     hr = nine_translate_shader(device, &info);
     if (FAILED(hr))
@@ -71,6 +73,9 @@ NineVertexShader9_ctor( struct NineVertexShader9 *This,
     This->byte_code.size = info.byte_size;
 
     This->variant.cso = info.cso;
+    This->last_cso = info.cso;
+    This->last_key = 0;
+
     This->const_used_size = info.const_used_size;
     This->lconstf = info.lconstf;
     This->sampler_mask = info.sampler_mask;
@@ -87,11 +92,12 @@ NineVertexShader9_ctor( struct NineVertexShader9 *This,
 void
 NineVertexShader9_dtor( struct NineVertexShader9 *This )
 {
-    DBG("This=%p cso=%p\n", This, This->variant.cso);
+    DBG("This=%p\n", This);
 
     if (This->base.device) {
         struct pipe_context *pipe = This->base.device->pipe;
         struct nine_shader_variant *var = &This->variant;
+
         do {
             if (var->cso) {
                 if (This->base.device->state.cso.vs == var->cso)
@@ -100,6 +106,12 @@ NineVertexShader9_dtor( struct NineVertexShader9 *This )
             }
             var = var->next;
         } while (var);
+
+        if (This->ff_cso) {
+            if (This->ff_cso == This->base.device->state.cso.vs)
+                pipe->bind_vs_state(pipe, NULL);
+            pipe->delete_vs_state(pipe, This->ff_cso);
+        }
     }
     nine_shader_variants_free(&This->variant);
 
@@ -130,10 +142,16 @@ NineVertexShader9_GetFunction( struct NineVertexShader9 *This,
 }
 
 void *
-NineVertexShader9_GetVariant( struct NineVertexShader9 *This,
-                              uint32_t key )
+NineVertexShader9_GetVariant( struct NineVertexShader9 *This )
 {
-    void *cso = nine_shader_variant_get(&This->variant, key);
+    void *cso;
+    uint32_t key;
+
+    key = This->next_key;
+    if (key == This->last_key)
+        return This->last_cso;
+
+    cso = nine_shader_variant_get(&This->variant, key);
     if (!cso) {
         struct NineDevice9 *device = This->base.device;
         struct nine_shader_info info;
@@ -144,6 +162,7 @@ NineVertexShader9_GetVariant( struct NineVertexShader9 *This,
         info.const_b_base = NINE_CONST_B_BASE(device->max_vs_const_f) / 16;
         info.byte_code = This->byte_code.tokens;
         info.sampler_mask_shadow = key & 0xf;
+        info.fog_enable = device->state.rs[D3DRS_FOGENABLE];
 
         hr = nine_translate_shader(This->base.device, &info);
         if (FAILED(hr))
@@ -151,6 +170,10 @@ NineVertexShader9_GetVariant( struct NineVertexShader9 *This,
         nine_shader_variant_add(&This->variant, key, info.cso);
         cso = info.cso;
     }
+
+    This->last_key = key;
+    This->last_cso = cso;
+
     return cso;
 }
 
diff --git a/src/gallium/state_trackers/nine/vertexshader9.h b/src/gallium/state_trackers/nine/vertexshader9.h
index 66c602c7b3c..15c3f4ff041 100644
--- a/src/gallium/state_trackers/nine/vertexshader9.h
+++ b/src/gallium/state_trackers/nine/vertexshader9.h
@@ -25,6 +25,7 @@
 
 #include "iunknown.h"
 #include "nine_shader.h"
+#include "nine_state.h"
 
 struct NineVertexShader9
 {
@@ -43,7 +44,6 @@ struct NineVertexShader9
     } byte_code;
 
     uint8_t sampler_mask;
-    uint8_t sampler_mask_shadow;
 
     boolean position_t; /* if true, disable vport transform */
     boolean point_size; /* if true, set rasterizer.point_size_per_vertex to 1 */
@@ -54,7 +54,13 @@ struct NineVertexShader9
 
     const struct pipe_stream_output_info *so;
 
-    uint64_t ff_key[2];
+    uint64_t ff_key[3];
+    void *ff_cso;
+
+    uint32_t last_key;
+    void *last_cso;
+
+    uint32_t next_key;
 };
 static inline struct NineVertexShader9 *
 NineVertexShader9( void *data )
@@ -62,9 +68,29 @@ NineVertexShader9( void *data )
     return (struct NineVertexShader9 *)data;
 }
 
+static inline BOOL
+NineVertexShader9_UpdateKey( struct NineVertexShader9 *vs,
+                             struct nine_state *state )
+{
+    uint8_t samplers_shadow;
+    uint32_t key;
+    BOOL res;
+
+    samplers_shadow = (uint8_t)((state->samplers_shadow & NINE_VS_SAMPLERS_MASK) >> NINE_SAMPLER_VS(0));
+    samplers_shadow &= vs->sampler_mask;
+    key = samplers_shadow;
+
+    if (vs->byte_code.version < 0x30)
+        key |= state->rs[D3DRS_FOGENABLE] << 8;
+
+    res = vs->last_key != key;
+    if (res)
+        vs->next_key = key;
+    return res;
+}
+
 void *
-NineVertexShader9_GetVariant( struct NineVertexShader9 *vs,
-                              uint32_t key );
+NineVertexShader9_GetVariant( struct NineVertexShader9 *vs );
 
 /*** public ***/
 
diff --git a/src/gallium/state_trackers/nine/volume9.c b/src/gallium/state_trackers/nine/volume9.c
index 4dfc5599a8e..0b9005685a9 100644
--- a/src/gallium/state_trackers/nine/volume9.c
+++ b/src/gallium/state_trackers/nine/volume9.c
@@ -23,6 +23,7 @@
 #include "device9.h"
 #include "volume9.h"
 #include "basetexture9.h" /* for marking dirty */
+#include "volumetexture9.h"
 #include "nine_helpers.h"
 #include "nine_pipe.h"
 #include "nine_dump.h"
@@ -43,7 +44,7 @@ NineVolume9_AllocateData( struct NineVolume9 *This )
     DBG("(%p(This=%p),level=%u) Allocating 0x%x bytes of system memory.\n",
         This->base.container, This, This->level, size);
 
-    This->data = (uint8_t *)MALLOC(size);
+    This->data = (uint8_t *)align_malloc(size, 32);
     if (!This->data)
         return E_OUTOFMEMORY;
     return D3D_OK;
@@ -182,47 +183,23 @@ NineVolume9_GetDesc( struct NineVolume9 *This,
     return D3D_OK;
 }
 
-static inline boolean
-NineVolume9_IsDirty(struct NineVolume9 *This)
-{
-    return This->dirty_box[0].width != 0;
-}
-
 inline void
 NineVolume9_AddDirtyRegion( struct NineVolume9 *This,
                             const struct pipe_box *box )
 {
-    struct pipe_box cover_a, cover_b;
-    float vol[2];
+    D3DBOX dirty_region;
+    struct NineVolumeTexture9 *tex = NineVolumeTexture9(This->base.container);
 
     if (!box) {
-        u_box_3d(0, 0, 0, This->desc.Width, This->desc.Height,
-                 This->desc.Depth, &This->dirty_box[0]);
-        memset(&This->dirty_box[1], 0, sizeof(This->dirty_box[1]));
-        return;
-    }
-    if (!This->dirty_box[0].width) {
-        This->dirty_box[0] = *box;
-        return;
-    }
-
-    u_box_union_3d(&cover_a, &This->dirty_box[0], box);
-    vol[0] = u_box_volume_3d(&cover_a);
-
-    if (This->dirty_box[1].width == 0) {
-        vol[1] = u_box_volume_3d(&This->dirty_box[0]);
-        if (vol[0] > (vol[1] * 1.5f))
-            This->dirty_box[1] = *box;
-        else
-            This->dirty_box[0] = cover_a;
+        NineVolumeTexture9_AddDirtyBox(tex, NULL);
     } else {
-        u_box_union_3d(&cover_b, &This->dirty_box[1], box);
-        vol[1] = u_box_volume_3d(&cover_b);
-
-        if (vol[0] > vol[1])
-            This->dirty_box[1] = cover_b;
-        else
-            This->dirty_box[0] = cover_a;
+        dirty_region.Left = box->x << This->level_actual;
+        dirty_region.Top = box->y << This->level_actual;
+        dirty_region.Front = box->z << This->level_actual;
+        dirty_region.Right = dirty_region.Left + (box->width << This->level_actual);
+        dirty_region.Bottom = dirty_region.Top + (box->height << This->level_actual);
+        dirty_region.Back = dirty_region.Front + (box->depth << This->level_actual);
+        NineVolumeTexture9_AddDirtyBox(tex, &dirty_region);
     }
 }
 
@@ -254,21 +231,26 @@ NineVolume9_LockBox( struct NineVolume9 *This,
         pBox ? pBox->Front : 0, pBox ? pBox->Back : 0,
         nine_D3DLOCK_to_str(Flags));
 
+    /* check if it's already locked */
+    user_assert(This->lock_count == 0, D3DERR_INVALIDCALL);
+
+    /* set pBits to NULL after lock_count check */
+    user_assert(pLockedVolume, E_POINTER);
+    pLockedVolume->pBits = NULL;
+
     user_assert(This->desc.Pool != D3DPOOL_DEFAULT ||
                 (This->desc.Usage & D3DUSAGE_DYNAMIC), D3DERR_INVALIDCALL);
 
     user_assert(!((Flags & D3DLOCK_DISCARD) && (Flags & D3DLOCK_READONLY)),
                 D3DERR_INVALIDCALL);
 
-    user_assert(This->lock_count == 0, D3DERR_INVALIDCALL);
-    user_assert(pLockedVolume, E_POINTER);
-
-    if (pBox && This->desc.Pool == D3DPOOL_DEFAULT &&
-        util_format_is_compressed(This->info.format)) {
+    if (pBox && compressed_format (This->desc.Format)) { /* For volume all pools are checked */
         const unsigned w = util_format_get_blockwidth(This->info.format);
         const unsigned h = util_format_get_blockheight(This->info.format);
-        user_assert(!(pBox->Left % w) && !(pBox->Right % w) &&
-                    !(pBox->Top % h) && !(pBox->Bottom % h),
+        user_assert((pBox->Left == 0 && pBox->Right == This->desc.Width &&
+                     pBox->Top == 0 && pBox->Bottom == This->desc.Height) ||
+                    (!(pBox->Left % w) && !(pBox->Right % w) &&
+                     !(pBox->Top % h) && !(pBox->Bottom % h)),
                     D3DERR_INVALIDCALL);
     }
 
@@ -312,8 +294,7 @@ NineVolume9_LockBox( struct NineVolume9 *This,
 
     if (!(Flags & (D3DLOCK_NO_DIRTY_UPDATE | D3DLOCK_READONLY))) {
         NineVolume9_MarkContainerDirty(This);
-        if (This->desc.Pool == D3DPOOL_MANAGED)
-            NineVolume9_AddDirtyRegion(This, &box);
+        NineVolume9_AddDirtyRegion(This, &box);
     }
 
     ++This->lock_count;
@@ -333,42 +314,31 @@ NineVolume9_UnlockBox( struct NineVolume9 *This )
     return D3D_OK;
 }
 
-
+/* When this function is called, we have already checked
+ * The copy regions fit the volumes */
 HRESULT
-NineVolume9_CopyVolume( struct NineVolume9 *This,
-                        struct NineVolume9 *From,
-                        unsigned dstx, unsigned dsty, unsigned dstz,
-                        struct pipe_box *pSrcBox )
+NineVolume9_CopyMemToDefault( struct NineVolume9 *This,
+                              struct NineVolume9 *From,
+                              unsigned dstx, unsigned dsty, unsigned dstz,
+                              struct pipe_box *pSrcBox )
 {
     struct pipe_context *pipe = This->pipe;
     struct pipe_resource *r_dst = This->resource;
-    struct pipe_resource *r_src = From->resource;
-    struct pipe_transfer *transfer;
     struct pipe_box src_box;
     struct pipe_box dst_box;
-    uint8_t *p_dst;
     const uint8_t *p_src;
 
     DBG("This=%p From=%p dstx=%u dsty=%u dstz=%u pSrcBox=%p\n",
         This, From, dstx, dsty, dstz, pSrcBox);
 
-    assert(This->desc.Pool != D3DPOOL_MANAGED &&
-           From->desc.Pool != D3DPOOL_MANAGED);
-    user_assert(This->desc.Format == From->desc.Format, D3DERR_INVALIDCALL);
+    assert(This->desc.Pool == D3DPOOL_DEFAULT &&
+           From->desc.Pool == D3DPOOL_SYSTEMMEM);
 
     dst_box.x = dstx;
     dst_box.y = dsty;
     dst_box.z = dstz;
 
     if (pSrcBox) {
-        /* make sure it doesn't range outside the source volume */
-        user_assert(pSrcBox->x >= 0 &&
-                    (pSrcBox->width - pSrcBox->x) <= From->desc.Width &&
-                    pSrcBox->y >= 0 &&
-                    (pSrcBox->height - pSrcBox->y) <= From->desc.Height &&
-                    pSrcBox->z >= 0 &&
-                    (pSrcBox->depth - pSrcBox->z) <= From->desc.Depth,
-                    D3DERR_INVALIDCALL);
         src_box = *pSrcBox;
     } else {
         src_box.x = 0;
@@ -378,101 +348,54 @@ NineVolume9_CopyVolume( struct NineVolume9 *This,
         src_box.height = From->desc.Height;
         src_box.depth = From->desc.Depth;
     }
-    /* limits */
-    dst_box.width = This->desc.Width - dst_box.x;
-    dst_box.height = This->desc.Height - dst_box.y;
-    dst_box.depth = This->desc.Depth - dst_box.z;
-
-    user_assert(src_box.width <= dst_box.width &&
-                src_box.height <= dst_box.height &&
-                src_box.depth <= dst_box.depth, D3DERR_INVALIDCALL);
 
     dst_box.width = src_box.width;
     dst_box.height = src_box.height;
     dst_box.depth = src_box.depth;
 
-    if (r_dst && r_src) {
-        pipe->resource_copy_region(pipe,
-                                   r_dst, This->level,
-                                   dst_box.x, dst_box.y, dst_box.z,
-                                   r_src, From->level,
-                                   &src_box);
-    } else
-    if (r_dst) {
-        p_src = NineVolume9_GetSystemMemPointer(From,
-            src_box.x, src_box.y, src_box.z);
-
-        pipe->transfer_inline_write(pipe, r_dst, This->level,
-                                    0, /* WRITE|DISCARD are implicit */
-                                    &dst_box, p_src,
-                                    From->stride, From->layer_stride);
-    } else
-    if (r_src) {
-        p_dst = NineVolume9_GetSystemMemPointer(This, 0, 0, 0);
-        p_src = pipe->transfer_map(pipe, r_src, From->level,
-                                   PIPE_TRANSFER_READ,
-                                   &src_box, &transfer);
-        if (!p_src)
-            return D3DERR_DRIVERINTERNALERROR;
-
-        util_copy_box(p_dst, This->info.format,
-                      This->stride, This->layer_stride,
-                      dst_box.x, dst_box.y, dst_box.z,
-                      dst_box.width, dst_box.height, dst_box.depth,
-                      p_src,
-                      transfer->stride, transfer->layer_stride,
-                      src_box.x, src_box.y, src_box.z);
+    p_src = NineVolume9_GetSystemMemPointer(From,
+         src_box.x, src_box.y, src_box.z);
 
-        pipe->transfer_unmap(pipe, transfer);
-    } else {
-        p_dst = NineVolume9_GetSystemMemPointer(This, 0, 0, 0);
-        p_src = NineVolume9_GetSystemMemPointer(From, 0, 0, 0);
-
-        util_copy_box(p_dst, This->info.format,
-                      This->stride, This->layer_stride,
-                      dst_box.x, dst_box.y, dst_box.z,
-                      dst_box.width, dst_box.height, dst_box.depth,
-                      p_src,
-                      From->stride, From->layer_stride,
-                      src_box.x, src_box.y, src_box.z);
-    }
+    pipe->transfer_inline_write(pipe, r_dst, This->level,
+                                0, /* WRITE|DISCARD are implicit */
+                                &dst_box, p_src,
+                                From->stride, From->layer_stride);
 
-    if (This->desc.Pool == D3DPOOL_DEFAULT)
-        NineVolume9_MarkContainerDirty(This);
-    if (!r_dst && This->resource)
-        NineVolume9_AddDirtyRegion(This, &dst_box);
+    NineVolume9_MarkContainerDirty(This);
 
     return D3D_OK;
 }
 
 HRESULT
-NineVolume9_UploadSelf( struct NineVolume9 *This )
+NineVolume9_UploadSelf( struct NineVolume9 *This,
+                        const struct pipe_box *damaged )
 {
     struct pipe_context *pipe = This->pipe;
     struct pipe_resource *res = This->resource;
+    struct pipe_box box;
     uint8_t *ptr;
-    unsigned i;
 
-    DBG("This=%p dirty=%i data=%p res=%p\n", This, NineVolume9_IsDirty(This),
+    DBG("This=%p damaged=%p data=%p res=%p\n", This, damaged,
         This->data, res);
 
     assert(This->desc.Pool == D3DPOOL_MANAGED);
-
-    if (!NineVolume9_IsDirty(This))
-        return D3D_OK;
     assert(res);
 
-    for (i = 0; i < Elements(This->dirty_box); ++i) {
-        const struct pipe_box *box = &This->dirty_box[i];
-        if (box->width == 0)
-            break;
-        ptr = NineVolume9_GetSystemMemPointer(This, box->x, box->y, box->z);
-
-        pipe->transfer_inline_write(pipe, res, This->level,
-                                    0,
-                                    box, ptr, This->stride, This->layer_stride);
+    if (damaged) {
+        box = *damaged;
+    } else {
+        box.x = 0;
+        box.y = 0;
+        box.z = 0;
+        box.width = This->desc.Width;
+        box.height = This->desc.Height;
+        box.depth = This->desc.Depth;
     }
-    NineVolume9_ClearDirtyRegion(This);
+
+    ptr = NineVolume9_GetSystemMemPointer(This, box.x, box.y, box.z);
+
+    pipe->transfer_inline_write(pipe, res, This->level, 0, &box,
+                                ptr, This->stride, This->layer_stride);
 
     return D3D_OK;
 }
diff --git a/src/gallium/state_trackers/nine/volume9.h b/src/gallium/state_trackers/nine/volume9.h
index fae24310a50..26ca8a32605 100644
--- a/src/gallium/state_trackers/nine/volume9.h
+++ b/src/gallium/state_trackers/nine/volume9.h
@@ -50,8 +50,6 @@ struct NineVolume9
     struct pipe_transfer *transfer;
     unsigned lock_count;
 
-    struct pipe_box dirty_box[2];
-
     struct pipe_context *pipe;
 
     /* for [GS]etPrivateData/FreePrivateData */
@@ -85,20 +83,15 @@ void
 NineVolume9_AddDirtyRegion( struct NineVolume9 *This,
                             const struct pipe_box *box );
 
-static inline void
-NineVolume9_ClearDirtyRegion( struct NineVolume9 *This )
-{
-    memset(&This->dirty_box, 0, sizeof(This->dirty_box));
-}
-
 HRESULT
-NineVolume9_CopyVolume( struct NineVolume9 *This,
-                        struct NineVolume9 *From,
-                        unsigned dstx, unsigned dsty, unsigned dstz,
-                        struct pipe_box *pSrcBox );
+NineVolume9_CopyMemToDefault( struct NineVolume9 *This,
+                              struct NineVolume9 *From,
+                              unsigned dstx, unsigned dsty, unsigned dstz,
+                              struct pipe_box *pSrcBox );
 
 HRESULT
-NineVolume9_UploadSelf( struct NineVolume9 *This );
+NineVolume9_UploadSelf( struct NineVolume9 *This,
+                        const struct pipe_box *damaged );
 
 
 /*** Direct3D public ***/
diff --git a/src/gallium/state_trackers/nine/volumetexture9.c b/src/gallium/state_trackers/nine/volumetexture9.c
index 1193e12f34c..e5b2b53148d 100644
--- a/src/gallium/state_trackers/nine/volumetexture9.c
+++ b/src/gallium/state_trackers/nine/volumetexture9.c
@@ -64,6 +64,13 @@ NineVolumeTexture9_ctor( struct NineVolumeTexture9 *This,
     if (Format == D3DFMT_ATI1 || Format == D3DFMT_ATI2)
         return D3DERR_INVALIDCALL;
 
+    if (compressed_format(Format)) {
+        const unsigned w = util_format_get_blockwidth(pf);
+        const unsigned h = util_format_get_blockheight(pf);
+        /* Compressed formats are not compressed on depth component */
+        user_assert(!(Width % w) && !(Height % h), D3DERR_INVALIDCALL);
+    }
+
     info->screen = pParams->device->screen;
     info->target = PIPE_TEXTURE_3D;
     info->format = pf;
@@ -116,6 +123,9 @@ NineVolumeTexture9_ctor( struct NineVolumeTexture9 *This,
             return hr;
     }
 
+    /* Textures start initially dirty */
+    NineVolumeTexture9_AddDirtyBox(This, NULL);
+
     return D3D_OK;
 }
 
@@ -193,12 +203,14 @@ NineVolumeTexture9_AddDirtyBox( struct NineVolumeTexture9 *This,
 {
     DBG("This=%p pDirtybox=%p\n", This, pDirtyBox);
 
-    if (This->base.base.pool != D3DPOOL_MANAGED) {
+    if (This->base.base.pool == D3DPOOL_DEFAULT) {
         return D3D_OK;
     }
-    This->base.managed.dirty = TRUE;
 
-    BASETEX_REGISTER_UPDATE(&This->base);
+    if (This->base.base.pool == D3DPOOL_MANAGED) {
+        This->base.managed.dirty = TRUE;
+        BASETEX_REGISTER_UPDATE(&This->base);
+    }
 
     if (!pDirtyBox) {
         This->dirty_box.x = 0;
diff --git a/src/gallium/targets/d3dadapter9/Makefile.am b/src/gallium/targets/d3dadapter9/Makefile.am
index fe5b0b11679..e26ca33a521 100644
--- a/src/gallium/targets/d3dadapter9/Makefile.am
+++ b/src/gallium/targets/d3dadapter9/Makefile.am
@@ -54,6 +54,7 @@ pkgconfigdir = $(libdir)/pkgconfig
 pkgconfig_DATA = d3d.pc
 
 d3dadapter9_la_SOURCES = \
+	description.c \
 	getproc.c \
 	drm.c
 
diff --git a/src/gallium/targets/d3dadapter9/description.c b/src/gallium/targets/d3dadapter9/description.c
new file mode 100644
index 00000000000..c0a86782f8e
--- /dev/null
+++ b/src/gallium/targets/d3dadapter9/description.c
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2015 Patrick Rudolph <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include <string.h>
+#include "adapter9.h"
+
+#define DBG_CHANNEL DBG_ADAPTER
+
+/* prototypes */
+void
+d3d_match_vendor_id( D3DADAPTER_IDENTIFIER9* drvid,
+        unsigned fallback_ven,
+        unsigned fallback_dev,
+        const char* fallback_name );
+void d3d_fill_driver_version(D3DADAPTER_IDENTIFIER9* drvid);
+void d3d_fill_cardname(D3DADAPTER_IDENTIFIER9* drvid);
+
+enum d3d_vendor_id
+{
+    HW_VENDOR_SOFTWARE              = 0x0000,
+    HW_VENDOR_AMD                   = 0x1002,
+    HW_VENDOR_NVIDIA                = 0x10de,
+    HW_VENDOR_VMWARE                = 0x15ad,
+    HW_VENDOR_INTEL                 = 0x8086,
+};
+
+struct card_lookup_table {
+    const char *mesaname;
+    const char *d3d9name;
+}
+cards_amd[] = {
+    {"HAWAII",                      "AMD Radeon R9 290"},
+    {"KAVERI",                      "AMD Radeon(TM) R7 Graphics"},
+    {"KABINI",                      "AMD Radeon HD 8400 / R3 Series"},
+    {"BONAIRE",                     "AMD Radeon HD 8770"},
+    {"OLAND",                       "AMD Radeon HD 8670"},
+    {"HAINAN",                      "AMD Radeon HD 8600M Series"},
+    {"TAHITI",                      "AMD Radeon HD 7900 Series"},
+    {"PITCAIRN",                    "AMD Radeon HD 7800 Series"},
+    {"CAPE VERDE",                  "AMD Radeon HD 7700 Series"},
+    {"ARUBA",                       "AMD Radeon HD 7660D"},
+    {"CAYMAN",                      "AMD Radeon HD 6900 Series"},
+    {"BARTS",                       "AMD Radeon HD 6800 Series"},
+    {"TURKS",                       "AMD Radeon HD 6600 Series"},
+    {"SUMO2",                       "AMD Radeon HD 6410D"},
+    {"SUMO",                        "AMD Radeon HD 6550D"},
+    {"CAICOS",                      "AMD Radeon HD 6400 Series"},
+    {"PALM",                        "AMD Radeon HD 6300 series Graphics"},
+    {"HEMLOCK",                     "ATI Radeon HD 5900 Series"},
+    {"CYPRESS",                     "ATI Radeon HD 5800 Series"},
+    {"JUNIPER",                     "ATI Radeon HD 5700 Series"},
+    {"REDWOOD",                     "ATI Radeon HD 5600 Series"},
+    {"CEDAR",                       "ATI Radeon HD 5500 Series"},
+    {"R700",                        "ATI Radeon HD 4800 Series"},
+    {"RV790",                       "ATI Radeon HD 4800 Series"},
+    {"RV770",                       "ATI Radeon HD 4800 Series"},
+    {"RV740",                       "ATI Radeon HD 4700 Series"},
+    {"RV730",                       "ATI Radeon HD 4600 Series"},
+    {"RV710",                       "ATI Radeon HD 4350"},
+    {"RS880",                       "ATI Mobility Radeon HD 4200"},
+    {"RS780",                       "ATI Radeon HD 3200 Graphics"},
+    {"R680",                        "ATI Radeon HD 2900 XT"},
+    {"R600",                        "ATI Radeon HD 2900 XT"},
+    {"RV670",                       "ATI Radeon HD 2900 XT"},
+    {"RV635",                       "ATI Mobility Radeon HD 2600"},
+    {"RV630",                       "ATI Mobility Radeon HD 2600"},
+    {"RV620",                       "ATI Mobility Radeon HD 2350"},
+    {"RV610",                       "ATI Mobility Radeon HD 2350"},
+    {"R580",                        "ATI Radeon X1600 Series"},
+    {"R520",                        "ATI Radeon X1600 Series"},
+    {"RV570",                       "ATI Radeon X1600 Series"},
+    {"RV560",                       "ATI Radeon X1600 Series"},
+    {"RV535",                       "ATI Radeon X1600 Series"},
+    {"RV530",                       "ATI Radeon X1600 Series"},
+    {"RV516",                       "ATI Radeon X700 SE"},
+    {"RV515",                       "ATI Radeon X700 SE"},
+    {"R481",                        "ATI Radeon X700 SE"},
+    {"R480",                        "ATI Radeon X700 SE"},
+    {"R430",                        "ATI Radeon X700 SE"},
+    {"R423",                        "ATI Radeon X700 SE"},
+    {"R420",                        "ATI Radeon X700 SE"},
+    {"R410",                        "ATI Radeon X700 SE"},
+    {"RV410",                       "ATI Radeon X700 SE"},
+    {"RS740",                       "ATI RADEON XPRESS 200M Series"},
+    {"RS690",                       "ATI RADEON XPRESS 200M Series"},
+    {"RS600",                       "ATI RADEON XPRESS 200M Series"},
+    {"RS485",                       "ATI RADEON XPRESS 200M Series"},
+    {"RS482",                       "ATI RADEON XPRESS 200M Series"},
+    {"RS480",                       "ATI RADEON XPRESS 200M Series"},
+    {"RS400",                       "ATI RADEON XPRESS 200M Series"},
+    {"R360",                        "ATI Radeon 9500"},
+    {"R350",                        "ATI Radeon 9500"},
+    {"R300",                        "ATI Radeon 9500"},
+    {"RV370",                       "ATI Radeon 9500"},
+    {"RV360",                       "ATI Radeon 9500"},
+    {"RV351",                       "ATI Radeon 9500"},
+    {"RV350",                       "ATI Radeon 9500"},
+},
+cards_nvidia[] =
+{
+    {"NV124",                       "NVIDIA GeForce GTX 970"},
+    {"NV117",                       "NVIDIA GeForce GTX 750"},
+    {"NVF1",                        "NVIDIA GeForce GTX 780 Ti"},
+    {"NVF0",                        "NVIDIA GeForce GTX 780"},
+    {"NVE6",                        "NVIDIA GeForce GTX 770M"},
+    {"NVE4",                        "NVIDIA GeForce GTX 680"},
+    {"NVD9",                        "NVIDIA GeForce GT 520"},
+    {"NVCF",                        "NVIDIA GeForce GTX 550 Ti"},
+    {"NVCE",                        "NVIDIA GeForce GTX 560"},
+    {"NVC8",                        "NVIDIA GeForce GTX 570"},
+    {"NVC4",                        "NVIDIA GeForce GTX 460"},
+    {"NVC3",                        "NVIDIA GeForce GT 440"},
+    {"NVC1",                        "NVIDIA GeForce GT 420"},
+    {"NVC0",                        "NVIDIA GeForce GTX 480"},
+    {"NVAF",                        "NVIDIA GeForce GT 320M"},
+    {"NVAC",                        "NVIDIA GeForce 8200"},
+    {"NVAA",                        "NVIDIA GeForce 8200"},
+    {"NVA8",                        "NVIDIA GeForce 210"},
+    {"NVA5",                        "NVIDIA GeForce GT 220"},
+    {"NVA3",                        "NVIDIA GeForce GT 240"},
+    {"NVA0",                        "NVIDIA GeForce GTX 280"},
+    {"NV98",                        "NVIDIA GeForce 9200"},
+    {"NV96",                        "NVIDIA GeForce 9400 GT"},
+    {"NV94",                        "NVIDIA GeForce 9600 GT"},
+    {"NV92",                        "NVIDIA GeForce 9800 GT"},
+    {"NV86",                        "NVIDIA GeForce 8500 GT"},
+    {"NV84",                        "NVIDIA GeForce 8600 GT"},
+    {"NV50",                        "NVIDIA GeForce 8800 GTX"},
+    {"NV68",                        "NVIDIA GeForce 6200"},
+    {"NV67",                        "NVIDIA GeForce 6200"},
+    {"NV63",                        "NVIDIA GeForce 6200"},
+    {"NV4E",                        "NVIDIA GeForce 6200"},
+    {"NV4C",                        "NVIDIA GeForce 6200"},
+    {"NV4B",                        "NVIDIA GeForce 7600 GT"},
+    {"NV4A",                        "NVIDIA GeForce 6200"},
+    {"NV49",                        "NVIDIA GeForce 7800 GT"},
+    {"NV47",                        "NVIDIA GeForce 7800 GT"},
+    {"NV46",                        "NVIDIA GeForce Go 7400",},
+    {"NV45",                        "NVIDIA GeForce 6800"},
+    {"NV44",                        "NVIDIA GeForce 6200"},
+    {"NV43",                        "NVIDIA GeForce 6600 GT"},
+    {"NV42",                        "NVIDIA GeForce 6800"},
+    {"NV41",                        "NVIDIA GeForce 6800"},
+    {"NV40",                        "NVIDIA GeForce 6800"},
+    {"NV38",                        "NVIDIA GeForce FX 5800"},
+    {"NV36",                        "NVIDIA GeForce FX 5800"},
+    {"NV35",                        "NVIDIA GeForce FX 5800"},
+    {"NV34",                        "NVIDIA GeForce FX 5200"},
+    {"NV31",                        "NVIDIA GeForce FX 5600"},
+    {"NV30",                        "NVIDIA GeForce FX 5800"},
+    {"nv28",                        "NVIDIA GeForce4 Ti 4200"},
+    {"nv25",                        "NVIDIA GeForce4 Ti 4200"},
+    {"nv20",                        "NVIDIA GeForce3"},
+    {"nv1F",                        "NVIDIA GeForce4 MX 460"},
+    {"nv1A",                        "NVIDIA GeForce2 GTS/GeForce2 Pro"},
+    {"nv18",                        "NVIDIA GeForce4 MX 460"},
+    {"nv17",                        "NVIDIA GeForce4 MX 460"},
+    {"nv16",                        "NVIDIA GeForce2 GTS/GeForce2 Pro"},
+    {"nv15",                        "NVIDIA GeForce2 GTS/GeForce2 Pro"},
+    {"nv11",                        "NVIDIA GeForce2 MX/MX 400"},
+    {"nv10",                        "NVIDIA GeForce 256"},
+},
+cards_vmware[] =
+{
+    {"SVGA3D",                      "VMware SVGA 3D (Microsoft Corporation - WDDM)"},
+},
+cards_intel[] =
+{
+    {"Haswell Mobile",              "Intel(R) Haswell Mobile"},
+    {"Ivybridge Server",            "Intel(R) Ivybridge Server"},
+    {"Ivybridge Mobile",            "Intel(R) Ivybridge Mobile"},
+    {"Ivybridge Desktop",           "Intel(R) Ivybridge Desktop"},
+    {"Sandybridge Server",          "Intel(R) Sandybridge Server"},
+    {"Sandybridge Mobile",          "Intel(R) Sandybridge Mobile"},
+    {"Sandybridge Desktop",         "Intel(R) Sandybridge Desktop"},
+    {"Ironlake Mobile",             "Intel(R) Ironlake Mobile"},
+    {"Ironlake Desktop",            "Intel(R) Ironlake Desktop"},
+    {"B43",                         "Intel(R) B43"},
+    {"G41",                         "Intel(R) G41"},
+    {"G45",                         "Intel(R) G45/G43"},
+    {"Q45",                         "Intel(R) Q45/Q43"},
+    {"Integrated Graphics Device",  "Intel(R) Integrated Graphics Device"},
+    {"GM45",                        "Mobile Intel(R) GM45 Express Chipset Family"},
+    {"965GME",                      "Intel(R) 965GME"},
+    {"965GM",                       "Mobile Intel(R) 965 Express Chipset Family"},
+    {"946GZ",                       "Intel(R) 946GZ"},
+    {"965G",                        "Intel(R) 965G"},
+    {"965Q",                        "Intel(R) 965Q"},
+    {"Pineview M",                  "Intel(R) IGD"},
+    {"Pineview G",                  "Intel(R) IGD"},
+    {"IGD",                         "Intel(R) IGD"},
+    {"Q33",                         "Intel(R) Q33"},
+    {"G33",                         "Intel(R) G33"},
+    {"Q35",                         "Intel(R) Q35"},
+    {"945GME",                      "Intel(R) 945GME"},
+    {"945GM",                       "Mobile Intel(R) 945GM Express Chipset Family"},
+    {"945G",                        "Intel(R) 945G"},
+    {"915GM",                       "Mobile Intel(R) 915GM/GMS,910GML Express Chipset Family"},
+    {"E7221G",                      "Intel(R) E7221G"},
+    {"915G",                        "Intel(R) 82915G/GV/910GL Express Chipset Family"},
+    {"865G",                        "Intel(R) 82865G Graphics Controller"},
+    {"845G",                        "Intel(R) 845G"},
+    {"855GM",                       "Intel(R) 82852/82855 GM/GME Graphics Controller"},
+    {"830M",                        "Intel(R) 82830M Graphics Controller"},
+};
+
+/* override VendorId, DeviceId and Description for unknown vendors */
+void
+d3d_match_vendor_id( D3DADAPTER_IDENTIFIER9* drvid,
+        unsigned fallback_ven,
+        unsigned fallback_dev,
+        const char* fallback_name )
+{
+    if (drvid->VendorId == HW_VENDOR_INTEL ||
+        drvid->VendorId == HW_VENDOR_VMWARE ||
+        drvid->VendorId == HW_VENDOR_AMD ||
+        drvid->VendorId == HW_VENDOR_NVIDIA)
+        return;
+
+    DBG("unknown vendor 0x4%x, emulating 0x4%x\n", drvid->VendorId, fallback_ven);
+    drvid->VendorId = fallback_ven;
+    drvid->DeviceId = fallback_dev;
+    strncpy(drvid->Description, fallback_name, sizeof(drvid->Description));
+}
+
+/* fill in driver name and version */
+void d3d_fill_driver_version(D3DADAPTER_IDENTIFIER9* drvid) {
+    switch (drvid->VendorId) {
+    case HW_VENDOR_INTEL:
+        drvid->DriverVersionLowPart = 0x000A0682;
+        drvid->DriverVersionHighPart = 0x0006000F;
+        strncpy(drvid->Driver, "igdumd32.dll", sizeof(drvid->Driver));
+        break;
+    case HW_VENDOR_VMWARE:
+        drvid->DriverVersionLowPart = 0x0001046E;
+        drvid->DriverVersionHighPart = 0x0006000E;
+        strncpy(drvid->Driver, "vm3dum.dll", sizeof(drvid->Driver));
+        break;
+    case HW_VENDOR_AMD:
+        drvid->DriverVersionLowPart = 0x000A0500;
+        drvid->DriverVersionHighPart = 0x00060011;
+        strncpy(drvid->Driver, "atiumdag.dll", sizeof(drvid->Driver));
+        break;
+    case HW_VENDOR_NVIDIA:
+        drvid->DriverVersionLowPart = 0x000D0FD4;
+        drvid->DriverVersionHighPart = 0x00060012;
+        strncpy(drvid->Driver, "nvd3dum.dll", sizeof(drvid->Driver));
+        break;
+    default:
+        break;
+    }
+}
+
+/* try to match the device name and override it with Windows like device names */
+void d3d_fill_cardname(D3DADAPTER_IDENTIFIER9* drvid) {
+    unsigned i;
+    switch (drvid->VendorId) {
+    case HW_VENDOR_INTEL:
+        for (i = 0; i < sizeof(cards_intel) / sizeof(cards_intel[0]); i++) {
+            if (strstr(drvid->Description, cards_intel[i].mesaname)) {
+                strncpy(drvid->Description, cards_intel[i].d3d9name, sizeof(drvid->Description));
+                return;
+            }
+        }
+        /* use a fall-back if nothing matches */
+        DBG("Unknown card name %s!\n", drvid->DeviceName);
+        strncpy(drvid->Description, cards_intel[0].d3d9name, sizeof(drvid->Description));
+        break;
+    case HW_VENDOR_VMWARE:
+        for (i = 0; i < sizeof(cards_vmware) / sizeof(cards_vmware[0]); i++) {
+            if (strstr(drvid->Description, cards_vmware[i].mesaname)) {
+                strncpy(drvid->Description, cards_vmware[i].d3d9name, sizeof(drvid->Description));
+                return;
+            }
+        }
+        /* use a fall-back if nothing matches */
+        DBG("Unknown card name %s!\n", drvid->DeviceName);
+        strncpy(drvid->Description, cards_vmware[0].d3d9name, sizeof(drvid->Description));
+        break;
+    case HW_VENDOR_AMD:
+        for (i = 0; i < sizeof(cards_amd) / sizeof(cards_amd[0]); i++) {
+            if (strstr(drvid->Description, cards_amd[i].mesaname)) {
+                strncpy(drvid->Description, cards_amd[i].d3d9name, sizeof(drvid->Description));
+                return;
+            }
+        }
+        /* use a fall-back if nothing matches */
+        DBG("Unknown card name %s!\n", drvid->DeviceName);
+        strncpy(drvid->Description, cards_amd[0].d3d9name, sizeof(drvid->Description));
+        break;
+    case HW_VENDOR_NVIDIA:
+        for (i = 0; i < sizeof(cards_nvidia) / sizeof(cards_nvidia[0]); i++) {
+            if (strstr(drvid->Description, cards_nvidia[i].mesaname)) {
+                strncpy(drvid->Description, cards_nvidia[i].d3d9name, sizeof(drvid->Description));
+                return;
+            }
+        }
+        /* use a fall-back if nothing matches */
+        DBG("Unknown card name %s!\n", drvid->DeviceName);
+        strncpy(drvid->Description, cards_nvidia[0].d3d9name, sizeof(drvid->Description));
+        break;
+    default:
+        break;
+    }
+}
diff --git a/src/gallium/targets/d3dadapter9/drm.c b/src/gallium/targets/d3dadapter9/drm.c
index 680f5164e60..fabc820f268 100644
--- a/src/gallium/targets/d3dadapter9/drm.c
+++ b/src/gallium/targets/d3dadapter9/drm.c
@@ -46,12 +46,6 @@
 
 #define DBG_CHANNEL DBG_ADAPTER
 
-#define VERSION_DWORD(hi, lo) \
-    ((DWORD)( \
-        ((DWORD)((hi) & 0xFFFF) << 16) | \
-         (DWORD)((lo) & 0xFFFF) \
-    ))
-
 const char __driConfigOptionsNine[] =
 DRI_CONF_BEGIN
     DRI_CONF_SECTION_PERFORMANCE
@@ -63,12 +57,21 @@ DRI_CONF_BEGIN
     DRI_CONF_SECTION_END
 DRI_CONF_END;
 
-/* Regarding os versions, we should not define our own as that would simply be
- * weird. Defaulting to Win2k/XP seems sane considering the origin of D3D9. The
- * driver also defaults to being a generic D3D9 driver, which of course only
- * matters if you're actually using the DDI. */
-#define VERSION_HIGH    VERSION_DWORD(0x0006, 0x000E) /* winxp, d3d9 */
-#define VERSION_LOW     VERSION_DWORD(0x0000, 0x0001) /* version, build */
+/* define fallback value here: NVIDIA GeForce GTX 970 */
+#define FALLBACK_NAME "NV124"
+#define FALLBACK_DEVID 0x13C2
+#define FALLBACK_VENID 0x10de
+
+/* prototypes */
+void
+d3d_match_vendor_id( D3DADAPTER_IDENTIFIER9* drvid,
+		unsigned fallback_ven,
+		unsigned fallback_dev,
+		const char* fallback_name );
+
+void d3d_fill_driver_version(D3DADAPTER_IDENTIFIER9* drvid);
+
+void d3d_fill_cardname(D3DADAPTER_IDENTIFIER9* drvid);
 
 struct d3dadapter9drm_context
 {
@@ -152,9 +155,9 @@ get_bus_info( int fd,
         *subsysid = 0;
         *revision = 0;
     } else {
-        DBG("Unable to detect card. Fake GTX 680.\n");
-        *vendorid = 0x10de; /* NV GTX 680 */
-        *deviceid = 0x1180;
+        DBG("Unable to detect card. Faking %s\n", FALLBACK_NAME);
+        *vendorid = FALLBACK_VENID;
+        *deviceid = FALLBACK_DEVID;
         *subsysid = 0;
         *revision = 0;
     }
@@ -169,33 +172,23 @@ read_descriptor( struct d3dadapter9_context *ctx,
     memset(drvid, 0, sizeof(*drvid));
     get_bus_info(fd, &drvid->VendorId, &drvid->DeviceId,
                  &drvid->SubSysId, &drvid->Revision);
+    snprintf(drvid->DeviceName, sizeof(drvid->DeviceName),
+                 "Gallium 0.4 with %s", ctx->hal->get_vendor(ctx->hal));
+    strncpy(drvid->Description, ctx->hal->get_name(ctx->hal),
+                 sizeof(drvid->Description));
+
+    /* choose fall-back vendor if necessary to allow
+     * the following functions to return sane results */
+    d3d_match_vendor_id(drvid, FALLBACK_VENID, FALLBACK_DEVID, FALLBACK_NAME);
+    /* fill in driver name and version info */
+    d3d_fill_driver_version(drvid);
+    /* override Description field with Windows like names */
+    d3d_fill_cardname(drvid);
+
+    /* this driver isn't WHQL certified */
+    drvid->WHQLLevel = 0;
 
-    strncpy(drvid->Driver, "libd3dadapter9.so", sizeof(drvid->Driver));
-    strncpy(drvid->DeviceName, ctx->hal->get_name(ctx->hal), 32);
-    snprintf(drvid->Description, sizeof(drvid->Description),
-             "Gallium 0.4 with %s", ctx->hal->get_vendor(ctx->hal));
-
-    drvid->DriverVersionLowPart = VERSION_LOW;
-    drvid->DriverVersionHighPart = VERSION_HIGH;
-
-    /* To make a pseudo-real GUID we use the PCI bus data and some string */
-    drvid->DeviceIdentifier.Data1 = drvid->VendorId;
-    drvid->DeviceIdentifier.Data2 = drvid->DeviceId;
-    drvid->DeviceIdentifier.Data3 = drvid->SubSysId;
-    memcpy(drvid->DeviceIdentifier.Data4, "Gallium3D", 8);
-
-    drvid->WHQLLevel = 1; /* This fakes WHQL validaion */
-
-    /* XXX Fake NVIDIA binary driver on Windows.
-     *
-     * OS version: 4=95/98/NT4, 5=2000, 6=2000/XP, 7=Vista, 8=Win7
-     */
-    strncpy(drvid->Driver, "nvd3dum.dll", sizeof(drvid->Driver));
-    strncpy(drvid->Description, "NVIDIA GeForce GTX 680", sizeof(drvid->Description));
-    drvid->DriverVersionLowPart = VERSION_DWORD(12, 6658); /* minor, build */
-    drvid->DriverVersionHighPart = VERSION_DWORD(6, 15); /* OS, major */
-    drvid->SubSysId = 0;
-    drvid->Revision = 0;
+    /* this value is fixed */
     drvid->DeviceIdentifier.Data1 = 0xaeb2cdd4;
     drvid->DeviceIdentifier.Data2 = 0x6e41;
     drvid->DeviceIdentifier.Data3 = 0x43ea;
@@ -207,7 +200,6 @@ read_descriptor( struct d3dadapter9_context *ctx,
     drvid->DeviceIdentifier.Data4[5] = 0x76;
     drvid->DeviceIdentifier.Data4[6] = 0x07;
     drvid->DeviceIdentifier.Data4[7] = 0x81;
-    drvid->WHQLLevel = 0;
 }
 
 static HRESULT WINAPI
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
index 7168e1dbfb3..a33d7f83671 100644
--- a/src/gallium/targets/dri/Android.mk
+++ b/src/gallium/targets/dri/Android.mk
@@ -77,8 +77,8 @@ gallium_DRIVERS += libmesa_pipe_r600
 LOCAL_CFLAGS += -DGALLIUM_R600
 endif
 ifneq ($(filter radeonsi,$(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_radeonsi
-LOCAL_SHARED_LIBRARIES += libLLVM
+gallium_DRIVERS += libmesa_pipe_radeonsi libmesa_winsys_amdgpu
+LOCAL_SHARED_LIBRARIES += libLLVM libdrm_amdgpu
 LOCAL_CFLAGS += -DGALLIUM_RADEONSI
 endif
 gallium_DRIVERS += libmesa_winsys_radeon libmesa_pipe_radeon
diff --git a/src/gallium/winsys/amdgpu/drm/Android.mk b/src/gallium/winsys/amdgpu/drm/Android.mk
index 7d507aa79c6..57732347eea 100644
--- a/src/gallium/winsys/amdgpu/drm/Android.mk
+++ b/src/gallium/winsys/amdgpu/drm/Android.mk
@@ -30,6 +30,16 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(C_SOURCES)
 
+LOCAL_CFLAGS := \
+	$(AMDGPU_CFLAGS) \
+	-DBRAHMA_BUILD=1
+
+LOCAL_C_INCLUDES := \
+	$(LOCAL_PATH)/addrlib \
+	$(LOCAL_PATH)/addrlib/core \
+	$(LOCAL_PATH)/addrlib/inc/chip/r800 \
+	$(LOCAL_PATH)/addrlib/r800/chip
+
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_amdgpu
 LOCAL_MODULE := libmesa_winsys_amdgpu
 
diff --git a/src/gallium/winsys/amdgpu/drm/Makefile.sources b/src/gallium/winsys/amdgpu/drm/Makefile.sources
index 6b33841b204..23630044ab3 100644
--- a/src/gallium/winsys/amdgpu/drm/Makefile.sources
+++ b/src/gallium/winsys/amdgpu/drm/Makefile.sources
@@ -11,9 +11,7 @@ C_SOURCES := \
 	addrlib/core/addrobject.h \
 	addrlib/inc/chip/r800/si_gb_reg.h \
 	addrlib/inc/lnx_common_defs.h \
-	addrlib/r800/chip/si_ci_merged_enum.h \
 	addrlib/r800/chip/si_ci_vi_merged_enum.h \
-	addrlib/r800/chip/si_enum.h \
 	addrlib/r800/ciaddrlib.cpp \
 	addrlib/r800/ciaddrlib.h \
 	addrlib/r800/egbaddrlib.cpp \
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 0842259044b..12c6b624b03 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -77,8 +77,8 @@ struct amdgpu_cs {
 
    int                         buffer_indices_hashlist[512];
 
-   unsigned                    used_vram;
-   unsigned                    used_gart;
+   uint64_t                    used_vram;
+   uint64_t                    used_gart;
 
    unsigned                    max_dependencies;
 };
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 7a267f9acbf..f04a696988a 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -97,22 +97,17 @@ static boolean radeon_init_cs_context(struct radeon_cs_context *csc,
 {
     int i;
 
-    csc->buf = MALLOC(ws->ib_max_size);
-    if (!csc->buf)
-        return FALSE;
     csc->fd = ws->fd;
     csc->nrelocs = 512;
     csc->relocs_bo = (struct radeon_bo**)
                      CALLOC(1, csc->nrelocs * sizeof(struct radeon_bo*));
     if (!csc->relocs_bo) {
-        FREE(csc->buf);
         return FALSE;
     }
 
     csc->relocs = (struct drm_radeon_cs_reloc*)
                   CALLOC(1, csc->nrelocs * sizeof(struct drm_radeon_cs_reloc));
     if (!csc->relocs) {
-        FREE(csc->buf);
         FREE(csc->relocs_bo);
         return FALSE;
     }
@@ -165,7 +160,6 @@ static void radeon_destroy_cs_context(struct radeon_cs_context *csc)
     radeon_cs_context_cleanup(csc);
     FREE(csc->relocs_bo);
     FREE(csc->relocs);
-    FREE(csc->buf);
 }
 
 
@@ -206,7 +200,7 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
     cs->cst = &cs->csc2;
     cs->base.buf = cs->csc->buf;
     cs->base.ring_type = ring_type;
-    cs->base.max_dw = ws->ib_max_size / 4;
+    cs->base.max_dw = ARRAY_SIZE(cs->csc->buf);
 
     p_atomic_inc(&ws->num_cs);
     return &cs->base;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index ab154945880..6ceb8e98ee7 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -30,7 +30,7 @@
 #include "radeon_drm_bo.h"
 
 struct radeon_cs_context {
-    uint32_t                    *buf;
+    uint32_t                    buf[16 * 1024];
 
     int                         fd;
     struct drm_radeon_cs        cs;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index b70bbaa54a3..f7784fb795e 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -395,20 +395,16 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
         }
 
         ws->info.r600_virtual_address = FALSE;
-        ws->ib_max_size = 64 * 1024;
-
         if (ws->info.drm_minor >= 13) {
+            uint32_t ib_vm_max_size;
+
             ws->info.r600_virtual_address = TRUE;
             if (!radeon_get_drm_value(ws->fd, RADEON_INFO_VA_START, NULL,
                                       &ws->va_start))
                 ws->info.r600_virtual_address = FALSE;
-
-            if (radeon_get_drm_value(ws->fd, RADEON_INFO_IB_VM_MAX_SIZE, NULL,
-                                     &ws->ib_max_size))
-                ws->ib_max_size *= 4; /* the kernel returns the size in dwords */
-            else
+            if (!radeon_get_drm_value(ws->fd, RADEON_INFO_IB_VM_MAX_SIZE, NULL,
+                                      &ib_vm_max_size))
                 ws->info.r600_virtual_address = FALSE;
-
             radeon_get_drm_value(ws->fd, RADEON_INFO_VA_UNMAP_WORKING, NULL,
                                  &ws->va_unmap_working);
         }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
index c1a8d6ae564..308b5bd976d 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h
@@ -73,7 +73,6 @@ struct radeon_drm_winsys {
 
     enum radeon_generation gen;
     struct radeon_info info;
-    uint32_t ib_max_size;
     uint32_t va_start;
     uint32_t va_unmap_working;
     uint32_t accel_working2;
diff --git a/src/gallium/winsys/sw/kms-dri/Makefile.am b/src/gallium/winsys/sw/kms-dri/Makefile.am
index 7f26b1b2f19..8162553289d 100644
--- a/src/gallium/winsys/sw/kms-dri/Makefile.am
+++ b/src/gallium/winsys/sw/kms-dri/Makefile.am
@@ -31,5 +31,3 @@ AM_CFLAGS = \
 noinst_LTLIBRARIES = libswkmsdri.la
 
 libswkmsdri_la_SOURCES = $(C_SOURCES)
-
-EXTRA_DIST = SConscript
diff --git a/src/gbm/main/backend.c b/src/gbm/main/backend.c
index 4929d732b44..37ec9c16f40 100644
--- a/src/gbm/main/backend.c
+++ b/src/gbm/main/backend.c
@@ -65,7 +65,7 @@ static const struct backend_desc *
 find_backend(const char *name)
 {
    const struct backend_desc *backend = NULL;
-   int i;
+   unsigned i;
 
    for (i = 0; i < ARRAY_SIZE(backends); ++i) {
       if (strcmp(backends[i].name, name) == 0) {
@@ -82,7 +82,7 @@ _gbm_create_device(int fd)
 {
    const struct gbm_backend *backend = NULL;
    struct gbm_device *dev = NULL;
-   int i;
+   unsigned i;
    const char *b;
 
    b = getenv("GBM_BACKEND");
diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources
index b491ad4d36f..da38e3576bd 100644
--- a/src/glsl/Makefile.sources
+++ b/src/glsl/Makefile.sources
@@ -25,6 +25,9 @@ NIR_FILES = \
 	nir/nir_array.h \
 	nir/nir_builder.h \
 	nir/nir_constant_expressions.h \
+	nir/nir_control_flow.c \
+	nir/nir_control_flow.h \
+	nir/nir_control_flow_private.h \
 	nir/nir_dominance.c \
 	nir/nir_from_ssa.c \
 	nir/nir_intrinsics.c \
diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp
index 27e84d101ec..ae399f03a9b 100644
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -290,6 +290,21 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
                                   "1.30 and later");
          }
       }
+
+      /* From page 27 of the GLSL ES 3.1 specification:
+       *
+       * "When aggregated into arrays within a shader, images can only be
+       *  indexed with a constant integral expression."
+       *
+       * On the other hand the desktop GL specification extension allows
+       * non-constant indexing of image arrays, but behavior is left undefined
+       * in cases where the indexing expression is not dynamically uniform.
+       */
+      if (state->es_shader && array->type->without_array()->is_image()) {
+         _mesa_glsl_error(&loc, state,
+                          "image arrays indexed with non-constant "
+                          "expressions are forbidden in GLSL ES.");
+      }
    }
 
    /* After performing all of the error checking, generate the IR for the
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index fa2c09d2697..981438de597 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2099,10 +2099,10 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
 static bool
 validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
                            YYLTYPE *loc,
-                           ir_variable *var,
+                           const glsl_type *type,
                            const ast_type_qualifier *qual)
 {
-   if (var->data.mode != ir_var_uniform && var->data.mode != ir_var_shader_storage) {
+   if (!qual->flags.q.uniform && !qual->flags.q.buffer) {
       _mesa_glsl_error(loc, state,
                        "the \"binding\" qualifier only applies to uniforms and "
                        "shader storage buffer objects");
@@ -2115,10 +2115,11 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
    }
 
    const struct gl_context *const ctx = state->ctx;
-   unsigned elements = var->type->is_array() ? var->type->length : 1;
+   unsigned elements = type->is_array() ? type->length : 1;
    unsigned max_index = qual->binding + elements - 1;
+   const glsl_type *base_type = type->without_array();
 
-   if (var->type->is_interface()) {
+   if (base_type->is_interface()) {
       /* UBOs.  From page 60 of the GLSL 4.20 specification:
        * "If the binding point for any uniform block instance is less than zero,
        *  or greater than or equal to the implementation-dependent maximum
@@ -2129,7 +2130,7 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
        *
        * The implementation-dependent maximum is GL_MAX_UNIFORM_BUFFER_BINDINGS.
        */
-      if (var->data.mode == ir_var_uniform &&
+      if (qual->flags.q.uniform &&
          max_index >= ctx->Const.MaxUniformBufferBindings) {
          _mesa_glsl_error(loc, state, "layout(binding = %d) for %d UBOs exceeds "
                           "the maximum number of UBO binding points (%d)",
@@ -2137,6 +2138,7 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
                           ctx->Const.MaxUniformBufferBindings);
          return false;
       }
+
       /* SSBOs. From page 67 of the GLSL 4.30 specification:
        * "If the binding point for any uniform or shader storage block instance
        *  is less than zero, or greater than or equal to the
@@ -2146,7 +2148,7 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
        *  N, all elements of the array from binding through binding + N – 1 must
        *  be within this range."
        */
-      if (var->data.mode == ir_var_shader_storage &&
+      if (qual->flags.q.buffer &&
          max_index >= ctx->Const.MaxShaderStorageBufferBindings) {
          _mesa_glsl_error(loc, state, "layout(binding = %d) for %d SSBOs exceeds "
                           "the maximum number of SSBO binding points (%d)",
@@ -2154,8 +2156,7 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
                           ctx->Const.MaxShaderStorageBufferBindings);
          return false;
       }
-   } else if (var->type->is_sampler() ||
-              (var->type->is_array() && var->type->fields.array->is_sampler())) {
+   } else if (base_type->is_sampler()) {
       /* Samplers.  From page 63 of the GLSL 4.20 specification:
        * "If the binding is less than zero, or greater than or equal to the
        *  implementation-dependent maximum supported number of units, a
@@ -2172,7 +2173,7 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
 
          return false;
       }
-   } else if (var->type->contains_atomic()) {
+   } else if (base_type->contains_atomic()) {
       assert(ctx->Const.MaxAtomicBufferBindings <= MAX_COMBINED_ATOMIC_BUFFERS);
       if (unsigned(qual->binding) >= ctx->Const.MaxAtomicBufferBindings) {
          _mesa_glsl_error(loc, state, "layout(binding = %d) exceeds the "
@@ -2182,10 +2183,19 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
 
          return false;
       }
+   } else if (state->is_version(420, 310) && base_type->is_image()) {
+      assert(ctx->Const.MaxImageUnits <= MAX_IMAGE_UNITS);
+      if (max_index >= ctx->Const.MaxImageUnits) {
+         _mesa_glsl_error(loc, state, "Image binding %d exceeds the "
+                          " maximum number of image units (%d)", max_index,
+                          ctx->Const.MaxImageUnits);
+         return false;
+      }
+
    } else {
       _mesa_glsl_error(loc, state,
                        "the \"binding\" qualifier only applies to uniform "
-                       "blocks, samplers, atomic counters, or arrays thereof");
+                       "blocks, opaque variables, or arrays thereof");
       return false;
    }
 
@@ -2446,14 +2456,38 @@ apply_image_qualifier_to_variable(const struct ast_type_qualifier *qual,
 
          var->data.image_format = qual->image_format;
       } else {
-         if (var->data.mode == ir_var_uniform && !qual->flags.q.write_only) {
-            _mesa_glsl_error(loc, state, "uniforms not qualified with "
-                             "`writeonly' must have a format layout "
-                             "qualifier");
+         if (var->data.mode == ir_var_uniform) {
+            if (state->es_shader) {
+               _mesa_glsl_error(loc, state, "all image uniforms "
+                                "must have a format layout qualifier");
+
+            } else if (!qual->flags.q.write_only) {
+               _mesa_glsl_error(loc, state, "image uniforms not qualified with "
+                                "`writeonly' must have a format layout "
+                                "qualifier");
+            }
          }
 
          var->data.image_format = GL_NONE;
       }
+
+      /* From page 70 of the GLSL ES 3.1 specification:
+       *
+       * "Except for image variables qualified with the format qualifiers
+       *  r32f, r32i, and r32ui, image variables must specify either memory
+       *  qualifier readonly or the memory qualifier writeonly."
+       */
+      if (state->es_shader &&
+          var->data.image_format != GL_R32F &&
+          var->data.image_format != GL_R32I &&
+          var->data.image_format != GL_R32UI &&
+          !var->data.image_read_only &&
+          !var->data.image_write_only) {
+         _mesa_glsl_error(loc, state, "image variables of format other than "
+                          "r32f, r32i or r32ui must be qualified `readonly' or "
+                          "`writeonly'");
+      }
+
    } else if (qual->flags.q.read_only ||
               qual->flags.q.write_only ||
               qual->flags.q.coherent ||
@@ -2759,7 +2793,7 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
    }
 
    if (qual->flags.q.explicit_binding &&
-       validate_binding_qualifier(state, loc, var, qual)) {
+       validate_binding_qualifier(state, loc, var->type, qual)) {
       var->data.explicit_binding = true;
       var->data.binding = qual->binding;
    }
@@ -3293,7 +3327,7 @@ handle_tess_ctrl_shader_output_decl(struct _mesa_glsl_parse_state *state,
 
    validate_layout_qualifier_vertex_count(state, loc, var, num_vertices,
                                           &state->tcs_output_size,
-                                          "geometry shader input");
+                                          "tessellation control shader output");
 }
 
 /**
@@ -3390,7 +3424,7 @@ validate_identifier(const char *identifier, YYLTYPE loc,
 static bool
 precision_qualifier_allowed(const glsl_type *type)
 {
-   /* Precision qualifiers apply to floating point, integer and sampler
+   /* Precision qualifiers apply to floating point, integer and opaque
     * types.
     *
     * Section 4.5.2 (Precision Qualifiers) of the GLSL 1.30 spec says:
@@ -3420,7 +3454,7 @@ precision_qualifier_allowed(const glsl_type *type)
    return type->is_float()
        || type->is_integer()
        || type->is_record()
-       || type->is_sampler();
+       || type->contains_opaque();
 }
 
 ir_rvalue *
@@ -4131,7 +4165,7 @@ ast_declarator_list::hir(exec_list *instructions,
 
          _mesa_glsl_error(&loc, state,
                           "precision qualifiers apply only to floating point"
-                          ", integer and sampler types");
+                          ", integer and opaque types");
       }
 
       /* From section 4.1.7 of the GLSL 4.40 spec:
@@ -5439,6 +5473,8 @@ is_valid_default_precision_type(const struct glsl_type *const type)
       /* "int" and "float" are valid, but vectors and matrices are not. */
       return type->vector_elements == 1 && type->matrix_columns == 1;
    case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_ATOMIC_UINT:
       return true;
    default:
       return false;
@@ -5487,7 +5523,7 @@ ast_type_specifier::hir(exec_list *instructions,
       if (!is_valid_default_precision_type(type)) {
          _mesa_glsl_error(&loc, state,
                           "default precision statements apply only to "
-                          "float, int, and sampler types");
+                          "float, int, and opaque types");
          return NULL;
       }
 
@@ -6067,6 +6103,8 @@ ast_interface_block::hir(exec_list *instructions,
                                         num_variables,
                                         packing,
                                         this->block_name);
+   if (this->layout.flags.q.explicit_binding)
+      validate_binding_qualifier(state, &loc, block_type, &this->layout);
 
    if (!state->symbols->add_interface(block_type->name, block_type, var_mode)) {
       YYLTYPE loc = this->get_location();
@@ -6197,6 +6235,10 @@ ast_interface_block::hir(exec_list *instructions,
                              "not allowed");
          }
 
+         if (this->layout.flags.q.explicit_binding)
+            validate_binding_qualifier(state, &loc, block_array_type,
+                                       &this->layout);
+
          var = new(state) ir_variable(block_array_type,
                                       this->instance_name,
                                       var_mode);
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index 2175c66cbd7..1bc3de4aec5 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -136,6 +136,13 @@ v140(const _mesa_glsl_parse_state *state)
 }
 
 static bool
+v400_fs_only(const _mesa_glsl_parse_state *state)
+{
+   return state->is_version(400, 0) &&
+          state->stage == MESA_SHADER_FRAGMENT;
+}
+
+static bool
 es31(const _mesa_glsl_parse_state *state)
 {
    return state->is_version(0, 310);
@@ -270,6 +277,13 @@ texture_array(const _mesa_glsl_parse_state *state)
 static bool
 texture_multisample(const _mesa_glsl_parse_state *state)
 {
+   return state->is_version(150, 310) ||
+          state->ARB_texture_multisample_enable;
+}
+
+static bool
+texture_multisample_array(const _mesa_glsl_parse_state *state)
+{
    return state->is_version(150, 0) ||
           state->ARB_texture_multisample_enable;
 }
@@ -394,11 +408,25 @@ shader_trinary_minmax(const _mesa_glsl_parse_state *state)
 static bool
 shader_image_load_store(const _mesa_glsl_parse_state *state)
 {
+   return (state->is_version(420, 310) ||
+           state->ARB_shader_image_load_store_enable);
+}
+
+static bool
+shader_image_atomic(const _mesa_glsl_parse_state *state)
+{
    return (state->is_version(420, 0) ||
            state->ARB_shader_image_load_store_enable);
 }
 
 static bool
+shader_image_size(const _mesa_glsl_parse_state *state)
+{
+   return state->is_version(430, 310) ||
+           state->ARB_shader_image_size_enable;
+}
+
+static bool
 gs_streams(const _mesa_glsl_parse_state *state)
 {
    return gpu_shader5(state) && gs_only(state);
@@ -492,13 +520,19 @@ private:
    /** Create a new function and add the given signatures. */
    void add_function(const char *name, ...);
 
+   typedef ir_function_signature *(builtin_builder::*image_prototype_ctr)(const glsl_type *image_type,
+                                                                          const char *intrinsic_name,
+                                                                          unsigned num_arguments,
+                                                                          unsigned flags);
+
    enum image_function_flags {
       IMAGE_FUNCTION_EMIT_STUB = (1 << 0),
       IMAGE_FUNCTION_RETURNS_VOID = (1 << 1),
       IMAGE_FUNCTION_HAS_VECTOR_DATA_TYPE = (1 << 2),
       IMAGE_FUNCTION_SUPPORTS_FLOAT_DATA_TYPE = (1 << 3),
       IMAGE_FUNCTION_READ_ONLY = (1 << 4),
-      IMAGE_FUNCTION_WRITE_ONLY = (1 << 5)
+      IMAGE_FUNCTION_WRITE_ONLY = (1 << 5),
+      IMAGE_FUNCTION_AVAIL_ATOMIC = (1 << 6)
    };
 
    /**
@@ -507,6 +541,7 @@ private:
     */
    void add_image_function(const char *name,
                            const char *intrinsic_name,
+                           image_prototype_ctr prototype,
                            unsigned num_arguments,
                            unsigned flags);
 
@@ -663,7 +698,7 @@ private:
                                               const glsl_type *stream_type);
    B0(barrier)
 
-   B2(textureQueryLod);
+   BA2(textureQueryLod);
    B1(textureQueryLevels);
    B1(dFdx);
    B1(dFdy);
@@ -708,7 +743,12 @@ private:
                                            const char *intrinsic_name,
                                            unsigned num_arguments,
                                            unsigned flags);
-   ir_function_signature *_image(const glsl_type *image_type,
+   ir_function_signature *_image_size_prototype(const glsl_type *image_type,
+                                                const char *intrinsic_name,
+                                                unsigned num_arguments,
+                                                unsigned flags);
+   ir_function_signature *_image(image_prototype_ctr prototype,
+                                 const glsl_type *image_type,
                                  const char *intrinsic_name,
                                  unsigned num_arguments,
                                  unsigned flags);
@@ -1367,9 +1407,9 @@ builtin_builder::create_builtins()
                 _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::isampler2DMS_type),
                 _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::usampler2DMS_type),
 
-                _textureSize(texture_multisample, glsl_type::ivec3_type, glsl_type::sampler2DMSArray_type),
-                _textureSize(texture_multisample, glsl_type::ivec3_type, glsl_type::isampler2DMSArray_type),
-                _textureSize(texture_multisample, glsl_type::ivec3_type, glsl_type::usampler2DMSArray_type),
+                _textureSize(texture_multisample_array, glsl_type::ivec3_type, glsl_type::sampler2DMSArray_type),
+                _textureSize(texture_multisample_array, glsl_type::ivec3_type, glsl_type::isampler2DMSArray_type),
+                _textureSize(texture_multisample_array, glsl_type::ivec3_type, glsl_type::usampler2DMSArray_type),
                 NULL);
 
    add_function("texture",
@@ -1632,9 +1672,9 @@ builtin_builder::create_builtins()
                 _texelFetch(texture_multisample, glsl_type::ivec4_type, glsl_type::isampler2DMS_type, glsl_type::ivec2_type),
                 _texelFetch(texture_multisample, glsl_type::uvec4_type, glsl_type::usampler2DMS_type, glsl_type::ivec2_type),
 
-                _texelFetch(texture_multisample, glsl_type::vec4_type,  glsl_type::sampler2DMSArray_type,  glsl_type::ivec3_type),
-                _texelFetch(texture_multisample, glsl_type::ivec4_type, glsl_type::isampler2DMSArray_type, glsl_type::ivec3_type),
-                _texelFetch(texture_multisample, glsl_type::uvec4_type, glsl_type::usampler2DMSArray_type, glsl_type::ivec3_type),
+                _texelFetch(texture_multisample_array, glsl_type::vec4_type,  glsl_type::sampler2DMSArray_type,  glsl_type::ivec3_type),
+                _texelFetch(texture_multisample_array, glsl_type::ivec4_type, glsl_type::isampler2DMSArray_type, glsl_type::ivec3_type),
+                _texelFetch(texture_multisample_array, glsl_type::uvec4_type, glsl_type::usampler2DMSArray_type, glsl_type::ivec3_type),
                 NULL);
 
    add_function("texelFetchOffset",
@@ -1944,40 +1984,77 @@ builtin_builder::create_builtins()
    add_function("barrier", _barrier(), NULL);
 
    add_function("textureQueryLOD",
-                _textureQueryLod(glsl_type::sampler1D_type,  glsl_type::float_type),
-                _textureQueryLod(glsl_type::isampler1D_type, glsl_type::float_type),
-                _textureQueryLod(glsl_type::usampler1D_type, glsl_type::float_type),
-
-                _textureQueryLod(glsl_type::sampler2D_type,  glsl_type::vec2_type),
-                _textureQueryLod(glsl_type::isampler2D_type, glsl_type::vec2_type),
-                _textureQueryLod(glsl_type::usampler2D_type, glsl_type::vec2_type),
-
-                _textureQueryLod(glsl_type::sampler3D_type,  glsl_type::vec3_type),
-                _textureQueryLod(glsl_type::isampler3D_type, glsl_type::vec3_type),
-                _textureQueryLod(glsl_type::usampler3D_type, glsl_type::vec3_type),
-
-                _textureQueryLod(glsl_type::samplerCube_type,  glsl_type::vec3_type),
-                _textureQueryLod(glsl_type::isamplerCube_type, glsl_type::vec3_type),
-                _textureQueryLod(glsl_type::usamplerCube_type, glsl_type::vec3_type),
-
-                _textureQueryLod(glsl_type::sampler1DArray_type,  glsl_type::float_type),
-                _textureQueryLod(glsl_type::isampler1DArray_type, glsl_type::float_type),
-                _textureQueryLod(glsl_type::usampler1DArray_type, glsl_type::float_type),
-
-                _textureQueryLod(glsl_type::sampler2DArray_type,  glsl_type::vec2_type),
-                _textureQueryLod(glsl_type::isampler2DArray_type, glsl_type::vec2_type),
-                _textureQueryLod(glsl_type::usampler2DArray_type, glsl_type::vec2_type),
-
-                _textureQueryLod(glsl_type::samplerCubeArray_type,  glsl_type::vec3_type),
-                _textureQueryLod(glsl_type::isamplerCubeArray_type, glsl_type::vec3_type),
-                _textureQueryLod(glsl_type::usamplerCubeArray_type, glsl_type::vec3_type),
-
-                _textureQueryLod(glsl_type::sampler1DShadow_type, glsl_type::float_type),
-                _textureQueryLod(glsl_type::sampler2DShadow_type, glsl_type::vec2_type),
-                _textureQueryLod(glsl_type::samplerCubeShadow_type, glsl_type::vec3_type),
-                _textureQueryLod(glsl_type::sampler1DArrayShadow_type, glsl_type::float_type),
-                _textureQueryLod(glsl_type::sampler2DArrayShadow_type, glsl_type::vec2_type),
-                _textureQueryLod(glsl_type::samplerCubeArrayShadow_type, glsl_type::vec3_type),
+                _textureQueryLod(texture_query_lod, glsl_type::sampler1D_type,  glsl_type::float_type),
+                _textureQueryLod(texture_query_lod, glsl_type::isampler1D_type, glsl_type::float_type),
+                _textureQueryLod(texture_query_lod, glsl_type::usampler1D_type, glsl_type::float_type),
+
+                _textureQueryLod(texture_query_lod, glsl_type::sampler2D_type,  glsl_type::vec2_type),
+                _textureQueryLod(texture_query_lod, glsl_type::isampler2D_type, glsl_type::vec2_type),
+                _textureQueryLod(texture_query_lod, glsl_type::usampler2D_type, glsl_type::vec2_type),
+
+                _textureQueryLod(texture_query_lod, glsl_type::sampler3D_type,  glsl_type::vec3_type),
+                _textureQueryLod(texture_query_lod, glsl_type::isampler3D_type, glsl_type::vec3_type),
+                _textureQueryLod(texture_query_lod, glsl_type::usampler3D_type, glsl_type::vec3_type),
+
+                _textureQueryLod(texture_query_lod, glsl_type::samplerCube_type,  glsl_type::vec3_type),
+                _textureQueryLod(texture_query_lod, glsl_type::isamplerCube_type, glsl_type::vec3_type),
+                _textureQueryLod(texture_query_lod, glsl_type::usamplerCube_type, glsl_type::vec3_type),
+
+                _textureQueryLod(texture_query_lod, glsl_type::sampler1DArray_type,  glsl_type::float_type),
+                _textureQueryLod(texture_query_lod, glsl_type::isampler1DArray_type, glsl_type::float_type),
+                _textureQueryLod(texture_query_lod, glsl_type::usampler1DArray_type, glsl_type::float_type),
+
+                _textureQueryLod(texture_query_lod, glsl_type::sampler2DArray_type,  glsl_type::vec2_type),
+                _textureQueryLod(texture_query_lod, glsl_type::isampler2DArray_type, glsl_type::vec2_type),
+                _textureQueryLod(texture_query_lod, glsl_type::usampler2DArray_type, glsl_type::vec2_type),
+
+                _textureQueryLod(texture_query_lod, glsl_type::samplerCubeArray_type,  glsl_type::vec3_type),
+                _textureQueryLod(texture_query_lod, glsl_type::isamplerCubeArray_type, glsl_type::vec3_type),
+                _textureQueryLod(texture_query_lod, glsl_type::usamplerCubeArray_type, glsl_type::vec3_type),
+
+                _textureQueryLod(texture_query_lod, glsl_type::sampler1DShadow_type, glsl_type::float_type),
+                _textureQueryLod(texture_query_lod, glsl_type::sampler2DShadow_type, glsl_type::vec2_type),
+                _textureQueryLod(texture_query_lod, glsl_type::samplerCubeShadow_type, glsl_type::vec3_type),
+                _textureQueryLod(texture_query_lod, glsl_type::sampler1DArrayShadow_type, glsl_type::float_type),
+                _textureQueryLod(texture_query_lod, glsl_type::sampler2DArrayShadow_type, glsl_type::vec2_type),
+                _textureQueryLod(texture_query_lod, glsl_type::samplerCubeArrayShadow_type, glsl_type::vec3_type),
+                NULL);
+
+   add_function("textureQueryLod",
+                _textureQueryLod(v400_fs_only, glsl_type::sampler1D_type,  glsl_type::float_type),
+                _textureQueryLod(v400_fs_only, glsl_type::isampler1D_type, glsl_type::float_type),
+                _textureQueryLod(v400_fs_only, glsl_type::usampler1D_type, glsl_type::float_type),
+
+                _textureQueryLod(v400_fs_only, glsl_type::sampler2D_type,  glsl_type::vec2_type),
+                _textureQueryLod(v400_fs_only, glsl_type::isampler2D_type, glsl_type::vec2_type),
+                _textureQueryLod(v400_fs_only, glsl_type::usampler2D_type, glsl_type::vec2_type),
+
+                _textureQueryLod(v400_fs_only, glsl_type::sampler3D_type,  glsl_type::vec3_type),
+                _textureQueryLod(v400_fs_only, glsl_type::isampler3D_type, glsl_type::vec3_type),
+                _textureQueryLod(v400_fs_only, glsl_type::usampler3D_type, glsl_type::vec3_type),
+
+                _textureQueryLod(v400_fs_only, glsl_type::samplerCube_type,  glsl_type::vec3_type),
+                _textureQueryLod(v400_fs_only, glsl_type::isamplerCube_type, glsl_type::vec3_type),
+                _textureQueryLod(v400_fs_only, glsl_type::usamplerCube_type, glsl_type::vec3_type),
+
+                _textureQueryLod(v400_fs_only, glsl_type::sampler1DArray_type,  glsl_type::float_type),
+                _textureQueryLod(v400_fs_only, glsl_type::isampler1DArray_type, glsl_type::float_type),
+                _textureQueryLod(v400_fs_only, glsl_type::usampler1DArray_type, glsl_type::float_type),
+
+                _textureQueryLod(v400_fs_only, glsl_type::sampler2DArray_type,  glsl_type::vec2_type),
+                _textureQueryLod(v400_fs_only, glsl_type::isampler2DArray_type, glsl_type::vec2_type),
+                _textureQueryLod(v400_fs_only, glsl_type::usampler2DArray_type, glsl_type::vec2_type),
+
+                _textureQueryLod(v400_fs_only, glsl_type::samplerCubeArray_type,  glsl_type::vec3_type),
+                _textureQueryLod(v400_fs_only, glsl_type::isamplerCubeArray_type, glsl_type::vec3_type),
+                _textureQueryLod(v400_fs_only, glsl_type::usamplerCubeArray_type, glsl_type::vec3_type),
+
+                _textureQueryLod(v400_fs_only, glsl_type::sampler1DShadow_type, glsl_type::float_type),
+                _textureQueryLod(v400_fs_only, glsl_type::sampler2DShadow_type, glsl_type::vec2_type),
+                _textureQueryLod(v400_fs_only, glsl_type::samplerCubeShadow_type, glsl_type::vec3_type),
+                _textureQueryLod(v400_fs_only, glsl_type::sampler1DArrayShadow_type, glsl_type::float_type),
+                _textureQueryLod(v400_fs_only, glsl_type::sampler2DArrayShadow_type, glsl_type::vec2_type),
+                _textureQueryLod(v400_fs_only, glsl_type::samplerCubeArrayShadow_type, glsl_type::vec3_type),
                 NULL);
 
    add_function("textureQueryLevels",
@@ -2552,6 +2629,7 @@ builtin_builder::add_function(const char *name, ...)
 void
 builtin_builder::add_image_function(const char *name,
                                     const char *intrinsic_name,
+                                    image_prototype_ctr prototype,
                                     unsigned num_arguments,
                                     unsigned flags)
 {
@@ -2590,12 +2668,13 @@ builtin_builder::add_image_function(const char *name,
       glsl_type::uimage2DMS_type,
       glsl_type::uimage2DMSArray_type
    };
+
    ir_function *f = new(mem_ctx) ir_function(name);
 
    for (unsigned i = 0; i < ARRAY_SIZE(types); ++i) {
       if (types[i]->sampler_type != GLSL_TYPE_FLOAT ||
           (flags & IMAGE_FUNCTION_SUPPORTS_FLOAT_DATA_TYPE))
-         f->add_signature(_image(types[i], intrinsic_name,
+         f->add_signature(_image(prototype, types[i], intrinsic_name,
                                  num_arguments, flags));
    }
 
@@ -2608,43 +2687,60 @@ builtin_builder::add_image_functions(bool glsl)
    const unsigned flags = (glsl ? IMAGE_FUNCTION_EMIT_STUB : 0);
 
    add_image_function(glsl ? "imageLoad" : "__intrinsic_image_load",
-                      "__intrinsic_image_load", 0,
-                      (flags | IMAGE_FUNCTION_HAS_VECTOR_DATA_TYPE |
+                       "__intrinsic_image_load",
+                       &builtin_builder::_image_prototype, 0,
+                       (flags | IMAGE_FUNCTION_HAS_VECTOR_DATA_TYPE |
                        IMAGE_FUNCTION_SUPPORTS_FLOAT_DATA_TYPE |
                        IMAGE_FUNCTION_READ_ONLY));
 
    add_image_function(glsl ? "imageStore" : "__intrinsic_image_store",
-                      "__intrinsic_image_store", 1,
+                      "__intrinsic_image_store",
+                      &builtin_builder::_image_prototype, 1,
                       (flags | IMAGE_FUNCTION_RETURNS_VOID |
                        IMAGE_FUNCTION_HAS_VECTOR_DATA_TYPE |
                        IMAGE_FUNCTION_SUPPORTS_FLOAT_DATA_TYPE |
                        IMAGE_FUNCTION_WRITE_ONLY));
 
+   const unsigned atom_flags = flags | IMAGE_FUNCTION_AVAIL_ATOMIC;
+
    add_image_function(glsl ? "imageAtomicAdd" : "__intrinsic_image_atomic_add",
-                      "__intrinsic_image_atomic_add", 1, flags);
+                      "__intrinsic_image_atomic_add",
+                      &builtin_builder::_image_prototype, 1, atom_flags);
 
    add_image_function(glsl ? "imageAtomicMin" : "__intrinsic_image_atomic_min",
-                      "__intrinsic_image_atomic_min", 1, flags);
+                      "__intrinsic_image_atomic_min",
+                      &builtin_builder::_image_prototype, 1, atom_flags);
 
    add_image_function(glsl ? "imageAtomicMax" : "__intrinsic_image_atomic_max",
-                      "__intrinsic_image_atomic_max", 1, flags);
+                      "__intrinsic_image_atomic_max",
+                      &builtin_builder::_image_prototype, 1, atom_flags);
 
    add_image_function(glsl ? "imageAtomicAnd" : "__intrinsic_image_atomic_and",
-                      "__intrinsic_image_atomic_and", 1, flags);
+                      "__intrinsic_image_atomic_and",
+                      &builtin_builder::_image_prototype, 1, atom_flags);
 
    add_image_function(glsl ? "imageAtomicOr" : "__intrinsic_image_atomic_or",
-                      "__intrinsic_image_atomic_or", 1, flags);
+                      "__intrinsic_image_atomic_or",
+                      &builtin_builder::_image_prototype, 1, atom_flags);
 
    add_image_function(glsl ? "imageAtomicXor" : "__intrinsic_image_atomic_xor",
-                      "__intrinsic_image_atomic_xor", 1, flags);
+                      "__intrinsic_image_atomic_xor",
+                      &builtin_builder::_image_prototype, 1, atom_flags);
 
    add_image_function((glsl ? "imageAtomicExchange" :
                        "__intrinsic_image_atomic_exchange"),
-                      "__intrinsic_image_atomic_exchange", 1, flags);
+                      "__intrinsic_image_atomic_exchange",
+                      &builtin_builder::_image_prototype, 1, atom_flags);
 
    add_image_function((glsl ? "imageAtomicCompSwap" :
                        "__intrinsic_image_atomic_comp_swap"),
-                      "__intrinsic_image_atomic_comp_swap", 2, flags);
+                      "__intrinsic_image_atomic_comp_swap",
+                      &builtin_builder::_image_prototype, 2, atom_flags);
+
+   add_image_function(glsl ? "imageSize" : "__intrinsic_image_size",
+                      "__intrinsic_image_size",
+                      &builtin_builder::_image_size_prototype, 1,
+                      flags | IMAGE_FUNCTION_SUPPORTS_FLOAT_DATA_TYPE);
 }
 
 ir_variable *
@@ -4314,13 +4410,14 @@ builtin_builder::_barrier()
 }
 
 ir_function_signature *
-builtin_builder::_textureQueryLod(const glsl_type *sampler_type,
+builtin_builder::_textureQueryLod(builtin_available_predicate avail,
+                                  const glsl_type *sampler_type,
                                   const glsl_type *coord_type)
 {
    ir_variable *s = in_var(sampler_type, "sampler");
    ir_variable *coord = in_var(coord_type, "coord");
    /* The sampler and coordinate always exist; add optional parameters later. */
-   MAKE_SIG(glsl_type::vec2_type, texture_query_lod, 2, s, coord);
+   MAKE_SIG(glsl_type::vec2_type, avail, 2, s, coord);
 
    ir_texture *tex = new(mem_ctx) ir_texture(ir_lod);
    tex->coordinate = var_ref(coord);
@@ -4787,8 +4884,10 @@ builtin_builder::_image_prototype(const glsl_type *image_type,
    ir_variable *coord = in_var(
       glsl_type::ivec(image_type->coordinate_components()), "coord");
 
-   ir_function_signature *sig = new_sig(
-      ret_type, shader_image_load_store, 2, image, coord);
+   const builtin_available_predicate avail =
+      (flags & IMAGE_FUNCTION_AVAIL_ATOMIC ? shader_image_atomic :
+       shader_image_load_store);
+   ir_function_signature *sig = new_sig(ret_type, avail, 2, image, coord);
 
    /* Sample index for multisample images. */
    if (image_type->sampler_dimensionality == GLSL_SAMPLER_DIM_MS)
@@ -4818,13 +4917,55 @@ builtin_builder::_image_prototype(const glsl_type *image_type,
 }
 
 ir_function_signature *
-builtin_builder::_image(const glsl_type *image_type,
+builtin_builder::_image_size_prototype(const glsl_type *image_type,
+                                       const char *intrinsic_name,
+                                       unsigned num_arguments,
+                                       unsigned flags)
+{
+   const glsl_type *ret_type;
+   unsigned num_components = image_type->coordinate_components();
+
+   /* From the ARB_shader_image_size extension:
+    * "Cube images return the dimensions of one face."
+    */
+   if (image_type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
+       !image_type->sampler_array) {
+      num_components = 2;
+   }
+
+   /* FIXME: Add the highp precision qualifier for GLES 3.10 when it is
+    * supported by mesa.
+    */
+   ret_type = glsl_type::get_instance(GLSL_TYPE_INT, num_components, 1);
+
+   ir_variable *image = in_var(image_type, "image");
+   ir_function_signature *sig = new_sig(ret_type, shader_image_size, 1, image);
+
+   /* Set the maximal set of qualifiers allowed for this image
+    * built-in.  Function calls with arguments having fewer
+    * qualifiers than present in the prototype are allowed by the
+    * spec, but not with more, i.e. this will make the compiler
+    * accept everything that needs to be accepted, and reject cases
+    * like loads from write-only or stores to read-only images.
+    */
+   image->data.image_read_only = true;
+   image->data.image_write_only = true;
+   image->data.image_coherent = true;
+   image->data.image_volatile = true;
+   image->data.image_restrict = true;
+
+   return sig;
+}
+
+ir_function_signature *
+builtin_builder::_image(image_prototype_ctr prototype,
+                        const glsl_type *image_type,
                         const char *intrinsic_name,
                         unsigned num_arguments,
                         unsigned flags)
 {
-   ir_function_signature *sig = _image_prototype(image_type, intrinsic_name,
-                                                 num_arguments, flags);
+   ir_function_signature *sig = (this->*prototype)(image_type, intrinsic_name,
+                                                   num_arguments, flags);
 
    if (flags & IMAGE_FUNCTION_EMIT_STUB) {
       ir_factory body(&sig->body, mem_ctx);
diff --git a/src/glsl/builtin_types.cpp b/src/glsl/builtin_types.cpp
index ffbc5e6fdbc..9cf198fd127 100644
--- a/src/glsl/builtin_types.cpp
+++ b/src/glsl/builtin_types.cpp
@@ -182,7 +182,7 @@ const static struct builtin_type_versions {
    T(samplerCubeArray,                400, 999)
    T(sampler2DRect,                   140, 999)
    T(samplerBuffer,                   140, 999)
-   T(sampler2DMS,                     150, 999)
+   T(sampler2DMS,                     150, 310)
    T(sampler2DMSArray,                150, 999)
 
    T(isampler1D,                      130, 999)
@@ -194,7 +194,7 @@ const static struct builtin_type_versions {
    T(isamplerCubeArray,               400, 999)
    T(isampler2DRect,                  140, 999)
    T(isamplerBuffer,                  140, 999)
-   T(isampler2DMS,                    150, 999)
+   T(isampler2DMS,                    150, 310)
    T(isampler2DMSArray,               150, 999)
 
    T(usampler1D,                      130, 999)
@@ -206,7 +206,7 @@ const static struct builtin_type_versions {
    T(usamplerCubeArray,               400, 999)
    T(usampler2DRect,                  140, 999)
    T(usamplerBuffer,                  140, 999)
-   T(usampler2DMS,                    150, 999)
+   T(usampler2DMS,                    150, 310)
    T(usampler2DMSArray,               150, 999)
 
    T(sampler1DShadow,                 110, 999)
@@ -220,40 +220,40 @@ const static struct builtin_type_versions {
    T(struct_gl_DepthRangeParameters,  110, 100)
 
    T(image1D,                         420, 999)
-   T(image2D,                         420, 999)
-   T(image3D,                         420, 999)
+   T(image2D,                         420, 310)
+   T(image3D,                         420, 310)
    T(image2DRect,                     420, 999)
-   T(imageCube,                       420, 999)
+   T(imageCube,                       420, 310)
    T(imageBuffer,                     420, 999)
    T(image1DArray,                    420, 999)
-   T(image2DArray,                    420, 999)
+   T(image2DArray,                    420, 310)
    T(imageCubeArray,                  420, 999)
    T(image2DMS,                       420, 999)
    T(image2DMSArray,                  420, 999)
    T(iimage1D,                        420, 999)
-   T(iimage2D,                        420, 999)
-   T(iimage3D,                        420, 999)
+   T(iimage2D,                        420, 310)
+   T(iimage3D,                        420, 310)
    T(iimage2DRect,                    420, 999)
-   T(iimageCube,                      420, 999)
+   T(iimageCube,                      420, 310)
    T(iimageBuffer,                    420, 999)
    T(iimage1DArray,                   420, 999)
-   T(iimage2DArray,                   420, 999)
+   T(iimage2DArray,                   420, 310)
    T(iimageCubeArray,                 420, 999)
    T(iimage2DMS,                      420, 999)
    T(iimage2DMSArray,                 420, 999)
    T(uimage1D,                        420, 999)
-   T(uimage2D,                        420, 999)
-   T(uimage3D,                        420, 999)
+   T(uimage2D,                        420, 310)
+   T(uimage3D,                        420, 310)
    T(uimage2DRect,                    420, 999)
-   T(uimageCube,                      420, 999)
+   T(uimageCube,                      420, 310)
    T(uimageBuffer,                    420, 999)
    T(uimage1DArray,                   420, 999)
-   T(uimage2DArray,                   420, 999)
+   T(uimage2DArray,                   420, 310)
    T(uimageCubeArray,                 420, 999)
    T(uimage2DMS,                      420, 999)
    T(uimage2DMSArray,                 420, 999)
 
-   T(atomic_uint,                     420, 999)
+   T(atomic_uint,                     420, 310)
 };
 
 static const glsl_type *const deprecated_types[] = {
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index 53d3500b1f4..dd7804f7b8c 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -744,25 +744,31 @@ builtin_variable_generator::generate_constants()
        */
    }
 
-   if (state->is_version(420, 0) ||
+   if (state->is_version(420, 310) ||
        state->ARB_shader_image_load_store_enable) {
       add_const("gl_MaxImageUnits",
                 state->Const.MaxImageUnits);
-      add_const("gl_MaxCombinedImageUnitsAndFragmentOutputs",
-                state->Const.MaxCombinedImageUnitsAndFragmentOutputs);
-      add_const("gl_MaxImageSamples",
-                state->Const.MaxImageSamples);
       add_const("gl_MaxVertexImageUniforms",
                 state->Const.MaxVertexImageUniforms);
-      add_const("gl_MaxTessControlImageUniforms", 0);
-      add_const("gl_MaxTessEvaluationImageUniforms", 0);
-      add_const("gl_MaxGeometryImageUniforms",
-                state->Const.MaxGeometryImageUniforms);
       add_const("gl_MaxFragmentImageUniforms",
                 state->Const.MaxFragmentImageUniforms);
       add_const("gl_MaxCombinedImageUniforms",
                 state->Const.MaxCombinedImageUniforms);
 
+      if (!state->es_shader) {
+         add_const("gl_MaxCombinedImageUnitsAndFragmentOutputs",
+                   state->Const.MaxCombinedShaderOutputResources);
+         add_const("gl_MaxImageSamples",
+                   state->Const.MaxImageSamples);
+         add_const("gl_MaxGeometryImageUniforms",
+                   state->Const.MaxGeometryImageUniforms);
+      }
+
+      if (state->is_version(450, 310)) {
+         add_const("gl_MaxCombinedShaderOutputResources",
+                   state->Const.MaxCombinedShaderOutputResources);
+      }
+
       if (state->is_version(400, 0) ||
           state->ARB_tessellation_shader_enable) {
          add_const("gl_MaxTessControlImageUniforms",
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index dd5ec2a30b5..18e50afe476 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2478,6 +2478,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	      if (extensions->ARB_shader_image_load_store)
 	         add_builtin_define(parser, "GL_ARB_shader_image_load_store", 1);
 
+              if (extensions->ARB_shader_image_size)
+                 add_builtin_define(parser, "GL_ARB_shader_image_size", 1);
+
               if (extensions->ARB_derivative_control)
                  add_builtin_define(parser, "GL_ARB_derivative_control", 1);
 
diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll
index efa0bb68099..24998c19467 100644
--- a/src/glsl/glsl_lexer.ll
+++ b/src/glsl/glsl_lexer.ll
@@ -343,9 +343,10 @@ usampler2DArray		KEYWORD(130, 300, 130, 300, USAMPLER2DARRAY);
 
    /* additional keywords in ARB_texture_multisample, included in GLSL 1.50 */
    /* these are reserved but not defined in GLSL 3.00 */
-sampler2DMS        KEYWORD_WITH_ALT(150, 300, 150, 0, yyextra->ARB_texture_multisample_enable, SAMPLER2DMS);
-isampler2DMS       KEYWORD_WITH_ALT(150, 300, 150, 0, yyextra->ARB_texture_multisample_enable, ISAMPLER2DMS);
-usampler2DMS       KEYWORD_WITH_ALT(150, 300, 150, 0, yyextra->ARB_texture_multisample_enable, USAMPLER2DMS);
+   /* [iu]sampler2DMS are defined in GLSL ES 3.10 */
+sampler2DMS        KEYWORD_WITH_ALT(150, 300, 150, 310, yyextra->ARB_texture_multisample_enable, SAMPLER2DMS);
+isampler2DMS       KEYWORD_WITH_ALT(150, 300, 150, 310, yyextra->ARB_texture_multisample_enable, ISAMPLER2DMS);
+usampler2DMS       KEYWORD_WITH_ALT(150, 300, 150, 310, yyextra->ARB_texture_multisample_enable, USAMPLER2DMS);
 sampler2DMSArray   KEYWORD_WITH_ALT(150, 300, 150, 0, yyextra->ARB_texture_multisample_enable, SAMPLER2DMSARRAY);
 isampler2DMSArray  KEYWORD_WITH_ALT(150, 300, 150, 0, yyextra->ARB_texture_multisample_enable, ISAMPLER2DMSARRAY);
 usampler2DMSArray  KEYWORD_WITH_ALT(150, 300, 150, 0, yyextra->ARB_texture_multisample_enable, USAMPLER2DMSARRAY);
@@ -368,35 +369,35 @@ precise		KEYWORD_WITH_ALT(400, 0, 400, 0, yyextra->ARB_gpu_shader5_enable, PRECI
 
    /* keywords available with ARB_shader_image_load_store */
 image1D         KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE1D);
-image2D         KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2D);
-image3D         KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE3D);
+image2D         KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE2D);
+image3D         KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE3D);
 image2DRect     KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2DRECT);
-imageCube       KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGECUBE);
+imageCube       KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGECUBE);
 imageBuffer     KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGEBUFFER);
 image1DArray    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE1DARRAY);
-image2DArray    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2DARRAY);
+image2DArray    KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE2DARRAY);
 imageCubeArray  KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGECUBEARRAY);
 image2DMS       KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2DMS);
 image2DMSArray  KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2DMSARRAY);
 iimage1D        KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE1D);
-iimage2D        KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2D);
-iimage3D        KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE3D);
+iimage2D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE2D);
+iimage3D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE3D);
 iimage2DRect    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DRECT);
-iimageCube      KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBE);
+iimageCube      KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBE);
 iimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGEBUFFER);
 iimage1DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE1DARRAY);
-iimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DARRAY);
+iimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DARRAY);
 iimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBEARRAY);
 iimage2DMS      KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DMS);
 iimage2DMSArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DMSARRAY);
 uimage1D        KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE1D);
-uimage2D        KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2D);
-uimage3D        KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE3D);
+uimage2D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE2D);
+uimage3D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE3D);
 uimage2DRect    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DRECT);
-uimageCube      KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBE);
+uimageCube      KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBE);
 uimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGEBUFFER);
 uimage1DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE1DARRAY);
-uimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DARRAY);
+uimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DARRAY);
 uimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBEARRAY);
 uimage2DMS      KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DMS);
 uimage2DMSArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DMSARRAY);
@@ -405,11 +406,11 @@ image2DShadow           KEYWORD(130, 300, 0, 0, IMAGE2DSHADOW);
 image1DArrayShadow      KEYWORD(130, 300, 0, 0, IMAGE1DARRAYSHADOW);
 image2DArrayShadow      KEYWORD(130, 300, 0, 0, IMAGE2DARRAYSHADOW);
 
-coherent	KEYWORD_WITH_ALT(420, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, COHERENT);
-volatile	KEYWORD_WITH_ALT(110, 100, 420, 0, yyextra->ARB_shader_image_load_store_enable, VOLATILE);
-restrict	KEYWORD_WITH_ALT(420, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, RESTRICT);
-readonly	KEYWORD_WITH_ALT(420, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, READONLY);
-writeonly	KEYWORD_WITH_ALT(420, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, WRITEONLY);
+coherent       KEYWORD_WITH_ALT(420, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, COHERENT);
+volatile       KEYWORD_WITH_ALT(110, 100, 420, 310, yyextra->ARB_shader_image_load_store_enable, VOLATILE);
+restrict       KEYWORD_WITH_ALT(420, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, RESTRICT);
+readonly       KEYWORD_WITH_ALT(420, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, READONLY);
+writeonly      KEYWORD_WITH_ALT(420, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, WRITEONLY);
 
 atomic_uint     KEYWORD_WITH_ALT(420, 300, 420, 310, yyextra->ARB_shader_atomic_counters_enable, ATOMIC_UINT);
 
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 97648c15ccc..e1b390844d3 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -1258,56 +1258,65 @@ layout_qualifier_id:
 
       /* Layout qualifiers for ARB_shader_image_load_store. */
       if (state->ARB_shader_image_load_store_enable ||
-          state->is_version(420, 0)) {
+          state->is_version(420, 310)) {
          if (!$$.flags.i) {
             static const struct {
                const char *name;
                GLenum format;
                glsl_base_type base_type;
+               /** Minimum desktop GLSL version required for the image
+                * format.  Use 130 if already present in the original
+                * ARB extension.
+                */
+               unsigned required_glsl;
+               /** Minimum GLSL ES version required for the image format. */
+               unsigned required_essl;
             } map[] = {
-               { "rgba32f", GL_RGBA32F, GLSL_TYPE_FLOAT },
-               { "rgba16f", GL_RGBA16F, GLSL_TYPE_FLOAT },
-               { "rg32f", GL_RG32F, GLSL_TYPE_FLOAT },
-               { "rg16f", GL_RG16F, GLSL_TYPE_FLOAT },
-               { "r11f_g11f_b10f", GL_R11F_G11F_B10F, GLSL_TYPE_FLOAT },
-               { "r32f", GL_R32F, GLSL_TYPE_FLOAT },
-               { "r16f", GL_R16F, GLSL_TYPE_FLOAT },
-               { "rgba32ui", GL_RGBA32UI, GLSL_TYPE_UINT },
-               { "rgba16ui", GL_RGBA16UI, GLSL_TYPE_UINT },
-               { "rgb10_a2ui", GL_RGB10_A2UI, GLSL_TYPE_UINT },
-               { "rgba8ui", GL_RGBA8UI, GLSL_TYPE_UINT },
-               { "rg32ui", GL_RG32UI, GLSL_TYPE_UINT },
-               { "rg16ui", GL_RG16UI, GLSL_TYPE_UINT },
-               { "rg8ui", GL_RG8UI, GLSL_TYPE_UINT },
-               { "r32ui", GL_R32UI, GLSL_TYPE_UINT },
-               { "r16ui", GL_R16UI, GLSL_TYPE_UINT },
-               { "r8ui", GL_R8UI, GLSL_TYPE_UINT },
-               { "rgba32i", GL_RGBA32I, GLSL_TYPE_INT },
-               { "rgba16i", GL_RGBA16I, GLSL_TYPE_INT },
-               { "rgba8i", GL_RGBA8I, GLSL_TYPE_INT },
-               { "rg32i", GL_RG32I, GLSL_TYPE_INT },
-               { "rg16i", GL_RG16I, GLSL_TYPE_INT },
-               { "rg8i", GL_RG8I, GLSL_TYPE_INT },
-               { "r32i", GL_R32I, GLSL_TYPE_INT },
-               { "r16i", GL_R16I, GLSL_TYPE_INT },
-               { "r8i", GL_R8I, GLSL_TYPE_INT },
-               { "rgba16", GL_RGBA16, GLSL_TYPE_FLOAT },
-               { "rgb10_a2", GL_RGB10_A2, GLSL_TYPE_FLOAT },
-               { "rgba8", GL_RGBA8, GLSL_TYPE_FLOAT },
-               { "rg16", GL_RG16, GLSL_TYPE_FLOAT },
-               { "rg8", GL_RG8, GLSL_TYPE_FLOAT },
-               { "r16", GL_R16, GLSL_TYPE_FLOAT },
-               { "r8", GL_R8, GLSL_TYPE_FLOAT },
-               { "rgba16_snorm", GL_RGBA16_SNORM, GLSL_TYPE_FLOAT },
-               { "rgba8_snorm", GL_RGBA8_SNORM, GLSL_TYPE_FLOAT },
-               { "rg16_snorm", GL_RG16_SNORM, GLSL_TYPE_FLOAT },
-               { "rg8_snorm", GL_RG8_SNORM, GLSL_TYPE_FLOAT },
-               { "r16_snorm", GL_R16_SNORM, GLSL_TYPE_FLOAT },
-               { "r8_snorm", GL_R8_SNORM, GLSL_TYPE_FLOAT }
+               { "rgba32f", GL_RGBA32F, GLSL_TYPE_FLOAT, 130, 310 },
+               { "rgba16f", GL_RGBA16F, GLSL_TYPE_FLOAT, 130, 310 },
+               { "rg32f", GL_RG32F, GLSL_TYPE_FLOAT, 130, 0 },
+               { "rg16f", GL_RG16F, GLSL_TYPE_FLOAT, 130, 0 },
+               { "r11f_g11f_b10f", GL_R11F_G11F_B10F, GLSL_TYPE_FLOAT, 130, 0 },
+               { "r32f", GL_R32F, GLSL_TYPE_FLOAT, 130, 310 },
+               { "r16f", GL_R16F, GLSL_TYPE_FLOAT, 130, 0 },
+               { "rgba32ui", GL_RGBA32UI, GLSL_TYPE_UINT, 130, 310 },
+               { "rgba16ui", GL_RGBA16UI, GLSL_TYPE_UINT, 130, 310 },
+               { "rgb10_a2ui", GL_RGB10_A2UI, GLSL_TYPE_UINT, 130, 0 },
+               { "rgba8ui", GL_RGBA8UI, GLSL_TYPE_UINT, 130, 310 },
+               { "rg32ui", GL_RG32UI, GLSL_TYPE_UINT, 130, 0 },
+               { "rg16ui", GL_RG16UI, GLSL_TYPE_UINT, 130, 0 },
+               { "rg8ui", GL_RG8UI, GLSL_TYPE_UINT, 130, 0 },
+               { "r32ui", GL_R32UI, GLSL_TYPE_UINT, 130, 310 },
+               { "r16ui", GL_R16UI, GLSL_TYPE_UINT, 130, 0 },
+               { "r8ui", GL_R8UI, GLSL_TYPE_UINT, 130, 0 },
+               { "rgba32i", GL_RGBA32I, GLSL_TYPE_INT, 130, 310 },
+               { "rgba16i", GL_RGBA16I, GLSL_TYPE_INT, 130, 310 },
+               { "rgba8i", GL_RGBA8I, GLSL_TYPE_INT, 130, 310 },
+               { "rg32i", GL_RG32I, GLSL_TYPE_INT, 130, 0 },
+               { "rg16i", GL_RG16I, GLSL_TYPE_INT, 130, 0 },
+               { "rg8i", GL_RG8I, GLSL_TYPE_INT, 130, 0 },
+               { "r32i", GL_R32I, GLSL_TYPE_INT, 130, 310 },
+               { "r16i", GL_R16I, GLSL_TYPE_INT, 130, 0 },
+               { "r8i", GL_R8I, GLSL_TYPE_INT, 130, 0 },
+               { "rgba16", GL_RGBA16, GLSL_TYPE_FLOAT, 130, 0 },
+               { "rgb10_a2", GL_RGB10_A2, GLSL_TYPE_FLOAT, 130, 0 },
+               { "rgba8", GL_RGBA8, GLSL_TYPE_FLOAT, 130, 310 },
+               { "rg16", GL_RG16, GLSL_TYPE_FLOAT, 130, 0 },
+               { "rg8", GL_RG8, GLSL_TYPE_FLOAT, 130, 0 },
+               { "r16", GL_R16, GLSL_TYPE_FLOAT, 130, 0 },
+               { "r8", GL_R8, GLSL_TYPE_FLOAT, 130, 0 },
+               { "rgba16_snorm", GL_RGBA16_SNORM, GLSL_TYPE_FLOAT, 130, 0 },
+               { "rgba8_snorm", GL_RGBA8_SNORM, GLSL_TYPE_FLOAT, 130, 310 },
+               { "rg16_snorm", GL_RG16_SNORM, GLSL_TYPE_FLOAT, 130, 0 },
+               { "rg8_snorm", GL_RG8_SNORM, GLSL_TYPE_FLOAT, 130, 0 },
+               { "r16_snorm", GL_R16_SNORM, GLSL_TYPE_FLOAT, 130, 0 },
+               { "r8_snorm", GL_R8_SNORM, GLSL_TYPE_FLOAT, 130, 0 }
             };
 
             for (unsigned i = 0; i < ARRAY_SIZE(map); i++) {
-               if (match_layout_qualifier($1, map[i].name, state) == 0) {
+               if (state->is_version(map[i].required_glsl,
+                                     map[i].required_essl) &&
+                   match_layout_qualifier($1, map[i].name, state) == 0) {
                   $$.flags.q.explicit_image_format = 1;
                   $$.image_format = map[i].format;
                   $$.image_base_type = map[i].base_type;
@@ -1521,11 +1530,10 @@ layout_qualifier_id:
                                 "invalid %s of %d specified",
                                 local_size_qualifiers[i], $3);
                YYERROR;
-            } else if (!state->is_version(430, 0) &&
-                       !state->ARB_compute_shader_enable) {
+            } else if (!state->has_compute_shader()) {
                _mesa_glsl_error(& @3, state,
                                 "%s qualifier requires GLSL 4.30 or "
-                                "ARB_compute_shader",
+                                "GLSL ES 3.10 or ARB_compute_shader",
                                 local_size_qualifiers[i]);
                YYERROR;
             } else {
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index ae2f35697fb..ca772e8ab33 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -143,7 +143,7 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
       this->Const.MaxComputeWorkGroupSize[i] = ctx->Const.MaxComputeWorkGroupSize[i];
 
    this->Const.MaxImageUnits = ctx->Const.MaxImageUnits;
-   this->Const.MaxCombinedImageUnitsAndFragmentOutputs = ctx->Const.MaxCombinedImageUnitsAndFragmentOutputs;
+   this->Const.MaxCombinedShaderOutputResources = ctx->Const.MaxCombinedShaderOutputResources;
    this->Const.MaxImageSamples = ctx->Const.MaxImageSamples;
    this->Const.MaxVertexImageUniforms = ctx->Const.Program[MESA_SHADER_VERTEX].MaxImageUniforms;
    this->Const.MaxTessControlImageUniforms = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxImageUniforms;
@@ -601,6 +601,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(ARB_shader_atomic_counters,       true,  false,     ARB_shader_atomic_counters),
    EXT(ARB_shader_bit_encoding,          true,  false,     ARB_shader_bit_encoding),
    EXT(ARB_shader_image_load_store,      true,  false,     ARB_shader_image_load_store),
+   EXT(ARB_shader_image_size,            true,  false,     ARB_shader_image_size),
    EXT(ARB_shader_precision,             true,  false,     ARB_shader_precision),
    EXT(ARB_shader_stencil_export,        true,  false,     ARB_shader_stencil_export),
    EXT(ARB_shader_storage_buffer_object, true,  false,     ARB_shader_storage_buffer_object),
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index eb325f04eed..e2145bea5fa 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -236,6 +236,11 @@ struct _mesa_glsl_parse_state {
       return ARB_shading_language_420pack_enable || is_version(420, 0);
    }
 
+   bool has_compute_shader() const
+   {
+      return ARB_compute_shader_enable || is_version(430, 310);
+   }
+
    void process_version_directive(YYLTYPE *locp, int version,
                                   const char *ident);
 
@@ -390,7 +395,7 @@ struct _mesa_glsl_parse_state {
 
       /* ARB_shader_image_load_store */
       unsigned MaxImageUnits;
-      unsigned MaxCombinedImageUnitsAndFragmentOutputs;
+      unsigned MaxCombinedShaderOutputResources;
       unsigned MaxImageSamples;
       unsigned MaxVertexImageUniforms;
       unsigned MaxTessControlImageUniforms;
@@ -495,6 +500,8 @@ struct _mesa_glsl_parse_state {
    bool ARB_shader_bit_encoding_warn;
    bool ARB_shader_image_load_store_enable;
    bool ARB_shader_image_load_store_warn;
+   bool ARB_shader_image_size_enable;
+   bool ARB_shader_image_size_warn;
    bool ARB_shader_precision_enable;
    bool ARB_shader_precision_warn;
    bool ARB_shader_stencil_export_enable;
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp
index c482fbfdfb2..f238513f174 100644
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -101,8 +101,13 @@ copy_constant_to_storage(union gl_constant_value *storage,
    }
 }
 
+/**
+ * Initialize an opaque uniform from the value of an explicit binding
+ * qualifier specified in the shader.  Atomic counters are different because
+ * they have no storage and should be handled elsewhere.
+ */
 void
-set_sampler_binding(gl_shader_program *prog, const char *name, int binding)
+set_opaque_binding(gl_shader_program *prog, const char *name, int binding)
 {
    struct gl_uniform_storage *const storage =
       get_storage(prog->UniformStorage, prog->NumUniformStorage, name);
@@ -128,11 +133,20 @@ set_sampler_binding(gl_shader_program *prog, const char *name, int binding)
    for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
       gl_shader *shader = prog->_LinkedShaders[sh];
 
-      if (shader && storage->sampler[sh].active) {
-         for (unsigned i = 0; i < elements; i++) {
-            unsigned index = storage->sampler[sh].index + i;
+      if (shader) {
+         if (storage->type->base_type == GLSL_TYPE_SAMPLER &&
+             storage->sampler[sh].active) {
+            for (unsigned i = 0; i < elements; i++) {
+               const unsigned index = storage->sampler[sh].index + i;
+               shader->SamplerUnits[index] = storage->storage[i].i;
+            }
 
-            shader->SamplerUnits[index] = storage->storage[i].i;
+         } else if (storage->type->base_type == GLSL_TYPE_IMAGE &&
+                    storage->image[sh].active) {
+            for (unsigned i = 0; i < elements; i++) {
+               const unsigned index = storage->image[sh].index + i;
+               shader->ImageUnits[index] = storage->storage[i].i;
+            }
          }
       }
    }
@@ -268,8 +282,9 @@ link_set_uniform_initializers(struct gl_shader_program *prog,
          if (var->data.explicit_binding) {
             const glsl_type *const type = var->type;
 
-            if (type->without_array()->is_sampler()) {
-               linker::set_sampler_binding(prog, var->name, var->data.binding);
+            if (type->without_array()->is_sampler() ||
+                type->without_array()->is_image()) {
+               linker::set_opaque_binding(prog, var->name, var->data.binding);
             } else if (var->is_in_buffer_block()) {
                const glsl_type *const iface_type = var->get_interface_type();
 
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index a7cd82049bd..47f7d2574dd 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2900,7 +2900,7 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
       linker_error(prog, "Too many combined image uniforms\n");
 
    if (total_image_units + fragment_outputs >
-       ctx->Const.MaxCombinedImageUnitsAndFragmentOutputs)
+       ctx->Const.MaxCombinedShaderOutputResources)
       linker_error(prog, "Too many combined image uniforms and fragment outputs\n");
 }
 
diff --git a/src/glsl/lower_subroutine.cpp b/src/glsl/lower_subroutine.cpp
index b29912ad150..c1aed61a36a 100644
--- a/src/glsl/lower_subroutine.cpp
+++ b/src/glsl/lower_subroutine.cpp
@@ -98,7 +98,7 @@ lower_subroutine_visitor::visit_leave(ir_call *ir)
       else
          last_branch = if_tree(equal(subr_to_int(var), lc), new_call, last_branch);
 
-      if (s > 0)
+      if (return_deref && s > 0)
         return_deref = return_deref->clone(mem_ctx, NULL);
    }
    if (last_branch)
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 27dabd3b8f2..af97da9cc21 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -26,6 +26,7 @@
  */
 
 #include "glsl_to_nir.h"
+#include "nir_control_flow.h"
 #include "ir_visitor.h"
 #include "ir_hierarchical_visitor.h"
 #include "ir.h"
@@ -43,7 +44,7 @@ namespace {
 class nir_visitor : public ir_visitor
 {
 public:
-   nir_visitor(nir_shader *shader, struct gl_shader *sh, gl_shader_stage stage);
+   nir_visitor(nir_shader *shader, gl_shader *sh);
    ~nir_visitor();
 
    virtual void visit(ir_variable *);
@@ -86,7 +87,6 @@ private:
    struct gl_shader *sh;
 
    nir_shader *shader;
-   gl_shader_stage stage;
    nir_function_impl *impl;
    exec_list *cf_node_list;
    nir_instr *result; /* result of the expression tree last visited */
@@ -133,9 +133,9 @@ private:
 nir_shader *
 glsl_to_nir(struct gl_shader *sh, const nir_shader_compiler_options *options)
 {
-   nir_shader *shader = nir_shader_create(NULL, options);
+   nir_shader *shader = nir_shader_create(NULL, sh->Stage, options);
 
-   nir_visitor v1(shader, sh, sh->Stage);
+   nir_visitor v1(shader, sh);
    nir_function_visitor v2(&v1);
    v2.run(sh->ir);
    visit_exec_list(sh->ir, &v1);
@@ -143,13 +143,11 @@ glsl_to_nir(struct gl_shader *sh, const nir_shader_compiler_options *options)
    return shader;
 }
 
-nir_visitor::nir_visitor(nir_shader *shader, struct gl_shader *sh,
-                         gl_shader_stage stage)
+nir_visitor::nir_visitor(nir_shader *shader, gl_shader *sh)
 {
    this->supports_ints = shader->options->native_integers;
    this->shader = shader;
    this->sh = sh;
-   this->stage = stage;
    this->is_global = true;
    this->var_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                              _mesa_key_pointer_equal);
@@ -266,7 +264,7 @@ nir_visitor::visit(ir_variable *ir)
       break;
 
    case ir_var_shader_in:
-      if (stage == MESA_SHADER_FRAGMENT &&
+      if (shader->stage == MESA_SHADER_FRAGMENT &&
           ir->data.location == VARYING_SLOT_FACE) {
          /* For whatever reason, GLSL IR makes gl_FrontFacing an input */
          var->data.location = SYSTEM_VALUE_FRONT_FACE;
@@ -646,6 +644,8 @@ nir_visitor::visit(ir_call *ir)
          op = nir_intrinsic_image_atomic_comp_swap;
       } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier") == 0) {
          op = nir_intrinsic_memory_barrier;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_size") == 0) {
+         op = nir_intrinsic_image_size;
       } else {
          unreachable("not reached");
       }
@@ -671,7 +671,8 @@ nir_visitor::visit(ir_call *ir)
       case nir_intrinsic_image_atomic_or:
       case nir_intrinsic_image_atomic_xor:
       case nir_intrinsic_image_atomic_exchange:
-      case nir_intrinsic_image_atomic_comp_swap: {
+      case nir_intrinsic_image_atomic_comp_swap:
+      case nir_intrinsic_image_size: {
          nir_ssa_undef_instr *instr_undef =
             nir_ssa_undef_instr_create(shader, 1);
          nir_instr_insert_after_cf_list(this->cf_node_list,
@@ -686,6 +687,17 @@ nir_visitor::visit(ir_call *ir)
          instr->variables[0] = evaluate_deref(&instr->instr, image);
          param = param->get_next();
 
+         /* Set the intrinsic destination. */
+         if (ir->return_deref) {
+            const nir_intrinsic_info *info =
+                    &nir_intrinsic_infos[instr->intrinsic];
+            nir_ssa_dest_init(&instr->instr, &instr->dest,
+                              info->dest_components, NULL);
+         }
+
+         if (op == nir_intrinsic_image_size)
+            break;
+
          /* Set the address argument, extending the coordinate vector to four
           * components.
           */
@@ -726,11 +738,6 @@ nir_visitor::visit(ir_call *ir)
             instr->src[3] = evaluate_rvalue((ir_dereference *)param);
             param = param->get_next();
          }
-
-         /* Set the intrinsic destination. */
-         if (ir->return_deref)
-            nir_ssa_dest_init(&instr->instr, &instr->dest,
-                              ir->return_deref->type->vector_elements, NULL);
          break;
       }
       case nir_intrinsic_memory_barrier:
diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index 2f7cbae42be..77cc4f078a3 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -26,10 +26,13 @@
  */
 
 #include "nir.h"
+#include "nir_control_flow_private.h"
 #include <assert.h>
 
 nir_shader *
-nir_shader_create(void *mem_ctx, const nir_shader_compiler_options *options)
+nir_shader_create(void *mem_ctx,
+                  gl_shader_stage stage,
+                  const nir_shader_compiler_options *options)
 {
    nir_shader *shader = ralloc(mem_ctx, nir_shader);
 
@@ -49,6 +52,8 @@ nir_shader_create(void *mem_ctx, const nir_shader_compiler_options *options)
    shader->num_outputs = 0;
    shader->num_uniforms = 0;
 
+   shader->stage = stage;
+
    return shader;
 }
 
@@ -180,11 +185,6 @@ nir_alu_dest_copy(nir_alu_dest *dest, const nir_alu_dest *src, void *mem_ctx)
    dest->saturate = src->saturate;
 }
 
-static inline void
-block_add_pred(nir_block *block, nir_block *pred)
-{
-   _mesa_set_add(block->predecessors, pred);
-}
 
 static void
 cf_init(nir_cf_node *node, nir_cf_node_type type)
@@ -194,45 +194,6 @@ cf_init(nir_cf_node *node, nir_cf_node_type type)
    node->type = type;
 }
 
-static void
-link_blocks(nir_block *pred, nir_block *succ1, nir_block *succ2)
-{
-   pred->successors[0] = succ1;
-   block_add_pred(succ1, pred);
-
-   pred->successors[1] = succ2;
-   if (succ2 != NULL)
-      block_add_pred(succ2, pred);
-}
-
-static void
-unlink_blocks(nir_block *pred, nir_block *succ)
-{
-   if (pred->successors[0] == succ) {
-      pred->successors[0] = pred->successors[1];
-      pred->successors[1] = NULL;
-   } else {
-      assert(pred->successors[1] == succ);
-      pred->successors[1] = NULL;
-   }
-
-   struct set_entry *entry = _mesa_set_search(succ->predecessors, pred);
-
-   assert(entry);
-
-   _mesa_set_remove(succ->predecessors, entry);
-}
-
-static void
-unlink_block_successors(nir_block *block)
-{
-   if (block->successors[0] != NULL)
-      unlink_blocks(block, block->successors[0]);
-   if (block->successors[1] != NULL)
-      unlink_blocks(block, block->successors[1]);
-}
-
-
 nir_function_impl *
 nir_function_impl_create(nir_function_overload *overload)
 {
@@ -262,14 +223,12 @@ nir_function_impl_create(nir_function_overload *overload)
    nir_block *end_block = nir_block_create(mem_ctx);
    start_block->cf_node.parent = &impl->cf_node;
    end_block->cf_node.parent = &impl->cf_node;
-   impl->start_block = start_block;
    impl->end_block = end_block;
 
    exec_list_push_tail(&impl->body, &start_block->cf_node.node);
 
    start_block->successors[0] = end_block;
-   block_add_pred(end_block, start_block);
-
+   _mesa_set_add(end_block->predecessors, start_block);
    return impl;
 }
 
@@ -335,7 +294,7 @@ nir_loop_create(void *mem_ctx)
    body->cf_node.parent = &loop->cf_node;
 
    body->successors[0] = body;
-   block_add_pred(body, body);
+   _mesa_set_add(body->predecessors, body);
 
    return loop;
 }
@@ -646,250 +605,6 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref)
    return load;
 }
 
-/**
- * \name Control flow modification
- *
- * These functions modify the control flow tree while keeping the control flow
- * graph up-to-date. The invariants respected are:
- * 1. Each then statement, else statement, or loop body must have at least one
- *    control flow node.
- * 2. Each if-statement and loop must have one basic block before it and one
- *    after.
- * 3. Two basic blocks cannot be directly next to each other.
- * 4. If a basic block has a jump instruction, there must be only one and it
- *    must be at the end of the block.
- * 5. The CFG must always be connected - this means that we must insert a fake
- *    CFG edge for loops with no break statement.
- *
- * The purpose of the second one is so that we have places to insert code during
- * GCM, as well as eliminating the possibility of critical edges.
- */
-/*@{*/
-
-static void
-link_non_block_to_block(nir_cf_node *node, nir_block *block)
-{
-   if (node->type == nir_cf_node_if) {
-      /*
-       * We're trying to link an if to a block after it; this just means linking
-       * the last block of the then and else branches.
-       */
-
-      nir_if *if_stmt = nir_cf_node_as_if(node);
-
-      nir_cf_node *last_then = nir_if_last_then_node(if_stmt);
-      assert(last_then->type == nir_cf_node_block);
-      nir_block *last_then_block = nir_cf_node_as_block(last_then);
-
-      nir_cf_node *last_else = nir_if_last_else_node(if_stmt);
-      assert(last_else->type == nir_cf_node_block);
-      nir_block *last_else_block = nir_cf_node_as_block(last_else);
-
-      if (exec_list_is_empty(&last_then_block->instr_list) ||
-          nir_block_last_instr(last_then_block)->type != nir_instr_type_jump) {
-         unlink_block_successors(last_then_block);
-         link_blocks(last_then_block, block, NULL);
-      }
-
-      if (exec_list_is_empty(&last_else_block->instr_list) ||
-          nir_block_last_instr(last_else_block)->type != nir_instr_type_jump) {
-         unlink_block_successors(last_else_block);
-         link_blocks(last_else_block, block, NULL);
-      }
-   } else {
-      assert(node->type == nir_cf_node_loop);
-
-      /*
-       * We can only get to this codepath if we're inserting a new loop, or
-       * at least a loop with no break statements; we can't insert break
-       * statements into a loop when we haven't inserted it into the CFG
-       * because we wouldn't know which block comes after the loop
-       * and therefore, which block should be the successor of the block with
-       * the break). Therefore, we need to insert a fake edge (see invariant
-       * #5).
-       */
-
-      nir_loop *loop = nir_cf_node_as_loop(node);
-
-      nir_cf_node *last = nir_loop_last_cf_node(loop);
-      assert(last->type == nir_cf_node_block);
-      nir_block *last_block =  nir_cf_node_as_block(last);
-
-      last_block->successors[1] = block;
-      block_add_pred(block, last_block);
-   }
-}
-
-static void
-link_block_to_non_block(nir_block *block, nir_cf_node *node)
-{
-   if (node->type == nir_cf_node_if) {
-      /*
-       * We're trying to link a block to an if after it; this just means linking
-       * the block to the first block of the then and else branches.
-       */
-
-      nir_if *if_stmt = nir_cf_node_as_if(node);
-
-      nir_cf_node *first_then = nir_if_first_then_node(if_stmt);
-      assert(first_then->type == nir_cf_node_block);
-      nir_block *first_then_block = nir_cf_node_as_block(first_then);
-
-      nir_cf_node *first_else = nir_if_first_else_node(if_stmt);
-      assert(first_else->type == nir_cf_node_block);
-      nir_block *first_else_block = nir_cf_node_as_block(first_else);
-
-      unlink_block_successors(block);
-      link_blocks(block, first_then_block, first_else_block);
-   } else {
-      /*
-       * For similar reasons as the corresponding case in
-       * link_non_block_to_block(), don't worry about if the loop header has
-       * any predecessors that need to be unlinked.
-       */
-
-      assert(node->type == nir_cf_node_loop);
-
-      nir_loop *loop = nir_cf_node_as_loop(node);
-
-      nir_cf_node *loop_header = nir_loop_first_cf_node(loop);
-      assert(loop_header->type == nir_cf_node_block);
-      nir_block *loop_header_block = nir_cf_node_as_block(loop_header);
-
-      unlink_block_successors(block);
-      link_blocks(block, loop_header_block, NULL);
-   }
-
-}
-
-/**
- * Takes a basic block and inserts a new empty basic block before it, making its
- * predecessors point to the new block. This essentially splits the block into
- * an empty header and a body so that another non-block CF node can be inserted
- * between the two. Note that this does *not* link the two basic blocks, so
- * some kind of cleanup *must* be performed after this call.
- */
-
-static nir_block *
-split_block_beginning(nir_block *block)
-{
-   nir_block *new_block = nir_block_create(ralloc_parent(block));
-   new_block->cf_node.parent = block->cf_node.parent;
-   exec_node_insert_node_before(&block->cf_node.node, &new_block->cf_node.node);
-
-   struct set_entry *entry;
-   set_foreach(block->predecessors, entry) {
-      nir_block *pred = (nir_block *) entry->key;
-
-      unlink_blocks(pred, block);
-      link_blocks(pred, new_block, NULL);
-   }
-
-   return new_block;
-}
-
-static void
-rewrite_phi_preds(nir_block *block, nir_block *old_pred, nir_block *new_pred)
-{
-   nir_foreach_instr_safe(block, instr) {
-      if (instr->type != nir_instr_type_phi)
-         break;
-
-      nir_phi_instr *phi = nir_instr_as_phi(instr);
-      nir_foreach_phi_src(phi, src) {
-         if (src->pred == old_pred) {
-            src->pred = new_pred;
-            break;
-         }
-      }
-   }
-}
-
-/**
- * Moves the successors of source to the successors of dest, leaving both
- * successors of source NULL.
- */
-
-static void
-move_successors(nir_block *source, nir_block *dest)
-{
-   nir_block *succ1 = source->successors[0];
-   nir_block *succ2 = source->successors[1];
-
-   if (succ1) {
-      unlink_blocks(source, succ1);
-      rewrite_phi_preds(succ1, source, dest);
-   }
-
-   if (succ2) {
-      unlink_blocks(source, succ2);
-      rewrite_phi_preds(succ2, source, dest);
-   }
-
-   unlink_block_successors(dest);
-   link_blocks(dest, succ1, succ2);
-}
-
-static nir_block *
-split_block_end(nir_block *block)
-{
-   nir_block *new_block = nir_block_create(ralloc_parent(block));
-   new_block->cf_node.parent = block->cf_node.parent;
-   exec_node_insert_after(&block->cf_node.node, &new_block->cf_node.node);
-
-   move_successors(block, new_block);
-
-   return new_block;
-}
-
-/**
- * Inserts a non-basic block between two basic blocks and links them together.
- */
-
-static void
-insert_non_block(nir_block *before, nir_cf_node *node, nir_block *after)
-{
-   node->parent = before->cf_node.parent;
-   exec_node_insert_after(&before->cf_node.node, &node->node);
-   link_block_to_non_block(before, node);
-   link_non_block_to_block(node, after);
-}
-
-/**
- * Inserts a non-basic block before a basic block.
- */
-
-static void
-insert_non_block_before_block(nir_cf_node *node, nir_block *block)
-{
-   /* split off the beginning of block into new_block */
-   nir_block *new_block = split_block_beginning(block);
-
-   /* insert our node in between new_block and block */
-   insert_non_block(new_block, node, block);
-}
-
-static void
-insert_non_block_after_block(nir_block *block, nir_cf_node *node)
-{
-   /* split off the end of block into new_block */
-   nir_block *new_block = split_block_end(block);
-
-   /* insert our node in between block and new_block */
-   insert_non_block(block, node, new_block);
-}
-
-/* walk up the control flow tree to find the innermost enclosed loop */
-static nir_loop *
-nearest_loop(nir_cf_node *node)
-{
-   while (node->type != nir_cf_node_loop) {
-      node = node->parent;
-   }
-
-   return nir_cf_node_as_loop(node);
-}
-
 nir_function_impl *
 nir_cf_node_get_function(nir_cf_node *node)
 {
@@ -900,384 +615,6 @@ nir_cf_node_get_function(nir_cf_node *node)
    return nir_cf_node_as_function(node);
 }
 
-/*
- * update the CFG after a jump instruction has been added to the end of a block
- */
-
-static void
-handle_jump(nir_block *block)
-{
-   nir_instr *instr = nir_block_last_instr(block);
-   nir_jump_instr *jump_instr = nir_instr_as_jump(instr);
-
-   unlink_block_successors(block);
-
-   nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node);
-   nir_metadata_preserve(impl, nir_metadata_none);
-
-   if (jump_instr->type == nir_jump_break ||
-       jump_instr->type == nir_jump_continue) {
-      nir_loop *loop = nearest_loop(&block->cf_node);
-
-      if (jump_instr->type == nir_jump_continue) {
-         nir_cf_node *first_node = nir_loop_first_cf_node(loop);
-         assert(first_node->type == nir_cf_node_block);
-         nir_block *first_block = nir_cf_node_as_block(first_node);
-         link_blocks(block, first_block, NULL);
-      } else {
-         nir_cf_node *after = nir_cf_node_next(&loop->cf_node);
-         assert(after->type == nir_cf_node_block);
-         nir_block *after_block = nir_cf_node_as_block(after);
-         link_blocks(block, after_block, NULL);
-
-         /* If we inserted a fake link, remove it */
-         nir_cf_node *last = nir_loop_last_cf_node(loop);
-         assert(last->type == nir_cf_node_block);
-         nir_block *last_block =  nir_cf_node_as_block(last);
-         if (last_block->successors[1] != NULL)
-            unlink_blocks(last_block, after_block);
-      }
-   } else {
-      assert(jump_instr->type == nir_jump_return);
-      link_blocks(block, impl->end_block, NULL);
-   }
-}
-
-static void
-handle_remove_jump(nir_block *block, nir_jump_type type)
-{
-   unlink_block_successors(block);
-
-   if (exec_node_is_tail_sentinel(block->cf_node.node.next)) {
-      nir_cf_node *parent = block->cf_node.parent;
-      if (parent->type == nir_cf_node_if) {
-         nir_cf_node *next = nir_cf_node_next(parent);
-         assert(next->type == nir_cf_node_block);
-         nir_block *next_block = nir_cf_node_as_block(next);
-
-         link_blocks(block, next_block, NULL);
-      } else {
-         assert(parent->type == nir_cf_node_loop);
-         nir_loop *loop = nir_cf_node_as_loop(parent);
-
-         nir_cf_node *head = nir_loop_first_cf_node(loop);
-         assert(head->type == nir_cf_node_block);
-         nir_block *head_block = nir_cf_node_as_block(head);
-
-         link_blocks(block, head_block, NULL);
-      }
-   } else {
-      nir_cf_node *next = nir_cf_node_next(&block->cf_node);
-      if (next->type == nir_cf_node_if) {
-         nir_if *next_if = nir_cf_node_as_if(next);
-
-         nir_cf_node *first_then = nir_if_first_then_node(next_if);
-         assert(first_then->type == nir_cf_node_block);
-         nir_block *first_then_block = nir_cf_node_as_block(first_then);
-
-         nir_cf_node *first_else = nir_if_first_else_node(next_if);
-         assert(first_else->type == nir_cf_node_block);
-         nir_block *first_else_block = nir_cf_node_as_block(first_else);
-
-         link_blocks(block, first_then_block, first_else_block);
-      } else {
-         assert(next->type == nir_cf_node_loop);
-         nir_loop *next_loop = nir_cf_node_as_loop(next);
-
-         nir_cf_node *first = nir_loop_first_cf_node(next_loop);
-         assert(first->type == nir_cf_node_block);
-         nir_block *first_block = nir_cf_node_as_block(first);
-
-         link_blocks(block, first_block, NULL);
-      }
-   }
-
-   if (type == nir_jump_break) {
-      nir_loop *loop = nearest_loop(&block->cf_node);
-
-      nir_cf_node *next = nir_cf_node_next(&loop->cf_node);
-      assert(next->type == nir_cf_node_block);
-      nir_block *next_block = nir_cf_node_as_block(next);
-
-      if (next_block->predecessors->entries == 0) {
-         /* insert fake link */
-         nir_cf_node *last = nir_loop_last_cf_node(loop);
-         assert(last->type == nir_cf_node_block);
-         nir_block *last_block = nir_cf_node_as_block(last);
-
-         last_block->successors[1] = next_block;
-         block_add_pred(next_block, last_block);
-      }
-   }
-
-   nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node);
-   nir_metadata_preserve(impl, nir_metadata_none);
-}
-
-/**
- * Inserts a basic block before another by merging the instructions.
- *
- * @param block the target of the insertion
- * @param before the block to be inserted - must not have been inserted before
- * @param has_jump whether \before has a jump instruction at the end
- */
-
-static void
-insert_block_before_block(nir_block *block, nir_block *before, bool has_jump)
-{
-   assert(!has_jump || exec_list_is_empty(&block->instr_list));
-
-   foreach_list_typed(nir_instr, instr, node, &before->instr_list) {
-      instr->block = block;
-   }
-
-   exec_list_prepend(&block->instr_list, &before->instr_list);
-
-   if (has_jump)
-      handle_jump(block);
-}
-
-/**
- * Inserts a basic block after another by merging the instructions.
- *
- * @param block the target of the insertion
- * @param after the block to be inserted - must not have been inserted before
- * @param has_jump whether \after has a jump instruction at the end
- */
-
-static void
-insert_block_after_block(nir_block *block, nir_block *after, bool has_jump)
-{
-   foreach_list_typed(nir_instr, instr, node, &after->instr_list) {
-      instr->block = block;
-   }
-
-   exec_list_append(&block->instr_list, &after->instr_list);
-
-   if (has_jump)
-      handle_jump(block);
-}
-
-static void
-update_if_uses(nir_cf_node *node)
-{
-   if (node->type != nir_cf_node_if)
-      return;
-
-   nir_if *if_stmt = nir_cf_node_as_if(node);
-
-   if_stmt->condition.parent_if = if_stmt;
-   if (if_stmt->condition.is_ssa) {
-      list_addtail(&if_stmt->condition.use_link,
-                   &if_stmt->condition.ssa->if_uses);
-   } else {
-      list_addtail(&if_stmt->condition.use_link,
-                   &if_stmt->condition.reg.reg->if_uses);
-   }
-}
-
-void
-nir_cf_node_insert_after(nir_cf_node *node, nir_cf_node *after)
-{
-   update_if_uses(after);
-
-   if (after->type == nir_cf_node_block) {
-      /*
-       * either node or the one after it must be a basic block, by invariant #2;
-       * in either case, just merge the blocks together.
-       */
-      nir_block *after_block = nir_cf_node_as_block(after);
-
-      bool has_jump = !exec_list_is_empty(&after_block->instr_list) &&
-         nir_block_last_instr(after_block)->type == nir_instr_type_jump;
-
-      if (node->type == nir_cf_node_block) {
-         insert_block_after_block(nir_cf_node_as_block(node), after_block,
-                                  has_jump);
-      } else {
-         nir_cf_node *next = nir_cf_node_next(node);
-         assert(next->type == nir_cf_node_block);
-         nir_block *next_block = nir_cf_node_as_block(next);
-
-         insert_block_before_block(next_block, after_block, has_jump);
-      }
-   } else {
-      if (node->type == nir_cf_node_block) {
-         insert_non_block_after_block(nir_cf_node_as_block(node), after);
-      } else {
-         /*
-          * We have to insert a non-basic block after a non-basic block. Since
-          * every non-basic block has a basic block after it, this is equivalent
-          * to inserting a non-basic block before a basic block.
-          */
-
-         nir_cf_node *next = nir_cf_node_next(node);
-         assert(next->type == nir_cf_node_block);
-         nir_block *next_block = nir_cf_node_as_block(next);
-
-         insert_non_block_before_block(after, next_block);
-      }
-   }
-
-   nir_function_impl *impl = nir_cf_node_get_function(node);
-   nir_metadata_preserve(impl, nir_metadata_none);
-}
-
-void
-nir_cf_node_insert_before(nir_cf_node *node, nir_cf_node *before)
-{
-   update_if_uses(before);
-
-   if (before->type == nir_cf_node_block) {
-      nir_block *before_block = nir_cf_node_as_block(before);
-
-      bool has_jump = !exec_list_is_empty(&before_block->instr_list) &&
-         nir_block_last_instr(before_block)->type == nir_instr_type_jump;
-
-      if (node->type == nir_cf_node_block) {
-         insert_block_before_block(nir_cf_node_as_block(node), before_block,
-                                   has_jump);
-      } else {
-         nir_cf_node *prev = nir_cf_node_prev(node);
-         assert(prev->type == nir_cf_node_block);
-         nir_block *prev_block = nir_cf_node_as_block(prev);
-
-         insert_block_after_block(prev_block, before_block, has_jump);
-      }
-   } else {
-      if (node->type == nir_cf_node_block) {
-         insert_non_block_before_block(before, nir_cf_node_as_block(node));
-      } else {
-         /*
-          * We have to insert a non-basic block before a non-basic block. This
-          * is equivalent to inserting a non-basic block after a basic block.
-          */
-
-         nir_cf_node *prev_node = nir_cf_node_prev(node);
-         assert(prev_node->type == nir_cf_node_block);
-         nir_block *prev_block = nir_cf_node_as_block(prev_node);
-
-         insert_non_block_after_block(prev_block, before);
-      }
-   }
-
-   nir_function_impl *impl = nir_cf_node_get_function(node);
-   nir_metadata_preserve(impl, nir_metadata_none);
-}
-
-void
-nir_cf_node_insert_begin(struct exec_list *list, nir_cf_node *node)
-{
-   nir_cf_node *begin = exec_node_data(nir_cf_node, list->head, node);
-   nir_cf_node_insert_before(begin, node);
-}
-
-void
-nir_cf_node_insert_end(struct exec_list *list, nir_cf_node *node)
-{
-   nir_cf_node *end = exec_node_data(nir_cf_node, list->tail_pred, node);
-   nir_cf_node_insert_after(end, node);
-}
-
-/**
- * Stitch two basic blocks together into one. The aggregate must have the same
- * predecessors as the first and the same successors as the second.
- */
-
-static void
-stitch_blocks(nir_block *before, nir_block *after)
-{
-   /*
-    * We move after into before, so we have to deal with up to 2 successors vs.
-    * possibly a large number of predecessors.
-    *
-    * TODO: special case when before is empty and after isn't?
-    */
-
-   move_successors(after, before);
-
-   foreach_list_typed(nir_instr, instr, node, &after->instr_list) {
-      instr->block = before;
-   }
-
-   exec_list_append(&before->instr_list, &after->instr_list);
-   exec_node_remove(&after->cf_node.node);
-}
-
-static void
-remove_defs_uses(nir_instr *instr);
-
-static void
-cleanup_cf_node(nir_cf_node *node)
-{
-   switch (node->type) {
-   case nir_cf_node_block: {
-      nir_block *block = nir_cf_node_as_block(node);
-      /* We need to walk the instructions and clean up defs/uses */
-      nir_foreach_instr(block, instr)
-         remove_defs_uses(instr);
-      break;
-   }
-
-   case nir_cf_node_if: {
-      nir_if *if_stmt = nir_cf_node_as_if(node);
-      foreach_list_typed(nir_cf_node, child, node, &if_stmt->then_list)
-         cleanup_cf_node(child);
-      foreach_list_typed(nir_cf_node, child, node, &if_stmt->else_list)
-         cleanup_cf_node(child);
-
-      list_del(&if_stmt->condition.use_link);
-      break;
-   }
-
-   case nir_cf_node_loop: {
-      nir_loop *loop = nir_cf_node_as_loop(node);
-      foreach_list_typed(nir_cf_node, child, node, &loop->body)
-         cleanup_cf_node(child);
-      break;
-   }
-   case nir_cf_node_function: {
-      nir_function_impl *impl = nir_cf_node_as_function(node);
-      foreach_list_typed(nir_cf_node, child, node, &impl->body)
-         cleanup_cf_node(child);
-      break;
-   }
-   default:
-      unreachable("Invalid CF node type");
-   }
-}
-
-void
-nir_cf_node_remove(nir_cf_node *node)
-{
-   nir_function_impl *impl = nir_cf_node_get_function(node);
-   nir_metadata_preserve(impl, nir_metadata_none);
-
-   if (node->type == nir_cf_node_block) {
-      /*
-       * Basic blocks can't really be removed by themselves, since they act as
-       * padding between the non-basic blocks. So all we do here is empty the
-       * block of instructions.
-       *
-       * TODO: could we assert here?
-       */
-      exec_list_make_empty(&nir_cf_node_as_block(node)->instr_list);
-   } else {
-      nir_cf_node *before = nir_cf_node_prev(node);
-      assert(before->type == nir_cf_node_block);
-      nir_block *before_block = nir_cf_node_as_block(before);
-
-      nir_cf_node *after = nir_cf_node_next(node);
-      assert(after->type == nir_cf_node_block);
-      nir_block *after_block = nir_cf_node_as_block(after);
-
-      exec_node_remove(&node->node);
-      stitch_blocks(before_block, after_block);
-   }
-
-   cleanup_cf_node(node);
-}
-
 static bool
 add_use_cb(nir_src *src, void *state)
 {
@@ -1348,7 +685,7 @@ nir_instr_insert_after(nir_instr *instr, nir_instr *after)
    exec_node_insert_after(&instr->node, &after->node);
 
    if (after->type == nir_instr_type_jump)
-      handle_jump(after->block);
+      nir_handle_add_jump(after->block);
 }
 
 void
@@ -1362,7 +699,7 @@ nir_instr_insert_before_block(nir_block *block, nir_instr *before)
    exec_list_push_head(&block->instr_list, &before->node);
 
    if (before->type == nir_instr_type_jump)
-      handle_jump(block);
+      nir_handle_add_jump(block);
 }
 
 void
@@ -1378,7 +715,7 @@ nir_instr_insert_after_block(nir_block *block, nir_instr *after)
    exec_list_push_tail(&block->instr_list, &after->node);
 
    if (after->type == nir_instr_type_jump)
-      handle_jump(block);
+      nir_handle_add_jump(block);
 }
 
 void
@@ -1456,7 +793,7 @@ void nir_instr_remove(nir_instr *instr)
 
    if (instr->type == nir_instr_type_jump) {
       nir_jump_instr *jump_instr = nir_instr_as_jump(instr);
-      handle_remove_jump(instr->block, jump_instr->type);
+      nir_handle_remove_jump(instr->block, jump_instr->type);
    }
 }
 
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 70af06e6971..f78596d5cc0 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1122,6 +1122,8 @@ typedef struct {
 
 #define nir_foreach_phi_src(phi, entry) \
    foreach_list_typed(nir_phi_src, entry, node, &(phi)->srcs)
+#define nir_foreach_phi_src_safe(phi, entry) \
+   foreach_list_typed_safe(nir_phi_src, entry, node, &(phi)->srcs)
 
 typedef struct {
    nir_instr instr;
@@ -1330,7 +1332,7 @@ typedef struct {
 
    struct exec_list body; /** < list of nir_cf_node */
 
-   nir_block *start_block, *end_block;
+   nir_block *end_block;
 
    /** list for all local variables in the function */
    struct exec_list locals;
@@ -1357,6 +1359,12 @@ typedef struct {
    nir_metadata valid_metadata;
 } nir_function_impl;
 
+static inline nir_block *
+nir_start_block(nir_function_impl *impl)
+{
+   return (nir_block *) exec_list_get_head(&impl->body);
+}
+
 static inline nir_cf_node *
 nir_cf_node_next(nir_cf_node *node)
 {
@@ -1488,8 +1496,8 @@ typedef struct nir_shader {
     */
    unsigned num_inputs, num_uniforms, num_outputs;
 
-   /** the number of uniforms that are only accessed directly */
-   unsigned num_direct_uniforms;
+   /** The shader stage, such as MESA_SHADER_VERTEX. */
+   gl_shader_stage stage;
 } nir_shader;
 
 #define nir_foreach_overload(shader, overload)                        \
@@ -1498,6 +1506,7 @@ typedef struct nir_shader {
                          &(func)->overload_list)
 
 nir_shader *nir_shader_create(void *mem_ctx,
+                              gl_shader_stage stage,
                               const nir_shader_compiler_options *options);
 
 /** creates a register, including assigning it an index and adding it to the list */
@@ -1521,21 +1530,6 @@ nir_loop *nir_loop_create(void *mem_ctx);
 
 nir_function_impl *nir_cf_node_get_function(nir_cf_node *node);
 
-/** puts a control flow node immediately after another control flow node */
-void nir_cf_node_insert_after(nir_cf_node *node, nir_cf_node *after);
-
-/** puts a control flow node immediately before another control flow node */
-void nir_cf_node_insert_before(nir_cf_node *node, nir_cf_node *before);
-
-/** puts a control flow node at the beginning of a list from an if, loop, or function */
-void nir_cf_node_insert_begin(struct exec_list *list, nir_cf_node *node);
-
-/** puts a control flow node at the end of a list from an if, loop, or function */
-void nir_cf_node_insert_end(struct exec_list *list, nir_cf_node *node);
-
-/** removes a control flow node, doing any cleanup necessary */
-void nir_cf_node_remove(nir_cf_node *node);
-
 /** requests that the given pieces of metadata be generated */
 void nir_metadata_require(nir_function_impl *impl, nir_metadata required);
 /** dirties all but the preserved metadata */
@@ -1660,15 +1654,10 @@ void nir_lower_locals_to_regs(nir_shader *shader);
 
 void nir_assign_var_locations(struct exec_list *var_list,
                               unsigned *size,
-                              bool is_scalar);
-void nir_assign_var_locations_direct_first(nir_shader *shader,
-                                           struct exec_list *var_list,
-                                           unsigned *direct_size,
-                                           unsigned *size,
-                                           bool is_scalar);
-
-void nir_lower_io(nir_shader *shader, bool is_scalar);
+                              int (*type_size)(const struct glsl_type *));
 
+void nir_lower_io(nir_shader *shader,
+                  int (*type_size)(const struct glsl_type *));
 void nir_lower_vars_to_ssa(nir_shader *shader);
 
 void nir_remove_dead_variables(nir_shader *shader);
@@ -1680,8 +1669,7 @@ void nir_lower_load_const_to_scalar(nir_shader *shader);
 void nir_lower_phis_to_scalar(nir_shader *shader);
 
 void nir_lower_samplers(nir_shader *shader,
-                        const struct gl_shader_program *shader_program,
-                        gl_shader_stage stage);
+                        const struct gl_shader_program *shader_program);
 void nir_lower_samplers_for_vk(nir_shader *shader);
 
 void nir_lower_system_values(nir_shader *shader);
diff --git a/src/glsl/nir/nir_control_flow.c b/src/glsl/nir/nir_control_flow.c
new file mode 100644
index 00000000000..5c03375ac77
--- /dev/null
+++ b/src/glsl/nir/nir_control_flow.c
@@ -0,0 +1,769 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Connor Abbott ([email protected])
+ *
+ */
+
+#include "nir_control_flow_private.h"
+
+/**
+ * \name Control flow modification
+ *
+ * These functions modify the control flow tree while keeping the control flow
+ * graph up-to-date. The invariants respected are:
+ * 1. Each then statement, else statement, or loop body must have at least one
+ *    control flow node.
+ * 2. Each if-statement and loop must have one basic block before it and one
+ *    after.
+ * 3. Two basic blocks cannot be directly next to each other.
+ * 4. If a basic block has a jump instruction, there must be only one and it
+ *    must be at the end of the block.
+ * 5. The CFG must always be connected - this means that we must insert a fake
+ *    CFG edge for loops with no break statement.
+ *
+ * The purpose of the second one is so that we have places to insert code during
+ * GCM, as well as eliminating the possibility of critical edges.
+ */
+/*@{*/
+
+static bool
+block_ends_in_jump(nir_block *block)
+{
+   return !exec_list_is_empty(&block->instr_list) &&
+          nir_block_last_instr(block)->type == nir_instr_type_jump;
+}
+
+static inline void
+block_add_pred(nir_block *block, nir_block *pred)
+{
+   _mesa_set_add(block->predecessors, pred);
+}
+
+static void
+link_blocks(nir_block *pred, nir_block *succ1, nir_block *succ2)
+{
+   pred->successors[0] = succ1;
+   if (succ1 != NULL)
+      block_add_pred(succ1, pred);
+
+   pred->successors[1] = succ2;
+   if (succ2 != NULL)
+      block_add_pred(succ2, pred);
+}
+
+static void
+unlink_blocks(nir_block *pred, nir_block *succ)
+{
+   if (pred->successors[0] == succ) {
+      pred->successors[0] = pred->successors[1];
+      pred->successors[1] = NULL;
+   } else {
+      assert(pred->successors[1] == succ);
+      pred->successors[1] = NULL;
+   }
+
+   struct set_entry *entry = _mesa_set_search(succ->predecessors, pred);
+
+   assert(entry);
+
+   _mesa_set_remove(succ->predecessors, entry);
+}
+
+static void
+unlink_block_successors(nir_block *block)
+{
+   if (block->successors[0] != NULL)
+      unlink_blocks(block, block->successors[0]);
+   if (block->successors[1] != NULL)
+      unlink_blocks(block, block->successors[1]);
+}
+
+static void
+link_non_block_to_block(nir_cf_node *node, nir_block *block)
+{
+   if (node->type == nir_cf_node_if) {
+      /*
+       * We're trying to link an if to a block after it; this just means linking
+       * the last block of the then and else branches.
+       */
+
+      nir_if *if_stmt = nir_cf_node_as_if(node);
+
+      nir_cf_node *last_then = nir_if_last_then_node(if_stmt);
+      assert(last_then->type == nir_cf_node_block);
+      nir_block *last_then_block = nir_cf_node_as_block(last_then);
+
+      nir_cf_node *last_else = nir_if_last_else_node(if_stmt);
+      assert(last_else->type == nir_cf_node_block);
+      nir_block *last_else_block = nir_cf_node_as_block(last_else);
+
+      if (!block_ends_in_jump(last_then_block)) {
+         unlink_block_successors(last_then_block);
+         link_blocks(last_then_block, block, NULL);
+      }
+
+      if (!block_ends_in_jump(last_else_block)) {
+         unlink_block_successors(last_else_block);
+         link_blocks(last_else_block, block, NULL);
+      }
+   } else {
+      assert(node->type == nir_cf_node_loop);
+
+      /*
+       * We can only get to this codepath if we're inserting a new loop, or
+       * at least a loop with no break statements; we can't insert break
+       * statements into a loop when we haven't inserted it into the CFG
+       * because we wouldn't know which block comes after the loop
+       * and therefore, which block should be the successor of the block with
+       * the break). Therefore, we need to insert a fake edge (see invariant
+       * #5).
+       */
+
+      nir_loop *loop = nir_cf_node_as_loop(node);
+
+      nir_cf_node *last = nir_loop_last_cf_node(loop);
+      assert(last->type == nir_cf_node_block);
+      nir_block *last_block =  nir_cf_node_as_block(last);
+
+      last_block->successors[1] = block;
+      block_add_pred(block, last_block);
+   }
+}
+
+static void
+link_block_to_non_block(nir_block *block, nir_cf_node *node)
+{
+   if (node->type == nir_cf_node_if) {
+      /*
+       * We're trying to link a block to an if after it; this just means linking
+       * the block to the first block of the then and else branches.
+       */
+
+      nir_if *if_stmt = nir_cf_node_as_if(node);
+
+      nir_cf_node *first_then = nir_if_first_then_node(if_stmt);
+      assert(first_then->type == nir_cf_node_block);
+      nir_block *first_then_block = nir_cf_node_as_block(first_then);
+
+      nir_cf_node *first_else = nir_if_first_else_node(if_stmt);
+      assert(first_else->type == nir_cf_node_block);
+      nir_block *first_else_block = nir_cf_node_as_block(first_else);
+
+      unlink_block_successors(block);
+      link_blocks(block, first_then_block, first_else_block);
+   } else {
+      /*
+       * For similar reasons as the corresponding case in
+       * link_non_block_to_block(), don't worry about if the loop header has
+       * any predecessors that need to be unlinked.
+       */
+
+      assert(node->type == nir_cf_node_loop);
+
+      nir_loop *loop = nir_cf_node_as_loop(node);
+
+      nir_cf_node *loop_header = nir_loop_first_cf_node(loop);
+      assert(loop_header->type == nir_cf_node_block);
+      nir_block *loop_header_block = nir_cf_node_as_block(loop_header);
+
+      unlink_block_successors(block);
+      link_blocks(block, loop_header_block, NULL);
+   }
+
+}
+
+/**
+ * Takes a basic block and inserts a new empty basic block before it, making its
+ * predecessors point to the new block. This essentially splits the block into
+ * an empty header and a body so that another non-block CF node can be inserted
+ * between the two. Note that this does *not* link the two basic blocks, so
+ * some kind of cleanup *must* be performed after this call.
+ */
+
+static nir_block *
+split_block_beginning(nir_block *block)
+{
+   nir_block *new_block = nir_block_create(ralloc_parent(block));
+   new_block->cf_node.parent = block->cf_node.parent;
+   exec_node_insert_node_before(&block->cf_node.node, &new_block->cf_node.node);
+
+   struct set_entry *entry;
+   set_foreach(block->predecessors, entry) {
+      nir_block *pred = (nir_block *) entry->key;
+
+      unlink_blocks(pred, block);
+      link_blocks(pred, new_block, NULL);
+   }
+
+   /* Any phi nodes must stay part of the new block, or else their
+    * sourcse will be messed up. This will reverse the order of the phi's, but
+    * order shouldn't matter.
+    */
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_phi)
+         break;
+
+      exec_node_remove(&instr->node);
+      instr->block = new_block;
+      exec_list_push_head(&new_block->instr_list, &instr->node);
+   }
+
+   return new_block;
+}
+
+static void
+rewrite_phi_preds(nir_block *block, nir_block *old_pred, nir_block *new_pred)
+{
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_phi)
+         break;
+
+      nir_phi_instr *phi = nir_instr_as_phi(instr);
+      nir_foreach_phi_src(phi, src) {
+         if (src->pred == old_pred) {
+            src->pred = new_pred;
+            break;
+         }
+      }
+   }
+}
+
+static void
+insert_phi_undef(nir_block *block, nir_block *pred)
+{
+   nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node);
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_phi)
+         break;
+
+      nir_phi_instr *phi = nir_instr_as_phi(instr);
+      nir_ssa_undef_instr *undef =
+         nir_ssa_undef_instr_create(ralloc_parent(phi),
+                                    phi->dest.ssa.num_components);
+      nir_instr_insert_before_cf_list(&impl->body, &undef->instr);
+      nir_phi_src *src = ralloc(phi, nir_phi_src);
+      src->pred = pred;
+      src->src.parent_instr = &phi->instr;
+      src->src.is_ssa = true;
+      src->src.ssa = &undef->def;
+
+      list_addtail(&src->src.use_link, &undef->def.uses);
+
+      exec_list_push_tail(&phi->srcs, &src->node);
+   }
+}
+
+/**
+ * Moves the successors of source to the successors of dest, leaving both
+ * successors of source NULL.
+ */
+
+static void
+move_successors(nir_block *source, nir_block *dest)
+{
+   nir_block *succ1 = source->successors[0];
+   nir_block *succ2 = source->successors[1];
+
+   if (succ1) {
+      unlink_blocks(source, succ1);
+      rewrite_phi_preds(succ1, source, dest);
+   }
+
+   if (succ2) {
+      unlink_blocks(source, succ2);
+      rewrite_phi_preds(succ2, source, dest);
+   }
+
+   unlink_block_successors(dest);
+   link_blocks(dest, succ1, succ2);
+}
+
+/* Given a basic block with no successors that has been inserted into the
+ * control flow tree, gives it the successors it would normally have assuming
+ * it doesn't end in a jump instruction. Also inserts phi sources with undefs
+ * if necessary.
+ */
+static void
+block_add_normal_succs(nir_block *block)
+{
+   if (exec_node_is_tail_sentinel(block->cf_node.node.next)) {
+      nir_cf_node *parent = block->cf_node.parent;
+      if (parent->type == nir_cf_node_if) {
+         nir_cf_node *next = nir_cf_node_next(parent);
+         assert(next->type == nir_cf_node_block);
+         nir_block *next_block = nir_cf_node_as_block(next);
+
+         link_blocks(block, next_block, NULL);
+      } else {
+         assert(parent->type == nir_cf_node_loop);
+         nir_loop *loop = nir_cf_node_as_loop(parent);
+
+         nir_cf_node *head = nir_loop_first_cf_node(loop);
+         assert(head->type == nir_cf_node_block);
+         nir_block *head_block = nir_cf_node_as_block(head);
+
+         link_blocks(block, head_block, NULL);
+         insert_phi_undef(head_block, block);
+      }
+   } else {
+      nir_cf_node *next = nir_cf_node_next(&block->cf_node);
+      if (next->type == nir_cf_node_if) {
+         nir_if *next_if = nir_cf_node_as_if(next);
+
+         nir_cf_node *first_then = nir_if_first_then_node(next_if);
+         assert(first_then->type == nir_cf_node_block);
+         nir_block *first_then_block = nir_cf_node_as_block(first_then);
+
+         nir_cf_node *first_else = nir_if_first_else_node(next_if);
+         assert(first_else->type == nir_cf_node_block);
+         nir_block *first_else_block = nir_cf_node_as_block(first_else);
+
+         link_blocks(block, first_then_block, first_else_block);
+      } else {
+         assert(next->type == nir_cf_node_loop);
+         nir_loop *next_loop = nir_cf_node_as_loop(next);
+
+         nir_cf_node *first = nir_loop_first_cf_node(next_loop);
+         assert(first->type == nir_cf_node_block);
+         nir_block *first_block = nir_cf_node_as_block(first);
+
+         link_blocks(block, first_block, NULL);
+         insert_phi_undef(first_block, block);
+      }
+   }
+}
+
+static nir_block *
+split_block_end(nir_block *block)
+{
+   nir_block *new_block = nir_block_create(ralloc_parent(block));
+   new_block->cf_node.parent = block->cf_node.parent;
+   exec_node_insert_after(&block->cf_node.node, &new_block->cf_node.node);
+
+   if (block_ends_in_jump(block)) {
+      /* Figure out what successor block would've had if it didn't have a jump
+       * instruction, and make new_block have that successor.
+       */
+      block_add_normal_succs(new_block);
+   } else {
+      move_successors(block, new_block);
+   }
+
+   return new_block;
+}
+
+static nir_block *
+split_block_before_instr(nir_instr *instr)
+{
+   assert(instr->type != nir_instr_type_phi);
+   nir_block *new_block = split_block_beginning(instr->block);
+
+   nir_foreach_instr_safe(instr->block, cur_instr) {
+      if (cur_instr == instr)
+         break;
+
+      exec_node_remove(&cur_instr->node);
+      cur_instr->block = new_block;
+      exec_list_push_tail(&new_block->instr_list, &cur_instr->node);
+   }
+
+   return new_block;
+}
+
+/* Splits a basic block at the point specified by the cursor. The "before" and
+ * "after" arguments are filled out with the blocks resulting from the split
+ * if non-NULL. Note that the "beginning" of the block is actually interpreted
+ * as before the first non-phi instruction, and it's illegal to split a block
+ * before a phi instruction.
+ */
+
+static void
+split_block_cursor(nir_cursor cursor,
+                   nir_block **_before, nir_block **_after)
+{
+   nir_block *before, *after;
+   switch (cursor.option) {
+   case nir_cursor_before_block:
+      after = cursor.block;
+      before = split_block_beginning(cursor.block);
+      break;
+
+   case nir_cursor_after_block:
+      before = cursor.block;
+      after = split_block_end(cursor.block);
+      break;
+
+   case nir_cursor_before_instr:
+      after = cursor.instr->block;
+      before = split_block_before_instr(cursor.instr);
+      break;
+
+   case nir_cursor_after_instr:
+      /* We lower this to split_block_before_instr() so that we can keep the
+       * after-a-jump-instr case contained to split_block_end().
+       */
+      if (nir_instr_is_last(cursor.instr)) {
+         before = cursor.instr->block;
+         after = split_block_end(cursor.instr->block);
+      } else {
+         after = cursor.instr->block;
+         before = split_block_before_instr(nir_instr_next(cursor.instr));
+      }
+      break;
+   }
+
+   if (_before)
+      *_before = before;
+   if (_after)
+      *_after = after;
+}
+
+/**
+ * Inserts a non-basic block between two basic blocks and links them together.
+ */
+
+static void
+insert_non_block(nir_block *before, nir_cf_node *node, nir_block *after)
+{
+   node->parent = before->cf_node.parent;
+   exec_node_insert_after(&before->cf_node.node, &node->node);
+   link_block_to_non_block(before, node);
+   link_non_block_to_block(node, after);
+}
+
+/* walk up the control flow tree to find the innermost enclosed loop */
+static nir_loop *
+nearest_loop(nir_cf_node *node)
+{
+   while (node->type != nir_cf_node_loop) {
+      node = node->parent;
+   }
+
+   return nir_cf_node_as_loop(node);
+}
+
+/*
+ * update the CFG after a jump instruction has been added to the end of a block
+ */
+
+void
+nir_handle_add_jump(nir_block *block)
+{
+   nir_instr *instr = nir_block_last_instr(block);
+   nir_jump_instr *jump_instr = nir_instr_as_jump(instr);
+
+   unlink_block_successors(block);
+
+   nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node);
+   nir_metadata_preserve(impl, nir_metadata_none);
+
+   if (jump_instr->type == nir_jump_break ||
+       jump_instr->type == nir_jump_continue) {
+      nir_loop *loop = nearest_loop(&block->cf_node);
+
+      if (jump_instr->type == nir_jump_continue) {
+         nir_cf_node *first_node = nir_loop_first_cf_node(loop);
+         assert(first_node->type == nir_cf_node_block);
+         nir_block *first_block = nir_cf_node_as_block(first_node);
+         link_blocks(block, first_block, NULL);
+      } else {
+         nir_cf_node *after = nir_cf_node_next(&loop->cf_node);
+         assert(after->type == nir_cf_node_block);
+         nir_block *after_block = nir_cf_node_as_block(after);
+         link_blocks(block, after_block, NULL);
+
+         /* If we inserted a fake link, remove it */
+         nir_cf_node *last = nir_loop_last_cf_node(loop);
+         assert(last->type == nir_cf_node_block);
+         nir_block *last_block =  nir_cf_node_as_block(last);
+         if (last_block->successors[1] != NULL)
+            unlink_blocks(last_block, after_block);
+      }
+   } else {
+      assert(jump_instr->type == nir_jump_return);
+      link_blocks(block, impl->end_block, NULL);
+   }
+}
+
+static void
+remove_phi_src(nir_block *block, nir_block *pred)
+{
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_phi)
+         break;
+
+      nir_phi_instr *phi = nir_instr_as_phi(instr);
+      nir_foreach_phi_src_safe(phi, src) {
+         if (src->pred == pred) {
+            list_del(&src->src.use_link);
+            exec_node_remove(&src->node);
+         }
+      }
+   }
+}
+
+/* Removes the successor of a block with a jump, and inserts a fake edge for
+ * infinite loops. Note that the jump to be eliminated may be free-floating.
+ */
+
+static
+void unlink_jump(nir_block *block, nir_jump_type type)
+{
+   if (block->successors[0])
+      remove_phi_src(block->successors[0], block);
+   if (block->successors[1])
+      remove_phi_src(block->successors[1], block);
+
+   if (type == nir_jump_break) {
+      nir_block *next = block->successors[0];
+
+      if (next->predecessors->entries == 1) {
+         nir_loop *loop =
+            nir_cf_node_as_loop(nir_cf_node_prev(&next->cf_node));
+
+         /* insert fake link */
+         nir_cf_node *last = nir_loop_last_cf_node(loop);
+         assert(last->type == nir_cf_node_block);
+         nir_block *last_block = nir_cf_node_as_block(last);
+
+         last_block->successors[1] = next;
+         block_add_pred(next, last_block);
+      }
+   }
+
+   unlink_block_successors(block);
+}
+
+void
+nir_handle_remove_jump(nir_block *block, nir_jump_type type)
+{
+   unlink_jump(block, type);
+
+   block_add_normal_succs(block);
+
+   nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node);
+   nir_metadata_preserve(impl, nir_metadata_none);
+}
+
+static void
+update_if_uses(nir_cf_node *node)
+{
+   if (node->type != nir_cf_node_if)
+      return;
+
+   nir_if *if_stmt = nir_cf_node_as_if(node);
+
+   if_stmt->condition.parent_if = if_stmt;
+   if (if_stmt->condition.is_ssa) {
+      list_addtail(&if_stmt->condition.use_link,
+                   &if_stmt->condition.ssa->if_uses);
+   } else {
+      list_addtail(&if_stmt->condition.use_link,
+                   &if_stmt->condition.reg.reg->if_uses);
+   }
+}
+
+/**
+ * Stitch two basic blocks together into one. The aggregate must have the same
+ * predecessors as the first and the same successors as the second.
+ */
+
+static void
+stitch_blocks(nir_block *before, nir_block *after)
+{
+   /*
+    * We move after into before, so we have to deal with up to 2 successors vs.
+    * possibly a large number of predecessors.
+    *
+    * TODO: special case when before is empty and after isn't?
+    */
+
+   if (block_ends_in_jump(before)) {
+      assert(exec_list_is_empty(&after->instr_list));
+      if (after->successors[0])
+         remove_phi_src(after->successors[0], after);
+      if (after->successors[1])
+         remove_phi_src(after->successors[1], after);
+      unlink_block_successors(after);
+      exec_node_remove(&after->cf_node.node);
+   } else {
+      move_successors(after, before);
+
+      foreach_list_typed(nir_instr, instr, node, &after->instr_list) {
+         instr->block = before;
+      }
+
+      exec_list_append(&before->instr_list, &after->instr_list);
+      exec_node_remove(&after->cf_node.node);
+   }
+}
+
+void
+nir_cf_node_insert(nir_cursor cursor, nir_cf_node *node)
+{
+   nir_block *before, *after;
+
+   split_block_cursor(cursor, &before, &after);
+
+   if (node->type == nir_cf_node_block) {
+      nir_block *block = nir_cf_node_as_block(node);
+      exec_node_insert_after(&before->cf_node.node, &block->cf_node.node);
+      block->cf_node.parent = before->cf_node.parent;
+      /* stitch_blocks() assumes that any block that ends with a jump has
+       * already been setup with the correct successors, so we need to set
+       * up jumps here as the block is being inserted.
+       */
+      if (block_ends_in_jump(block))
+         nir_handle_add_jump(block);
+
+      stitch_blocks(block, after);
+      stitch_blocks(before, block);
+   } else {
+      update_if_uses(node);
+      insert_non_block(before, node, after);
+   }
+}
+
+static bool
+replace_ssa_def_uses(nir_ssa_def *def, void *void_impl)
+{
+   nir_function_impl *impl = void_impl;
+   void *mem_ctx = ralloc_parent(impl);
+
+   nir_ssa_undef_instr *undef =
+      nir_ssa_undef_instr_create(mem_ctx, def->num_components);
+   nir_instr_insert_before_cf_list(&impl->body, &undef->instr);
+   nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(&undef->def), mem_ctx);
+   return true;
+}
+
+static void
+cleanup_cf_node(nir_cf_node *node, nir_function_impl *impl)
+{
+   switch (node->type) {
+   case nir_cf_node_block: {
+      nir_block *block = nir_cf_node_as_block(node);
+      /* We need to walk the instructions and clean up defs/uses */
+      nir_foreach_instr_safe(block, instr) {
+         if (instr->type == nir_instr_type_jump) {
+            nir_jump_type jump_type = nir_instr_as_jump(instr)->type;
+            unlink_jump(block, jump_type);
+         } else {
+            nir_foreach_ssa_def(instr, replace_ssa_def_uses, impl);
+            nir_instr_remove(instr);
+         }
+      }
+      break;
+   }
+
+   case nir_cf_node_if: {
+      nir_if *if_stmt = nir_cf_node_as_if(node);
+      foreach_list_typed(nir_cf_node, child, node, &if_stmt->then_list)
+         cleanup_cf_node(child, impl);
+      foreach_list_typed(nir_cf_node, child, node, &if_stmt->else_list)
+         cleanup_cf_node(child, impl);
+
+      list_del(&if_stmt->condition.use_link);
+      break;
+   }
+
+   case nir_cf_node_loop: {
+      nir_loop *loop = nir_cf_node_as_loop(node);
+      foreach_list_typed(nir_cf_node, child, node, &loop->body)
+         cleanup_cf_node(child, impl);
+      break;
+   }
+   case nir_cf_node_function: {
+      nir_function_impl *impl = nir_cf_node_as_function(node);
+      foreach_list_typed(nir_cf_node, child, node, &impl->body)
+         cleanup_cf_node(child, impl);
+      break;
+   }
+   default:
+      unreachable("Invalid CF node type");
+   }
+}
+
+void
+nir_cf_extract(nir_cf_list *extracted, nir_cursor begin, nir_cursor end)
+{
+   nir_block *block_begin, *block_end, *block_before, *block_after;
+
+   /* In the case where begin points to an instruction in some basic block and
+    * end points to the end of the same basic block, we rely on the fact that
+    * splitting on an instruction moves earlier instructions into a new basic
+    * block. If the later instructions were moved instead, then the end cursor
+    * would be pointing to the same place that begin used to point to, which
+    * is obviously not what we want.
+    */
+   split_block_cursor(begin, &block_before, &block_begin);
+   split_block_cursor(end, &block_end, &block_after);
+
+   extracted->impl = nir_cf_node_get_function(&block_begin->cf_node);
+   exec_list_make_empty(&extracted->list);
+
+   nir_cf_node *cf_node = &block_begin->cf_node;
+   nir_cf_node *cf_node_end = &block_end->cf_node;
+   while (true) {
+      nir_cf_node *next = nir_cf_node_next(cf_node);
+
+      exec_node_remove(&cf_node->node);
+      cf_node->parent = NULL;
+      exec_list_push_tail(&extracted->list, &cf_node->node);
+
+      if (cf_node == cf_node_end)
+         break;
+
+      cf_node = next;
+   }
+
+   stitch_blocks(block_before, block_after);
+}
+
+void
+nir_cf_reinsert(nir_cf_list *cf_list, nir_cursor cursor)
+{
+   nir_block *before, *after;
+
+   split_block_cursor(cursor, &before, &after);
+
+   foreach_list_typed_safe(nir_cf_node, node, node, &cf_list->list) {
+      exec_node_remove(&node->node);
+      node->parent = before->cf_node.parent;
+      exec_node_insert_node_before(&after->cf_node.node, &node->node);
+   }
+
+   stitch_blocks(before,
+                 nir_cf_node_as_block(nir_cf_node_next(&before->cf_node)));
+   stitch_blocks(nir_cf_node_as_block(nir_cf_node_prev(&after->cf_node)),
+                 after);
+}
+
+void
+nir_cf_delete(nir_cf_list *cf_list)
+{
+   foreach_list_typed(nir_cf_node, node, node, &cf_list->list) {
+      cleanup_cf_node(node, cf_list->impl);
+   }
+}
diff --git a/src/glsl/nir/nir_control_flow.h b/src/glsl/nir/nir_control_flow.h
new file mode 100644
index 00000000000..5efd41caadf
--- /dev/null
+++ b/src/glsl/nir/nir_control_flow.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Connor Abbott ([email protected])
+ *
+ */
+
+#include "nir.h"
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** NIR Control Flow Modification
+ *
+ * This file contains various API's that make modifying control flow in NIR,
+ * while maintaining the invariants checked by the validator, much easier.
+ * There are two parts to this:
+ *
+ * 1. Inserting control flow (if's and loops) in various places, for creating
+ *    IR either from scratch or as part of some lowering pass.
+ * 2. Taking existing pieces of the IR and either moving them around or
+ *    deleting them.
+ */
+
+/* Helper struct for representing a point to extract/insert. Helps reduce the
+ * combinatorial explosion of possible points to extract.
+ */
+
+typedef enum {
+   nir_cursor_before_block,
+   nir_cursor_after_block,
+   nir_cursor_before_instr,
+   nir_cursor_after_instr,
+} nir_cursor_option;
+
+typedef struct {
+   nir_cursor_option option;
+   union {
+      nir_block *block;
+      nir_instr *instr;
+   };
+} nir_cursor;
+
+static inline nir_cursor
+nir_before_block(nir_block *block)
+{
+   nir_cursor cursor;
+   cursor.option = nir_cursor_before_block;
+   cursor.block = block;
+   return cursor;
+}
+
+static inline nir_cursor
+nir_after_block(nir_block *block)
+{
+   nir_cursor cursor;
+   cursor.option = nir_cursor_after_block;
+   cursor.block = block;
+   return cursor;
+}
+
+static inline nir_cursor
+nir_before_instr(nir_instr *instr)
+{
+   nir_cursor cursor;
+   cursor.option = nir_cursor_before_instr;
+   cursor.instr = instr;
+   return cursor;
+}
+
+static inline nir_cursor
+nir_after_instr(nir_instr *instr)
+{
+   nir_cursor cursor;
+   cursor.option = nir_cursor_after_instr;
+   cursor.instr = instr;
+   return cursor;
+}
+
+static inline nir_cursor
+nir_before_cf_node(nir_cf_node *node)
+{
+   if (node->type == nir_cf_node_block)
+      return nir_before_block(nir_cf_node_as_block(node));
+
+   return nir_after_block(nir_cf_node_as_block(nir_cf_node_prev(node)));
+}
+
+static inline nir_cursor
+nir_after_cf_node(nir_cf_node *node)
+{
+   if (node->type == nir_cf_node_block)
+      return nir_after_block(nir_cf_node_as_block(node));
+
+   return nir_before_block(nir_cf_node_as_block(nir_cf_node_next(node)));
+}
+
+static inline nir_cursor
+nir_before_cf_list(struct exec_list *cf_list)
+{
+   nir_cf_node *first_node = exec_node_data(nir_cf_node,
+                                            exec_list_get_head(cf_list), node);
+   return nir_before_cf_node(first_node);
+}
+
+static inline nir_cursor
+nir_after_cf_list(struct exec_list *cf_list)
+{
+   nir_cf_node *last_node = exec_node_data(nir_cf_node,
+                                           exec_list_get_tail(cf_list), node);
+   return nir_after_cf_node(last_node);
+}
+
+/** Control flow insertion. */
+
+/** puts a control flow node where the cursor is */
+void nir_cf_node_insert(nir_cursor cursor, nir_cf_node *node);
+
+/** puts a control flow node immediately after another control flow node */
+static inline void
+nir_cf_node_insert_after(nir_cf_node *node, nir_cf_node *after)
+{
+   nir_cf_node_insert(nir_after_cf_node(node), after);
+}
+
+/** puts a control flow node immediately before another control flow node */
+static inline void
+nir_cf_node_insert_before(nir_cf_node *node, nir_cf_node *before)
+{
+   nir_cf_node_insert(nir_before_cf_node(node), before);
+}
+
+/** puts a control flow node at the beginning of a list from an if, loop, or function */
+static inline void
+nir_cf_node_insert_begin(struct exec_list *list, nir_cf_node *node)
+{
+   nir_cf_node_insert(nir_before_cf_list(list), node);
+}
+
+/** puts a control flow node at the end of a list from an if, loop, or function */
+static inline void
+nir_cf_node_insert_end(struct exec_list *list, nir_cf_node *node)
+{
+   nir_cf_node_insert(nir_after_cf_list(list), node);
+}
+
+
+/** Control flow motion.
+ *
+ * These functions let you take a part of a control flow list (basically
+ * equivalent to a series of statement in GLSL) and "extract" it from the IR,
+ * so that it's a free-floating piece of IR that can be either re-inserted
+ * somewhere else or deleted entirely. A few notes on using it:
+ *
+ * 1. Phi nodes are considered attached to the piece of control flow that
+ *    their sources come from. There are three places where phi nodes can
+ *    occur, which are the three places where a block can have multiple
+ *    predecessors:
+ *
+ *    1) After an if statement, if neither branch ends in a jump.
+ *    2) After a loop, if there are multiple break's.
+ *    3) At the beginning of a loop.
+ *
+ *    For #1, the phi node is considered to be part of the if, and for #2 and
+ *    #3 the phi node is considered to be part of the loop. This allows us to
+ *    keep phi's intact, but it means that phi nodes cannot be separated from
+ *    the control flow they come from. For example, extracting an if without
+ *    extracting all the phi nodes after it is not allowed, and neither is
+ *    extracting only some of the phi nodes at the beginning of a block. It
+ *    also means that extracting from the beginning of a basic block actually
+ *    means extracting from the first non-phi instruction, since there's no
+ *    situation where extracting phi nodes without extracting what comes
+ *    before them makes any sense.
+ *
+ * 2. Phi node sources are guaranteed to remain valid, meaning that they still
+ *    correspond one-to-one with the predecessors of the basic block they're
+ *    part of. In addition, the original sources will be preserved unless they
+ *    correspond to a break or continue that was deleted. However, no attempt
+ *    is made to ensure that SSA form is maintained. In particular, it is
+ *    *not* guaranteed that definitions of SSA values will dominate all their
+ *    uses after all is said and done. Either the caller must ensure that this
+ *    is the case, or it must insert extra phi nodes to restore SSA.
+ *
+ * 3. It is invalid to move a piece of IR with a break/continue outside of the
+ *    loop it references. Doing this will result in invalid
+ *    successors/predecessors and phi node sources.
+ *
+ * 4. It is invalid to move a piece of IR from one function implementation to
+ *    another.
+ *
+ * 5. Extracting a control flow list will leave lots of dangling references to
+ *    and from other pieces of the IR. It also leaves things in a not 100%
+ *    consistent state. This means that some things (e.g. inserting
+ *    instructions) might not work reliably on the extracted control flow. It
+ *    also means that extracting control flow without re-inserting it or
+ *    deleting it is a Bad Thing (tm).
+ */
+
+typedef struct {
+   struct exec_list list;
+   nir_function_impl *impl; /* for cleaning up if the list is deleted */
+} nir_cf_list;
+
+void nir_cf_extract(nir_cf_list *extracted, nir_cursor begin, nir_cursor end);
+
+void nir_cf_reinsert(nir_cf_list *cf_list, nir_cursor cursor);
+
+void nir_cf_delete(nir_cf_list *cf_list);
+
+static inline void
+nir_cf_list_extract(nir_cf_list *extracted, struct exec_list *cf_list)
+{
+   nir_cf_extract(extracted, nir_before_cf_list(cf_list),
+                  nir_after_cf_list(cf_list));
+}
+
+/** removes a control flow node, doing any cleanup necessary */
+static inline void
+nir_cf_node_remove(nir_cf_node *node)
+{
+   nir_cf_list list;
+   nir_cf_extract(&list, nir_before_cf_node(node), nir_after_cf_node(node));
+   nir_cf_delete(&list);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/glsl/nir/nir_control_flow_private.h b/src/glsl/nir/nir_control_flow_private.h
new file mode 100644
index 00000000000..f32b57a8cef
--- /dev/null
+++ b/src/glsl/nir/nir_control_flow_private.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Connor Abbott ([email protected])
+ *
+ */
+
+#include "nir_control_flow.h"
+
+#pragma once
+
+/* Internal control-flow modification functions used when inserting/removing
+ * instructions.
+ */
+
+void nir_handle_add_jump(nir_block *block);
+void nir_handle_remove_jump(nir_block *block, nir_jump_type type);
diff --git a/src/glsl/nir/nir_dominance.c b/src/glsl/nir/nir_dominance.c
index 2f50db1c1f6..af4caae0055 100644
--- a/src/glsl/nir/nir_dominance.c
+++ b/src/glsl/nir/nir_dominance.c
@@ -42,7 +42,7 @@ static bool
 init_block_cb(nir_block *block, void *_state)
 {
    dom_state *state = (dom_state *) _state;
-   if (block == state->impl->start_block)
+   if (block == nir_start_block(state->impl))
       block->imm_dom = block;
    else
       block->imm_dom = NULL;
@@ -78,7 +78,7 @@ static bool
 calc_dominance_cb(nir_block *block, void *_state)
 {
    dom_state *state = (dom_state *) _state;
-   if (block == state->impl->start_block)
+   if (block == nir_start_block(state->impl))
       return true;
 
    nir_block *new_idom = NULL;
@@ -209,12 +209,13 @@ nir_calc_dominance_impl(nir_function_impl *impl)
 
    nir_foreach_block(impl, calc_dom_frontier_cb, &state);
 
-   impl->start_block->imm_dom = NULL;
+   nir_block *start_block = nir_start_block(impl);
+   start_block->imm_dom = NULL;
 
    calc_dom_children(impl);
 
    unsigned dfs_index = 0;
-   calc_dfs_indicies(impl->start_block, &dfs_index);
+   calc_dfs_indicies(start_block, &dfs_index);
 }
 
 void
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h
index 64861300b55..1f24f9f677d 100644
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -123,6 +123,8 @@ INTRINSIC(image_atomic_or, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
 INTRINSIC(image_atomic_xor, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
 INTRINSIC(image_atomic_exchange, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
 INTRINSIC(image_atomic_comp_swap, 4, ARR(4, 1, 1, 1), true, 1, 1, 0, 0)
+INTRINSIC(image_size, 0, ARR(), true, 4, 1, 0,
+          NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 
 #define SYSTEM_VALUE(name, components) \
    INTRINSIC(load_##name, 0, ARR(), true, components, 0, 0, \
@@ -139,12 +141,18 @@ SYSTEM_VALUE(sample_mask_in, 1)
 SYSTEM_VALUE(invocation_id, 1)
 
 /*
- * The last index is the base address to load from.  Indirect loads have an
- * additional register input, which is added to the constant address to
- * compute the final address to load from.  For UBO's (and SSBO's), the first
- * source is the (possibly constant) UBO buffer index and the indirect (if it
- * exists) is the second source, and the first index is the descriptor set
- * index.
+ * The format of the indices depends on the type of the load.  For uniforms,
+ * the first index is the base address and the second index is an offset that
+ * should be added to the base address.  (This way you can determine in the
+ * back-end which variable is being accessed even in an array.)  For inputs,
+ * the one and only index corresponds to the attribute slot.  UBO loads
+ * have two indices the first of which is the descriptor set and the second
+ * is the base address to load from.
+ *
+ * UBO loads have a (possibly constant) source which is the UBO buffer index.
+ * For each type of load, the _indirect variant has one additional source
+ * (the second in the case of UBO's) that is the is an indirect to be added to
+ * the constant address or base offset to compute the final offset.
  *
  * For vector backends, the address is in terms of one vec4, and so each array
  * element is +4 scalar components from the previous array element. For scalar
@@ -152,14 +160,14 @@ SYSTEM_VALUE(invocation_id, 1)
  * elements begin immediately after the previous array element.
  */
 
-#define LOAD(name, extra_srcs, extra_indices, flags) \
-   INTRINSIC(load_##name, extra_srcs, ARR(1), true, 0, 0, 1 + extra_indices, flags) \
+#define LOAD(name, extra_srcs, indices, flags) \
+   INTRINSIC(load_##name, extra_srcs, ARR(1), true, 0, 0, indices, flags) \
    INTRINSIC(load_##name##_indirect, extra_srcs + 1, ARR(1, 1), \
-             true, 0, 0, 1 + extra_indices, flags)
+             true, 0, 0, indices, flags)
 
-LOAD(uniform, 0, 0, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
-LOAD(ubo, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
-LOAD(input, 0, 0, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+LOAD(uniform, 0, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+LOAD(ubo, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
+LOAD(input, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 /* LOAD(ssbo, 1, 0) */
 
 /*
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index 6a4494d5fd2..c9697e7845e 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -32,103 +32,17 @@
  */
 
 #include "nir.h"
+#include "nir_builder.h"
 
 struct lower_io_state {
+   nir_builder builder;
    void *mem_ctx;
-   bool is_scalar;
+   int (*type_size)(const struct glsl_type *type);
 };
 
-static int
-type_size_vec4(const struct glsl_type *type)
-{
-   unsigned int i;
-   int size;
-
-   switch (glsl_get_base_type(type)) {
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_BOOL:
-      if (glsl_type_is_matrix(type)) {
-         return glsl_get_matrix_columns(type);
-      } else {
-         return 1;
-      }
-   case GLSL_TYPE_ARRAY:
-      return type_size_vec4(glsl_get_array_element(type)) * glsl_get_length(type);
-   case GLSL_TYPE_STRUCT:
-      size = 0;
-      for (i = 0; i <  glsl_get_length(type); i++) {
-         size += type_size_vec4(glsl_get_struct_field(type, i));
-      }
-      return size;
-   case GLSL_TYPE_SUBROUTINE:
-      return 1;
-   case GLSL_TYPE_SAMPLER:
-      return 0;
-   case GLSL_TYPE_ATOMIC_UINT:
-      return 0;
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_VOID:
-   case GLSL_TYPE_DOUBLE:
-   case GLSL_TYPE_ERROR:
-   case GLSL_TYPE_INTERFACE:
-      unreachable("not reached");
-   }
-
-   return 0;
-}
-
-static unsigned
-type_size_scalar(const struct glsl_type *type)
-{
-   unsigned int size, i;
-
-   switch (glsl_get_base_type(type)) {
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_BOOL:
-      return glsl_get_components(type);
-   case GLSL_TYPE_ARRAY:
-      return type_size_scalar(glsl_get_array_element(type)) * glsl_get_length(type);
-   case GLSL_TYPE_STRUCT:
-      size = 0;
-      for (i = 0; i < glsl_get_length(type); i++) {
-         size += type_size_scalar(glsl_get_struct_field(type, i));
-      }
-      return size;
-   case GLSL_TYPE_SUBROUTINE:
-      return 1;
-   case GLSL_TYPE_SAMPLER:
-      return 0;
-   case GLSL_TYPE_ATOMIC_UINT:
-      return 0;
-   case GLSL_TYPE_INTERFACE:
-      return 0;
-   case GLSL_TYPE_IMAGE:
-      return 0;
-   case GLSL_TYPE_FUNCTION:
-   case GLSL_TYPE_VOID:
-   case GLSL_TYPE_ERROR:
-   case GLSL_TYPE_DOUBLE:
-      unreachable("not reached");
-   }
-
-   return 0;
-}
-
-static unsigned
-type_size(const struct glsl_type *type, bool is_scalar)
-{
-   if (is_scalar)
-      return type_size_scalar(type);
-   else
-      return type_size_vec4(type);
-}
-
 void
-nir_assign_var_locations(struct exec_list *var_list, unsigned *size, bool is_scalar)
+nir_assign_var_locations(struct exec_list *var_list, unsigned *size,
+                         int (*type_size)(const struct glsl_type *))
 {
    unsigned location = 0;
 
@@ -142,7 +56,7 @@ nir_assign_var_locations(struct exec_list *var_list, unsigned *size, bool is_sca
          continue;
 
       var->data.driver_location = location;
-      location += type_size(var->type, is_scalar);
+      location += type_size(var->type);
    }
 
    *size = location;
@@ -162,80 +76,6 @@ deref_has_indirect(nir_deref_var *deref)
    return false;
 }
 
-static bool
-mark_indirect_uses_block(nir_block *block, void *void_state)
-{
-   struct set *indirect_set = void_state;
-
-   nir_foreach_instr(block, instr) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-      for (unsigned i = 0;
-           i < nir_intrinsic_infos[intrin->intrinsic].num_variables; i++) {
-         if (deref_has_indirect(intrin->variables[i]))
-            _mesa_set_add(indirect_set, intrin->variables[i]->var);
-      }
-   }
-
-   return true;
-}
-
-/* Identical to nir_assign_var_locations_packed except that it assigns
- * locations to the variables that are used 100% directly first and then
- * assigns locations to variables that are used indirectly.
- */
-void
-nir_assign_var_locations_direct_first(nir_shader *shader,
-                                      struct exec_list *var_list,
-                                      unsigned *direct_size,
-                                      unsigned *size,
-                                      bool is_scalar)
-{
-   struct set *indirect_set = _mesa_set_create(NULL, _mesa_hash_pointer,
-                                               _mesa_key_pointer_equal);
-
-   nir_foreach_overload(shader, overload) {
-      if (overload->impl)
-         nir_foreach_block(overload->impl, mark_indirect_uses_block,
-                           indirect_set);
-   }
-
-   unsigned location = 0;
-
-   foreach_list_typed(nir_variable, var, node, var_list) {
-      if ((var->data.mode == nir_var_uniform || var->data.mode == nir_var_shader_storage) &&
-          var->interface_type != NULL)
-         continue;
-
-      if (_mesa_set_search(indirect_set, var))
-         continue;
-
-      var->data.driver_location = location;
-      location += type_size(var->type, is_scalar);
-   }
-
-   *direct_size = location;
-
-   foreach_list_typed(nir_variable, var, node, var_list) {
-      if ((var->data.mode == nir_var_uniform || var->data.mode == nir_var_shader_storage) &&
-          var->interface_type != NULL)
-         continue;
-
-      if (!_mesa_set_search(indirect_set, var))
-         continue;
-
-      var->data.driver_location = location;
-      location += type_size(var->type, is_scalar);
-   }
-
-   *size = location;
-
-   _mesa_set_destroy(indirect_set, NULL);
-}
-
 static unsigned
 get_io_offset(nir_deref_var *deref, nir_instr *instr, nir_src *indirect,
               struct lower_io_state *state)
@@ -243,6 +83,9 @@ get_io_offset(nir_deref_var *deref, nir_instr *instr, nir_src *indirect,
    bool found_indirect = false;
    unsigned base_offset = 0;
 
+   nir_builder *b = &state->builder;
+   nir_builder_insert_before_instr(b, instr);
+
    nir_deref *tail = &deref->deref;
    while (tail->child != NULL) {
       const struct glsl_type *parent_type = tail->type;
@@ -250,56 +93,56 @@ get_io_offset(nir_deref_var *deref, nir_instr *instr, nir_src *indirect,
 
       if (tail->deref_type == nir_deref_type_array) {
          nir_deref_array *deref_array = nir_deref_as_array(tail);
-         unsigned size = type_size(tail->type, state->is_scalar);
+         unsigned size = state->type_size(tail->type);
 
          base_offset += size * deref_array->base_offset;
 
          if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-            nir_load_const_instr *load_const =
-               nir_load_const_instr_create(state->mem_ctx, 1);
-            load_const->value.u[0] = size;
-            nir_instr_insert_before(instr, &load_const->instr);
-
-            nir_alu_instr *mul = nir_alu_instr_create(state->mem_ctx,
-                                                      nir_op_imul);
-            mul->src[0].src.is_ssa = true;
-            mul->src[0].src.ssa = &load_const->def;
-            nir_src_copy(&mul->src[1].src, &deref_array->indirect,
-                         state->mem_ctx);
-            mul->dest.write_mask = 1;
-            nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL);
-            nir_instr_insert_before(instr, &mul->instr);
+            nir_ssa_def *mul =
+               nir_imul(b, nir_imm_int(b, size),
+                        nir_ssa_for_src(b, deref_array->indirect, 1));
 
             if (found_indirect) {
-               nir_alu_instr *add = nir_alu_instr_create(state->mem_ctx,
-                                                         nir_op_iadd);
-               add->src[0].src = *indirect;
-               add->src[1].src.is_ssa = true;
-               add->src[1].src.ssa = &mul->dest.dest.ssa;
-               add->dest.write_mask = 1;
-               nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL);
-               nir_instr_insert_before(instr, &add->instr);
-
-               indirect->is_ssa = true;
-               indirect->ssa = &add->dest.dest.ssa;
+               indirect->ssa =
+                  nir_iadd(b, nir_ssa_for_src(b, *indirect, 1), mul);
             } else {
-               indirect->is_ssa = true;
-               indirect->ssa = &mul->dest.dest.ssa;
-               found_indirect = true;
+               indirect->ssa = mul;
             }
+            indirect->is_ssa = true;
+            found_indirect = true;
          }
       } else if (tail->deref_type == nir_deref_type_struct) {
          nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
 
-         for (unsigned i = 0; i < deref_struct->index; i++)
-            base_offset += type_size(glsl_get_struct_field(parent_type, i),
-                                     state->is_scalar);
+         for (unsigned i = 0; i < deref_struct->index; i++) {
+            base_offset +=
+               state->type_size(glsl_get_struct_field(parent_type, i));
+         }
       }
    }
 
    return base_offset;
 }
 
+static nir_intrinsic_op
+load_op(nir_variable_mode mode, bool has_indirect)
+{
+   nir_intrinsic_op op;
+   switch (mode) {
+   case nir_var_shader_in:
+      op = has_indirect ? nir_intrinsic_load_input_indirect :
+                          nir_intrinsic_load_input;
+      break;
+   case nir_var_uniform:
+      op = has_indirect ? nir_intrinsic_load_uniform_indirect :
+                          nir_intrinsic_load_uniform;
+      break;
+   default:
+      unreachable("Unknown variable mode");
+   }
+   return op;
+}
+
 static bool
 nir_lower_io_block(nir_block *block, void *void_state)
 {
@@ -319,31 +162,22 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
          bool has_indirect = deref_has_indirect(intrin->variables[0]);
 
-         /* Figure out the opcode */
-         nir_intrinsic_op load_op;
-         switch (mode) {
-         case nir_var_shader_in:
-            load_op = has_indirect ? nir_intrinsic_load_input_indirect :
-                                     nir_intrinsic_load_input;
-            break;
-         case nir_var_uniform:
-            load_op = has_indirect ? nir_intrinsic_load_uniform_indirect :
-                                     nir_intrinsic_load_uniform;
-            break;
-         default:
-            unreachable("Unknown variable mode");
-         }
-
-         nir_intrinsic_instr *load = nir_intrinsic_instr_create(state->mem_ctx,
-                                                                load_op);
+         nir_intrinsic_instr *load =
+            nir_intrinsic_instr_create(state->mem_ctx,
+                                       load_op(mode, has_indirect));
          load->num_components = intrin->num_components;
 
          nir_src indirect;
          unsigned offset = get_io_offset(intrin->variables[0],
                                          &intrin->instr, &indirect, state);
-         offset += intrin->variables[0]->var->data.driver_location;
 
-         load->const_index[0] = offset;
+         unsigned location = intrin->variables[0]->var->data.driver_location;
+         if (mode == nir_var_uniform) {
+            load->const_index[0] = location;
+            load->const_index[1] = offset;
+         } else {
+            load->const_index[0] = location + offset;
+         }
 
          if (has_indirect)
             load->src[0] = indirect;
@@ -406,12 +240,13 @@ nir_lower_io_block(nir_block *block, void *void_state)
 }
 
 static void
-nir_lower_io_impl(nir_function_impl *impl, bool is_scalar)
+nir_lower_io_impl(nir_function_impl *impl, int(*type_size)(const struct glsl_type *))
 {
    struct lower_io_state state;
 
+   nir_builder_init(&state.builder, impl);
    state.mem_ctx = ralloc_parent(impl);
-   state.is_scalar = is_scalar;
+   state.type_size = type_size;
 
    nir_foreach_block(impl, nir_lower_io_block, &state);
 
@@ -420,10 +255,10 @@ nir_lower_io_impl(nir_function_impl *impl, bool is_scalar)
 }
 
 void
-nir_lower_io(nir_shader *shader, bool is_scalar)
+nir_lower_io(nir_shader *shader, int(*type_size)(const struct glsl_type *))
 {
    nir_foreach_overload(shader, overload) {
       if (overload->impl)
-         nir_lower_io_impl(overload->impl, is_scalar);
+         nir_lower_io_impl(overload->impl, type_size);
    }
 }
diff --git a/src/glsl/nir/nir_lower_samplers.cpp b/src/glsl/nir/nir_lower_samplers.cpp
index 9a9cdd16a9a..438caacfd4d 100644
--- a/src/glsl/nir/nir_lower_samplers.cpp
+++ b/src/glsl/nir/nir_lower_samplers.cpp
@@ -192,12 +192,12 @@ lower_impl(nir_function_impl *impl, const struct gl_shader_program *shader_progr
 }
 
 extern "C" void
-nir_lower_samplers(nir_shader *shader, const struct gl_shader_program *shader_program,
-                   gl_shader_stage stage)
+nir_lower_samplers(nir_shader *shader,
+                   const struct gl_shader_program *shader_program)
 {
    nir_foreach_overload(shader, overload) {
       if (overload->impl)
-         lower_impl(overload->impl, shader_program, stage);
+         lower_impl(overload->impl, shader_program, shader->stage);
    }
 }
 
diff --git a/src/glsl/nir/nir_lower_vars_to_ssa.c b/src/glsl/nir/nir_lower_vars_to_ssa.c
index ccb8f99dfba..4ff21663e57 100644
--- a/src/glsl/nir/nir_lower_vars_to_ssa.c
+++ b/src/glsl/nir/nir_lower_vars_to_ssa.c
@@ -935,7 +935,7 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
    nir_foreach_block(impl, register_variable_uses_block, &state);
 
    insert_phi_nodes(&state);
-   rename_variables_block(impl->start_block, &state);
+   rename_variables_block(nir_start_block(impl), &state);
 
    nir_metadata_preserve(impl, nir_metadata_block_index |
                                nir_metadata_dominance);
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index d7c17403f9f..226e0a8d85c 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -113,6 +113,8 @@ optimizations = [
    (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
    (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
    (('sne', a, b), ('b2f', ('fne', a, b)), 'options->lower_scmp'),
+   (('fne', ('fneg', a), a), ('fne', a, 0.0)),
+   (('feq', ('fneg', a), a), ('feq', a, 0.0)),
    # Emulating booleans
    (('imul', ('b2i', a), ('b2i', b)), ('b2i', ('iand', a, b))),
    (('fmul', ('b2f', a), ('b2f', b)), ('b2f', ('iand', a, b))),
diff --git a/src/glsl/nir/nir_opt_gcm.c b/src/glsl/nir/nir_opt_gcm.c
index 44068bf37b8..5b412eebc32 100644
--- a/src/glsl/nir/nir_opt_gcm.c
+++ b/src/glsl/nir/nir_opt_gcm.c
@@ -256,7 +256,7 @@ gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state)
    /* Start with the instruction at the top.  As we iterate over the
     * sources, it will get moved down as needed.
     */
-   instr->block = state->impl->start_block;
+   instr->block = nir_start_block(state->impl);
    state->instr = instr;
 
    nir_foreach_src(instr, gcm_schedule_early_src, state);
diff --git a/src/glsl/nir/nir_opt_peephole_select.c b/src/glsl/nir/nir_opt_peephole_select.c
index 6620e5dc81f..26ec4ed92d3 100644
--- a/src/glsl/nir/nir_opt_peephole_select.c
+++ b/src/glsl/nir/nir_opt_peephole_select.c
@@ -26,6 +26,7 @@
  */
 
 #include "nir.h"
+#include "nir_control_flow.h"
 
 /*
  * Implements a small peephole optimization that looks for
diff --git a/src/glsl/nir/nir_spirv.h b/src/glsl/nir/nir_spirv.h
index 3254f10a88d..1f09174ad7f 100644
--- a/src/glsl/nir/nir_spirv.h
+++ b/src/glsl/nir/nir_spirv.h
@@ -37,6 +37,7 @@ extern "C" {
 #endif
 
 nir_shader *spirv_to_nir(const uint32_t *words, size_t word_count,
+                         gl_shader_stage stage,
                          const nir_shader_compiler_options *options);
 
 #ifdef __cplusplus
diff --git a/src/glsl/nir/nir_to_ssa.c b/src/glsl/nir/nir_to_ssa.c
index a3c35fa0493..b089df79fcf 100644
--- a/src/glsl/nir/nir_to_ssa.c
+++ b/src/glsl/nir/nir_to_ssa.c
@@ -516,7 +516,7 @@ nir_convert_to_ssa_impl(nir_function_impl *impl)
    rewrite_state state;
    init_rewrite_state(impl, &state);
 
-   rewrite_block(impl->start_block, &state);
+   rewrite_block(nir_start_block(impl), &state);
 
    remove_unused_regs(impl, &state);
 
diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c
index dc799414d24..9938c0ef8b1 100644
--- a/src/glsl/nir/nir_validate.c
+++ b/src/glsl/nir/nir_validate.c
@@ -75,6 +75,9 @@ typedef struct {
    /* the current if statement being validated */
    nir_if *if_stmt;
 
+   /* the current loop being visited */
+   nir_loop *loop;
+
    /* the parent of the current cf node being visited */
    nir_cf_node *parent_node;
 
@@ -594,9 +597,85 @@ validate_block(nir_block *block, validate_state *state)
       }
    }
 
+   struct set_entry *entry;
+   set_foreach(block->predecessors, entry) {
+      const nir_block *pred = entry->key;
+      assert(pred->successors[0] == block ||
+             pred->successors[1] == block);
+   }
+
    if (!exec_list_is_empty(&block->instr_list) &&
-       nir_block_last_instr(block)->type == nir_instr_type_jump)
+       nir_block_last_instr(block)->type == nir_instr_type_jump) {
       assert(block->successors[1] == NULL);
+      nir_jump_instr *jump = nir_instr_as_jump(nir_block_last_instr(block));
+      switch (jump->type) {
+      case nir_jump_break: {
+         nir_block *after =
+            nir_cf_node_as_block(nir_cf_node_next(&state->loop->cf_node));
+         assert(block->successors[0] == after);
+         break;
+      }
+
+      case nir_jump_continue: {
+         nir_block *first =
+            nir_cf_node_as_block(nir_loop_first_cf_node(state->loop));
+         assert(block->successors[0] == first);
+         break;
+      }
+
+      case nir_jump_return:
+         assert(block->successors[0] == state->impl->end_block);
+         break;
+
+      default:
+         unreachable("bad jump type");
+      }
+   } else {
+      nir_cf_node *next = nir_cf_node_next(&block->cf_node);
+      if (next == NULL) {
+         switch (state->parent_node->type) {
+         case nir_cf_node_loop: {
+            nir_block *first =
+               nir_cf_node_as_block(nir_loop_first_cf_node(state->loop));
+            assert(block->successors[0] == first);
+            /* due to the hack for infinite loops, block->successors[1] may
+             * point to the block after the loop.
+             */
+            break;
+         }
+
+         case nir_cf_node_if: {
+            nir_block *after =
+               nir_cf_node_as_block(nir_cf_node_next(state->parent_node));
+            assert(block->successors[0] == after);
+            assert(block->successors[1] == NULL);
+            break;
+         }
+
+         case nir_cf_node_function:
+            assert(block->successors[0] == state->impl->end_block);
+            assert(block->successors[1] == NULL);
+            break;
+
+         default:
+            unreachable("unknown control flow node type");
+         }
+      } else {
+         if (next->type == nir_cf_node_if) {
+            nir_if *if_stmt = nir_cf_node_as_if(next);
+            assert(&block->successors[0]->cf_node ==
+                   nir_if_first_then_node(if_stmt));
+            assert(&block->successors[1]->cf_node ==
+                   nir_if_first_else_node(if_stmt));
+         } else {
+            assert(next->type == nir_cf_node_loop);
+            nir_loop *loop = nir_cf_node_as_loop(next);
+            assert(&block->successors[0]->cf_node ==
+                   nir_loop_first_cf_node(loop));
+            assert(block->successors[1] == NULL);
+         }
+      }
+   }
 }
 
 static void
@@ -608,12 +687,6 @@ validate_if(nir_if *if_stmt, validate_state *state)
    nir_cf_node *prev_node = nir_cf_node_prev(&if_stmt->cf_node);
    assert(prev_node->type == nir_cf_node_block);
 
-   nir_block *prev_block = nir_cf_node_as_block(prev_node);
-   assert(&prev_block->successors[0]->cf_node ==
-          nir_if_first_then_node(if_stmt));
-   assert(&prev_block->successors[1]->cf_node ==
-          nir_if_first_else_node(if_stmt));
-
    assert(!exec_node_is_tail_sentinel(if_stmt->cf_node.node.next));
    nir_cf_node *next_node = nir_cf_node_next(&if_stmt->cf_node);
    assert(next_node->type == nir_cf_node_block);
@@ -647,10 +720,6 @@ validate_loop(nir_loop *loop, validate_state *state)
    nir_cf_node *prev_node = nir_cf_node_prev(&loop->cf_node);
    assert(prev_node->type == nir_cf_node_block);
 
-   nir_block *prev_block = nir_cf_node_as_block(prev_node);
-   assert(&prev_block->successors[0]->cf_node == nir_loop_first_cf_node(loop));
-   assert(prev_block->successors[1] == NULL);
-
    assert(!exec_node_is_tail_sentinel(loop->cf_node.node.next));
    nir_cf_node *next_node = nir_cf_node_next(&loop->cf_node);
    assert(next_node->type == nir_cf_node_block);
@@ -659,6 +728,8 @@ validate_loop(nir_loop *loop, validate_state *state)
 
    nir_cf_node *old_parent = state->parent_node;
    state->parent_node = &loop->cf_node;
+   nir_loop *old_loop = state->loop;
+   state->loop = loop;
 
    exec_list_validate(&loop->body);
    foreach_list_typed(nir_cf_node, cf_node, node, &loop->body) {
@@ -666,6 +737,7 @@ validate_loop(nir_loop *loop, validate_state *state)
    }
 
    state->parent_node = old_parent;
+   state->loop = old_loop;
 }
 
 static void
@@ -921,6 +993,7 @@ init_validate_state(validate_state *state)
    state->regs_found = NULL;
    state->var_defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                              _mesa_key_pointer_equal);
+   state->loop = NULL;
 }
 
 static void
diff --git a/src/glsl/nir/spirv2nir.c b/src/glsl/nir/spirv2nir.c
index 0eed23fbc3f..a7b8c0f4b15 100644
--- a/src/glsl/nir/spirv2nir.c
+++ b/src/glsl/nir/spirv2nir.c
@@ -49,6 +49,6 @@ int main(int argc, char **argv)
    const void *map = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0);
    assert(map != NULL);
 
-   nir_shader *shader = spirv_to_nir(map, word_count, NULL);
+   nir_shader *shader = spirv_to_nir(map, word_count, MESA_SHADER_FRAGMENT, NULL);
    nir_print_shader(shader, stderr);
 }
diff --git a/src/glsl/nir/spirv_to_nir.c b/src/glsl/nir/spirv_to_nir.c
index 65a995c29de..771637e8912 100644
--- a/src/glsl/nir/spirv_to_nir.c
+++ b/src/glsl/nir/spirv_to_nir.c
@@ -27,6 +27,7 @@
 
 #include "spirv_to_nir_private.h"
 #include "nir_vla.h"
+#include "nir_control_flow.h"
 
 static struct vtn_ssa_value *
 vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
@@ -2927,6 +2928,7 @@ vtn_walk_blocks(struct vtn_builder *b, struct vtn_block *start,
 
 nir_shader *
 spirv_to_nir(const uint32_t *words, size_t word_count,
+             gl_shader_stage stage,
              const nir_shader_compiler_options *options)
 {
    const uint32_t *word_end = words + word_count;
@@ -2942,7 +2944,7 @@ spirv_to_nir(const uint32_t *words, size_t word_count,
 
    words+= 5;
 
-   nir_shader *shader = nir_shader_create(NULL, options);
+   nir_shader *shader = nir_shader_create(NULL, stage, options);
 
    /* Initialize the stn_builder object */
    struct vtn_builder *b = rzalloc(NULL, struct vtn_builder);
diff --git a/src/glsl/opt_constant_propagation.cpp b/src/glsl/opt_constant_propagation.cpp
index 10be8e800ff..5221417aca0 100644
--- a/src/glsl/opt_constant_propagation.cpp
+++ b/src/glsl/opt_constant_propagation.cpp
@@ -110,6 +110,8 @@ public:
    virtual ir_visitor_status visit_enter(class ir_if *);
 
    void add_constant(ir_assignment *ir);
+   void constant_folding(ir_rvalue **rvalue);
+   void constant_propagation(ir_rvalue **rvalue);
    void kill(ir_variable *ir, unsigned write_mask);
    void handle_if_block(exec_list *instructions);
    void handle_rvalue(ir_rvalue **rvalue);
@@ -132,8 +134,38 @@ public:
 
 
 void
-ir_constant_propagation_visitor::handle_rvalue(ir_rvalue **rvalue)
-{
+ir_constant_propagation_visitor::constant_folding(ir_rvalue **rvalue) {
+
+   if (*rvalue == NULL || (*rvalue)->ir_type == ir_type_constant)
+      return;
+
+   /* Note that we visit rvalues one leaving.  So if an expression has a
+    * non-constant operand, no need to go looking down it to find if it's
+    * constant.  This cuts the time of this pass down drastically.
+    */
+   ir_expression *expr = (*rvalue)->as_expression();
+   if (expr) {
+      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
+	 if (!expr->operands[i]->as_constant())
+	    return;
+      }
+   }
+
+   /* Ditto for swizzles. */
+   ir_swizzle *swiz = (*rvalue)->as_swizzle();
+   if (swiz && !swiz->val->as_constant())
+      return;
+
+   ir_constant *constant = (*rvalue)->constant_expression_value();
+   if (constant) {
+      *rvalue = constant;
+      this->progress = true;
+   }
+}
+
+void
+ir_constant_propagation_visitor::constant_propagation(ir_rvalue **rvalue) {
+
    if (this->in_assignee || !*rvalue)
       return;
 
@@ -216,6 +248,13 @@ ir_constant_propagation_visitor::handle_rvalue(ir_rvalue **rvalue)
    this->progress = true;
 }
 
+void
+ir_constant_propagation_visitor::handle_rvalue(ir_rvalue **rvalue)
+{
+   constant_propagation(rvalue);
+   constant_folding(rvalue);
+}
+
 ir_visitor_status
 ir_constant_propagation_visitor::visit_enter(ir_function_signature *ir)
 {
@@ -243,6 +282,8 @@ ir_constant_propagation_visitor::visit_enter(ir_function_signature *ir)
 ir_visitor_status
 ir_constant_propagation_visitor::visit_leave(ir_assignment *ir)
 {
+  constant_folding(&ir->rhs);
+
    if (this->in_assignee)
       return visit_continue;
 
diff --git a/src/glsl/shader_enums.h b/src/glsl/shader_enums.h
index 3fef7c48d6f..0c64af05fad 100644
--- a/src/glsl/shader_enums.h
+++ b/src/glsl/shader_enums.h
@@ -48,158 +48,6 @@ typedef enum
 #define MESA_SHADER_STAGES (MESA_SHADER_COMPUTE + 1)
 
 /**
- * Bitflags for system values.
- */
-#define SYSTEM_BIT_SAMPLE_ID ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_ID)
-#define SYSTEM_BIT_SAMPLE_POS ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_POS)
-#define SYSTEM_BIT_SAMPLE_MASK_IN ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_MASK_IN)
-/**
- * If the gl_register_file is PROGRAM_SYSTEM_VALUE, the register index will be
- * one of these values.  If a NIR variable's mode is nir_var_system_value, it
- * will be one of these values.
- */
-typedef enum
-{
-   /**
-    * \name Vertex shader system values
-    */
-   /*@{*/
-   /**
-    * OpenGL-style vertex ID.
-    *
-    * Section 2.11.7 (Shader Execution), subsection Shader Inputs, of the
-    * OpenGL 3.3 core profile spec says:
-    *
-    *     "gl_VertexID holds the integer index i implicitly passed by
-    *     DrawArrays or one of the other drawing commands defined in section
-    *     2.8.3."
-    *
-    * Section 2.8.3 (Drawing Commands) of the same spec says:
-    *
-    *     "The commands....are equivalent to the commands with the same base
-    *     name (without the BaseVertex suffix), except that the ith element
-    *     transferred by the corresponding draw call will be taken from
-    *     element indices[i] + basevertex of each enabled array."
-    *
-    * Additionally, the overview in the GL_ARB_shader_draw_parameters spec
-    * says:
-    *
-    *     "In unextended GL, vertex shaders have inputs named gl_VertexID and
-    *     gl_InstanceID, which contain, respectively the index of the vertex
-    *     and instance. The value of gl_VertexID is the implicitly passed
-    *     index of the vertex being processed, which includes the value of
-    *     baseVertex, for those commands that accept it."
-    *
-    * gl_VertexID gets basevertex added in.  This differs from DirectX where
-    * SV_VertexID does \b not get basevertex added in.
-    *
-    * \note
-    * If all system values are available, \c SYSTEM_VALUE_VERTEX_ID will be
-    * equal to \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus
-    * \c SYSTEM_VALUE_BASE_VERTEX.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_BASE_VERTEX
-    */
-   SYSTEM_VALUE_VERTEX_ID,
-
-   /**
-    * Instanced ID as supplied to gl_InstanceID
-    *
-    * Values assigned to gl_InstanceID always begin with zero, regardless of
-    * the value of baseinstance.
-    *
-    * Section 11.1.3.9 (Shader Inputs) of the OpenGL 4.4 core profile spec
-    * says:
-    *
-    *     "gl_InstanceID holds the integer instance number of the current
-    *     primitive in an instanced draw call (see section 10.5)."
-    *
-    * Through a big chain of pseudocode, section 10.5 describes that
-    * baseinstance is not counted by gl_InstanceID.  In that section, notice
-    *
-    *     "If an enabled vertex attribute array is instanced (it has a
-    *     non-zero divisor as specified by VertexAttribDivisor), the element
-    *     index that is transferred to the GL, for all vertices, is given by
-    *
-    *         floor(instance/divisor) + baseinstance
-    *
-    *     If an array corresponding to an attribute required by a vertex
-    *     shader is not enabled, then the corresponding element is taken from
-    *     the current attribute state (see section 10.2)."
-    *
-    * Note that baseinstance is \b not included in the value of instance.
-    */
-   SYSTEM_VALUE_INSTANCE_ID,
-
-   /**
-    * DirectX-style vertex ID.
-    *
-    * Unlike \c SYSTEM_VALUE_VERTEX_ID, this system value does \b not include
-    * the value of basevertex.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_BASE_VERTEX
-    */
-   SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
-
-   /**
-    * Value of \c basevertex passed to \c glDrawElementsBaseVertex and similar
-    * functions.
-    *
-    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
-    */
-   SYSTEM_VALUE_BASE_VERTEX,
-   /*@}*/
-
-   /**
-    * \name Geometry shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_INVOCATION_ID,  /**< (Also in Tessellation Control shader) */
-   /*@}*/
-
-   /**
-    * \name Fragment shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_FRONT_FACE,     /**< (not done yet) */
-   SYSTEM_VALUE_SAMPLE_ID,
-   SYSTEM_VALUE_SAMPLE_POS,
-   SYSTEM_VALUE_SAMPLE_MASK_IN,
-   /*@}*/
-
-   /**
-    * \name Tessellation Evaluation shader system values
-    */
-   /*@{*/
-   SYSTEM_VALUE_TESS_COORD,
-   SYSTEM_VALUE_VERTICES_IN,    /**< Tessellation vertices in input patch */
-   SYSTEM_VALUE_PRIMITIVE_ID,   /**< (currently not used by GS) */
-   SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
-   SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
-   /*@}*/
-
-   SYSTEM_VALUE_MAX             /**< Number of values */
-} gl_system_value;
-
-
-/**
- * The possible interpolation qualifiers that can be applied to a fragment
- * shader input in GLSL.
- *
- * Note: INTERP_QUALIFIER_NONE must be 0 so that memsetting the
- * gl_fragment_program data structure to 0 causes the default behavior.
- */
-enum glsl_interp_qualifier
-{
-   INTERP_QUALIFIER_NONE = 0,
-   INTERP_QUALIFIER_SMOOTH,
-   INTERP_QUALIFIER_FLAT,
-   INTERP_QUALIFIER_NOPERSPECTIVE,
-   INTERP_QUALIFIER_COUNT /**< Number of interpolation qualifiers */
-};
-
-
-/**
  * Indexes for vertex program attributes.
  * GL_NV_vertex_program aliases generic attributes over the conventional
  * attributes.  In GL_ARB_vertex_program shader the aliasing is optional.
@@ -305,7 +153,6 @@ typedef enum
    BITFIELD64_RANGE(VERT_ATTRIB_GENERIC(0), VERT_ATTRIB_GENERIC_MAX)
 /*@}*/
 
-
 /**
  * Indexes for vertex shader outputs, geometry shader inputs/outputs, and
  * fragment shader inputs.
@@ -346,9 +193,6 @@ typedef enum
    VARYING_SLOT_TESS_LEVEL_OUTER, /* Only appears as TCS output. */
    VARYING_SLOT_TESS_LEVEL_INNER, /* Only appears as TCS output. */
    VARYING_SLOT_VAR0, /* First generic varying slot */
-   VARYING_SLOT_MAX = VARYING_SLOT_VAR0 + MAX_VARYING,
-   VARYING_SLOT_PATCH0 = VARYING_SLOT_MAX,
-   VARYING_SLOT_TESS_MAX = VARYING_SLOT_PATCH0 + MAX_VARYING
 } gl_varying_slot;
 
 
@@ -383,9 +227,161 @@ typedef enum
 #define VARYING_BIT_VIEWPORT BITFIELD64_BIT(VARYING_SLOT_VIEWPORT)
 #define VARYING_BIT_FACE BITFIELD64_BIT(VARYING_SLOT_FACE)
 #define VARYING_BIT_PNTC BITFIELD64_BIT(VARYING_SLOT_PNTC)
+#define VARYING_BIT_TESS_LEVEL_OUTER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_OUTER)
+#define VARYING_BIT_TESS_LEVEL_INNER BITFIELD64_BIT(VARYING_SLOT_TESS_LEVEL_INNER)
 #define VARYING_BIT_VAR(V) BITFIELD64_BIT(VARYING_SLOT_VAR0 + (V))
 /*@}*/
 
+/**
+ * Bitflags for system values.
+ */
+#define SYSTEM_BIT_SAMPLE_ID ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_ID)
+#define SYSTEM_BIT_SAMPLE_POS ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_POS)
+#define SYSTEM_BIT_SAMPLE_MASK_IN ((uint64_t)1 << SYSTEM_VALUE_SAMPLE_MASK_IN)
+/**
+ * If the gl_register_file is PROGRAM_SYSTEM_VALUE, the register index will be
+ * one of these values.  If a NIR variable's mode is nir_var_system_value, it
+ * will be one of these values.
+ */
+typedef enum
+{
+   /**
+    * \name Vertex shader system values
+    */
+   /*@{*/
+   /**
+    * OpenGL-style vertex ID.
+    *
+    * Section 2.11.7 (Shader Execution), subsection Shader Inputs, of the
+    * OpenGL 3.3 core profile spec says:
+    *
+    *     "gl_VertexID holds the integer index i implicitly passed by
+    *     DrawArrays or one of the other drawing commands defined in section
+    *     2.8.3."
+    *
+    * Section 2.8.3 (Drawing Commands) of the same spec says:
+    *
+    *     "The commands....are equivalent to the commands with the same base
+    *     name (without the BaseVertex suffix), except that the ith element
+    *     transferred by the corresponding draw call will be taken from
+    *     element indices[i] + basevertex of each enabled array."
+    *
+    * Additionally, the overview in the GL_ARB_shader_draw_parameters spec
+    * says:
+    *
+    *     "In unextended GL, vertex shaders have inputs named gl_VertexID and
+    *     gl_InstanceID, which contain, respectively the index of the vertex
+    *     and instance. The value of gl_VertexID is the implicitly passed
+    *     index of the vertex being processed, which includes the value of
+    *     baseVertex, for those commands that accept it."
+    *
+    * gl_VertexID gets basevertex added in.  This differs from DirectX where
+    * SV_VertexID does \b not get basevertex added in.
+    *
+    * \note
+    * If all system values are available, \c SYSTEM_VALUE_VERTEX_ID will be
+    * equal to \c SYSTEM_VALUE_VERTEX_ID_ZERO_BASE plus
+    * \c SYSTEM_VALUE_BASE_VERTEX.
+    *
+    * \sa SYSTEM_VALUE_VERTEX_ID_ZERO_BASE, SYSTEM_VALUE_BASE_VERTEX
+    */
+   SYSTEM_VALUE_VERTEX_ID,
+
+   /**
+    * Instanced ID as supplied to gl_InstanceID
+    *
+    * Values assigned to gl_InstanceID always begin with zero, regardless of
+    * the value of baseinstance.
+    *
+    * Section 11.1.3.9 (Shader Inputs) of the OpenGL 4.4 core profile spec
+    * says:
+    *
+    *     "gl_InstanceID holds the integer instance number of the current
+    *     primitive in an instanced draw call (see section 10.5)."
+    *
+    * Through a big chain of pseudocode, section 10.5 describes that
+    * baseinstance is not counted by gl_InstanceID.  In that section, notice
+    *
+    *     "If an enabled vertex attribute array is instanced (it has a
+    *     non-zero divisor as specified by VertexAttribDivisor), the element
+    *     index that is transferred to the GL, for all vertices, is given by
+    *
+    *         floor(instance/divisor) + baseinstance
+    *
+    *     If an array corresponding to an attribute required by a vertex
+    *     shader is not enabled, then the corresponding element is taken from
+    *     the current attribute state (see section 10.2)."
+    *
+    * Note that baseinstance is \b not included in the value of instance.
+    */
+   SYSTEM_VALUE_INSTANCE_ID,
+
+   /**
+    * DirectX-style vertex ID.
+    *
+    * Unlike \c SYSTEM_VALUE_VERTEX_ID, this system value does \b not include
+    * the value of basevertex.
+    *
+    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_BASE_VERTEX
+    */
+   SYSTEM_VALUE_VERTEX_ID_ZERO_BASE,
+
+   /**
+    * Value of \c basevertex passed to \c glDrawElementsBaseVertex and similar
+    * functions.
+    *
+    * \sa SYSTEM_VALUE_VERTEX_ID, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE
+    */
+   SYSTEM_VALUE_BASE_VERTEX,
+   /*@}*/
+
+   /**
+    * \name Geometry shader system values
+    */
+   /*@{*/
+   SYSTEM_VALUE_INVOCATION_ID,  /**< (Also in Tessellation Control shader) */
+   /*@}*/
+
+   /**
+    * \name Fragment shader system values
+    */
+   /*@{*/
+   SYSTEM_VALUE_FRONT_FACE,     /**< (not done yet) */
+   SYSTEM_VALUE_SAMPLE_ID,
+   SYSTEM_VALUE_SAMPLE_POS,
+   SYSTEM_VALUE_SAMPLE_MASK_IN,
+   /*@}*/
+
+   /**
+    * \name Tessellation Evaluation shader system values
+    */
+   /*@{*/
+   SYSTEM_VALUE_TESS_COORD,
+   SYSTEM_VALUE_VERTICES_IN,    /**< Tessellation vertices in input patch */
+   SYSTEM_VALUE_PRIMITIVE_ID,   /**< (currently not used by GS) */
+   SYSTEM_VALUE_TESS_LEVEL_OUTER, /**< TES input */
+   SYSTEM_VALUE_TESS_LEVEL_INNER, /**< TES input */
+   /*@}*/
+
+   SYSTEM_VALUE_MAX             /**< Number of values */
+} gl_system_value;
+
+
+/**
+ * The possible interpolation qualifiers that can be applied to a fragment
+ * shader input in GLSL.
+ *
+ * Note: INTERP_QUALIFIER_NONE must be 0 so that memsetting the
+ * gl_fragment_program data structure to 0 causes the default behavior.
+ */
+enum glsl_interp_qualifier
+{
+   INTERP_QUALIFIER_NONE = 0,
+   INTERP_QUALIFIER_SMOOTH,
+   INTERP_QUALIFIER_FLAT,
+   INTERP_QUALIFIER_NOPERSPECTIVE,
+   INTERP_QUALIFIER_COUNT /**< Number of interpolation qualifiers */
+};
 
 /**
  * Fragment program results
@@ -405,8 +401,6 @@ typedef enum
     * any are written, FRAG_RESULT_COLOR will not be written.
     */
    FRAG_RESULT_DATA0 = 4,
-   FRAG_RESULT_MAX = (FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
 } gl_frag_result;
 
-
 #endif /* SHADER_ENUMS_H */
diff --git a/src/glx/dri_common.c b/src/glx/dri_common.c
index 63c8de38c7c..eedcd46a15a 100644
--- a/src/glx/dri_common.c
+++ b/src/glx/dri_common.c
@@ -253,8 +253,7 @@ __ATTRIB(__DRI_ATTRIB_BIND_TO_TEXTURE_RGB, bindToTextureRgb),
 static int
 scalarEqual(struct glx_config *mode, unsigned int attrib, unsigned int value)
 {
-   unsigned int glxValue;
-   int i;
+   unsigned glxValue, i;
 
    for (i = 0; i < ARRAY_SIZE(attribMap); i++)
       if (attribMap[i].attrib == attrib) {
diff --git a/src/glx/dri_common_query_renderer.c b/src/glx/dri_common_query_renderer.c
index b3e107dc612..ebeedc9c037 100644
--- a/src/glx/dri_common_query_renderer.c
+++ b/src/glx/dri_common_query_renderer.c
@@ -56,7 +56,7 @@ static const struct {
 static int
 dri2_convert_glx_query_renderer_attribs(int attribute)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < ARRAY_SIZE(query_renderer_map); i++)
       if (query_renderer_map[i].glx_attrib == attribute)
diff --git a/src/glx/glxext.c b/src/glx/glxext.c
index fdc24d439c1..dc87fb9e16c 100644
--- a/src/glx/glxext.c
+++ b/src/glx/glxext.c
@@ -138,6 +138,9 @@ __glXWireToEvent(Display *dpy, XEvent *event, xEvent *wire)
       if (!glxDraw)
 	 return False;
 
+      aevent->serial = _XSetLastRequestRead(dpy, (xGenericReply *) wire);
+      aevent->send_event = (awire->type & 0x80) != 0;
+      aevent->display = dpy;
       aevent->event_type = awire->event_type;
       aevent->drawable = glxDraw->xDrawable;
       aevent->ust = ((CARD64)awire->ust_hi << 32) | awire->ust_lo;
diff --git a/src/mapi/glapi/gen/GL4x.xml b/src/mapi/glapi/gen/GL4x.xml
index 94ddfb72960..dee50275a57 100644
--- a/src/mapi/glapi/gen/GL4x.xml
+++ b/src/mapi/glapi/gen/GL4x.xml
@@ -44,4 +44,10 @@
   <enum name="DEPTH_STENCIL_TEXTURE_MODE"              value="0x90EA"/>
 </category>
 
+<category name="4.5">
+  <function name="MemoryBarrierByRegion" es2="3.1">
+    <param name="barriers" type="GLbitfield"/>
+  </function>
+</category>
+
 </OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 7d9d1a61215..86a92437f16 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -154,6 +154,7 @@ API_XML = \
 	ARB_shader_image_load_store.xml \
 	ARB_shader_subroutine.xml \
 	ARB_sync.xml \
+	ARB_tessellation_shader.xml \
 	ARB_texture_barrier.xml \
 	ARB_texture_buffer_object.xml \
 	ARB_texture_buffer_range.xml \
diff --git a/src/mesa/drivers/dri/common/drirc b/src/mesa/drivers/dri/common/drirc
index 97d961b6597..bb840eaba94 100644
--- a/src/mesa/drivers/dri/common/drirc
+++ b/src/mesa/drivers/dri/common/drirc
@@ -53,10 +53,12 @@ TODO: document the other workarounds.
 
         <application name="Unigine OilRush (32-bit)" executable="OilRush_x86">
             <option name="disable_blend_func_extended" value="true" />
+            <option name="allow_glsl_extension_directive_midshader" value="true" />
 	</application>
 
         <application name="Unigine OilRush (64-bit)" executable="OilRush_x64">
             <option name="disable_blend_func_extended" value="true" />
+            <option name="allow_glsl_extension_directive_midshader" value="true" />
 	</application>
 
         <application name="Savage 2" executable="savage2.bin">
diff --git a/src/mesa/drivers/dri/common/utils.c b/src/mesa/drivers/dri/common/utils.c
index b51b263fe46..43d90d90599 100644
--- a/src/mesa/drivers/dri/common/utils.c
+++ b/src/mesa/drivers/dri/common/utils.c
@@ -452,7 +452,7 @@ int
 driGetConfigAttrib(const __DRIconfig *config,
 		   unsigned int attrib, unsigned int *value)
 {
-    int i;
+    unsigned i;
 
     for (i = 0; i < ARRAY_SIZE(attribMap); i++)
 	if (attribMap[i].attrib == attrib)
diff --git a/src/mesa/drivers/dri/i915/intel_pixel_read.c b/src/mesa/drivers/dri/i915/intel_pixel_read.c
index 149e921218f..e6fa8f2dce8 100644
--- a/src/mesa/drivers/dri/i915/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_read.c
@@ -91,7 +91,7 @@ do_blit_readpixels(struct gl_context * ctx,
 
    if (ctx->_ImageTransferState ||
        !_mesa_format_matches_format_and_type(irb->mt->format, format, type,
-                                             false)) {
+                                             false, NULL)) {
       DBG("%s - bad format for blit\n", __func__);
       return false;
    }
diff --git a/src/mesa/drivers/dri/i915/intel_tex_image.c b/src/mesa/drivers/dri/i915/intel_tex_image.c
index 0a213e9f614..5ab60d16173 100644
--- a/src/mesa/drivers/dri/i915/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i915/intel_tex_image.c
@@ -134,7 +134,7 @@ try_pbo_upload(struct gl_context *ctx,
    }
 
    if (!_mesa_format_matches_format_and_type(intelImage->mt->format,
-                                             format, type, false)) {
+                                             format, type, false, NULL)) {
       DBG("%s: format mismatch (upload to %s with format 0x%x, type 0x%x)\n",
 	  __func__, _mesa_get_format_name(intelImage->mt->format),
 	  format, type);
diff --git a/src/mesa/drivers/dri/i965/brw_conditional_render.c b/src/mesa/drivers/dri/i965/brw_conditional_render.c
index 6d37c3b6928..122a4ecc0f6 100644
--- a/src/mesa/drivers/dri/i965/brw_conditional_render.c
+++ b/src/mesa/drivers/dri/i965/brw_conditional_render.c
@@ -56,6 +56,12 @@ set_predicate_for_result(struct brw_context *brw,
 
    assert(query->bo != NULL);
 
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+    * command when loading the values into the predicate source registers for
+    * conditional rendering.
+    */
+   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_FLUSH_ENABLE);
+
    brw_load_register_mem64(brw,
                            MI_PREDICATE_SRC0,
                            query->bo,
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 328662da82e..0ee5ab2bdee 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -514,7 +514,7 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.Program[MESA_SHADER_COMPUTE].MaxImageUniforms =
          BRW_MAX_IMAGES;
       ctx->Const.MaxImageUnits = MAX_IMAGE_UNITS;
-      ctx->Const.MaxCombinedImageUnitsAndFragmentOutputs =
+      ctx->Const.MaxCombinedShaderOutputResources =
          MAX_IMAGE_UNITS + BRW_MAX_DRAW_BUFFERS;
       ctx->Const.MaxImageSamples = 0;
       ctx->Const.MaxCombinedImageUniforms = 3 * BRW_MAX_IMAGES;
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index e092ef4a7c6..e5de4202c01 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -383,7 +383,7 @@ brw_postdraw_set_buffers_need_resolve(struct brw_context *brw)
       brw_render_cache_set_add_bo(brw, stencil_irb->mt->bo);
    }
 
-   for (int i = 0; i < fb->_NumColorDrawBuffers; i++) {
+   for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
       struct intel_renderbuffer *irb =
          intel_renderbuffer(fb->_ColorDrawBuffers[i]);
 
@@ -626,7 +626,7 @@ brw_draw_init(struct brw_context *brw)
 void
 brw_draw_destroy(struct brw_context *brw)
 {
-   int i;
+   unsigned i;
 
    for (i = 0; i < brw->vb.nr_buffers; i++) {
       drm_intel_bo_unreference(brw->vb.buffers[i].bo);
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index cbfd5855410..21d8f1e6994 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -395,7 +395,8 @@ brw_prepare_vertices(struct brw_context *brw)
    GLuint interleaved = 0;
    unsigned int min_index = brw->vb.min_index + brw->basevertex;
    unsigned int max_index = brw->vb.max_index + brw->basevertex;
-   int delta, i, j;
+   unsigned i;
+   int delta, j;
 
    struct brw_vertex_element *upload[VERT_ATTRIB_MAX];
    GLuint nr_uploads = 0;
@@ -418,10 +419,10 @@ brw_prepare_vertices(struct brw_context *brw)
    /* Accumulate the list of enabled arrays. */
    brw->vb.nr_enabled = 0;
    while (vs_inputs) {
-      GLuint i = ffsll(vs_inputs) - 1;
-      struct brw_vertex_element *input = &brw->vb.inputs[i];
+      GLuint index = ffsll(vs_inputs) - 1;
+      struct brw_vertex_element *input = &brw->vb.inputs[index];
 
-      vs_inputs &= ~BITFIELD64_BIT(i);
+      vs_inputs &= ~BITFIELD64_BIT(index);
       brw->vb.enabled[brw->vb.nr_enabled++] = input;
    }
 
@@ -438,7 +439,7 @@ brw_prepare_vertices(struct brw_context *brw)
       if (_mesa_is_bufferobj(glarray->BufferObj)) {
 	 struct intel_buffer_object *intel_buffer =
 	    intel_buffer_object(glarray->BufferObj);
-	 int k;
+	 unsigned k;
 
 	 /* If we have a VB set to be uploaded for this buffer object
 	  * already, reuse that VB state so that we emit fewer
@@ -792,21 +793,6 @@ brw_emit_vertices(struct brw_context *brw)
                     ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
    }
 
-   if (brw->gen >= 6 && gen6_edgeflag_input) {
-      uint32_t format =
-         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
-
-      OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) |
-                GEN6_VE0_VALID |
-                GEN6_VE0_EDGE_FLAG_ENABLE |
-                (format << BRW_VE0_FORMAT_SHIFT) |
-                (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT));
-      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
-   }
-
    if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid) {
       uint32_t dw0 = 0, dw1 = 0;
       uint32_t comp0 = BRW_VE1_COMPONENT_STORE_0;
@@ -847,6 +833,21 @@ brw_emit_vertices(struct brw_context *brw)
       OUT_BATCH(dw1);
    }
 
+   if (brw->gen >= 6 && gen6_edgeflag_input) {
+      uint32_t format =
+         brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
+
+      OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) |
+                GEN6_VE0_VALID |
+                GEN6_VE0_EDGE_FLAG_ENABLE |
+                (format << BRW_VE0_FORMAT_SHIFT) |
+                (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT));
+      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
+                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
+                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
+   }
+
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 0e091ddc227..159f7161e11 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -455,8 +455,8 @@ fs_reg::component_size(unsigned width) const
    return MAX2(width * stride, 1) * type_sz(type);
 }
 
-int
-fs_visitor::type_size(const struct glsl_type *type)
+extern "C" int
+type_size_scalar(const struct glsl_type *type)
 {
    unsigned int size, i;
 
@@ -467,11 +467,11 @@ fs_visitor::type_size(const struct glsl_type *type)
    case GLSL_TYPE_BOOL:
       return type->components();
    case GLSL_TYPE_ARRAY:
-      return type_size(type->fields.array) * type->length;
+      return type_size_scalar(type->fields.array) * type->length;
    case GLSL_TYPE_STRUCT:
       size = 0;
       for (i = 0; i < type->length; i++) {
-	 size += type_size(type->fields.structure[i].type);
+	 size += type_size_scalar(type->fields.structure[i].type);
       }
       return size;
    case GLSL_TYPE_SAMPLER:
@@ -907,7 +907,7 @@ fs_reg
 fs_visitor::vgrf(const glsl_type *const type)
 {
    int reg_width = dispatch_width / 8;
-   return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
+   return fs_reg(GRF, alloc.allocate(type_size_scalar(type) * reg_width),
                  brw_type_for_base_type(type));
 }
 
@@ -944,15 +944,17 @@ fs_visitor::import_uniforms(fs_visitor *v)
 }
 
 void
-fs_visitor::setup_vector_uniform_values(const gl_constant_value *values, unsigned n)
+fs_visitor::setup_vec4_uniform_value(unsigned param_offset,
+                                     const gl_constant_value *values,
+                                     unsigned n)
 {
    static const gl_constant_value zero = { 0 };
 
    for (unsigned i = 0; i < n; ++i)
-      stage_prog_data->param[uniforms++] = &values[i];
+      stage_prog_data->param[param_offset + i] = &values[i];
 
    for (unsigned i = n; i < 4; ++i)
-      stage_prog_data->param[uniforms++] = &zero;
+      stage_prog_data->param[param_offset + i] = &zero;
 }
 
 fs_reg *
@@ -1769,21 +1771,21 @@ fs_visitor::compact_virtual_grfs()
    return progress;
 }
 
-/*
- * Implements array access of uniforms by inserting a
- * PULL_CONSTANT_LOAD instruction.
+/**
+ * Assign UNIFORM file registers to either push constants or pull constants.
  *
- * Unlike temporary GRF array access (where we don't support it due to
- * the difficulty of doing relative addressing on instruction
- * destinations), we could potentially do array access of uniforms
- * that were loaded in GRF space as push constants.  In real-world
- * usage we've seen, though, the arrays being used are always larger
- * than we could load as push constants, so just always move all
- * uniform array access out to a pull constant buffer.
+ * We allow a fragment shader to have more than the specified minimum
+ * maximum number of fragment shader uniform components (64).  If
+ * there are too many of these, they'd fill up all of register space.
+ * So, this will push some of them out to the pull constant buffer and
+ * update the program to load them.  We also use pull constants for all
+ * indirect constant loads because we don't support indirect accesses in
+ * registers yet.
  */
 void
-fs_visitor::move_uniform_array_access_to_pull_constants()
+fs_visitor::assign_constant_locations()
 {
+   /* Only the first compile (SIMD8 mode) gets to decide on locations. */
    if (dispatch_width != 8)
       return;
 
@@ -1820,23 +1822,6 @@ fs_visitor::move_uniform_array_access_to_pull_constants()
          }
       }
    }
-}
-
-/**
- * Assign UNIFORM file registers to either push constants or pull constants.
- *
- * We allow a fragment shader to have more than the specified minimum
- * maximum number of fragment shader uniform components (64).  If
- * there are too many of these, they'd fill up all of register space.
- * So, this will push some of them out to the pull constant buffer and
- * update the program to load them.
- */
-void
-fs_visitor::assign_constant_locations()
-{
-   /* Only the first compile (SIMD8 mode) gets to decide on locations. */
-   if (dispatch_width != 8)
-      return;
 
    /* Find which UNIFORM registers are still in use. */
    bool is_live[uniforms];
@@ -4823,7 +4808,6 @@ fs_visitor::optimize()
 
    split_virtual_grfs();
 
-   move_uniform_array_access_to_pull_constants();
    assign_constant_locations();
    demote_pull_constants();
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 975183e990d..31f39fe0adc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -112,7 +112,6 @@ public:
    void swizzle_result(ir_texture_opcode op, int dest_components,
                        fs_reg orig_val, uint32_t sampler);
 
-   int type_size(const struct glsl_type *type);
    fs_inst *get_instruction_generating_reg(fs_inst *start,
 					   fs_inst *end,
 					   const fs_reg &reg);
@@ -147,7 +146,6 @@ public:
    void spill_reg(int spill_reg);
    void split_virtual_grfs();
    bool compact_virtual_grfs();
-   void move_uniform_array_access_to_pull_constants();
    void assign_constant_locations();
    void demote_pull_constants();
    void invalidate_live_intervals();
@@ -291,8 +289,9 @@ public:
 
    struct brw_reg interp_reg(int location, int channel);
 
-   virtual void setup_vector_uniform_values(const gl_constant_value *values,
-                                            unsigned n);
+   virtual void setup_vec4_uniform_value(unsigned param_offset,
+                                         const gl_constant_value *values,
+                                         unsigned n);
 
    int implied_mrf_writes(fs_inst *inst);
 
@@ -318,9 +317,6 @@ public:
    /** Number of uniform variable components visited. */
    unsigned uniforms;
 
-   /** Total number of direct uniforms we can get from NIR */
-   unsigned num_direct_uniforms;
-
    /** Byte-offset for the next available spot in the scratch space buffer. */
    unsigned last_scratch;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 93a36cc03bf..6272b61a98b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -131,7 +131,7 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 
       switch (stage) {
       case MESA_SHADER_VERTEX:
-         for (int i = 0; i < ALIGN(type_size(var->type), 4) / 4; i++) {
+         for (int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) {
             int output = var->data.location + i;
             this->outputs[output] = offset(reg, bld, 4 * i);
             this->output_components[output] = vector_elements;
@@ -175,19 +175,9 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 void
 fs_visitor::nir_setup_uniforms(nir_shader *shader)
 {
-   num_direct_uniforms = shader->num_direct_uniforms;
-
    if (dispatch_width != 8)
       return;
 
-   /* We split the uniform register file in half.  The first half is
-    * entirely direct uniforms.  The second half is indirect.
-    */
-   if (num_direct_uniforms > 0)
-      param_size[0] = num_direct_uniforms;
-   if (shader->num_uniforms > num_direct_uniforms)
-      param_size[num_direct_uniforms] = shader->num_uniforms - num_direct_uniforms;
-
    uniforms = shader->num_uniforms;
 
    if (shader_prog) {
@@ -200,15 +190,19 @@ fs_visitor::nir_setup_uniforms(nir_shader *shader)
             nir_setup_builtin_uniform(var);
          else
             nir_setup_uniform(var);
+
+         param_size[var->data.driver_location] = type_size_scalar(var->type);
       }
    } else {
-      /* prog_to_nir doesn't create uniform variables; set param up directly. */
+      /* prog_to_nir only creates a single giant uniform variable so we can
+       * just set param up directly. */
       for (unsigned p = 0; p < prog->Parameters->NumParameters; p++) {
          for (unsigned int i = 0; i < 4; i++) {
             stage_prog_data->param[4 * p + i] =
                &prog->Parameters->ParameterValues[p][i];
          }
       }
+      param_size[0] = prog->Parameters->NumParameters * 4;
    }
 }
 
@@ -239,15 +233,7 @@ fs_visitor::nir_setup_uniform(nir_variable *var)
       }
 
       if (storage->type->is_image()) {
-         /* Images don't get a valid location assigned by nir_lower_io()
-          * because their size is driver-specific, so we need to allocate
-          * space for them here at the end of the parameter array.
-          */
-         var->data.driver_location = uniforms;
-         param_size[uniforms] =
-            BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1);
-
-         setup_image_uniform_values(storage);
+         setup_image_uniform_values(index, storage);
       } else {
          unsigned slots = storage->type->component_slots();
          if (storage->array_elements)
@@ -1406,6 +1392,51 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_image_size: {
+      /* Get the referenced image variable and type. */
+      const nir_variable *var = instr->variables[0]->var;
+      const glsl_type *type = var->type->without_array();
+
+      /* Get the size of the image. */
+      const fs_reg image = get_nir_image_deref(instr->variables[0]);
+      const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+      /* For 1DArray image types, the array index is stored in the Z component.
+       * Fix this by swizzling the Z component to the Y component.
+       */
+      const bool is_1d_array_image =
+                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
+                  type->sampler_array;
+
+      /* For CubeArray images, we should count the number of cubes instead
+       * of the number of faces. Fix it by dividing the (Z component) by 6.
+       */
+      const bool is_cube_array_image =
+                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
+                  type->sampler_array;
+
+      /* Copy all the components. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+      for (unsigned c = 0; c < info->dest_components; ++c) {
+         if ((int)c >= type->coordinate_components()) {
+             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                     fs_reg(1));
+         } else if (c == 1 && is_1d_array_image) {
+            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                    offset(size, bld, 2));
+         } else if (c == 2 && is_cube_array_image) {
+            bld.emit(SHADER_OPCODE_INT_QUOTIENT,
+                     offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                     offset(size, bld, c), fs_reg(6));
+         } else {
+            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                    offset(size, bld, c));
+         }
+       }
+
+      break;
+   }
+
    case nir_intrinsic_load_front_face:
       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
               *emit_frontfacing_interpolation());
@@ -1467,21 +1498,13 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       has_indirect = true;
       /* fallthrough */
    case nir_intrinsic_load_uniform: {
-      unsigned index = instr->const_index[0];
-
-      fs_reg uniform_reg;
-      if (index < num_direct_uniforms) {
-         uniform_reg = fs_reg(UNIFORM, 0);
-      } else {
-         uniform_reg = fs_reg(UNIFORM, num_direct_uniforms);
-         index -= num_direct_uniforms;
-      }
+      fs_reg uniform_reg(UNIFORM, instr->const_index[0]);
+      uniform_reg.reg_offset = instr->const_index[1];
 
       for (unsigned j = 0; j < instr->num_components; j++) {
-         fs_reg src = offset(retype(uniform_reg, dest.type), bld, index);
+         fs_reg src = offset(retype(uniform_reg, dest.type), bld, j);
          if (has_indirect)
             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
-         index++;
 
          bld.MOV(dest, src);
          dest = offset(dest, bld, 1);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index b70895ec2ff..6eb988938d4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -156,7 +156,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
    }
 
    uint8_t *ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
-   struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count);
+   struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
    if (devinfo->gen >= 6)
       ra_set_allocate_round_robin(regs);
    int *classes = ralloc_array(compiler, int, class_count);
@@ -232,7 +232,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
             for (int base_reg = j;
                  base_reg < j + (class_sizes[i] + 1) / 2;
                  base_reg++) {
-               ra_add_transitive_reg_conflict(regs, base_reg, reg);
+               ra_add_reg_conflict(regs, base_reg, reg);
             }
 
             reg++;
@@ -246,7 +246,7 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
             for (int base_reg = j;
                  base_reg < j + class_sizes[i];
                  base_reg++) {
-               ra_add_transitive_reg_conflict(regs, base_reg, reg);
+               ra_add_reg_conflict(regs, base_reg, reg);
             }
 
             reg++;
@@ -255,6 +255,12 @@ brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
    }
    assert(reg == ra_reg_count);
 
+   /* Applying transitivity to all of the base registers gives us the
+    * appropreate register conflict relationships everywhere.
+    */
+   for (int reg = 0; reg < base_reg_count; reg++)
+      ra_make_reg_conflicts_transitive(regs, reg);
+
    /* Add a special class for aligned pairs, which we'll put delta_xy
     * in on Gen <= 6 so that we can do PLN.
     */
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index e9d9467d330..27511525bff 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -878,7 +878,8 @@ brw_upload_invariant_state(struct brw_context *brw)
 {
    const bool is_965 = brw->gen == 4 && !brw->is_g4x;
 
-   brw_select_pipeline(brw, BRW_RENDER_PIPELINE);
+   brw_emit_select_pipeline(brw, BRW_RENDER_PIPELINE);
+   brw->last_pipeline = BRW_RENDER_PIPELINE;
 
    if (brw->gen < 6) {
       /* Disable depth offset clamping. */
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 79e31d86759..0276d47c4d4 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -22,6 +22,7 @@
  */
 
 #include "brw_nir.h"
+#include "brw_shader.h"
 #include "glsl/glsl_parser_extras.h"
 #include "glsl/nir/glsl_to_nir.h"
 #include "program/prog_to_nir.h"
@@ -130,22 +131,24 @@ brw_process_nir(nir_shader *nir,
    nir_optimize(nir, is_scalar);
 
    if (is_scalar) {
-      nir_assign_var_locations_direct_first(nir, &nir->uniforms,
-                                            &nir->num_direct_uniforms,
-                                            &nir->num_uniforms,
-                                            is_scalar);
-      nir_assign_var_locations(&nir->outputs, &nir->num_outputs, is_scalar);
+      nir_assign_var_locations(&nir->uniforms,
+                               &nir->num_uniforms,
+                               type_size_scalar);
+      nir_assign_var_locations(&nir->inputs, &nir->num_inputs, type_size_scalar);
+      nir_assign_var_locations(&nir->outputs, &nir->num_outputs, type_size_scalar);
+      nir_lower_io(nir, type_size_scalar);
    } else {
       nir_assign_var_locations(&nir->uniforms,
                                &nir->num_uniforms,
-                               is_scalar);
+                               type_size_vec4);
+
+      nir_assign_var_locations(&nir->inputs, &nir->num_inputs, type_size_vec4);
 
       foreach_list_typed(nir_variable, var, node, &nir->outputs)
          var->data.driver_location = var->data.location;
-   }
-   nir_assign_var_locations(&nir->inputs, &nir->num_inputs, is_scalar);
 
-   nir_lower_io(nir, is_scalar);
+      nir_lower_io(nir, type_size_vec4);
+   }
 
    nir_validate_shader(nir);
 
@@ -153,7 +156,7 @@ brw_process_nir(nir_shader *nir,
    nir_validate_shader(nir);
 
    if (shader_prog) {
-      nir_lower_samplers(nir, shader_prog, stage);
+      nir_lower_samplers(nir, shader_prog);
    } else {
       nir_lower_samplers_for_vk(nir);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_primitive_restart.c b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
index 6ed79d7cb75..c8d90027ea0 100644
--- a/src/mesa/drivers/dri/i965/brw_primitive_restart.c
+++ b/src/mesa/drivers/dri/i965/brw_primitive_restart.c
@@ -91,7 +91,7 @@ can_cut_index_handle_prims(struct gl_context *ctx,
       return false;
    }
 
-   for (int i = 0; i < nr_prims; i++) {
+   for (unsigned i = 0; i < nr_prims; i++) {
       switch (prim[i].mode) {
       case GL_POINTS:
       case GL_LINES:
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index d6b012c392e..a8e5ababba4 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -66,20 +66,11 @@ brw_write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
 void
 brw_write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
 {
-   uint32_t flags;
-
-   flags = (PIPE_CONTROL_WRITE_DEPTH_COUNT |
-            PIPE_CONTROL_DEPTH_STALL);
-
-   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
-    * command when loading the values into the predicate source registers for
-    * conditional rendering.
-    */
-   if (brw->predicate.supported)
-      flags |= PIPE_CONTROL_FLUSH_ENABLE;
-
-   brw_emit_pipe_control_write(brw, flags, query_bo,
-                               idx * sizeof(uint64_t), 0, 0);
+   brw_emit_pipe_control_write(brw,
+                               PIPE_CONTROL_WRITE_DEPTH_COUNT |
+                               PIPE_CONTROL_DEPTH_STALL,
+                               query_bo, idx * sizeof(uint64_t),
+                               0, 0);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 67b8dde7cc8..0007e5c07a5 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -122,7 +122,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
    compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = true;
    compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
 
-   if (compiler->scalar_vs || brw_env_var_as_boolean("INTEL_USE_NIR", false)) {
+   if (compiler->scalar_vs || brw_env_var_as_boolean("INTEL_USE_NIR", true)) {
       if (compiler->scalar_vs) {
          /* If we're using the scalar backend for vertex shaders, we need to
           * configure these accordingly.
@@ -135,7 +135,7 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions = nir_options;
    }
 
-   if (brw_env_var_as_boolean("INTEL_USE_NIR", false)) {
+   if (brw_env_var_as_boolean("INTEL_USE_NIR", true)) {
       compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].NirOptions = nir_options;
    }
 
@@ -1421,7 +1421,8 @@ backend_shader::assign_common_binding_table_offsets(uint32_t next_binding_table_
 }
 
 void
-backend_shader::setup_image_uniform_values(const gl_uniform_storage *storage)
+backend_shader::setup_image_uniform_values(unsigned param_offset,
+                                           const gl_uniform_storage *storage)
 {
    const unsigned stage = _mesa_program_enum_to_shader_stage(prog->Target);
 
@@ -1432,18 +1433,19 @@ backend_shader::setup_image_uniform_values(const gl_uniform_storage *storage)
       /* Upload the brw_image_param structure.  The order is expected to match
        * the BRW_IMAGE_PARAM_*_OFFSET defines.
        */
-      setup_vector_uniform_values(
+      setup_vec4_uniform_value(param_offset + BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET,
          (const gl_constant_value *)&param->surface_idx, 1);
-      setup_vector_uniform_values(
+      setup_vec4_uniform_value(param_offset + BRW_IMAGE_PARAM_OFFSET_OFFSET,
          (const gl_constant_value *)param->offset, 2);
-      setup_vector_uniform_values(
+      setup_vec4_uniform_value(param_offset + BRW_IMAGE_PARAM_SIZE_OFFSET,
          (const gl_constant_value *)param->size, 3);
-      setup_vector_uniform_values(
+      setup_vec4_uniform_value(param_offset + BRW_IMAGE_PARAM_STRIDE_OFFSET,
          (const gl_constant_value *)param->stride, 4);
-      setup_vector_uniform_values(
+      setup_vec4_uniform_value(param_offset + BRW_IMAGE_PARAM_TILING_OFFSET,
          (const gl_constant_value *)param->tiling, 3);
-      setup_vector_uniform_values(
+      setup_vec4_uniform_value(param_offset + BRW_IMAGE_PARAM_SWIZZLING_OFFSET,
          (const gl_constant_value *)param->swizzling, 2);
+      param_offset += BRW_IMAGE_PARAM_SIZE;
 
       brw_mark_surface_used(
          stage_prog_data,
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 2cc97f24972..ccccf4d6938 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -270,9 +270,11 @@ public:
 
    virtual void invalidate_live_intervals() = 0;
 
-   virtual void setup_vector_uniform_values(const gl_constant_value *values,
-                                            unsigned n) = 0;
-   void setup_image_uniform_values(const gl_uniform_storage *storage);
+   virtual void setup_vec4_uniform_value(unsigned param_offset,
+                                         const gl_constant_value *values,
+                                         unsigned n) = 0;
+   void setup_image_uniform_values(unsigned param_offset,
+                                   const gl_uniform_storage *storage);
 };
 
 uint32_t brw_texture_offset(int *offsets, unsigned num_components);
@@ -307,6 +309,9 @@ bool brw_cs_precompile(struct gl_context *ctx,
                        struct gl_shader_program *shader_prog,
                        struct gl_program *prog);
 
+int type_size_scalar(const struct glsl_type *type);
+int type_size_vec4(const struct glsl_type *type);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 5effb4c8829..e817ecfb8cc 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -208,7 +208,7 @@ brw_lookup_prog(const struct brw_cache *cache,
                 const void *data, unsigned data_size)
 {
    const struct brw_context *brw = cache->brw;
-   int i;
+   unsigned i;
    const struct brw_cache_item *item;
 
    for (i = 0; i < cache->size; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index b8b03932065..e96732a1908 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -555,7 +555,7 @@ brw_miptree_layout_texture_array(struct brw_context *brw,
       if (mt->compressed)
          img_height /= mt->align_h;
 
-      for (int q = 0; q < mt->level[level].depth; q++) {
+      for (unsigned q = 0; q < mt->level[level].depth; q++) {
          if (mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
             intel_miptree_set_image_offset(mt, level, q, 0, q * img_height);
          } else {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 341c516b39a..673a29e4b7f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -177,8 +177,9 @@ public:
    void fail(const char *msg, ...);
 
    void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
-   virtual void setup_vector_uniform_values(const gl_constant_value *values,
-                                            unsigned n);
+   virtual void setup_vec4_uniform_value(unsigned param_offset,
+                                         const gl_constant_value *values,
+                                         unsigned n);
    void setup_uniform_values(ir_variable *ir);
    void setup_builtin_uniform_values(ir_variable *ir);
    int setup_uniforms(int payload_reg);
@@ -409,7 +410,6 @@ public:
 
    void visit_atomic_counter_intrinsic(ir_call *ir);
 
-   int type_size(const struct glsl_type *type);
    bool is_high_sampler(src_reg sampler);
 
    virtual void emit_nir_code();
@@ -447,7 +447,6 @@ public:
    dst_reg *nir_locals;
    dst_reg *nir_ssa_values;
    src_reg *nir_inputs;
-   unsigned *nir_uniform_driver_location;
    dst_reg *nir_system_values;
 
 protected:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
index d85fb6f31ec..8a8dd571e74 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -44,7 +44,7 @@ vec4_gs_visitor::nir_setup_inputs(nir_shader *shader)
           */
          assert(var->type->length > 0);
          int length = var->type->length;
-         int size = type_size(var->type) / length;
+         int size = type_size_vec4(var->type) / length;
          for (int i = 0; i < length; i++) {
             int location = var->data.location + i * BRW_VARYING_SLOT_COUNT;
             for (int j = 0; j < size; j++) {
@@ -55,7 +55,7 @@ vec4_gs_visitor::nir_setup_inputs(nir_shader *shader)
             }
          }
       } else {
-         int size = type_size(var->type);
+         int size = type_size_vec4(var->type);
          for (int i = 0; i < size; i++) {
             src_reg src = src_reg(ATTR, var->data.location + i, var->type);
             src = retype(src, brw_type_for_base_type(var->type));
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index fd3d556bfb6..d5a24d823b5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -119,7 +119,7 @@ vec4_visitor::nir_setup_inputs(nir_shader *shader)
 
    foreach_list_typed(nir_variable, var, node, &shader->inputs) {
       int offset = var->data.driver_location;
-      unsigned size = type_size(var->type);
+      unsigned size = type_size_vec4(var->type);
       for (unsigned i = 0; i < size; i++) {
          src_reg src = src_reg(ATTR, var->data.location + i, var->type);
          nir_inputs[offset + i] = src;
@@ -132,20 +132,17 @@ vec4_visitor::nir_setup_uniforms(nir_shader *shader)
 {
    uniforms = 0;
 
-   nir_uniform_driver_location =
-      rzalloc_array(mem_ctx, unsigned, this->uniform_array_size);
-
    if (shader_prog) {
       foreach_list_typed(nir_variable, var, node, &shader->uniforms) {
          /* UBO's, atomics and samplers don't take up space in the
             uniform file */
          if (var->interface_type != NULL || var->type->contains_atomic() ||
-             type_size(var->type) == 0) {
+             type_size_vec4(var->type) == 0) {
             continue;
          }
 
          assert(uniforms < uniform_array_size);
-         this->uniform_size[uniforms] = type_size(var->type);
+         this->uniform_size[uniforms] = type_size_vec4(var->type);
 
          if (strncmp(var->name, "gl_", 3) == 0)
             nir_setup_builtin_uniform(var);
@@ -161,7 +158,7 @@ vec4_visitor::nir_setup_uniforms(nir_shader *shader)
              strcmp(var->name, "parameters") == 0);
 
       assert(uniforms < uniform_array_size);
-      this->uniform_size[uniforms] = type_size(var->type);
+      this->uniform_size[uniforms] = type_size_vec4(var->type);
 
       struct gl_program_parameter_list *plist = prog->Parameters;
       for (unsigned p = 0; p < plist->NumParameters; p++) {
@@ -182,7 +179,6 @@ vec4_visitor::nir_setup_uniforms(nir_shader *shader)
             stage_prog_data->param[uniforms * 4 + i] = &zero;
          }
 
-         nir_uniform_driver_location[uniforms] = var->data.driver_location;
          uniforms++;
       }
    }
@@ -230,7 +226,6 @@ vec4_visitor::nir_setup_uniform(nir_variable *var)
              stage_prog_data->param[uniforms * 4 + i] = &zero;
           }
 
-          nir_uniform_driver_location[uniforms] = var->data.driver_location;
           uniforms++;
        }
     }
@@ -263,7 +258,6 @@ vec4_visitor::nir_setup_builtin_uniform(nir_variable *var)
          (var->type->is_scalar() || var->type->is_vector() ||
           var->type->is_matrix() ? var->type->vector_elements : 4);
 
-      nir_uniform_driver_location[uniforms] = var->data.driver_location;
       uniforms++;
    }
 }
@@ -458,13 +452,28 @@ vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
    dst_reg reg = dst_reg(GRF, alloc.allocate(1));
    reg.type =  BRW_REGISTER_TYPE_F;
 
+   unsigned remaining = brw_writemask_for_size(instr->def.num_components);
+
    /* @FIXME: consider emitting vector operations to save some MOVs in
     * cases where the components are representable in 8 bits.
-    * By now, we emit a MOV for each component.
+    * For now, we emit a MOV for each distinct value.
     */
-   for (unsigned i = 0; i < instr->def.num_components; ++i) {
-      reg.writemask = 1 << i;
+   for (unsigned i = 0; i < instr->def.num_components; i++) {
+      unsigned writemask = 1 << i;
+
+      if ((remaining & writemask) == 0)
+         continue;
+
+      for (unsigned j = i; j < instr->def.num_components; j++) {
+         if (instr->value.u[i] == instr->value.u[j]) {
+            writemask |= 1 << j;
+         }
+      }
+
+      reg.writemask = writemask;
       emit(MOV(reg, src_reg(instr->value.f[i])));
+
+      remaining &= ~writemask;
    }
 
    /* Set final writemask */
@@ -555,24 +564,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       has_indirect = true;
       /* fallthrough */
    case nir_intrinsic_load_uniform: {
-      int uniform = instr->const_index[0];
-
       dest = get_nir_dest(instr->dest);
 
-      if (has_indirect) {
-         /* Split addressing into uniform and offset */
-         int offset = uniform - nir_uniform_driver_location[uniform];
-         assert(offset >= 0);
+      src = src_reg(dst_reg(UNIFORM, instr->const_index[0]));
+      src.reg_offset = instr->const_index[1];
 
-         uniform -= offset;
-         assert(uniform >= 0);
-
-         src = src_reg(dst_reg(UNIFORM, uniform));
-         src.reg_offset = offset;
+      if (has_indirect) {
          src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1);
          src.reladdr = new(mem_ctx) src_reg(tmp);
-      } else {
-         src = src_reg(dst_reg(UNIFORM, uniform));
       }
 
       emit(MOV(dest, src));
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
index 617c9889cad..62ed7084883 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -115,7 +115,7 @@ brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
    ralloc_free(compiler->vec4_reg_set.ra_reg_to_grf);
    compiler->vec4_reg_set.ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
    ralloc_free(compiler->vec4_reg_set.regs);
-   compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, ra_reg_count);
+   compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
    if (compiler->devinfo->gen >= 6)
       ra_set_allocate_round_robin(compiler->vec4_reg_set.regs);
    ralloc_free(compiler->vec4_reg_set.classes);
@@ -140,7 +140,7 @@ brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
 	 for (int base_reg = j;
 	      base_reg < j + class_sizes[i];
 	      base_reg++) {
-	    ra_add_transitive_reg_conflict(compiler->vec4_reg_set.regs, base_reg, reg);
+	    ra_add_reg_conflict(compiler->vec4_reg_set.regs, base_reg, reg);
 	 }
 
 	 reg++;
@@ -158,6 +158,9 @@ brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
    }
    assert(reg == ra_reg_count);
 
+   for (int reg = 0; reg < base_reg_count; reg++)
+      ra_make_reg_conflicts_transitive(compiler->vec4_reg_set.regs, reg);
+
    ra_set_finalize(compiler->vec4_reg_set.regs, q_values);
 
    for (int i = 0; i < MAX_VGRF_SIZE; i++)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 20b628e9192..499f6288aee 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -597,8 +597,8 @@ vec4_visitor::visit_instructions(const exec_list *list)
  * This method is useful to calculate how much register space is needed to
  * store a particular type.
  */
-int
-vec4_visitor::type_size(const struct glsl_type *type)
+extern "C" int
+type_size_vec4(const struct glsl_type *type)
 {
    unsigned int i;
    int size;
@@ -620,11 +620,11 @@ vec4_visitor::type_size(const struct glsl_type *type)
       }
    case GLSL_TYPE_ARRAY:
       assert(type->length > 0);
-      return type_size(type->fields.array) * type->length;
+      return type_size_vec4(type->fields.array) * type->length;
    case GLSL_TYPE_STRUCT:
       size = 0;
       for (i = 0; i < type->length; i++) {
-	 size += type_size(type->fields.structure[i].type);
+	 size += type_size_vec4(type->fields.structure[i].type);
       }
       return size;
    case GLSL_TYPE_SUBROUTINE:
@@ -655,7 +655,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(v->type_size(type));
+   this->reg = v->alloc.allocate(type_size_vec4(type));
 
    if (type->is_array() || type->is_record()) {
       this->swizzle = BRW_SWIZZLE_NOOP;
@@ -673,7 +673,7 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(v->type_size(type) * size);
+   this->reg = v->alloc.allocate(type_size_vec4(type) * size);
 
    this->swizzle = BRW_SWIZZLE_NOOP;
 
@@ -685,7 +685,7 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
    init();
 
    this->file = GRF;
-   this->reg = v->alloc.allocate(v->type_size(type));
+   this->reg = v->alloc.allocate(type_size_vec4(type));
 
    if (type->is_array() || type->is_record()) {
       this->writemask = WRITEMASK_XYZW;
@@ -697,18 +697,21 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
 }
 
 void
-vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
-                                          unsigned n)
+vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
+                                       const gl_constant_value *values,
+                                       unsigned n)
 {
    static const gl_constant_value zero = { 0 };
 
+   assert(param_offset % 4 == 0);
+
    for (unsigned i = 0; i < n; ++i)
-      stage_prog_data->param[4 * uniforms + i] = &values[i];
+      stage_prog_data->param[param_offset + i] = &values[i];
 
    for (unsigned i = n; i < 4; ++i)
-      stage_prog_data->param[4 * uniforms + i] = &zero;
+      stage_prog_data->param[param_offset + i] = &zero;
 
-   uniform_vector_size[uniforms++] = n;
+   uniform_vector_size[param_offset / 4] = n;
 }
 
 /* Our support for uniforms is piggy-backed on the struct
@@ -744,9 +747,12 @@ vec4_visitor::setup_uniform_values(ir_variable *ir)
                                      storage->type->matrix_columns);
       const unsigned vector_size = storage->type->vector_elements;
 
-      for (unsigned s = 0; s < vector_count; s++)
-         setup_vector_uniform_values(&storage->storage[s * vector_size],
-                                     vector_size);
+      for (unsigned s = 0; s < vector_count; s++) {
+         setup_vec4_uniform_value(uniforms * 4,
+                                  &storage->storage[s * vector_size],
+                                  vector_size);
+         uniforms++;
+      }
    }
 }
 
@@ -1070,7 +1076,7 @@ vec4_visitor::visit(ir_variable *ir)
       assert(ir->data.location != -1);
       reg = new(mem_ctx) dst_reg(this, ir->type);
 
-      for (int i = 0; i < type_size(ir->type); i++) {
+      for (int i = 0; i < type_size_vec4(ir->type); i++) {
 	 output_reg[ir->data.location + i] = *reg;
 	 output_reg[ir->data.location + i].reg_offset = i;
 	 output_reg_annotation[ir->data.location + i] = ir->name;
@@ -1092,14 +1098,14 @@ vec4_visitor::visit(ir_variable *ir)
        * Some uniforms, such as samplers and atomic counters, have no actual
        * storage, so we should ignore them.
        */
-      if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
+      if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
          return;
 
       /* Track how big the whole uniform variable is, in case we need to put a
        * copy of its data into pull constants for array access.
        */
       assert(this->uniforms < uniform_array_size);
-      this->uniform_size[this->uniforms] = type_size(ir->type);
+      this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
 
       if (!strncmp(ir->name, "gl_", 3)) {
 	 setup_builtin_uniform_values(ir);
@@ -2052,7 +2058,7 @@ vec4_visitor::compute_array_stride(ir_dereference_array *ir)
    /* Under normal circumstances array elements are stored consecutively, so
     * the stride is equal to the size of the array element.
     */
-   return type_size(ir->type);
+   return type_size_vec4(ir->type);
 }
 
 
@@ -2121,7 +2127,7 @@ vec4_visitor::visit(ir_dereference_record *ir)
    for (i = 0; i < struct_type->length; i++) {
       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 	 break;
-      offset += type_size(struct_type->fields.structure[i].type);
+      offset += type_size_vec4(struct_type->fields.structure[i].type);
    }
 
    /* If the type is smaller than a vec4, replicate the last channel out. */
@@ -2330,7 +2336,7 @@ vec4_visitor::visit(ir_assignment *ir)
       emit_bool_to_cond_code(ir->condition, &predicate);
    }
 
-   for (i = 0; i < type_size(ir->lhs->type); i++) {
+   for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
       vec4_instruction *inst = emit(MOV(dst, src));
       inst->predicate = predicate;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index 72e37d4b467..fd7e56e50d5 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -56,7 +56,7 @@ brw_upload_pull_constants(struct brw_context *brw,
                           const struct brw_stage_prog_data *prog_data,
                           bool dword_pitch)
 {
-   int i;
+   unsigned i;
    uint32_t surf_index = prog_data->binding_table.pull_constants_start;
 
    if (!prog_data->nr_pull_params) {
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 0cd439046b3..cd0b56ba60c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -47,7 +47,7 @@ brw_color_buffer_write_enabled(struct brw_context *brw)
    struct gl_context *ctx = &brw->ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct gl_fragment_program *fp = brw->fragment_program;
-   int i;
+   unsigned i;
 
    /* _NEW_BUFFERS */
    for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index f13a97ce2b0..8213f4ea2fb 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -898,7 +898,7 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
    uint32_t *surf_offsets =
       &stage_state->surf_offset[prog_data->binding_table.ubo_start];
 
-   for (int i = 0; i < shader->NumUniformBlocks; i++) {
+   for (unsigned i = 0; i < shader->NumUniformBlocks; i++) {
       struct gl_uniform_buffer_binding *binding;
       struct intel_buffer_object *intel_bo;
 
@@ -958,7 +958,7 @@ brw_upload_abo_surfaces(struct brw_context *brw,
    uint32_t *surf_offsets =
       &stage_state->surf_offset[prog_data->binding_table.abo_start];
 
-   for (int i = 0; i < prog->NumAtomicBuffers; i++) {
+   for (unsigned i = 0; i < prog->NumAtomicBuffers; i++) {
       struct gl_atomic_buffer_binding *binding =
          &ctx->AtomicBufferBindings[prog->AtomicBuffers[i].Binding];
       struct intel_buffer_object *intel_bo =
@@ -1117,7 +1117,7 @@ update_texture_image_param(struct brw_context *brw,
                      minify(mt->logical_depth0, u->Level) :
                      mt->logical_depth0);
 
-   intel_miptree_get_image_offset(mt, u->Level, u->Layer,
+   intel_miptree_get_image_offset(mt, u->Level, u->_Layer,
                                   &param->offset[0],
                                   &param->offset[1]);
 
@@ -1202,7 +1202,7 @@ update_image_surface(struct brw_context *brw,
                access != GL_READ_ONLY);
 
          } else {
-            const unsigned min_layer = obj->MinLayer + u->Layer;
+            const unsigned min_layer = obj->MinLayer + u->_Layer;
             const unsigned min_level = obj->MinLevel + u->Level;
             const unsigned num_layers = (!u->Layered ? 1 :
                                          obj->Target == GL_TEXTURE_CUBE_MAP ? 6 :
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index 35d10ef8779..6653a6d759b 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -68,7 +68,7 @@ gen6_upload_push_constants(struct brw_context *brw,
       _mesa_load_state_parameters(ctx, prog->Parameters);
 
       gl_constant_value *param;
-      int i;
+      unsigned i;
 
       param = brw_state_batch(brw, type,
 			      prog_data->nr_params * sizeof(gl_constant_value),
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index 41573a80a52..8cd2fc4b48a 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -116,7 +116,7 @@ gen7_upload_3dstate_so_decl_list(struct brw_context *brw,
    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
     * command is feels strange -- each dword pair contains a SO_DECL per stream.
     */
-   for (int i = 0; i < linked_xfb_info->NumOutputs; i++) {
+   for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
       int buffer = linked_xfb_info->Outputs[i].OutputBuffer;
       uint16_t decl = 0;
       int varying = linked_xfb_info->Outputs[i].OutputRegister;
diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
index 1af90ecc6a4..1b48643e577 100644
--- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
@@ -40,16 +40,25 @@ gen8_emit_vertices(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+   bool uses_edge_flag;
 
    brw_prepare_vertices(brw);
    brw_prepare_shader_draw_parameters(brw);
 
+   uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
+                     ctx->Polygon.BackMode != GL_FILL);
+
    if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid) {
       unsigned vue = brw->vb.nr_enabled;
 
-      WARN_ONCE(brw->vs.prog_data->inputs_read & VERT_BIT_EDGEFLAG,
-                "Using VID/IID with edgeflags, need to reorder the "
-                "vertex attributes");
+      /* The element for the edge flags must always be last, so we have to
+       * insert the SGVS before it in that case.
+       */
+      if (uses_edge_flag) {
+         assert(vue > 0);
+         vue--;
+      }
+
       WARN_ONCE(vue >= 33,
                 "Trying to insert VID/IID past 33rd vertex element, "
                 "need to reorder the vertex attrbutes.");
@@ -74,7 +83,7 @@ gen8_emit_vertices(struct brw_context *brw)
 
       BEGIN_BATCH(3);
       OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
-      OUT_BATCH(brw->vb.nr_buffers | GEN8_VF_INSTANCING_ENABLE);
+      OUT_BATCH(vue | GEN8_VF_INSTANCING_ENABLE);
       OUT_BATCH(0);
       ADVANCE_BATCH();
    } else {
@@ -138,7 +147,18 @@ gen8_emit_vertices(struct brw_context *brw)
       ADVANCE_BATCH();
    }
 
-   unsigned nr_elements = brw->vb.nr_enabled + brw->vs.prog_data->uses_vertexid;
+   /* Normally we don't need an element for the SGVS attribute because the
+    * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an
+    * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if the
+    * vertex ID is used then it needs an element for the base vertex buffer.
+    * Additionally if there is an edge flag element then the SGVS can't be
+    * inserted past that so we need a dummy element to ensure that the edge
+    * flag is the last one.
+    */
+   bool needs_sgvs_element = (brw->vs.prog_data->uses_vertexid ||
+                              (brw->vs.prog_data->uses_instanceid &&
+                               uses_edge_flag));
+   unsigned nr_elements = brw->vb.nr_enabled + needs_sgvs_element;
 
    /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
     * presumably for VertexID/InstanceID.
@@ -192,6 +212,24 @@ gen8_emit_vertices(struct brw_context *brw)
                 (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
    }
 
+   if (needs_sgvs_element) {
+      if (brw->vs.prog_data->uses_vertexid) {
+         OUT_BATCH(GEN6_VE0_VALID |
+                   brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT |
+                   BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
+         OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
+      } else {
+         OUT_BATCH(GEN6_VE0_VALID);
+         OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
+                   (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
+      }
+   }
+
    if (gen6_edgeflag_input) {
       uint32_t format =
          brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
@@ -206,25 +244,26 @@ gen8_emit_vertices(struct brw_context *brw)
                 (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
                 (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
    }
-
-   if (brw->vs.prog_data->uses_vertexid) {
-      OUT_BATCH(GEN6_VE0_VALID |
-                brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT |
-                BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT);
-      OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
-                (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT));
-   }
    ADVANCE_BATCH();
 
-   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
+   for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
       const struct brw_vertex_element *input = brw->vb.enabled[i];
       const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
+      unsigned element_index;
+
+      /* The edge flag element is reordered to be the last one in the code
+       * above so we need to compensate for that in the element indices used
+       * below.
+       */
+      if (input == gen6_edgeflag_input)
+         element_index = nr_elements - 1;
+      else
+         element_index = j++;
 
       BEGIN_BATCH(3);
       OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2));
-      OUT_BATCH(i | (buffer->step_rate ? GEN8_VF_INSTANCING_ENABLE : 0));
+      OUT_BATCH(element_index |
+                (buffer->step_rate ? GEN8_VF_INSTANCING_ENABLE : 0));
       OUT_BATCH(buffer->step_rate);
       ADVANCE_BATCH();
    }
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 3bc28a12026..1a246d3ea3a 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -331,6 +331,7 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_gpu_shader5 = true;
       ctx->Extensions.ARB_shader_atomic_counters = true;
       ctx->Extensions.ARB_shader_image_load_store = true;
+      ctx->Extensions.ARB_shader_image_size = true;
       ctx->Extensions.ARB_texture_compression_bptc = true;
       ctx->Extensions.ARB_texture_view = true;
 
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 72648b01e33..64d57e8bc89 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -662,7 +662,7 @@ intel_validate_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb)
    struct intel_renderbuffer *stencilRb =
       intel_get_renderbuffer(fb, BUFFER_STENCIL);
    struct intel_mipmap_tree *depth_mt = NULL, *stencil_mt = NULL;
-   int i;
+   unsigned i;
 
    DBG("%s() on fb %p (%s)\n", __func__,
        fb, (fb == ctx->DrawBuffer ? "drawbuffer" :
@@ -797,7 +797,7 @@ intel_blit_framebuffer_with_blitter(struct gl_context *ctx,
    intel_prepare_render(brw);
 
    if (mask & GL_COLOR_BUFFER_BIT) {
-      GLint i;
+      unsigned i;
       struct gl_renderbuffer *src_rb = readFb->_ColorReadBuffer;
       struct intel_renderbuffer *src_irb = intel_renderbuffer(src_rb);
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index a164c6985dc..5911b444454 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -122,7 +122,7 @@ aub_dump_bmp(struct gl_context *ctx)
 {
    struct gl_framebuffer *fb = ctx->DrawBuffer;
 
-   for (int i = 0; i < fb->_NumColorDrawBuffers; i++) {
+   for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
       struct intel_renderbuffer *irb =
 	 intel_renderbuffer(fb->_ColorDrawBuffers[i]);
 
@@ -1219,7 +1219,7 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
    __DRIconfig **configs = NULL;
 
    /* Generate singlesample configs without accumulation buffer. */
-   for (int i = 0; i < ARRAY_SIZE(formats); i++) {
+   for (unsigned i = 0; i < ARRAY_SIZE(formats); i++) {
       __DRIconfig **new_configs;
       int num_depth_stencil_bits = 2;
 
@@ -1256,7 +1256,7 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
    /* Generate the minimum possible set of configs that include an
     * accumulation buffer.
     */
-   for (int i = 0; i < ARRAY_SIZE(formats); i++) {
+   for (unsigned i = 0; i < ARRAY_SIZE(formats); i++) {
       __DRIconfig **new_configs;
 
       if (formats[i] == MESA_FORMAT_B5G6R5_UNORM) {
@@ -1288,7 +1288,7 @@ intel_screen_make_configs(__DRIscreen *dri_screen)
     * supported.  Singlebuffer configs are not supported because no one wants
     * them.
     */
-   for (int i = 0; i < ARRAY_SIZE(formats); i++) {
+   for (unsigned i = 0; i < ARRAY_SIZE(formats); i++) {
       if (devinfo->gen < 6)
          break;
 
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 1cdea937f91..e17b41ce55c 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -866,8 +866,8 @@ _mesa_init_buffer_objects( struct gl_context *ctx )
       _mesa_reference_buffer_object(ctx,
 				    &ctx->AtomicBufferBindings[i].BufferObject,
 				    ctx->Shared->NullBufferObj);
-      ctx->AtomicBufferBindings[i].Offset = -1;
-      ctx->AtomicBufferBindings[i].Size = -1;
+      ctx->AtomicBufferBindings[i].Offset = 0;
+      ctx->AtomicBufferBindings[i].Size = 0;
    }
 }
 
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 888c461d1c2..be542dd6797 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -402,10 +402,6 @@ one_time_init( struct gl_context *ctx )
 		     PACKAGE_VERSION, __DATE__, __TIME__);
       }
 #endif
-
-#ifdef DEBUG
-      _mesa_test_formats();
-#endif
    }
 
    /* per-API one-time init */
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index d934d19c3e7..4a3c231e36f 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -50,6 +50,7 @@ enum {
    ES1 = 1 << API_OPENGLES,
    ES2 = 1 << API_OPENGLES2,
    ES3 = 1 << (API_OPENGL_LAST + 1),
+   ES31 = 1 << (API_OPENGL_LAST + 2),
 };
 
 /**
@@ -152,6 +153,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_shader_atomic_counters",              o(ARB_shader_atomic_counters),              GL,             2011 },
    { "GL_ARB_shader_bit_encoding",                 o(ARB_shader_bit_encoding),                 GL,             2010 },
    { "GL_ARB_shader_image_load_store",             o(ARB_shader_image_load_store),             GL,             2011 },
+   { "GL_ARB_shader_image_size",                   o(ARB_shader_image_size),                   GL,             2012 },
    { "GL_ARB_shader_objects",                      o(dummy_true),                              GL,             2002 },
    { "GL_ARB_shader_precision",                    o(ARB_shader_precision),                    GL,             2010 },
    { "GL_ARB_shader_stencil_export",               o(ARB_shader_stencil_export),               GL,             2009 },
@@ -773,6 +775,8 @@ _mesa_make_extension_string(struct gl_context *ctx)
    unsigned api_set = (1 << ctx->API);
    if (_mesa_is_gles3(ctx))
       api_set |= ES3;
+   if (_mesa_is_gles31(ctx))
+      api_set |= ES31;
 
    /* Check if the MESA_EXTENSION_MAX_YEAR env var is set */
    {
@@ -854,6 +858,8 @@ _mesa_get_extension_count(struct gl_context *ctx)
    unsigned api_set = (1 << ctx->API);
    if (_mesa_is_gles3(ctx))
       api_set |= ES3;
+   if (_mesa_is_gles31(ctx))
+      api_set |= ES31;
 
    /* only count once */
    if (ctx->Extensions.Count != 0)
@@ -880,6 +886,8 @@ _mesa_get_enabled_extension(struct gl_context *ctx, GLuint index)
    unsigned api_set = (1 << ctx->API);
    if (_mesa_is_gles3(ctx))
       api_set |= ES3;
+   if (_mesa_is_gles31(ctx))
+      api_set |= ES31;
 
    base = (GLboolean*) &ctx->Extensions;
    n = 0;
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 841834030df..07db1950bbb 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -2033,6 +2033,16 @@ renderbuffer_storage(struct gl_context *ctx, struct gl_renderbuffer *rb,
        */
       sample_count_error = _mesa_check_sample_count(ctx, GL_RENDERBUFFER,
             internalFormat, samples);
+
+      /* Section 2.5 (GL Errors) of OpenGL 3.0 specification, page 16:
+       *
+       * "If a negative number is provided where an argument of type sizei or
+       * sizeiptr is specified, the error INVALID VALUE is generated."
+       */
+      if (samples < 0) {
+         sample_count_error = GL_INVALID_VALUE;
+      }
+
       if (sample_count_error != GL_NO_ERROR) {
          _mesa_error(ctx, sample_count_error, "%s(samples)", func);
          return;
diff --git a/src/mesa/main/format_info.py b/src/mesa/main/format_info.py
index 3bae57e54ed..839d4073c61 100644
--- a/src/mesa/main/format_info.py
+++ b/src/mesa/main/format_info.py
@@ -98,14 +98,6 @@ def get_gl_data_type(fmat):
    else:
       assert False
 
-def get_mesa_layout(fmat):
-   if fmat.layout == 'array':
-      return 'MESA_FORMAT_LAYOUT_ARRAY'
-   elif fmat.layout == 'packed':
-      return 'MESA_FORMAT_LAYOUT_PACKED'
-   else:
-      return 'MESA_FORMAT_LAYOUT_OTHER'
-
 def get_channel_bits(fmat, chan_name):
    if fmat.is_compressed():
       # These values are pretty-much bogus, but OpenGL requires that we
@@ -179,7 +171,7 @@ for fmat in formats:
    print '   {'
    print '      {0},'.format(fmat.name)
    print '      "{0}",'.format(fmat.name)
-   print '      {0},'.format(get_mesa_layout(fmat))
+   print '      {0},'.format('MESA_FORMAT_LAYOUT_' + fmat.layout.upper())
    print '      {0},'.format(get_gl_base_format(fmat))
    print '      {0},'.format(get_gl_data_type(fmat))
 
@@ -188,6 +180,8 @@ for fmat in formats:
    bits = [ get_channel_bits(fmat, name) for name in ['l', 'i', 'z', 's']]
    print '      {0},'.format(', '.join(map(str, bits)))
 
+   print '      {0:d},'.format(fmat.colorspace == 'srgb')
+
    print '      {0}, {1}, {2},'.format(fmat.block_width, fmat.block_height,
                                        int(fmat.block_size() / 8))
 
diff --git a/src/mesa/main/format_utils.c b/src/mesa/main/format_utils.c
index 810bb163400..5fdabd5b97f 100644
--- a/src/mesa/main/format_utils.c
+++ b/src/mesa/main/format_utils.c
@@ -602,7 +602,7 @@ _mesa_format_to_array(mesa_format format, GLenum *type, int *num_components,
 
    *normalized = !_mesa_is_format_integer(format);
 
-   _mesa_format_to_type_and_comps(format, type, &format_components);
+   _mesa_uncompressed_format_to_type_and_comps(format, type, &format_components);
 
    switch (_mesa_get_format_layout(format)) {
    case MESA_FORMAT_LAYOUT_ARRAY:
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index d7b2bae59e7..8dd07d88f40 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -65,6 +65,8 @@ struct gl_format_info
    GLubyte DepthBits;
    GLubyte StencilBits;
 
+   bool IsSRGBFormat;
+
    /**
     * To describe compressed formats.  If not compressed, Width=Height=1.
     */
@@ -81,6 +83,7 @@ static const struct gl_format_info *
 _mesa_get_format_info(mesa_format format)
 {
    const struct gl_format_info *info = &format_info[format];
+   STATIC_ASSERT(ARRAY_SIZE(format_info) == MESA_FORMAT_COUNT);
    assert(info->Name == format);
    return info;
 }
@@ -188,6 +191,12 @@ _mesa_get_format_max_bits(mesa_format format)
  * The return value will be one of:
  *    MESA_FORMAT_LAYOUT_ARRAY
  *    MESA_FORMAT_LAYOUT_PACKED
+ *    MESA_FORMAT_LAYOUT_S3TC
+ *    MESA_FORMAT_LAYOUT_RGTC
+ *    MESA_FORMAT_LAYOUT_FXT1
+ *    MESA_FORMAT_LAYOUT_ETC1
+ *    MESA_FORMAT_LAYOUT_ETC2
+ *    MESA_FORMAT_LAYOUT_BPTC
  *    MESA_FORMAT_LAYOUT_OTHER
  */
 extern enum mesa_format_layout
@@ -562,30 +571,8 @@ _mesa_is_format_color_format(mesa_format format)
 GLenum
 _mesa_get_format_color_encoding(mesa_format format)
 {
-   /* XXX this info should be encoded in gl_format_info */
-   switch (format) {
-   case MESA_FORMAT_BGR_SRGB8:
-   case MESA_FORMAT_A8B8G8R8_SRGB:
-   case MESA_FORMAT_B8G8R8A8_SRGB:
-   case MESA_FORMAT_A8R8G8B8_SRGB:
-   case MESA_FORMAT_R8G8B8A8_SRGB:
-   case MESA_FORMAT_L_SRGB8:
-   case MESA_FORMAT_L8A8_SRGB:
-   case MESA_FORMAT_A8L8_SRGB:
-   case MESA_FORMAT_SRGB_DXT1:
-   case MESA_FORMAT_SRGBA_DXT1:
-   case MESA_FORMAT_SRGBA_DXT3:
-   case MESA_FORMAT_SRGBA_DXT5:
-   case MESA_FORMAT_R8G8B8X8_SRGB:
-   case MESA_FORMAT_ETC2_SRGB8:
-   case MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC:
-   case MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1:
-   case MESA_FORMAT_B8G8R8X8_SRGB:
-   case MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM:
-      return GL_SRGB;
-   default:
-      return GL_LINEAR;
-   }
+   const struct gl_format_info *info = _mesa_get_format_info(format);
+   return info->IsSRGBFormat ? GL_SRGB : GL_LINEAR;
 }
 
 
@@ -878,124 +865,13 @@ _mesa_format_row_stride(mesa_format format, GLsizei width)
 }
 
 
-/**
- * Debug/test: check that all formats are handled in the
- * _mesa_format_to_type_and_comps() function.  When new pixel formats
- * are added to Mesa, that function needs to be updated.
- * This is a no-op after the first call.
- */
-static void
-check_format_to_type_and_comps(void)
-{
-   mesa_format f;
-
-   for (f = MESA_FORMAT_NONE + 1; f < MESA_FORMAT_COUNT; f++) {
-      GLenum datatype = 0;
-      GLuint comps = 0;
-      /* This function will emit a problem/warning if the format is
-       * not handled.
-       */
-      _mesa_format_to_type_and_comps(f, &datatype, &comps);
-   }
-}
 
 /**
- * Do sanity checking of the format info table.
+ * Return datatype and number of components per texel for the given
+ * uncompressed mesa_format. Only used for mipmap generation code.
  */
 void
-_mesa_test_formats(void)
-{
-   GLuint i;
-
-   STATIC_ASSERT(ARRAY_SIZE(format_info) == MESA_FORMAT_COUNT);
-
-   for (i = 0; i < MESA_FORMAT_COUNT; i++) {
-      const struct gl_format_info *info = _mesa_get_format_info(i);
-      assert(info);
-
-      assert(info->Name == i);
-
-      if (info->Name == MESA_FORMAT_NONE)
-         continue;
-
-      if (info->BlockWidth == 1 && info->BlockHeight == 1) {
-         if (info->RedBits > 0) {
-            GLuint t = info->RedBits + info->GreenBits
-               + info->BlueBits + info->AlphaBits;
-            assert(t / 8 <= info->BytesPerBlock);
-            (void) t;
-         }
-      }
-
-      assert(info->DataType == GL_UNSIGNED_NORMALIZED ||
-             info->DataType == GL_SIGNED_NORMALIZED ||
-             info->DataType == GL_UNSIGNED_INT ||
-             info->DataType == GL_INT ||
-             info->DataType == GL_FLOAT ||
-             /* Z32_FLOAT_X24S8 has DataType of GL_NONE */
-             info->DataType == GL_NONE);
-
-      if (info->BaseFormat == GL_RGB) {
-         assert(info->RedBits > 0);
-         assert(info->GreenBits > 0);
-         assert(info->BlueBits > 0);
-         assert(info->AlphaBits == 0);
-         assert(info->LuminanceBits == 0);
-         assert(info->IntensityBits == 0);
-      }
-      else if (info->BaseFormat == GL_RGBA) {
-         assert(info->RedBits > 0);
-         assert(info->GreenBits > 0);
-         assert(info->BlueBits > 0);
-         assert(info->AlphaBits > 0);
-         assert(info->LuminanceBits == 0);
-         assert(info->IntensityBits == 0);
-      }
-      else if (info->BaseFormat == GL_RG) {
-         assert(info->RedBits > 0);
-         assert(info->GreenBits > 0);
-         assert(info->BlueBits == 0);
-         assert(info->AlphaBits == 0);
-         assert(info->LuminanceBits == 0);
-         assert(info->IntensityBits == 0);
-      }
-      else if (info->BaseFormat == GL_RED) {
-         assert(info->RedBits > 0);
-         assert(info->GreenBits == 0);
-         assert(info->BlueBits == 0);
-         assert(info->AlphaBits == 0);
-         assert(info->LuminanceBits == 0);
-         assert(info->IntensityBits == 0);
-      }
-      else if (info->BaseFormat == GL_LUMINANCE) {
-         assert(info->RedBits == 0);
-         assert(info->GreenBits == 0);
-         assert(info->BlueBits == 0);
-         assert(info->AlphaBits == 0);
-         assert(info->LuminanceBits > 0);
-         assert(info->IntensityBits == 0);
-      }
-      else if (info->BaseFormat == GL_INTENSITY) {
-         assert(info->RedBits == 0);
-         assert(info->GreenBits == 0);
-         assert(info->BlueBits == 0);
-         assert(info->AlphaBits == 0);
-         assert(info->LuminanceBits == 0);
-         assert(info->IntensityBits > 0);
-      }
-   }
-
-   check_format_to_type_and_comps();
-}
-
-
-
-/**
- * Return datatype and number of components per texel for the given mesa_format.
- * Only used for mipmap generation code.
- */
-void
-_mesa_format_to_type_and_comps(mesa_format format,
+_mesa_uncompressed_format_to_type_and_comps(mesa_format format,
                                GLenum *datatype, GLuint *comps)
 {
    switch (format) {
@@ -1229,44 +1105,6 @@ _mesa_format_to_type_and_comps(mesa_format format,
       *comps = 2;
       return;
 
-   case MESA_FORMAT_RGB_FXT1:
-   case MESA_FORMAT_RGBA_FXT1:
-   case MESA_FORMAT_RGB_DXT1:
-   case MESA_FORMAT_RGBA_DXT1:
-   case MESA_FORMAT_RGBA_DXT3:
-   case MESA_FORMAT_RGBA_DXT5:
-   case MESA_FORMAT_SRGB_DXT1:
-   case MESA_FORMAT_SRGBA_DXT1:
-   case MESA_FORMAT_SRGBA_DXT3:
-   case MESA_FORMAT_SRGBA_DXT5:
-   case MESA_FORMAT_R_RGTC1_UNORM:
-   case MESA_FORMAT_R_RGTC1_SNORM:
-   case MESA_FORMAT_RG_RGTC2_UNORM:
-   case MESA_FORMAT_RG_RGTC2_SNORM:
-   case MESA_FORMAT_L_LATC1_UNORM:
-   case MESA_FORMAT_L_LATC1_SNORM:
-   case MESA_FORMAT_LA_LATC2_UNORM:
-   case MESA_FORMAT_LA_LATC2_SNORM:
-   case MESA_FORMAT_ETC1_RGB8:
-   case MESA_FORMAT_ETC2_RGB8:
-   case MESA_FORMAT_ETC2_SRGB8:
-   case MESA_FORMAT_ETC2_RGBA8_EAC:
-   case MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC:
-   case MESA_FORMAT_ETC2_R11_EAC:
-   case MESA_FORMAT_ETC2_RG11_EAC:
-   case MESA_FORMAT_ETC2_SIGNED_R11_EAC:
-   case MESA_FORMAT_ETC2_SIGNED_RG11_EAC:
-   case MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1:
-   case MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1:
-   case MESA_FORMAT_BPTC_RGBA_UNORM:
-   case MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM:
-   case MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT:
-   case MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT:
-      /* XXX generate error instead? */
-      *datatype = GL_UNSIGNED_BYTE;
-      *comps = 0;
-      return;
-
    case MESA_FORMAT_RGBA_FLOAT32:
       *datatype = GL_FLOAT;
       *comps = 4;
@@ -1564,14 +1402,12 @@ _mesa_format_to_type_and_comps(mesa_format format,
    case MESA_FORMAT_COUNT:
       assert(0);
       return;
-
-   case MESA_FORMAT_NONE:
-   /* For debug builds, warn if any formats are not handled */
-#ifdef DEBUG
    default:
-#endif
-      _mesa_problem(NULL, "bad format %s in _mesa_format_to_type_and_comps",
+      /* Warn if any formats are not handled */
+      _mesa_problem(NULL, "bad format %s in _mesa_uncompressed_format_to_type_and_comps",
                     _mesa_get_format_name(format));
+      assert(format == MESA_FORMAT_NONE ||
+             _mesa_is_format_compressed(format));
       *datatype = 0;
       *comps = 1;
    }
@@ -1584,20 +1420,26 @@ _mesa_format_to_type_and_comps(mesa_format format,
  * \param format  the user-specified image format
  * \param type  the user-specified image datatype
  * \param swapBytes  typically the current pixel pack/unpack byteswap state
+ * \param[out] error GL_NO_ERROR if format is an expected input.
+ *                   GL_INVALID_ENUM if format is an unexpected input.
  * \return GL_TRUE if the formats match, GL_FALSE otherwise.
  */
 GLboolean
 _mesa_format_matches_format_and_type(mesa_format mesa_format,
 				     GLenum format, GLenum type,
-                                     GLboolean swapBytes)
+				     GLboolean swapBytes, GLenum *error)
 {
    const GLboolean littleEndian = _mesa_little_endian();
+   if (error)
+      *error = GL_NO_ERROR;
 
    /* Note: When reading a GL format/type combination, the format lists channel
     * assignments from most significant channel in the type to least
     * significant.  A type with _REV indicates that the assignments are
     * swapped, so they are listed from least significant to most significant.
     *
+    * Compressed formats will fall through and return GL_FALSE.
+    *
     * For sanity, please keep this switch statement ordered the same as the
     * enums in formats.h.
     */
@@ -1858,26 +1700,6 @@ _mesa_format_matches_format_and_type(mesa_format mesa_format,
    case MESA_FORMAT_S_UINT8:
       return format == GL_STENCIL_INDEX && type == GL_UNSIGNED_BYTE;
 
-   case MESA_FORMAT_SRGB_DXT1:
-   case MESA_FORMAT_SRGBA_DXT1:
-   case MESA_FORMAT_SRGBA_DXT3:
-   case MESA_FORMAT_SRGBA_DXT5:
-      return GL_FALSE;
-
-   case MESA_FORMAT_RGB_FXT1:
-   case MESA_FORMAT_RGBA_FXT1:
-   case MESA_FORMAT_RGB_DXT1:
-   case MESA_FORMAT_RGBA_DXT1:
-   case MESA_FORMAT_RGBA_DXT3:
-   case MESA_FORMAT_RGBA_DXT5:
-      return GL_FALSE;
-
-   case MESA_FORMAT_BPTC_RGBA_UNORM:
-   case MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM:
-   case MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT:
-   case MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT:
-      return GL_FALSE;
-
    case MESA_FORMAT_RGBA_FLOAT32:
       return format == GL_RGBA && type == GL_FLOAT && !swapBytes;
    case MESA_FORMAT_RGBA_FLOAT16:
@@ -2074,31 +1896,6 @@ _mesa_format_matches_format_and_type(mesa_format mesa_format,
       return format == GL_RGBA && type == GL_UNSIGNED_SHORT &&
              !swapBytes;
 
-   case MESA_FORMAT_R_RGTC1_UNORM:
-   case MESA_FORMAT_R_RGTC1_SNORM:
-   case MESA_FORMAT_RG_RGTC2_UNORM:
-   case MESA_FORMAT_RG_RGTC2_SNORM:
-      return GL_FALSE;
-
-   case MESA_FORMAT_L_LATC1_UNORM:
-   case MESA_FORMAT_L_LATC1_SNORM:
-   case MESA_FORMAT_LA_LATC2_UNORM:
-   case MESA_FORMAT_LA_LATC2_SNORM:
-      return GL_FALSE;
-
-   case MESA_FORMAT_ETC1_RGB8:
-   case MESA_FORMAT_ETC2_RGB8:
-   case MESA_FORMAT_ETC2_SRGB8:
-   case MESA_FORMAT_ETC2_RGBA8_EAC:
-   case MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC:
-   case MESA_FORMAT_ETC2_R11_EAC:
-   case MESA_FORMAT_ETC2_RG11_EAC:
-   case MESA_FORMAT_ETC2_SIGNED_R11_EAC:
-   case MESA_FORMAT_ETC2_SIGNED_RG11_EAC:
-   case MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1:
-   case MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1:
-      return GL_FALSE;
-
    case MESA_FORMAT_A_SNORM8:
       return format == GL_ALPHA && type == GL_BYTE;
    case MESA_FORMAT_L_SNORM8:
@@ -2181,8 +1978,11 @@ _mesa_format_matches_format_and_type(mesa_format mesa_format,
    case MESA_FORMAT_B8G8R8X8_SRGB:
    case MESA_FORMAT_X8R8G8B8_SRGB:
       return GL_FALSE;
+   default:
+      assert(_mesa_is_format_compressed(format));
+      if (error)
+         *error = GL_INVALID_ENUM;
    }
-
    return GL_FALSE;
 }
 
diff --git a/src/mesa/main/formats.h b/src/mesa/main/formats.h
index d938e6ad513..4936fa0d482 100644
--- a/src/mesa/main/formats.h
+++ b/src/mesa/main/formats.h
@@ -64,6 +64,12 @@ extern "C" {
 enum mesa_format_layout {
    MESA_FORMAT_LAYOUT_ARRAY,
    MESA_FORMAT_LAYOUT_PACKED,
+   MESA_FORMAT_LAYOUT_S3TC,
+   MESA_FORMAT_LAYOUT_RGTC,
+   MESA_FORMAT_LAYOUT_FXT1,
+   MESA_FORMAT_LAYOUT_ETC1,
+   MESA_FORMAT_LAYOUT_ETC2,
+   MESA_FORMAT_LAYOUT_BPTC,
    MESA_FORMAT_LAYOUT_OTHER,
 };
 
@@ -659,7 +665,7 @@ extern GLint
 _mesa_format_row_stride(mesa_format format, GLsizei width);
 
 extern void
-_mesa_format_to_type_and_comps(mesa_format format,
+_mesa_uncompressed_format_to_type_and_comps(mesa_format format,
                                GLenum *datatype, GLuint *comps);
 
 extern void
@@ -680,7 +686,7 @@ _mesa_format_has_color_component(mesa_format format, int component);
 GLboolean
 _mesa_format_matches_format_and_type(mesa_format mesa_format,
 				     GLenum format, GLenum type,
-                                     GLboolean swapBytes);
+				     GLboolean swapBytes, GLenum *error);
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 7dc92f10100..517c391955d 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -806,7 +806,7 @@ descriptor=[
   [ "MAX_VERTEX_ATTRIB_BINDINGS", "CONTEXT_ENUM(Const.MaxVertexAttribBindings), NO_EXTRA" ],
 
 # GL_ARB_shader_image_load_store
-  [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedImageUnitsAndFragmentOutputs), extra_ARB_shader_image_load_store" ],
+  [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store" ],
   [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store" ],
   [ "MAX_GEOMETRY_IMAGE_UNIFORMS", "CONTEXT_INT(Const.Program[MESA_SHADER_GEOMETRY].MaxImageUniforms), extra_ARB_shader_image_load_store_and_geometry_shader"],
 
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 1e22f930092..2bf5902fba4 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -1886,7 +1886,7 @@ generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
    GLenum datatype;
    GLuint comps;
 
-   _mesa_format_to_type_and_comps(srcImage->TexFormat, &datatype, &comps);
+   _mesa_uncompressed_format_to_type_and_comps(srcImage->TexFormat, &datatype, &comps);
 
    for (level = texObj->BaseLevel; level < maxLevel; level++) {
       /* generate image[level+1] from image[level] */
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 83f3717754d..4883cbc93d5 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -94,7 +94,10 @@ struct vbo_context;
 #define PRIM_OUTSIDE_BEGIN_END   (PRIM_MAX + 1)
 #define PRIM_UNKNOWN             (PRIM_MAX + 2)
 
-
+#define VARYING_SLOT_MAX	(VARYING_SLOT_VAR0 + MAX_VARYING)
+#define VARYING_SLOT_PATCH0	(VARYING_SLOT_MAX)
+#define VARYING_SLOT_TESS_MAX	(VARYING_SLOT_PATCH0 + MAX_VARYING)
+#define FRAG_RESULT_MAX		(FRAG_RESULT_DATA0 + MAX_DRAW_BUFFERS)
 
 /**
  * Determine if the given gl_varying_slot appears in the fragment shader.
@@ -117,7 +120,6 @@ _mesa_varying_slot_in_fs(gl_varying_slot slot)
    }
 }
 
-
 /**
  * Indexes for all renderbuffers
  */
@@ -3571,7 +3573,7 @@ struct gl_constants
 
    /* GL_ARB_shader_image_load_store */
    GLuint MaxImageUnits;
-   GLuint MaxCombinedImageUnitsAndFragmentOutputs;
+   GLuint MaxCombinedShaderOutputResources;
    GLuint MaxImageSamples;
    GLuint MaxCombinedImageUniforms;
 
@@ -3656,6 +3658,7 @@ struct gl_extensions
    GLboolean ARB_shader_atomic_counters;
    GLboolean ARB_shader_bit_encoding;
    GLboolean ARB_shader_image_load_store;
+   GLboolean ARB_shader_image_size;
    GLboolean ARB_shader_precision;
    GLboolean ARB_shader_stencil_export;
    GLboolean ARB_shader_storage_buffer_object;
@@ -4073,10 +4076,16 @@ struct gl_image_unit
    GLboolean _Valid;
 
    /**
+    * Layer of the texture object bound to this unit as specified by the
+    * application.
+    */
+   GLuint Layer;
+
+   /**
     * Layer of the texture object bound to this unit, or zero if the
     * whole level is bound.
     */
-   GLuint Layer;
+   GLuint _Layer;
 
    /**
     * Access allowed to this texture image.  Either \c GL_READ_ONLY,
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index 09e6154f7ec..e7783ea5374 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -150,15 +150,6 @@ GLenum
 _mesa_check_sample_count(struct gl_context *ctx, GLenum target,
                          GLenum internalFormat, GLsizei samples)
 {
-   /* Section 2.5 (GL Errors) of OpenGL 3.0 specification, page 16:
-    *
-    * "If a negative number is provided where an argument of type sizei or
-    * sizeiptr is specified, the error INVALID VALUE is generated."
-    */
-   if (samples < 0) {
-      return GL_INVALID_VALUE;
-   }
-
    /* Section 4.4 (Framebuffer objects), page 198 of the OpenGL ES 3.0.0
     * specification says:
     *
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index d826ecfc3d5..12779446c6d 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -201,7 +201,7 @@ readpixels_can_use_memcpy(const struct gl_context *ctx, GLenum format, GLenum ty
 
    /* The Mesa format must match the input format and type. */
    if (!_mesa_format_matches_format_and_type(rb->Format, format, type,
-                                             packing->SwapBytes)) {
+                                             packing->SwapBytes, NULL)) {
       return GL_FALSE;
    }
 
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index f9a7d130f9c..b227c17548e 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -1995,55 +1995,6 @@ _mesa_use_shader_program(struct gl_context *ctx, GLenum type,
 }
 
 
-static GLuint
-_mesa_create_shader_program(struct gl_context* ctx, GLboolean separate,
-                            GLenum type, GLsizei count,
-                            const GLchar* const *strings)
-{
-   const GLuint shader = create_shader(ctx, type);
-   GLuint program = 0;
-
-   if (shader) {
-      _mesa_ShaderSource(shader, count, strings, NULL);
-
-      compile_shader(ctx, shader);
-
-      program = create_shader_program(ctx);
-      if (program) {
-	 struct gl_shader_program *shProg;
-	 struct gl_shader *sh;
-	 GLint compiled = GL_FALSE;
-
-	 shProg = _mesa_lookup_shader_program(ctx, program);
-	 sh = _mesa_lookup_shader(ctx, shader);
-
-	 shProg->SeparateShader = separate;
-
-	 get_shaderiv(ctx, shader, GL_COMPILE_STATUS, &compiled);
-	 if (compiled) {
-	    attach_shader(ctx, program, shader);
-	    link_program(ctx, program);
-	    detach_shader(ctx, program, shader);
-
-#if 0
-	    /* Possibly... */
-	    if (active-user-defined-varyings-in-linked-program) {
-	       append-error-to-info-log;
-	       shProg->LinkStatus = GL_FALSE;
-	    }
-#endif
-	 }
-         if (sh->InfoLog)
-            ralloc_strcat(&shProg->InfoLog, sh->InfoLog);
-      }
-
-      delete_shader(ctx, shader);
-   }
-
-   return program;
-}
-
-
 /**
  * Copy program-specific data generated by linking from the gl_shader_program
  * object to a specific gl_program object.
@@ -2111,7 +2062,56 @@ _mesa_CreateShaderProgramv(GLenum type, GLsizei count,
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   return _mesa_create_shader_program(ctx, GL_TRUE, type, count, strings);
+   const GLuint shader = create_shader(ctx, type);
+   GLuint program = 0;
+
+   /*
+    * According to OpenGL 4.5 and OpenGL ES 3.1 standards, section 7.3:
+    * GL_INVALID_VALUE should be generated if count < 0
+    */
+   if (count < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glCreateShaderProgram (count < 0)");
+      return program;
+   }
+
+   if (shader) {
+      _mesa_ShaderSource(shader, count, strings, NULL);
+
+      compile_shader(ctx, shader);
+
+      program = create_shader_program(ctx);
+      if (program) {
+	 struct gl_shader_program *shProg;
+	 struct gl_shader *sh;
+	 GLint compiled = GL_FALSE;
+
+	 shProg = _mesa_lookup_shader_program(ctx, program);
+	 sh = _mesa_lookup_shader(ctx, shader);
+
+	 shProg->SeparateShader = GL_TRUE;
+
+	 get_shaderiv(ctx, shader, GL_COMPILE_STATUS, &compiled);
+	 if (compiled) {
+	    attach_shader(ctx, program, shader);
+	    link_program(ctx, program);
+	    detach_shader(ctx, program, shader);
+
+#if 0
+	    /* Possibly... */
+	    if (active-user-defined-varyings-in-linked-program) {
+	       append-error-to-info-log;
+	       shProg->LinkStatus = GL_FALSE;
+	    }
+#endif
+	 }
+         if (sh->InfoLog)
+            ralloc_strcat(&shProg->InfoLog, sh->InfoLog);
+      }
+
+      delete_shader(ctx, shader);
+   }
+
+   return program;
 }
 
 
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index a348cdb0405..c4bba842ca7 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -331,17 +331,88 @@ get_image_format_class(mesa_format format)
    }
 }
 
+/**
+ * Return whether an image format should be supported based on the current API
+ * version of the context.
+ */
+static bool
+is_image_format_supported(const struct gl_context *ctx, GLenum format)
+{
+   switch (format) {
+   /* Formats supported on both desktop and ES GL, c.f. table 8.27 of the
+    * OpenGL ES 3.1 specification.
+    */
+   case GL_RGBA32F:
+   case GL_RGBA16F:
+   case GL_R32F:
+   case GL_RGBA32UI:
+   case GL_RGBA16UI:
+   case GL_RGBA8UI:
+   case GL_R32UI:
+   case GL_RGBA32I:
+   case GL_RGBA16I:
+   case GL_RGBA8I:
+   case GL_R32I:
+   case GL_RGBA8:
+   case GL_RGBA8_SNORM:
+      return true;
+
+   /* Formats supported on unextended desktop GL and the original
+    * ARB_shader_image_load_store extension, c.f. table 3.21 of the OpenGL 4.2
+    * specification.
+    */
+   case GL_RG32F:
+   case GL_RG16F:
+   case GL_R11F_G11F_B10F:
+   case GL_R16F:
+   case GL_RGB10_A2UI:
+   case GL_RG32UI:
+   case GL_RG16UI:
+   case GL_RG8UI:
+   case GL_R16UI:
+   case GL_R8UI:
+   case GL_RG32I:
+   case GL_RG16I:
+   case GL_RG8I:
+   case GL_R16I:
+   case GL_R8I:
+   case GL_RGBA16:
+   case GL_RGB10_A2:
+   case GL_RG16:
+   case GL_RG8:
+   case GL_R16:
+   case GL_R8:
+   case GL_RGBA16_SNORM:
+   case GL_RG16_SNORM:
+   case GL_RG8_SNORM:
+   case GL_R16_SNORM:
+   case GL_R8_SNORM:
+      return _mesa_is_desktop_gl(ctx);
+
+   default:
+      return false;
+   }
+}
+
+struct gl_image_unit
+_mesa_default_image_unit(struct gl_context *ctx)
+{
+   const GLenum format = _mesa_is_desktop_gl(ctx) ? GL_R8 : GL_R32UI;
+   const struct gl_image_unit u = {
+      .Access = GL_READ_ONLY,
+      .Format = format,
+      ._ActualFormat = _mesa_get_shader_image_format(format)
+   };
+   return u;
+}
+
 void
 _mesa_init_image_units(struct gl_context *ctx)
 {
    unsigned i;
 
-   for (i = 0; i < ARRAY_SIZE(ctx->ImageUnits); ++i) {
-      struct gl_image_unit *u = &ctx->ImageUnits[i];
-      u->Access = GL_READ_ONLY;
-      u->Format = GL_R8;
-      u->_ActualFormat = _mesa_get_shader_image_format(u->Format);
-   }
+   for (i = 0; i < ARRAY_SIZE(ctx->ImageUnits); ++i)
+      ctx->ImageUnits[i] = _mesa_default_image_unit(ctx);
 }
 
 static GLboolean
@@ -362,7 +433,7 @@ validate_image_unit(struct gl_context *ctx, struct gl_image_unit *u)
       return GL_FALSE;
 
    if (_mesa_tex_target_is_layered(t->Target) &&
-       u->Layer >= _mesa_get_texture_layers(t, u->Level))
+       u->_Layer >= _mesa_get_texture_layers(t, u->Level))
       return GL_FALSE;
 
    if (t->Target == GL_TEXTURE_BUFFER) {
@@ -370,7 +441,7 @@ validate_image_unit(struct gl_context *ctx, struct gl_image_unit *u)
 
    } else {
       struct gl_texture_image *img = (t->Target == GL_TEXTURE_CUBE_MAP ?
-                                      t->Image[u->Layer][u->Level] :
+                                      t->Image[u->_Layer][u->Level] :
                                       t->Image[0][u->Level]);
 
       if (!img || img->Border || img->NumSamples > ctx->Const.MaxImageSamples)
@@ -442,7 +513,7 @@ validate_bind_image_texture(struct gl_context *ctx, GLuint unit,
       return GL_FALSE;
    }
 
-   if (!_mesa_get_shader_image_format(format)) {
+   if (!is_image_format_supported(ctx, format)) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBindImageTexture(format)");
       return GL_FALSE;
    }
@@ -475,6 +546,18 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
          return;
       }
 
+      /* From section 8.22 "Texture Image Loads and Stores" of the OpenGL ES
+       * 3.1 spec:
+       *
+       * "An INVALID_OPERATION error is generated if texture is not the name
+       *  of an immutable texture object."
+       */
+      if (_mesa_is_gles(ctx) && !t->Immutable) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glBindImageTexture(!immutable)");
+         return;
+      }
+
       _mesa_reference_texobj(&u->TexObj, t);
    } else {
       _mesa_reference_texobj(&u->TexObj, NULL);
@@ -488,7 +571,8 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
 
    if (u->TexObj && _mesa_tex_target_is_layered(u->TexObj->Target)) {
       u->Layered = layered;
-      u->Layer = (layered ? 0 : layer);
+      u->Layer = layer;
+      u->_Layer = (u->Layered ? 0 : u->Layer);
    } else {
       u->Layered = GL_FALSE;
       u->Layer = 0;
@@ -599,7 +683,7 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
             tex_format = image->InternalFormat;
          }
 
-         if (_mesa_get_shader_image_format(tex_format) == MESA_FORMAT_NONE) {
+         if (!is_image_format_supported(ctx, tex_format)) {
             /* The ARB_multi_bind spec says:
              *
              *   "An INVALID_OPERATION error is generated if the internal
@@ -619,7 +703,7 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
          _mesa_reference_texobj(&u->TexObj, texObj);
          u->Level = 0;
          u->Layered = _mesa_tex_target_is_layered(texObj->Target);
-         u->Layer = 0;
+         u->_Layer = u->Layer = 0;
          u->Access = GL_READ_WRITE;
          u->Format = tex_format;
          u->_ActualFormat = _mesa_get_shader_image_format(tex_format);
@@ -629,7 +713,7 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
          _mesa_reference_texobj(&u->TexObj, NULL);
          u->Level = 0;
          u->Layered = GL_FALSE;
-         u->Layer = 0;
+         u->_Layer = u->Layer = 0;
          u->Access = GL_READ_ONLY;
          u->Format = GL_R8;
          u->_ActualFormat = MESA_FORMAT_R_UNORM8;
@@ -653,3 +737,43 @@ _mesa_MemoryBarrier(GLbitfield barriers)
    if (ctx->Driver.MemoryBarrier)
       ctx->Driver.MemoryBarrier(ctx, barriers);
 }
+
+void GLAPIENTRY
+_mesa_MemoryBarrierByRegion(GLbitfield barriers)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   GLbitfield all_allowed_bits = GL_ATOMIC_COUNTER_BARRIER_BIT |
+                                 GL_FRAMEBUFFER_BARRIER_BIT |
+                                 GL_SHADER_IMAGE_ACCESS_BARRIER_BIT |
+                                 GL_SHADER_STORAGE_BARRIER_BIT |
+                                 GL_TEXTURE_FETCH_BARRIER_BIT |
+                                 GL_UNIFORM_BARRIER_BIT;
+
+   if (ctx->Driver.MemoryBarrier) {
+      /* From section 7.11.2 of the OpenGL ES 3.1 specification:
+       *
+       *    "When barriers is ALL_BARRIER_BITS, shader memory accesses will be
+       *     synchronized relative to all these barrier bits, but not to other
+       *     barrier bits specific to MemoryBarrier."
+       *
+       * That is, if barriers is the special value GL_ALL_BARRIER_BITS, then all
+       * barriers allowed by glMemoryBarrierByRegion should be activated."
+       */
+      if (barriers == GL_ALL_BARRIER_BITS)
+         return ctx->Driver.MemoryBarrier(ctx, all_allowed_bits);
+
+      /* From section 7.11.2 of the OpenGL ES 3.1 specification:
+       *
+       *    "An INVALID_VALUE error is generated if barriers is not the special
+       *     value ALL_BARRIER_BITS, and has any bits set other than those
+       *     described above."
+       */
+      if ((barriers & ~all_allowed_bits) != 0) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glMemoryBarrierByRegion(unsupported barrier bit");
+      }
+
+      ctx->Driver.MemoryBarrier(ctx, barriers);
+   }
+}
diff --git a/src/mesa/main/shaderimage.h b/src/mesa/main/shaderimage.h
index 33d8a1eff55..bbe088a2459 100644
--- a/src/mesa/main/shaderimage.h
+++ b/src/mesa/main/shaderimage.h
@@ -43,6 +43,12 @@ mesa_format
 _mesa_get_shader_image_format(GLenum format);
 
 /**
+ * Get a single image unit struct with the default state.
+ */
+struct gl_image_unit
+_mesa_default_image_unit(struct gl_context *ctx);
+
+/**
  * Initialize a context's shader image units to the default state.
  */
 void
@@ -68,6 +74,9 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures);
 void GLAPIENTRY
 _mesa_MemoryBarrier(GLbitfield barriers);
 
+void GLAPIENTRY
+_mesa_MemoryBarrierByRegion(GLbitfield barriers);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/tests/Makefile.am b/src/mesa/main/tests/Makefile.am
index 251474d5c25..9467f3ba8c6 100644
--- a/src/mesa/main/tests/Makefile.am
+++ b/src/mesa/main/tests/Makefile.am
@@ -27,6 +27,7 @@ AM_CPPFLAGS += -DHAVE_SHARED_GLAPI
 
 main_test_SOURCES +=			\
 	dispatch_sanity.cpp		\
+	mesa_formats.cpp			\
 	program_state_string.cpp
 
 main_test_LDADD += \
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index af89d2c1cfb..59107eb67b1 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -851,6 +851,9 @@ const struct function common_desktop_functions_possible[] = {
 // { "glTextureStorage2DMultisampleEXT", 43, -1 },      // XXX: Add to xml
 // { "glTextureStorage3DMultisampleEXT", 43, -1 },      // XXX: Add to xml
 
+/* GL 4.5 */
+   { "glMemoryBarrierByRegion", 45, -1 },
+
    /* GL_ARB_internalformat_query */
    { "glGetInternalformativ", 30, -1 },
 
@@ -1739,6 +1742,9 @@ const struct function gl_core_functions_possible[] = {
 // { "glTextureStorage2DMultisampleEXT", 43, -1 },      // XXX: Add to xml
 // { "glTextureStorage3DMultisampleEXT", 43, -1 },      // XXX: Add to xml
 
+/* GL 4.5 */
+   { "glMemoryBarrierByRegion", 45, -1 },
+
    /* GL_ARB_direct_state_access */
    { "glCreateTransformFeedbacks", 45, -1 },
    { "glTransformFeedbackBufferBase", 45, -1 },
@@ -2461,8 +2467,7 @@ const struct function gles31_functions_possible[] = {
    { "glGetBooleani_v", 31, -1 },
    { "glMemoryBarrier", 31, -1 },
 
-   // FINISHME: This function has not been implemented yet.
-   // { "glMemoryBarrierByRegion", 31, -1 },
+   { "glMemoryBarrierByRegion", 31, -1 },
 
    { "glTexStorage2DMultisample", 31, -1 },
    { "glGetMultisamplefv", 31, -1 },
diff --git a/src/mesa/main/tests/mesa_formats.cpp b/src/mesa/main/tests/mesa_formats.cpp
new file mode 100644
index 00000000000..5356cd919a8
--- /dev/null
+++ b/src/mesa/main/tests/mesa_formats.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \name mesa_formats.cpp
+ *
+ * Verify that all mesa formats are handled in certain functions and that
+ * the format info table is sane.
+ *
+ */
+
+#include <gtest/gtest.h>
+
+#include "main/formats.h"
+#include "main/glformats.h"
+
+/**
+ * Debug/test: check that all uncompressed formats are handled in the
+ * _mesa_uncompressed_format_to_type_and_comps() function. When new pixel
+ * formats are added to Mesa, that function needs to be updated.
+ */
+TEST(MesaFormatsTest, FormatTypeAndComps)
+{
+   for (int fi = MESA_FORMAT_NONE + 1; fi < MESA_FORMAT_COUNT; ++fi) {
+      mesa_format f = (mesa_format) fi;
+      SCOPED_TRACE(_mesa_get_format_name(f));
+
+      /* This function will emit a problem/warning if the format is
+       * not handled.
+       */
+      if (!_mesa_is_format_compressed(f)) {
+         GLenum datatype = 0;
+         GLenum error = 0;
+         GLuint comps = 0;
+
+         /* If the datatype is zero, the format was not handled */
+         _mesa_uncompressed_format_to_type_and_comps(f, &datatype, &comps);
+         EXPECT_NE(datatype, (GLenum)0);
+
+         /* If the error isn't NO_ERROR, the format was not handled.
+          * Use an arbitrary GLenum format. */
+         _mesa_format_matches_format_and_type(f, GL_RG, datatype,
+                                              GL_FALSE, &error);
+         EXPECT_EQ((GLenum)GL_NO_ERROR, error);
+      }
+
+   }
+}
+
+/**
+ * Do sanity checking of the format info table.
+ */
+TEST(MesaFormatsTest, FormatSanity)
+{
+   for (int fi = 0; fi < MESA_FORMAT_COUNT; ++fi) {
+      mesa_format f = (mesa_format) fi;
+      SCOPED_TRACE(_mesa_get_format_name(f));
+      GLenum datatype = _mesa_get_format_datatype(f);
+      GLint r = _mesa_get_format_bits(f, GL_RED_BITS);
+      GLint g = _mesa_get_format_bits(f, GL_GREEN_BITS);
+      GLint b = _mesa_get_format_bits(f, GL_BLUE_BITS);
+      GLint a = _mesa_get_format_bits(f, GL_ALPHA_BITS);
+      GLint l = _mesa_get_format_bits(f, GL_TEXTURE_LUMINANCE_SIZE);
+      GLint i = _mesa_get_format_bits(f, GL_TEXTURE_INTENSITY_SIZE);
+
+      /* Note: Z32_FLOAT_X24S8 has datatype of GL_NONE */
+      EXPECT_TRUE(datatype == GL_NONE ||
+                  datatype == GL_UNSIGNED_NORMALIZED ||
+                  datatype == GL_SIGNED_NORMALIZED ||
+                  datatype == GL_UNSIGNED_INT ||
+                  datatype == GL_INT ||
+                  datatype == GL_FLOAT);
+
+      if (r > 0 && !_mesa_is_format_compressed(f)) {
+         GLint bytes = _mesa_get_format_bytes(f);
+         EXPECT_LE((r+g+b+a) / 8, bytes);
+      }
+
+      /* Determines if the base format has a channel [rgba] or property [li].
+      * >  indicates existance
+      * == indicates non-existance
+      */
+      #define HAS_PROP(rop,gop,bop,aop,lop,iop) \
+         do { \
+            EXPECT_TRUE(r rop 0); \
+            EXPECT_TRUE(g gop 0); \
+            EXPECT_TRUE(b bop 0); \
+            EXPECT_TRUE(a aop 0); \
+            EXPECT_TRUE(l lop 0); \
+            EXPECT_TRUE(i iop 0); \
+         } while(0)
+
+      switch (_mesa_get_format_base_format(f)) {
+      case GL_RGBA:
+         HAS_PROP(>,>,>,>,==,==);
+         break;
+      case GL_RGB:
+         HAS_PROP(>,>,>,==,==,==);
+         break;
+      case GL_RG:
+         HAS_PROP(>,>,==,==,==,==);
+         break;
+      case GL_RED:
+         HAS_PROP(>,==,==,==,==,==);
+         break;
+      case GL_LUMINANCE:
+         HAS_PROP(==,==,==,==,>,==);
+         break;
+      case GL_INTENSITY:
+         HAS_PROP(==,==,==,==,==,>);
+         break;
+      default:
+         break;
+      }
+
+      #undef HAS_PROP
+
+   }
+}
diff --git a/src/mesa/main/texcompress.c b/src/mesa/main/texcompress.c
index 0fd1a3683d1..edfb03625c2 100644
--- a/src/mesa/main/texcompress.c
+++ b/src/mesa/main/texcompress.c
@@ -586,34 +586,16 @@ _mesa_compressed_image_address(GLint col, GLint row, GLint img,
 compressed_fetch_func
 _mesa_get_compressed_fetch_func(mesa_format format)
 {
-   switch (format) {
-   case MESA_FORMAT_RGB_DXT1:
-   case MESA_FORMAT_RGBA_DXT1:
-   case MESA_FORMAT_RGBA_DXT3:
-   case MESA_FORMAT_RGBA_DXT5:
-   case MESA_FORMAT_SRGB_DXT1:
-   case MESA_FORMAT_SRGBA_DXT1:
-   case MESA_FORMAT_SRGBA_DXT3:
-   case MESA_FORMAT_SRGBA_DXT5:
+   switch (_mesa_get_format_layout(format)) {
+   case MESA_FORMAT_LAYOUT_S3TC:
       return _mesa_get_dxt_fetch_func(format);
-   case MESA_FORMAT_RGB_FXT1:
-   case MESA_FORMAT_RGBA_FXT1:
+   case MESA_FORMAT_LAYOUT_FXT1:
       return _mesa_get_fxt_fetch_func(format);
-   case MESA_FORMAT_R_RGTC1_UNORM:
-   case MESA_FORMAT_L_LATC1_UNORM:
-   case MESA_FORMAT_R_RGTC1_SNORM:
-   case MESA_FORMAT_L_LATC1_SNORM:
-   case MESA_FORMAT_RG_RGTC2_UNORM:
-   case MESA_FORMAT_LA_LATC2_UNORM:
-   case MESA_FORMAT_RG_RGTC2_SNORM:
-   case MESA_FORMAT_LA_LATC2_SNORM:
+   case MESA_FORMAT_LAYOUT_RGTC:
       return _mesa_get_compressed_rgtc_func(format);
-   case MESA_FORMAT_ETC1_RGB8:
+   case MESA_FORMAT_LAYOUT_ETC1:
       return _mesa_get_etc_fetch_func(format);
-   case MESA_FORMAT_BPTC_RGBA_UNORM:
-   case MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM:
-   case MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT:
-   case MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT:
+   case MESA_FORMAT_LAYOUT_BPTC:
       return _mesa_get_bptc_fetch_func(format);
    default:
       return NULL;
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index c0ccce3d50e..3c1e166ffa1 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -651,7 +651,7 @@ get_tex_memcpy(struct gl_context *ctx,
        texBaseFormat == texImage->_BaseFormat) {
       memCopy = _mesa_format_matches_format_and_type(texImage->TexFormat,
                                                      format, type,
-                                                     ctx->Pack.SwapBytes);
+                                                     ctx->Pack.SwapBytes, NULL);
    }
 
    if (depth > 1) {
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 3a556a6ad6e..274ecad44e9 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1785,18 +1785,36 @@ compressedteximage_only_format(const struct gl_context *ctx, GLenum format)
 }
 
 
+/* Writes to an GL error pointer if non-null and returns whether or not the
+ * error is GL_NO_ERROR */
+static bool
+write_error(GLenum *err_ptr, GLenum error)
+{
+   if (err_ptr)
+      *err_ptr = error;
+
+   return error == GL_NO_ERROR;
+}
+
 /**
  * Helper function to determine whether a target and specific compression
- * format are supported.
+ * format are supported. The error parameter returns GL_NO_ERROR if the
+ * target can be compressed. Otherwise it returns either GL_INVALID_OPERATION
+ * or GL_INVALID_ENUM, whichever is more appropriate.
  */
 GLboolean
 _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
-                               GLenum intFormat)
+                               GLenum intFormat, GLenum *error)
 {
+   GLboolean target_can_be_compresed = GL_FALSE;
+   mesa_format format = _mesa_glenum_to_compressed_format(intFormat);
+   enum mesa_format_layout layout = _mesa_get_format_layout(format);
+
    switch (target) {
    case GL_TEXTURE_2D:
    case GL_PROXY_TEXTURE_2D:
-      return GL_TRUE; /* true for any compressed format so far */
+      target_can_be_compresed = GL_TRUE; /* true for any compressed format so far */
+      break;
    case GL_PROXY_TEXTURE_CUBE_MAP:
    case GL_TEXTURE_CUBE_MAP:
    case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
@@ -1805,26 +1823,46 @@ _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
    case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      return ctx->Extensions.ARB_texture_cube_map;
+      target_can_be_compresed = ctx->Extensions.ARB_texture_cube_map;
+      break;
    case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
    case GL_TEXTURE_2D_ARRAY_EXT:
-      return ctx->Extensions.EXT_texture_array;
+      target_can_be_compresed = ctx->Extensions.EXT_texture_array;
+      break;
    case GL_PROXY_TEXTURE_CUBE_MAP_ARRAY:
    case GL_TEXTURE_CUBE_MAP_ARRAY:
-      return ctx->Extensions.ARB_texture_cube_map_array;
+      /* From section 3.8.6, page 146 of OpenGL ES 3.0 spec:
+       *
+       *    "The ETC2/EAC texture compression algorithm supports only
+       *     two-dimensional images. If internalformat is an ETC2/EAC format,
+       *     glCompressedTexImage3D will generate an INVALID_OPERATION error if
+       *     target is not TEXTURE_2D_ARRAY."
+       *
+       * This should also be applicable for glTexStorage3D(). Other available
+       * targets for these functions are: TEXTURE_3D and TEXTURE_CUBE_MAP_ARRAY.
+       */
+      if (layout == MESA_FORMAT_LAYOUT_ETC2 && _mesa_is_gles3(ctx))
+            return write_error(error, GL_INVALID_OPERATION);
+
+      target_can_be_compresed = ctx->Extensions.ARB_texture_cube_map_array;
+      break;
    case GL_TEXTURE_3D:
-      switch (intFormat) {
-      case GL_COMPRESSED_RGBA_BPTC_UNORM:
-      case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
-      case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
-      case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
-         return ctx->Extensions.ARB_texture_compression_bptc;
-      default:
-         return GL_FALSE;
+
+      /* See ETC2/EAC comment in switch case GL_TEXTURE_CUBE_MAP_ARRAY. */
+      if (layout == MESA_FORMAT_LAYOUT_ETC2 && _mesa_is_gles3(ctx))
+         return write_error(error, GL_INVALID_OPERATION);
+
+      if (layout == MESA_FORMAT_LAYOUT_BPTC) {
+         target_can_be_compresed = ctx->Extensions.ARB_texture_compression_bptc;
+         break;
       }
+
+      break;
    default:
-      return GL_FALSE;
+      break;
    }
+   return write_error(error,
+                      target_can_be_compresed ? GL_NO_ERROR : GL_INVALID_ENUM);
 }
 
 
@@ -2284,8 +2322,9 @@ texture_error_check( struct gl_context *ctx,
 
    /* additional checks for compressed textures */
    if (_mesa_is_compressed_format(ctx, internalFormat)) {
-      if (!_mesa_target_can_be_compressed(ctx, target, internalFormat)) {
-         _mesa_error(ctx, GL_INVALID_ENUM,
+      GLenum err;
+      if (!_mesa_target_can_be_compressed(ctx, target, internalFormat, &err)) {
+         _mesa_error(ctx, err,
                      "glTexImage%dD(target can't be compressed)", dimensions);
          return GL_TRUE;
       }
@@ -2340,16 +2379,8 @@ compressed_texture_error_check(struct gl_context *ctx, GLint dimensions,
    GLenum error = GL_NO_ERROR;
    char *reason = ""; /* no error */
 
-   if (!_mesa_target_can_be_compressed(ctx, target, internalFormat)) {
+   if (!_mesa_target_can_be_compressed(ctx, target, internalFormat, &error)) {
       reason = "target";
-      /* From section 3.8.6, page 146 of OpenGL ES 3.0 spec:
-       *
-       *    "The ETC2/EAC texture compression algorithm supports only
-       *     two-dimensional images. If internalformat is an ETC2/EAC format,
-       *     CompressedTexImage3D will generate an INVALID_OPERATION error if
-       *     target is not TEXTURE_2D_ARRAY."
-       */
-      error = _mesa_is_desktop_gl(ctx) ? GL_INVALID_ENUM : GL_INVALID_OPERATION;
       goto error;
    }
 
@@ -2813,9 +2844,10 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
    }
 
    if (_mesa_is_compressed_format(ctx, internalFormat)) {
-      if (!_mesa_target_can_be_compressed(ctx, target, internalFormat)) {
-         _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glCopyTexImage%dD(target)", dimensions);
+      GLenum err;
+      if (!_mesa_target_can_be_compressed(ctx, target, internalFormat, &err)) {
+         _mesa_error(ctx, err,
+                     "glCopyTexImage%dD(target can't be compressed)", dimensions);
          return GL_TRUE;
       }
       if (compressedteximage_only_format(ctx, internalFormat)) {
@@ -5569,10 +5601,13 @@ static GLboolean
 is_renderable_texture_format(struct gl_context *ctx, GLenum internalformat)
 {
    /* Everything that is allowed for renderbuffers,
-    * except for a base format of GL_STENCIL_INDEX.
+    * except for a base format of GL_STENCIL_INDEX, unless supported.
     */
    GLenum baseFormat = _mesa_base_fbo_format(ctx, internalformat);
-   return baseFormat != 0 && baseFormat != GL_STENCIL_INDEX;
+   if (ctx->Extensions.ARB_texture_stencil8)
+      return baseFormat != 0;
+   else
+      return baseFormat != 0 && baseFormat != GL_STENCIL_INDEX;
 }
 
 
@@ -5596,13 +5631,13 @@ check_multisample_target(GLuint dims, GLenum target, bool dsa)
 
 
 static void
-_mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
-                                struct gl_texture_object *texObj,
-                                GLenum target, GLsizei samples,
-                                GLint internalformat, GLsizei width,
-                                GLsizei height, GLsizei depth,
-                                GLboolean fixedsamplelocations,
-                                GLboolean immutable, const char *func)
+texture_image_multisample(struct gl_context *ctx, GLuint dims,
+                          struct gl_texture_object *texObj,
+                          GLenum target, GLsizei samples,
+                          GLint internalformat, GLsizei width,
+                          GLsizei height, GLsizei depth,
+                          GLboolean fixedsamplelocations,
+                          GLboolean immutable, const char *func)
 {
    struct gl_texture_image *texImage;
    GLboolean sizeOK, dimensionsOK, samplesOK;
@@ -5616,6 +5651,11 @@ _mesa_texture_image_multisample(struct gl_context *ctx, GLuint dims,
       return;
    }
 
+   if (samples < 1) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(samples < 1)", func);
+      return;
+   }
+
    if (!check_multisample_target(dims, target, dsa)) {
       if (dsa) {
          _mesa_error(ctx, GL_INVALID_OPERATION, "%s(target)", func);
@@ -5763,10 +5803,10 @@ _mesa_TexImage2DMultisample(GLenum target, GLsizei samples,
    if (!texObj)
       return;
 
-   _mesa_texture_image_multisample(ctx, 2, texObj, target, samples,
-                                   internalformat, width, height, 1,
-                                   fixedsamplelocations, GL_FALSE,
-                                   "glTexImage2DMultisample");
+   texture_image_multisample(ctx, 2, texObj, target, samples,
+                             internalformat, width, height, 1,
+                             fixedsamplelocations, GL_FALSE,
+                             "glTexImage2DMultisample");
 }
 
 
@@ -5783,12 +5823,26 @@ _mesa_TexImage3DMultisample(GLenum target, GLsizei samples,
    if (!texObj)
       return;
 
-   _mesa_texture_image_multisample(ctx, 3, texObj, target, samples,
-                                   internalformat, width, height, depth,
-                                   fixedsamplelocations, GL_FALSE,
-                                   "glTexImage3DMultisample");
+   texture_image_multisample(ctx, 3, texObj, target, samples,
+                             internalformat, width, height, depth,
+                             fixedsamplelocations, GL_FALSE,
+                             "glTexImage3DMultisample");
 }
 
+static bool
+valid_texstorage_ms_parameters(GLsizei width, GLsizei height, GLsizei depth,
+                               GLsizei samples, unsigned dims)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!_mesa_valid_tex_storage_dim(width, height, depth)) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "glTexStorage%uDMultisample(width=%d,height=%d,depth=%d)",
+                  dims, width, height, depth);
+      return false;
+   }
+   return true;
+}
 
 void GLAPIENTRY
 _mesa_TexStorage2DMultisample(GLenum target, GLsizei samples,
@@ -5802,10 +5856,13 @@ _mesa_TexStorage2DMultisample(GLenum target, GLsizei samples,
    if (!texObj)
       return;
 
-   _mesa_texture_image_multisample(ctx, 2, texObj, target, samples,
-                                   internalformat, width, height, 1,
-                                   fixedsamplelocations, GL_TRUE,
-                                   "glTexStorage2DMultisample");
+   if (!valid_texstorage_ms_parameters(width, height, 1, samples, 2))
+      return;
+
+   texture_image_multisample(ctx, 2, texObj, target, samples,
+                             internalformat, width, height, 1,
+                             fixedsamplelocations, GL_TRUE,
+                             "glTexStorage2DMultisample");
 }
 
 void GLAPIENTRY
@@ -5821,10 +5878,13 @@ _mesa_TexStorage3DMultisample(GLenum target, GLsizei samples,
    if (!texObj)
       return;
 
-   _mesa_texture_image_multisample(ctx, 3, texObj, target, samples,
-                                   internalformat, width, height, depth,
-                                   fixedsamplelocations, GL_TRUE,
-                                   "glTexStorage3DMultisample");
+   if (!valid_texstorage_ms_parameters(width, height, depth, samples, 3))
+      return;
+
+   texture_image_multisample(ctx, 3, texObj, target, samples,
+                             internalformat, width, height, depth,
+                             fixedsamplelocations, GL_TRUE,
+                             "glTexStorage3DMultisample");
 }
 
 void GLAPIENTRY
@@ -5841,10 +5901,13 @@ _mesa_TextureStorage2DMultisample(GLuint texture, GLsizei samples,
    if (!texObj)
       return;
 
-   _mesa_texture_image_multisample(ctx, 2, texObj, texObj->Target, samples,
-                                   internalformat, width, height, 1,
-                                   fixedsamplelocations, GL_TRUE,
-                                   "glTextureStorage2DMultisample");
+   if (!valid_texstorage_ms_parameters(width, height, 1, samples, 2))
+      return;
+
+   texture_image_multisample(ctx, 2, texObj, texObj->Target, samples,
+                             internalformat, width, height, 1,
+                             fixedsamplelocations, GL_TRUE,
+                             "glTextureStorage2DMultisample");
 }
 
 void GLAPIENTRY
@@ -5862,8 +5925,11 @@ _mesa_TextureStorage3DMultisample(GLuint texture, GLsizei samples,
    if (!texObj)
       return;
 
-   _mesa_texture_image_multisample(ctx, 3, texObj, texObj->Target, samples,
-                                   internalformat, width, height, depth,
-                                   fixedsamplelocations, GL_TRUE,
-                                   "glTextureStorage3DMultisample");
+   if (!valid_texstorage_ms_parameters(width, height, depth, samples, 3))
+      return;
+
+   texture_image_multisample(ctx, 3, texObj, texObj->Target, samples,
+                             internalformat, width, height, depth,
+                             fixedsamplelocations, GL_TRUE,
+                             "glTextureStorage3DMultisample");
 }
diff --git a/src/mesa/main/teximage.h b/src/mesa/main/teximage.h
index bf729daf534..a4736b5a29f 100644
--- a/src/mesa/main/teximage.h
+++ b/src/mesa/main/teximage.h
@@ -133,7 +133,7 @@ _mesa_test_proxy_teximage(struct gl_context *ctx, GLenum target, GLint level,
 
 extern GLboolean
 _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
-                               GLenum intFormat);
+                               GLenum intFormat, GLenum *error);
 
 extern GLuint
 _mesa_tex_target_to_face(GLenum target);
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index cd7cfd6a4fb..c5d83e145a6 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -37,6 +37,7 @@
 #include "hash.h"
 #include "imports.h"
 #include "macros.h"
+#include "shaderimage.h"
 #include "teximage.h"
 #include "texobj.h"
 #include "texstate.h"
@@ -1411,8 +1412,10 @@ unbind_texobj_from_image_units(struct gl_context *ctx,
    for (i = 0; i < ctx->Const.MaxImageUnits; i++) {
       struct gl_image_unit *unit = &ctx->ImageUnits[i];
 
-      if (texObj == unit->TexObj)
+      if (texObj == unit->TexObj) {
          _mesa_reference_texobj(&unit->TexObj, NULL);
+         *unit = _mesa_default_image_unit(ctx);
+      }
    }
 }
 
@@ -1742,10 +1745,10 @@ _mesa_BindTexture( GLenum target, GLuint texName )
  * texture object will be decremented.  It'll be deleted if the
  * count hits zero.
  */
-void
-_mesa_bind_texture_unit(struct gl_context *ctx,
-                        GLuint unit,
-                        struct gl_texture_object *texObj)
+static void
+bind_texture_unit(struct gl_context *ctx,
+                  GLuint unit,
+                  struct gl_texture_object *texObj)
 {
    struct gl_texture_unit *texUnit;
 
@@ -1834,7 +1837,7 @@ _mesa_BindTextureUnit(GLuint unit, GLuint texture)
    }
    assert(valid_texture_object(texObj));
 
-   _mesa_bind_texture_unit(ctx, unit, texObj);
+   bind_texture_unit(ctx, unit, texObj);
 }
 
 
diff --git a/src/mesa/main/texobj.h b/src/mesa/main/texobj.h
index ec5ccb27601..690878c85fc 100644
--- a/src/mesa/main/texobj.h
+++ b/src/mesa/main/texobj.h
@@ -209,10 +209,6 @@ extern void
 _mesa_delete_nameless_texture(struct gl_context *ctx,
                               struct gl_texture_object *texObj);
 
-extern void
-_mesa_bind_texture_unit(struct gl_context *ctx,
-                        GLuint unit,
-                        struct gl_texture_object *texObj);
 
 /*@}*/
 
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index c0611c3e489..16739f1779b 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1562,6 +1562,19 @@ invalid_pname:
                _mesa_enum_to_string(pname));
 }
 
+static bool
+valid_tex_level_parameteriv_target(struct gl_context *ctx, GLenum target,
+                                   bool dsa)
+{
+   const char *suffix = dsa ? "ture" : "";
+   if (!legal_get_tex_level_parameter_target(ctx, target, dsa)) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glGetTex%sLevelParameter[if]v(target=%s)", suffix,
+                  _mesa_enum_to_string(target));
+      return false;
+   }
+   return true;
+}
 
 /**
  * This isn't exposed to the rest of the driver because it is a part of the
@@ -1585,13 +1598,6 @@ get_tex_level_parameteriv(struct gl_context *ctx,
       return;
    }
 
-   if (!legal_get_tex_level_parameter_target(ctx, target, dsa)) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetTex%sLevelParameter[if]v(target=%s)", suffix,
-                  _mesa_enum_to_string(target));
-      return;
-   }
-
    maxLevels = _mesa_max_texture_levels(ctx, target);
    assert(maxLevels != 0);
 
@@ -1619,6 +1625,9 @@ _mesa_GetTexLevelParameterfv( GLenum target, GLint level,
    GLint iparam;
    GET_CURRENT_CONTEXT(ctx);
 
+   if (!valid_tex_level_parameteriv_target(ctx, target, false))
+      return;
+
    texObj = _mesa_get_current_tex_object(ctx, target);
    if (!texObj)
       return;
@@ -1636,6 +1645,9 @@ _mesa_GetTexLevelParameteriv( GLenum target, GLint level,
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
+   if (!valid_tex_level_parameteriv_target(ctx, target, false))
+      return;
+
    texObj = _mesa_get_current_tex_object(ctx, target);
    if (!texObj)
       return;
@@ -1657,6 +1669,9 @@ _mesa_GetTextureLevelParameterfv(GLuint texture, GLint level,
    if (!texObj)
       return;
 
+   if (!valid_tex_level_parameteriv_target(ctx, texObj->Target, true))
+      return;
+
    get_tex_level_parameteriv(ctx, texObj, texObj->Target, level,
                              pname, &iparam, true);
 
@@ -1675,6 +1690,9 @@ _mesa_GetTextureLevelParameteriv(GLuint texture, GLint level,
    if (!texObj)
       return;
 
+   if (!valid_tex_level_parameteriv_target(ctx, texObj->Target, true))
+      return;
+
    get_tex_level_parameteriv(ctx, texObj, texObj->Target, level,
                              pname, params, true);
 }
@@ -1890,6 +1908,12 @@ get_tex_parameterfv(struct gl_context *ctx,
          *params = (GLfloat) obj->Sampler.sRGBDecode;
          break;
 
+      case GL_IMAGE_FORMAT_COMPATIBILITY_TYPE:
+         if (!ctx->Extensions.ARB_shader_image_load_store)
+            goto invalid_pname;
+         *params = (GLfloat) obj->ImageFormatCompatibilityType;
+         break;
+
       default:
          goto invalid_pname;
    }
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index 4a2cc6065df..c53bb295c66 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -189,6 +189,20 @@ clear_texture_fields(struct gl_context *ctx,
 }
 
 
+/**
+ * Update/re-validate framebuffer object.
+ */
+static void
+update_fbo_texture(struct gl_context *ctx, struct gl_texture_object *texObj)
+{
+   const unsigned numFaces = _mesa_num_tex_faces(texObj->Target);
+   for (int level = 0; level < ARRAY_SIZE(texObj->Image[0]); level++) {
+      for (unsigned face = 0; face < numFaces; face++)
+         _mesa_update_fbo_texture(ctx, texObj, face, level);
+   }
+}
+
+
 GLboolean
 _mesa_is_legal_tex_storage_format(struct gl_context *ctx, GLenum internalformat)
 {
@@ -287,29 +301,21 @@ tex_storage_error_check(struct gl_context *ctx,
     * order to allow meta functions to use legacy formats. */
 
    /* size check */
-   if (width < 1 || height < 1 || depth < 1) {
+   if (!_mesa_valid_tex_storage_dim(width, height, depth)) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glTex%sStorage%uD(width, height or depth < 1)",
                   suffix, dims);
       return GL_TRUE;
    }  
 
-   /* From section 3.8.6, page 146 of OpenGL ES 3.0 spec:
-    *
-    *    "The ETC2/EAC texture compression algorithm supports only
-    *     two-dimensional images. If internalformat is an ETC2/EAC format,
-    *     CompressedTexImage3D will generate an INVALID_OPERATION error if
-    *     target is not TEXTURE_2D_ARRAY."
-    *
-    * This should also be applicable for glTexStorage3D().
-    */
-   if (_mesa_is_compressed_format(ctx, internalformat)
-       && !_mesa_target_can_be_compressed(ctx, target, internalformat)) {
-      _mesa_error(ctx, _mesa_is_desktop_gl(ctx)?
-                  GL_INVALID_ENUM : GL_INVALID_OPERATION,
+   if (_mesa_is_compressed_format(ctx, internalformat)) {
+      GLenum err;
+      if (!_mesa_target_can_be_compressed(ctx, target, internalformat, &err)) {
+         _mesa_error(ctx, err,
                   "glTex%sStorage%dD(internalformat = %s)", suffix, dims,
                   _mesa_enum_to_string(internalformat));
-      return GL_TRUE;
+         return GL_TRUE;
+      }
    }
 
    /* levels check */
@@ -446,6 +452,7 @@ _mesa_texture_storage(struct gl_context *ctx, GLuint dims,
 
       _mesa_set_texture_view_state(ctx, texObj, target, levels);
 
+      update_fbo_texture(ctx, texObj);
    }
 }
 
diff --git a/src/mesa/main/texstorage.h b/src/mesa/main/texstorage.h
index 6f5495f3856..033ecb7edaa 100644
--- a/src/mesa/main/texstorage.h
+++ b/src/mesa/main/texstorage.h
@@ -38,6 +38,27 @@ _mesa_texture_storage(struct gl_context *ctx, GLuint dims,
                       GLenum internalformat, GLsizei width,
                       GLsizei height, GLsizei depth, bool dsa);
 
+/**
+ * Texture width, height and depth check shared with the
+ * multisample variants of TexStorage functions.
+ *
+ * From OpenGL 4.5 Core spec, page 260 (section 8.19)
+ *
+ *     "An INVALID_VALUE error is generated if width, height, depth
+ *     or levels are less than 1, for commands with the corresponding
+ *     parameters."
+ *
+ * (referring to TextureStorage* commands, these also match values
+ * specified for OpenGL ES 3.1.)
+ */
+static inline bool
+_mesa_valid_tex_storage_dim(GLsizei width, GLsizei height, GLsizei depth)
+{
+   if (width < 1 || height < 1 || depth < 1)
+      return false;
+   return true;
+}
+
 /*@}*/
 
 /**
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 37c05690091..fc83310d4e3 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -863,7 +863,7 @@ _mesa_texstore_can_use_memcpy(struct gl_context *ctx,
 
    /* The Mesa format must match the input format and type. */
    if (!_mesa_format_matches_format_and_type(dstFormat, srcFormat, srcType,
-                                             srcPacking->SwapBytes)) {
+                                             srcPacking->SwapBytes, NULL)) {
       return GL_FALSE;
    }
 
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index 036530e91b6..10266189259 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -319,24 +319,31 @@ _mesa_get_uniform(struct gl_context *ctx, GLuint program, GLint location,
 
       return;
    }
+   if ((uni->type->base_type == GLSL_TYPE_DOUBLE &&
+        returnType != GLSL_TYPE_DOUBLE) ||
+       (uni->type->base_type != GLSL_TYPE_DOUBLE &&
+        returnType == GLSL_TYPE_DOUBLE)) {
+	 _mesa_error( ctx, GL_INVALID_OPERATION,
+	             "glGetnUniform*vARB(incompatible uniform types)");
+	return;
+   }
 
    {
       unsigned elements = (uni->type->is_sampler())
 	 ? 1 : uni->type->components();
+      const int dmul = uni->type->base_type == GLSL_TYPE_DOUBLE ? 2 : 1;
 
       /* Calculate the source base address *BEFORE* modifying elements to
        * account for the size of the user's buffer.
        */
       const union gl_constant_value *const src =
-	 &uni->storage[offset * elements];
+	 &uni->storage[offset * elements * dmul];
 
       assert(returnType == GLSL_TYPE_FLOAT || returnType == GLSL_TYPE_INT ||
-             returnType == GLSL_TYPE_UINT);
-      /* The three (currently) supported types all have the same size,
-       * which is of course the same as their union. That'll change
-       * with glGetUniformdv()...
-       */
-      unsigned bytes = sizeof(src[0]) * elements;
+             returnType == GLSL_TYPE_UINT || returnType == GLSL_TYPE_DOUBLE);
+
+      /* doubles have a different size than the other 3 types */
+      unsigned bytes = sizeof(src[0]) * elements * dmul;
       if (bufSize < 0 || bytes > (unsigned) bufSize) {
 	 _mesa_error( ctx, GL_INVALID_OPERATION,
 	             "glGetnUniform*vARB(out of bounds: bufSize is %d,"
@@ -677,9 +684,11 @@ _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shProg,
       match = (basicType != GLSL_TYPE_DOUBLE);
       break;
    case GLSL_TYPE_SAMPLER:
-   case GLSL_TYPE_IMAGE:
       match = (basicType == GLSL_TYPE_INT);
       break;
+   case GLSL_TYPE_IMAGE:
+      match = (basicType == GLSL_TYPE_INT && _mesa_is_desktop_gl(ctx));
+      break;
    default:
       match = (basicType == uni->type->base_type);
       break;
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index ff1df72e1d6..10819e2e21a 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -888,16 +888,7 @@ _mesa_GetnUniformdvARB(GLuint program, GLint location,
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   (void) program;
-   (void) location;
-   (void) bufSize;
-   (void) params;
-
-   /*
    _mesa_get_uniform(ctx, program, location, bufSize, GLSL_TYPE_DOUBLE, params);
-   */
-   _mesa_error(ctx, GL_INVALID_OPERATION, "glGetUniformdvARB"
-               "(GL_ARB_gpu_shader_fp64 not implemented)");
 }
 
 void GLAPIENTRY
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index d54f934247d..d96b7bc8782 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -33,6 +33,7 @@
 #include "prog_instruction.h"
 #include "prog_parameter.h"
 #include "prog_print.h"
+#include "program.h"
 
 /**
  * \file prog_to_nir.c
@@ -142,7 +143,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
       load->variables[0] = nir_deref_var_create(load, c->input_vars[prog_src->Index]);
 
       nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
-      nir_instr_insert_after_cf_list(b->cf_node_list, &load->instr);
+      nir_builder_instr_insert(b, &load->instr);
 
       src.src = nir_src_for_ssa(&load->dest.ssa);
       break;
@@ -166,6 +167,8 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
          }
          /* FALLTHROUGH */
       case PROGRAM_STATE_VAR: {
+         assert(c->parameters != NULL);
+
          nir_intrinsic_instr *load =
             nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_var);
          nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
@@ -200,7 +203,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
             deref_arr->base_offset = prog_src->Index;
          }
 
-         nir_instr_insert_after_cf_list(b->cf_node_list, &load->instr);
+         nir_builder_instr_insert(b, &load->instr);
 
          src.src = nir_src_for_ssa(&load->dest.ssa);
          break;
@@ -250,7 +253,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
             mov->dest.write_mask = 0x1;
             mov->src[0] = src;
             mov->src[0].swizzle[0] = swizzle;
-            nir_instr_insert_after_cf_list(b->cf_node_list, &mov->instr);
+            nir_builder_instr_insert(b, &mov->instr);
 
             chans[i] = &mov->dest.dest.ssa;
          }
@@ -278,7 +281,7 @@ ptn_alu(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src)
       instr->src[i].src = nir_src_for_ssa(src[i]);
 
    instr->dest = dest;
-   nir_instr_insert_after_cf_list(b->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(b, &instr->instr);
 }
 
 static void
@@ -297,7 +300,7 @@ ptn_move_dest_masked(nir_builder *b, nir_alu_dest dest,
    mov->src[0].src = nir_src_for_ssa(def);
    for (unsigned i = def->num_components; i < 4; i++)
       mov->src[0].swizzle[i] = def->num_components - 1;
-   nir_instr_insert_after_cf_list(b->cf_node_list, &mov->instr);
+   nir_builder_instr_insert(b, &mov->instr);
 }
 
 static void
@@ -558,7 +561,7 @@ ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
    nir_intrinsic_instr *discard =
       nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
    discard->src[0] = nir_src_for_ssa(cmp);
-   nir_instr_insert_after_cf_list(b->cf_node_list, &discard->instr);
+   nir_builder_instr_insert(b, &discard->instr);
 }
 
 static void
@@ -685,7 +688,7 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src,
    assert(src_number == num_srcs);
 
    nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL);
-   nir_instr_insert_after_cf_list(b->cf_node_list, &instr->instr);
+   nir_builder_instr_insert(b, &instr->instr);
 
    /* Resolve the writemask on the texture op. */
    ptn_move_dest(b, dest, &instr->dest.ssa);
@@ -941,7 +944,7 @@ ptn_add_output_stores(struct ptn_compile *c)
       } else {
          store->src[0].reg.reg = c->output_regs[var->data.location];
       }
-      nir_instr_insert_after_cf_list(c->build.cf_node_list, &store->instr);
+      nir_builder_instr_insert(b, &store->instr);
    }
 }
 
@@ -985,7 +988,7 @@ setup_registers_and_variables(struct ptn_compile *c)
             load_x->num_components = 1;
             load_x->variables[0] = nir_deref_var_create(load_x, var);
             nir_ssa_dest_init(&load_x->instr, &load_x->dest, 1, NULL);
-            nir_instr_insert_after_cf_list(b->cf_node_list, &load_x->instr);
+            nir_builder_instr_insert(b, &load_x->instr);
 
             nir_ssa_def *f001 = nir_vec4(b, &load_x->dest.ssa, nir_imm_float(b, 0.0),
                                          nir_imm_float(b, 0.0), nir_imm_float(b, 1.0));
@@ -1001,7 +1004,7 @@ setup_registers_and_variables(struct ptn_compile *c)
             store->num_components = 4;
             store->variables[0] = nir_deref_var_create(store, fullvar);
             store->src[0] = nir_src_for_ssa(f001);
-            nir_instr_insert_after_cf_list(b->cf_node_list, &store->instr);
+            nir_builder_instr_insert(b, &store->instr);
 
             /* Insert the real input into the list so the driver has real
              * inputs, but set c->input_vars[i] to the temporary so we use
@@ -1079,22 +1082,25 @@ prog_to_nir(const struct gl_program *prog,
 {
    struct ptn_compile *c;
    struct nir_shader *s;
+   gl_shader_stage stage = _mesa_program_enum_to_shader_stage(prog->Target);
 
    c = rzalloc(NULL, struct ptn_compile);
    if (!c)
       return NULL;
-   s = nir_shader_create(NULL, options);
+   s = nir_shader_create(NULL, stage, options);
    if (!s)
       goto fail;
    c->prog = prog;
 
-   c->parameters = rzalloc(s, nir_variable);
-   c->parameters->type = glsl_array_type(glsl_vec4_type(),
-                                            prog->Parameters->NumParameters);
-   c->parameters->name = "parameters";
-   c->parameters->data.read_only = true;
-   c->parameters->data.mode = nir_var_uniform;
-   exec_list_push_tail(&s->uniforms, &c->parameters->node);
+   if (prog->Parameters->NumParameters > 0) {
+      c->parameters = rzalloc(s, nir_variable);
+      c->parameters->type =
+         glsl_array_type(glsl_vec4_type(), prog->Parameters->NumParameters);
+      c->parameters->name = "parameters";
+      c->parameters->data.read_only = true;
+      c->parameters->data.mode = nir_var_uniform;
+      exec_list_push_tail(&s->uniforms, &c->parameters->node);
+   }
 
    nir_function *func = nir_function_create(s, "main");
    nir_function_overload *overload = nir_function_overload_create(func);
@@ -1102,7 +1108,7 @@ prog_to_nir(const struct gl_program *prog,
 
    c->build.shader = s;
    c->build.impl = impl;
-   c->build.cf_node_list = &impl->body;
+   nir_builder_insert_after_cf_list(&c->build, &impl->body);
 
    setup_registers_and_variables(c);
    if (unlikely(c->error))
diff --git a/src/mesa/state_tracker/st_cb_blit.c b/src/mesa/state_tracker/st_cb_blit.c
index 139690615d6..4fdef7fb5a6 100644
--- a/src/mesa/state_tracker/st_cb_blit.c
+++ b/src/mesa/state_tracker/st_cb_blit.c
@@ -192,6 +192,7 @@ st_BlitFramebuffer(struct gl_context *ctx,
 
    blit.filter = pFilter;
    blit.render_condition_enable = TRUE;
+   blit.alpha_blend = FALSE;
 
    if (mask & GL_COLOR_BUFFER_BIT) {
       struct gl_renderbuffer_attachment *srcAtt =
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 18ea43fa71a..6ff6cf6f6d6 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -139,7 +139,7 @@ st_readpixels(struct gl_context *ctx, GLint x, GLint y,
     * in which case the memcpy-based fast path will likely be used and
     * we don't have to blit. */
    if (_mesa_format_matches_format_and_type(rb->Format, format,
-                                            type, pack->SwapBytes)) {
+                                            type, pack->SwapBytes, NULL)) {
       goto fallback;
    }
 
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 715d69c0c68..93335aefe6c 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -695,7 +695,7 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims,
     * in which case the memcpy-based fast path will likely be used and
     * we don't have to blit. */
    if (_mesa_format_matches_format_and_type(texImage->TexFormat, format,
-                                            type, unpack->SwapBytes)) {
+                                            type, unpack->SwapBytes, NULL)) {
       goto fallback;
    }
 
@@ -963,7 +963,7 @@ st_GetTexSubImage(struct gl_context * ctx,
    /* See if the texture format already matches the format and type,
     * in which case the memcpy-based fast path will be used. */
    if (_mesa_format_matches_format_and_type(texImage->TexFormat, format,
-                                            type, ctx->Pack.SwapBytes)) {
+                                            type, ctx->Pack.SwapBytes, NULL)) {
       goto fallback;
    }
 
@@ -1071,6 +1071,8 @@ st_GetTexSubImage(struct gl_context * ctx,
 
    /* From now on, we need the gallium representation of dimensions. */
    if (gl_target == GL_TEXTURE_1D_ARRAY) {
+      zoffset = yoffset;
+      yoffset = 0;
       depth = height;
       height = 1;
    }
@@ -1114,7 +1116,7 @@ st_GetTexSubImage(struct gl_context * ctx,
 
    /* copy/pack data into user buffer */
    if (_mesa_format_matches_format_and_type(mesa_format, format, type,
-                                            ctx->Pack.SwapBytes)) {
+                                            ctx->Pack.SwapBytes, NULL)) {
       /* memcpy */
       const uint bytesPerRow = width * util_format_get_blocksize(dst_format);
       GLuint row, slice;
@@ -1871,6 +1873,31 @@ st_TextureView(struct gl_context *ctx,
    return GL_TRUE;
 }
 
+/* HACK: this is only enough for the most basic uses of CopyImage. Must fix
+ * before actually exposing the extension.
+ */
+static void
+st_CopyImageSubData(struct gl_context *ctx,
+                    struct gl_texture_image *src_image,
+                    int src_x, int src_y, int src_z,
+                    struct gl_texture_image *dst_image,
+                    int dst_x, int dst_y, int dst_z,
+                    int src_width, int src_height)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct st_texture_image *src = st_texture_image(src_image);
+   struct st_texture_image *dst = st_texture_image(dst_image);
+
+   struct pipe_box box;
+
+   u_box_2d_zslice(src_x, src_y, src_z, src_width, src_height, &box);
+   pipe->resource_copy_region(pipe, dst->pt, dst_image->Level,
+                              dst_x, dst_y, dst_z,
+                              src->pt, src_image->Level,
+                              &box);
+}
+
 
 void
 st_init_texture_functions(struct dd_function_table *functions)
@@ -1903,4 +1930,6 @@ st_init_texture_functions(struct dd_function_table *functions)
 
    functions->AllocTextureStorage = st_AllocTextureStorage;
    functions->TextureView = st_TextureView;
+
+   functions->CopyImageSubData = st_CopyImageSubData;
 }
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index db7b5b71d66..db74184cff4 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -1917,7 +1917,7 @@ st_choose_matching_format(struct st_context *st, unsigned bind,
       }
 
       if (_mesa_format_matches_format_and_type(mesa_format, format, type,
-                                               swapBytes)) {
+                                               swapBytes, NULL)) {
          enum pipe_format format =
             st_mesa_format_to_pipe_format(st, mesa_format);
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 6f007273c73..cba98819718 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -2816,7 +2816,7 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
        */
       glsl_to_tgsi_instruction *inst, *new_inst;
       inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
-      new_inst = emit_asm(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2]);
+      new_inst = emit_asm(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2], inst->src[3]);
       new_inst->saturate = inst->saturate;
       inst->dead_mask = inst->dst[0].writemask;
    } else {
@@ -4402,12 +4402,12 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
     * new visitor. */
    foreach_in_list(glsl_to_tgsi_instruction, inst, &original->instructions) {
       glsl_to_tgsi_instruction *newinst;
-      st_src_reg src_regs[3];
+      st_src_reg src_regs[4];
 
       if (inst->dst[0].file == PROGRAM_OUTPUT)
          prog->OutputsWritten |= BITFIELD64_BIT(inst->dst[0].index);
 
-      for (int i = 0; i < 3; i++) {
+      for (int i = 0; i < 4; i++) {
          src_regs[i] = inst->src[i];
          if (src_regs[i].file == PROGRAM_INPUT &&
              src_regs[i].index == VARYING_SLOT_COL0) {
@@ -4418,7 +4418,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
             prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2], src_regs[3]);
       newinst->tex_target = inst->tex_target;
       newinst->sampler_array_size = inst->sampler_array_size;
    }
@@ -4487,18 +4487,18 @@ get_bitmap_visitor(struct st_fragment_program *fp,
     * new visitor. */
    foreach_in_list(glsl_to_tgsi_instruction, inst, &original->instructions) {
       glsl_to_tgsi_instruction *newinst;
-      st_src_reg src_regs[3];
+      st_src_reg src_regs[4];
 
       if (inst->dst[0].file == PROGRAM_OUTPUT)
          prog->OutputsWritten |= BITFIELD64_BIT(inst->dst[0].index);
 
-      for (int i = 0; i < 3; i++) {
+      for (int i = 0; i < 4; i++) {
          src_regs[i] = inst->src[i];
          if (src_regs[i].file == PROGRAM_INPUT)
             prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2], src_regs[3]);
       newinst->tex_target = inst->tex_target;
       newinst->sampler_array_size = inst->sampler_array_size;
    }
diff --git a/src/mesa/swrast/s_drawpix.c b/src/mesa/swrast/s_drawpix.c
index dc6827ede9f..5393d50ea02 100644
--- a/src/mesa/swrast/s_drawpix.c
+++ b/src/mesa/swrast/s_drawpix.c
@@ -242,7 +242,7 @@ fast_draw_rgba_pixels(struct gl_context *ctx, GLint x, GLint y,
    }
 
    if (_mesa_format_matches_format_and_type(rb->Format, format, type,
-                                            ctx->Unpack.SwapBytes)) {
+                                            ctx->Unpack.SwapBytes, NULL)) {
       fast_draw_generic_pixels(ctx, rb, x, y, width, height,
                                format, type, &unpack, pixels);
       return GL_TRUE;
diff --git a/src/util/bitset.h b/src/util/bitset.h
index febcddefd97..c452819414f 100644
--- a/src/util/bitset.h
+++ b/src/util/bitset.h
@@ -96,4 +96,40 @@ __bitset_ffs(const BITSET_WORD *x, int n)
 
 #define BITSET_FFS(x) __bitset_ffs(x, ARRAY_SIZE(x))
 
+static inline unsigned
+__bitset_next_set(unsigned i, BITSET_WORD *tmp,
+                  BITSET_WORD *set, unsigned size)
+{
+   unsigned bit, word;
+
+   /* NOTE: The initial conditions for this function are very specific.  At
+    * the start of the loop, the tmp variable must be set to *set and the
+    * initial i value set to 0.  This way, if there is a bit set in the first
+    * word, we ignore the i-value and just grab that bit (so 0 is ok, even
+    * though 0 may be returned).  If the first word is 0, then the value of
+    * `word` will be 0 and we will go on to look at the second word.
+    */
+   word = BITSET_BITWORD(i);
+   while (*tmp == 0) {
+      word++;
+
+      if (word >= BITSET_WORDS(size))
+         return size;
+
+      *tmp = set[word];
+   }
+
+   /* Find the next set bit in the non-zero word */
+   bit = ffs(*tmp) - 1;
+
+   /* Unset the bit */
+   *tmp &= ~(1ull << bit);
+
+   return word * BITSET_WORDBITS + bit;
+}
+
+#define BITSET_FOREACH_SET(__i, __tmp, __set, __size) \
+   for (__tmp = *(__set), __i = 0; \
+        (__i = __bitset_next_set(__i, &__tmp, __set, __size)) < __size;)
+
 #endif
diff --git a/src/util/register_allocate.c b/src/util/register_allocate.c
index 436e008b01a..8af93c0406c 100644
--- a/src/util/register_allocate.c
+++ b/src/util/register_allocate.c
@@ -183,7 +183,7 @@ struct ra_graph {
  * using ralloc_free().
  */
 struct ra_regs *
-ra_alloc_reg_set(void *mem_ctx, unsigned int count)
+ra_alloc_reg_set(void *mem_ctx, unsigned int count, bool need_conflict_lists)
 {
    unsigned int i;
    struct ra_regs *regs;
@@ -197,9 +197,15 @@ ra_alloc_reg_set(void *mem_ctx, unsigned int count)
                                               BITSET_WORDS(count));
       BITSET_SET(regs->regs[i].conflicts, i);
 
-      regs->regs[i].conflict_list = ralloc_array(regs->regs, unsigned int, 4);
-      regs->regs[i].conflict_list_size = 4;
-      regs->regs[i].conflict_list[0] = i;
+      if (need_conflict_lists) {
+         regs->regs[i].conflict_list = ralloc_array(regs->regs,
+                                                    unsigned int, 4);
+         regs->regs[i].conflict_list_size = 4;
+         regs->regs[i].conflict_list[0] = i;
+      } else {
+         regs->regs[i].conflict_list = NULL;
+         regs->regs[i].conflict_list_size = 0;
+      }
       regs->regs[i].num_conflicts = 1;
    }
 
@@ -227,12 +233,14 @@ ra_add_conflict_list(struct ra_regs *regs, unsigned int r1, unsigned int r2)
 {
    struct ra_reg *reg1 = &regs->regs[r1];
 
-   if (reg1->conflict_list_size == reg1->num_conflicts) {
-      reg1->conflict_list_size *= 2;
-      reg1->conflict_list = reralloc(regs->regs, reg1->conflict_list,
-				     unsigned int, reg1->conflict_list_size);
+   if (reg1->conflict_list) {
+      if (reg1->conflict_list_size == reg1->num_conflicts) {
+         reg1->conflict_list_size *= 2;
+         reg1->conflict_list = reralloc(regs->regs, reg1->conflict_list,
+                                        unsigned int, reg1->conflict_list_size);
+      }
+      reg1->conflict_list[reg1->num_conflicts++] = r2;
    }
-   reg1->conflict_list[reg1->num_conflicts++] = r2;
    BITSET_SET(reg1->conflicts, r2);
 }
 
@@ -255,7 +263,7 @@ ra_add_reg_conflict(struct ra_regs *regs, unsigned int r1, unsigned int r2)
  */
 void
 ra_add_transitive_reg_conflict(struct ra_regs *regs,
-			       unsigned int base_reg, unsigned int reg)
+                               unsigned int base_reg, unsigned int reg)
 {
    unsigned int i;
 
@@ -266,13 +274,37 @@ ra_add_transitive_reg_conflict(struct ra_regs *regs,
    }
 }
 
+/**
+ * Makes every conflict on the given register transitive.  In other words,
+ * every register that conflicts with r will now conflict with every other
+ * register conflicting with r.
+ *
+ * This can simplify code for setting up multiple register classes
+ * which are aggregates of some base hardware registers, compared to
+ * explicitly using ra_add_reg_conflict.
+ */
+void
+ra_make_reg_conflicts_transitive(struct ra_regs *regs, unsigned int r)
+{
+   struct ra_reg *reg = &regs->regs[r];
+   BITSET_WORD tmp;
+   int c;
+
+   BITSET_FOREACH_SET(c, tmp, reg->conflicts, regs->count) {
+      struct ra_reg *other = &regs->regs[c];
+      unsigned i;
+      for (i = 0; i < BITSET_WORDS(regs->count); i++)
+         other->conflicts[i] |= reg->conflicts[i];
+   }
+}
+
 unsigned int
 ra_alloc_reg_class(struct ra_regs *regs)
 {
    struct ra_class *class;
 
    regs->classes = reralloc(regs->regs, regs->classes, struct ra_class *,
-			    regs->class_count + 1);
+                            regs->class_count + 1);
 
    class = rzalloc(regs, struct ra_class);
    regs->classes[regs->class_count] = class;
@@ -319,7 +351,7 @@ ra_set_finalize(struct ra_regs *regs, unsigned int **q_values)
       for (b = 0; b < regs->class_count; b++) {
          for (c = 0; c < regs->class_count; c++) {
             regs->classes[b]->q[c] = q_values[b][c];
-	 }
+         }
       }
    } else {
       /* Compute, for each class B and C, how many regs of B an
@@ -410,14 +442,14 @@ ra_alloc_interference_graph(struct ra_regs *regs, unsigned int count)
 
 void
 ra_set_node_class(struct ra_graph *g,
-		  unsigned int n, unsigned int class)
+                  unsigned int n, unsigned int class)
 {
    g->nodes[n].class = class;
 }
 
 void
 ra_add_node_interference(struct ra_graph *g,
-			 unsigned int n1, unsigned int n2)
+                         unsigned int n1, unsigned int n2)
 {
    if (!BITSET_TEST(g->nodes[n1].adjacency, n2)) {
       ra_add_node_adjacency(g, n1, n2);
@@ -445,7 +477,7 @@ decrement_q(struct ra_graph *g, unsigned int n)
 
       if (n != n2 && !g->nodes[n2].in_stack) {
          assert(g->nodes[n2].q_total >= g->regs->classes[n2_class]->q[n_class]);
-	 g->nodes[n2].q_total -= g->regs->classes[n2_class]->q[n_class];
+         g->nodes[n2].q_total -= g->regs->classes[n2_class]->q[n_class];
       }
    }
 }
diff --git a/src/util/register_allocate.h b/src/util/register_allocate.h
index 61f182eff49..628d2bbbced 100644
--- a/src/util/register_allocate.h
+++ b/src/util/register_allocate.h
@@ -44,13 +44,15 @@ struct ra_regs;
  * registers, such as aligned register pairs that conflict with the
  * two real registers from which they are composed.
  */
-struct ra_regs *ra_alloc_reg_set(void *mem_ctx, unsigned int count);
+struct ra_regs *ra_alloc_reg_set(void *mem_ctx, unsigned int count,
+                                 bool need_conflict_lists);
 void ra_set_allocate_round_robin(struct ra_regs *regs);
 unsigned int ra_alloc_reg_class(struct ra_regs *regs);
 void ra_add_reg_conflict(struct ra_regs *regs,
 			 unsigned int r1, unsigned int r2);
 void ra_add_transitive_reg_conflict(struct ra_regs *regs,
 				    unsigned int base_reg, unsigned int reg);
+void ra_make_reg_conflicts_transitive(struct ra_regs *regs, unsigned int reg);
 void ra_class_add_reg(struct ra_regs *regs, unsigned int c, unsigned int reg);
 void ra_set_num_conflicts(struct ra_regs *regs, unsigned int class_a,
                           unsigned int class_b, unsigned int num_conflicts);
diff --git a/src/util/rounding.h b/src/util/rounding.h
index 7b5608b8a78..afb38fbdb56 100644
--- a/src/util/rounding.h
+++ b/src/util/rounding.h
@@ -24,9 +24,8 @@
 #ifndef _ROUNDING_H
 #define _ROUNDING_H
 
-#include "c99_compat.h" // inline
+#include "c99_math.h"
 
-#include <math.h>
 #include <limits.h>
 #include <stdint.h>
 
diff --git a/src/vulkan/anv_compiler.cpp b/src/vulkan/anv_compiler.cpp
index 2dbf59f991e..4cbf98afa1e 100644
--- a/src/vulkan/anv_compiler.cpp
+++ b/src/vulkan/anv_compiler.cpp
@@ -1007,7 +1007,7 @@ anv_compile_shader_spirv(struct anv_compiler *compiler,
 
    mesa_shader->Program->nir =
       spirv_to_nir((uint32_t *)shader->module->data, shader->module->size / 4,
-                   glsl_options->NirOptions);
+                   stage_info[stage].stage, glsl_options->NirOptions);
    nir_validate_shader(mesa_shader->Program->nir);
 
    brw_process_nir(mesa_shader->Program->nir,
author	Jason Ekstrand <[email protected]>	2015-08-25 17:12:03 -0700
committer	Jason Ekstrand <[email protected]>	2015-08-25 18:41:21 -0700
commit	9b387b5d3f4103c51079ea5298d33086af6da433 (patch)
tree	4127f2284b6b4a5746bbc01bbfc6a97305057cb4
parent	5360edcb304e147341b934567f3bbf40e9d5a3b5 (diff)
parent	1d2a844e7d55645ea3d24fb589bec03695b3d2b1 (diff)