340 files changed, 16617 insertions, 3727 deletions
diff --git a/configure.ac b/configure.ac
index 0c88db9f66f..758751c4b94 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,7 +81,7 @@ PRESENTPROTO_REQUIRED=1.0
 LIBUDEV_REQUIRED=151
 GLPROTO_REQUIRED=1.4.14
 LIBOMXIL_BELLAGIO_REQUIRED=0.0
-LIBVA_REQUIRED=0.35.0
+LIBVA_REQUIRED=0.38.0
 VDPAU_REQUIRED=1.1
 WAYLAND_REQUIRED=1.2.0
 XCB_REQUIRED=1.9.3
@@ -867,7 +867,7 @@ GALLIUM_DRIVERS_DEFAULT="r300,r600,svga,swrast"
 AC_ARG_WITH([gallium-drivers],
     [AS_HELP_STRING([--with-gallium-drivers@<:@=DIRS...@:>@],
         [comma delimited Gallium drivers list, e.g.
-        "i915,ilo,nouveau,r300,r600,radeonsi,freedreno,svga,swrast,vc4"
+        "i915,ilo,nouveau,r300,r600,radeonsi,freedreno,svga,swrast,vc4,virgl"
         @<:@default=r300,r600,svga,swrast@:>@])],
     [with_gallium_drivers="$withval"],
     [with_gallium_drivers="$GALLIUM_DRIVERS_DEFAULT"])
@@ -2188,6 +2188,12 @@ if test -n "$with_gallium_drivers"; then
             PKG_CHECK_MODULES([SIMPENROSE], [simpenrose],
                               [USE_VC4_SIMULATOR=yes], [USE_VC4_SIMULATOR=no])
             ;;
+        xvirgl)
+            HAVE_GALLIUM_VIRGL=yes
+            gallium_require_drm "virgl"
+            gallium_require_drm_loader
+            require_egl_drm "virgl"
+            ;;
         *)
             AC_MSG_ERROR([Unknown Gallium driver: $driver])
             ;;
@@ -2259,6 +2265,7 @@ AM_CONDITIONAL(HAVE_GALLIUM_FREEDRENO, test "x$HAVE_GALLIUM_FREEDRENO" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_SOFTPIPE, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_LLVMPIPE, test "x$HAVE_GALLIUM_LLVMPIPE" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes)
+AM_CONDITIONAL(HAVE_GALLIUM_VIRGL, test "x$HAVE_GALLIUM_VIRGL" = xyes)
 
 AM_CONDITIONAL(HAVE_GALLIUM_STATIC_TARGETS, test "x$enable_shared_pipe_drivers" = xno)
 
@@ -2386,6 +2393,7 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/drivers/svga/Makefile
 		src/gallium/drivers/trace/Makefile
 		src/gallium/drivers/vc4/Makefile
+		src/gallium/drivers/virgl/Makefile
 		src/gallium/state_trackers/clover/Makefile
 		src/gallium/state_trackers/dri/Makefile
 		src/gallium/state_trackers/glx/xlib/Makefile
@@ -2426,6 +2434,8 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/winsys/sw/wrapper/Makefile
 		src/gallium/winsys/sw/xlib/Makefile
 		src/gallium/winsys/vc4/drm/Makefile
+		src/gallium/winsys/virgl/drm/Makefile
+		src/gallium/winsys/virgl/vtest/Makefile
 		src/gbm/Makefile
 		src/gbm/main/gbm.pc
 		src/glsl/Makefile
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 167321676df..7f6b8c9ef27 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -153,10 +153,10 @@ GL 4.3, GLSL 4.30:
   GL_ARB_ES3_compatibility                             DONE (all drivers that support GLSL 3.30)
   GL_ARB_clear_buffer_object                           DONE (all drivers)
   GL_ARB_compute_shader                                in progress (jljusten)
-  GL_ARB_copy_image                                    DONE (i965) (gallium - in progress, VMware)
+  GL_ARB_copy_image                                    DONE (i965, nv50, nvc0, radeonsi)
   GL_KHR_debug                                         DONE (all drivers)
   GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_fragment_layer_viewport                       DONE (nv50, nvc0, r600, radeonsi, llvmpipe)
+  GL_ARB_fragment_layer_viewport                       DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
   GL_ARB_framebuffer_no_attachments                    DONE (i965)
   GL_ARB_internalformat_query2                         not started
   GL_ARB_invalidate_subdata                            DONE (all drivers)
@@ -243,7 +243,7 @@ GLES3.2, GLSL ES 3.2
   GL_KHR_texture_compression_astc_ldr                  DONE (i965/gen9+)
   GL_OES_copy_image                                    not started (based on GL_ARB_copy_image, which is done for some drivers)
   GL_OES_draw_buffers_indexed                          not started
-  GL_OES_draw_elements_base_vertex                     not started (based on GL_ARB_draw_elements_base_vertex, which is done for all drivers)
+  GL_OES_draw_elements_base_vertex                     DONE (all drivers)
   GL_OES_geometry_shader                               not started (based on GL_ARB_geometry_shader4, which is done for all drivers)
   GL_OES_gpu_shader5                                   not started (based on parts of GL_ARB_gpu_shader5, which is done for some drivers)
   GL_OES_primitive_bounding box                        not started
diff --git a/docs/index.html b/docs/index.html
index 138447fc500..c8d4a5c2699 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,12 @@
 
 <h1>News</h1>
 
+<h2>October 24, 2015</h2>
+<p>
+<a href="relnotes/11.0.4.html">Mesa 11.0.4</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>October 10, 2015</h2>
 <p>
 <a href="relnotes/11.0.3.html">Mesa 11.0.3</a> is released.
@@ -28,7 +34,7 @@ This is a bug-fix release.
 This is a bug-fix release.
 <br>
 NOTE: It is anticipated that 10.6.9 will be the final release in the 10.6
-series. Users of 10.5 are encouraged to migrate to the 11.0 series in order
+series. Users of 10.6 are encouraged to migrate to the 11.0 series in order
 to obtain future fixes.
 </p>
 
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 074c3b6a612..d1dde4fd726 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,7 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/11.0.4.html">11.0.4 release notes</a>
 <li><a href="relnotes/11.0.3.html">11.0.3 release notes</a>
 <li><a href="relnotes/10.6.9.html">10.6.9 release notes</a>
 <li><a href="relnotes/11.0.2.html">11.0.2 release notes</a>
diff --git a/docs/relnotes/11.0.4.html b/docs/relnotes/11.0.4.html
new file mode 100644
index 00000000000..a777b9de506
--- /dev/null
+++ b/docs/relnotes/11.0.4.html
@@ -0,0 +1,168 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 11.0.4 Release Notes / October 24, 2015</h1>
+
+<p>
+Mesa 11.0.4 is a bug fix release which fixes bugs found since the 11.0.3 release.
+</p>
+<p>
+Mesa 11.0.4 implements the OpenGL 4.1 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.1.  OpenGL
+4.1 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+ed412ca6a46d1bd055120e5c12806c15419ae8c4dd6d3f6ea20a83091d5c78bf  mesa-11.0.4.tar.gz
+40201bf7fc6fa12a6d9edfe870b41eb4dd6669154e3c42c48a96f70805f5483d  mesa-11.0.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86281">Bug 86281</a> - brw_meta_fast_clear (brw=brw&#64;entry=0x7fffd4097a08, fb=fb&#64;entry=0x7fffd40fa900, buffers=buffers&#64;entry=2, partial_clear=partial_clear&#64;entry=false)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86720">Bug 86720</a> - [radeon] Europa Universalis 4 freezing during game start (10.3.3+, still broken on 11.0.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91788">Bug 91788</a> - [HSW Regression] Synmark2_v6 Multithread performance case FPS reduced by 36%</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92304">Bug 92304</a> - [cts] cts.shaders.negative conformance tests fail</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alejandro Piñeiro (2):</p>
+<ul>
+  <li>i965/vec4: check writemask when bailing out at register coalesce</li>
+  <li>i965/vec4: fill src_reg type using the constructor type parameter</li>
+</ul>
+
+<p>Brian Paul (2):</p>
+<ul>
+  <li>vbo: fix incorrect switch statement in init_mat_currval()</li>
+  <li>mesa: fix incorrect opcode in save_BlendFunci()</li>
+</ul>
+
+<p>Chih-Wei Huang (3):</p>
+<ul>
+  <li>mesa: android: Fix the incorrect path of sse_minmax.c</li>
+  <li>nv50/ir: use C++11 standard std::unordered_map if possible</li>
+  <li>nv30: include the header of ffs prototype</li>
+</ul>
+
+<p>Chris Wilson (1):</p>
+<ul>
+  <li>i965: Remove early release of DRI2 miptree</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>mesa/uniforms: fix get_uniform for doubles (v2)</li>
+</ul>
+
+<p>Emil Velikov (1):</p>
+<ul>
+  <li>docs: add sha256 checksums for 11.0.3</li>
+</ul>
+
+<p>Francisco Jerez (5):</p>
+<ul>
+  <li>i965: Don't tell the hardware about our UAV access.</li>
+  <li>mesa: Expose function to calculate whether a shader image unit is valid.</li>
+  <li>mesa: Skip redundant texture completeness checking during image validation.</li>
+  <li>i965: Use _mesa_is_image_unit_valid() instead of gl_image_unit::_Valid.</li>
+  <li>mesa: Get rid of texture-dependent image unit derived state.</li>
+</ul>
+
+<p>Ian Romanick (8):</p>
+<ul>
+  <li>glsl: Allow built-in functions as constant expressions in OpenGL ES 1.00</li>
+  <li>ff_fragment_shader: Use binding to set the sampler unit</li>
+  <li>glsl/linker: Use constant_initializer instead of constant_value to initialize uniforms</li>
+  <li>glsl: Use constant_initializer instead of constant_value to determine whether to keep an unused uniform</li>
+  <li>glsl: Only set ir_variable::constant_value for const-decorated variables</li>
+  <li>glsl: Restrict initializers for global variables to constant expression in ES</li>
+  <li>glsl: Add method to determine whether an expression contains the sequence operator</li>
+  <li>glsl: In later GLSL versions, sequence operator is cannot be a constant expression</li>
+</ul>
+
+<p>Ilia Mirkin (1):</p>
+<ul>
+  <li>nouveau: make sure there's always room to emit a fence</li>
+</ul>
+
+<p>Indrajit Das (1):</p>
+<ul>
+  <li>st/va: Used correct parameter to derive the value of the "h" variable in vlVaCreateImage</li>
+</ul>
+
+<p>Jonathan Gray (1):</p>
+<ul>
+  <li>configure.ac: ensure RM is set</li>
+</ul>
+
+<p>Krzysztof Sobiecki (1):</p>
+<ul>
+  <li>st/fbo: use pipe_surface_release instead of pipe_surface_reference</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>st/omx/dec/h264: fix field picture type 0 poc disorder</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>st/mesa: fix clip state dependencies</li>
+  <li>radeonsi: fix a GS copy shader leak</li>
+  <li>gallium: add PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT</li>
+</ul>
+
+<p>Nicolai Hähnle (1):</p>
+<ul>
+  <li>u_vbuf: fix vb slot assignment for translated buffers</li>
+</ul>
+
+<p>Rob Clark (1):</p>
+<ul>
+  <li>freedreno/a3xx: cache-flush is needed after MEM_WRITE</li>
+</ul>
+
+<p>Tapani Pälli (3):</p>
+<ul>
+  <li>mesa: add GL_UNSIGNED_INT_24_8 to _mesa_pack_depth_span</li>
+  <li>mesa: Set api prefix to version string when overriding version</li>
+  <li>mesa: fix ARRAY_SIZE query for GetProgramResourceiv</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/11.1.0.html b/docs/relnotes/11.1.0.html
index d3dbe9dda13..7160244fcb4 100644
--- a/docs/relnotes/11.1.0.html
+++ b/docs/relnotes/11.1.0.html
@@ -45,15 +45,21 @@ Note: some of the new features are only available with certain drivers.
 
 <ul>
 <li>GL_ARB_blend_func_extended on freedreno (a3xx)</li>
+<li>GL_ARB_copy_image on radeonsi</li>
 <li>GL_ARB_gpu_shader_fp64 on r600 for Cypress/Cayman/Aruba chips</li>
 <li>GL_ARB_gpu_shader5 on r600 for Evergreen and later chips</li>
+<li>GL_ARB_shader_clock on i965 (gen7+)</li>
+<li>GL_ARB_shader_stencil_export on i965 (gen9+)</li>
 <li>GL_ARB_shader_storage_buffer_object on i965</li>
 <li>GL_ARB_shader_texture_image_samples on i965, nv50, nvc0, r600, radeonsi</li>
 <li>GL_ARB_texture_barrier / GL_NV_texture_barrier on i965</li>
 <li>GL_ARB_texture_query_lod on softpipe</li>
 <li>GL_ARB_texture_view on radeonsi</li>
+<li>GL_EXT_draw_elements_base_vertex on all drivers</li>
+<li>GL_OES_draw_elements_base_vertex on all drivers</li>
 <li>EGL_KHR_create_context on softpipe, llvmpipe</li>
 <li>EGL_KHR_gl_colorspace on softpipe, llvmpipe</li>
+<li>new virgl gallium driver for qemu virtio-gpu</li>
 </ul>
 
 <h2>Bug fixes</h2>
diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index a0f155a1f42..6bbd3fa87f5 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -495,7 +495,7 @@ struct __DRIdamageExtensionRec {
  * SWRast Loader extension.
  */
 #define __DRI_SWRAST_LOADER "DRI_SWRastLoader"
-#define __DRI_SWRAST_LOADER_VERSION 2
+#define __DRI_SWRAST_LOADER_VERSION 3
 struct __DRIswrastLoaderExtensionRec {
     __DRIextension base;
 
@@ -528,6 +528,15 @@ struct __DRIswrastLoaderExtensionRec {
     void (*putImage2)(__DRIdrawable *drawable, int op,
                       int x, int y, int width, int height, int stride,
                       char *data, void *loaderPrivate);
+
+   /**
+     * Put image to drawable
+     *
+     * \since 3
+     */
+   void (*getImage2)(__DRIdrawable *readable,
+		     int x, int y, int width, int height, int stride,
+		     char *data, void *loaderPrivate);
 };
 
 /**
diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index 8a425999429..5891ba67ea4 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -109,21 +109,29 @@ CHIPSET(0x162A, bdw_gt3, "Intel(R) Iris Pro P6300 (Broadwell GT3e)")
 CHIPSET(0x162B, bdw_gt3, "Intel(R) Iris 6100 (Broadwell GT3)")
 CHIPSET(0x162D, bdw_gt3, "Intel(R) Broadwell GT3")
 CHIPSET(0x162E, bdw_gt3, "Intel(R) Broadwell GT3")
-CHIPSET(0x1902, skl_gt1, "Intel(R) Skylake DT  GT1")
-CHIPSET(0x1906, skl_gt1, "Intel(R) Skylake ULT GT1")
-CHIPSET(0x190A, skl_gt1, "Intel(R) Skylake SRV GT1")
-CHIPSET(0x190B, skl_gt1, "Intel(R) Skylake Halo GT1")
-CHIPSET(0x190E, skl_gt1, "Intel(R) Skylake ULX GT1")
-CHIPSET(0x1912, skl_gt2, "Intel(R) Skylake DT  GT2")
-CHIPSET(0x1916, skl_gt2, "Intel(R) Skylake ULT GT2")
-CHIPSET(0x191A, skl_gt2, "Intel(R) Skylake SRV GT2")
-CHIPSET(0x191B, skl_gt2, "Intel(R) Skylake Halo GT2")
-CHIPSET(0x191D, skl_gt2, "Intel(R) Skylake WKS GT2")
-CHIPSET(0x191E, skl_gt2, "Intel(R) Skylake ULX GT2")
-CHIPSET(0x1921, skl_gt2, "Intel(R) Skylake ULT GT2F")
-CHIPSET(0x1926, skl_gt3, "Intel(R) Skylake ULT GT3")
-CHIPSET(0x192A, skl_gt3, "Intel(R) Skylake SRV GT3")
-CHIPSET(0x192B, skl_gt3, "Intel(R) Skylake Halo GT3")
+CHIPSET(0x1902, skl_gt1, "Intel(R) HD Graphics 510 (Skylake GT1)")
+CHIPSET(0x1906, skl_gt1, "Intel(R) HD Graphics 510 (Skylake GT1)")
+CHIPSET(0x190A, skl_gt1, "Intel(R) Skylake GT1")
+CHIPSET(0x190E, skl_gt1, "Intel(R) Skylake GT1")
+CHIPSET(0x1912, skl_gt2, "Intel(R) HD Graphics 530 (Skylake GT2)")
+CHIPSET(0x1913, skl_gt2, "Intel(R) Skylake GT2f")
+CHIPSET(0x1915, skl_gt2, "Intel(R) Skylake GT2f")
+CHIPSET(0x1916, skl_gt2, "Intel(R) HD Graphics 520 (Skylake GT2)")
+CHIPSET(0x1917, skl_gt2, "Intel(R) Skylake GT2f")
+CHIPSET(0x191A, skl_gt2, "Intel(R) Skylake GT2")
+CHIPSET(0x191B, skl_gt2, "Intel(R) HD Graphics 530 (Skylake GT2)")
+CHIPSET(0x191D, skl_gt2, "Intel(R) HD Graphics P530 (Skylake GT2)")
+CHIPSET(0x191E, skl_gt2, "Intel(R) HD Graphics 515 (Skylake GT2)")
+CHIPSET(0x1921, skl_gt2, "Intel(R) Skylake GT2")
+CHIPSET(0x1923, skl_gt3, "Intel(R) Iris Graphics 540 (Skylake GT3e)")
+CHIPSET(0x1926, skl_gt3, "Intel(R) HD Graphics 535 (Skylake GT3)")
+CHIPSET(0x1927, skl_gt3, "Intel(R) Iris Graphics 550 (Skylake GT3e)")
+CHIPSET(0x192A, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x192B, skl_gt3, "Intel(R) Iris Graphics (Skylake GT3fe)")
+CHIPSET(0x1932, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x193A, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x193B, skl_gt4, "Intel(R) Skylake GT4")
+CHIPSET(0x193D, skl_gt4, "Intel(R) Skylake GT4")
 CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h
index 52eada1d3d5..bcf15a186c6 100644
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -181,3 +181,5 @@ CHIPSET(0x9876, CARRIZO_, CARRIZO)
 CHIPSET(0x9877, CARRIZO_, CARRIZO)
 
 CHIPSET(0x7300, FIJI_, FIJI)
+
+CHIPSET(0x98E4, STONEY_, STONEY)
diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am
index a7c3606de0a..611d55fafe2 100644
--- a/src/gallium/Makefile.am
+++ b/src/gallium/Makefile.am
@@ -82,6 +82,11 @@ if HAVE_GALLIUM_VC4
 SUBDIRS += drivers/vc4 winsys/vc4/drm
 endif
 
+## virgl
+if HAVE_GALLIUM_VIRGL
+SUBDIRS += drivers/virgl winsys/virgl/drm winsys/virgl/vtest
+endif
+
 ## the sw winsys'
 SUBDIRS += winsys/sw/null
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 017d0752060..96aba7370c1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -427,6 +427,7 @@ lp_build_init(void)
        */
       util_cpu_caps.has_avx = 0;
       util_cpu_caps.has_avx2 = 0;
+      util_cpu_caps.has_f16c = 0;
    }
 
 #ifdef PIPE_ARCH_PPC_64
@@ -458,7 +459,9 @@ lp_build_init(void)
    util_cpu_caps.has_sse3 = 0;
    util_cpu_caps.has_ssse3 = 0;
    util_cpu_caps.has_sse4_1 = 0;
+   util_cpu_caps.has_sse4_2 = 0;
    util_cpu_caps.has_avx = 0;
+   util_cpu_caps.has_avx2 = 0;
    util_cpu_caps.has_f16c = 0;
 #endif
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 72fab8ccf06..7bda1184ee9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -497,20 +497,48 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
 #endif
    }
 
-   llvm::SmallVector<std::string, 1> MAttrs;
-   if (util_cpu_caps.has_avx) {
-      /*
-       * AVX feature is not automatically detected from CPUID by the X86 target
-       * yet, because the old (yet default) JIT engine is not capable of
-       * emitting the opcodes. On newer llvm versions it is and at least some
-       * versions (tested with 3.3) will emit avx opcodes without this anyway.
-       */
-      MAttrs.push_back("+avx");
-      if (util_cpu_caps.has_f16c) {
-         MAttrs.push_back("+f16c");
-      }
-      builder.setMAttrs(MAttrs);
-   }
+   llvm::SmallVector<std::string, 16> MAttrs;
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+   /*
+    * We need to unset attributes because sometimes LLVM mistakenly assumes
+    * certain features are present given the processor name.
+    *
+    * https://bugs.freedesktop.org/show_bug.cgi?id=92214
+    * http://llvm.org/PR25021
+    * http://llvm.org/PR19429
+    * http://llvm.org/PR16721
+    */
+   MAttrs.push_back(util_cpu_caps.has_sse    ? "+sse"    : "-sse"   );
+   MAttrs.push_back(util_cpu_caps.has_sse2   ? "+sse2"   : "-sse2"  );
+   MAttrs.push_back(util_cpu_caps.has_sse3   ? "+sse3"   : "-sse3"  );
+   MAttrs.push_back(util_cpu_caps.has_ssse3  ? "+ssse3"  : "-ssse3" );
+#if HAVE_LLVM >= 0x0304
+   MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse4.1" : "-sse4.1");
+#else
+   MAttrs.push_back(util_cpu_caps.has_sse4_1 ? "+sse41"  : "-sse41" );
+#endif
+#if HAVE_LLVM >= 0x0304
+   MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse4.2" : "-sse4.2");
+#else
+   MAttrs.push_back(util_cpu_caps.has_sse4_2 ? "+sse42"  : "-sse42" );
+#endif
+   /*
+    * AVX feature is not automatically detected from CPUID by the X86 target
+    * yet, because the old (yet default) JIT engine is not capable of
+    * emitting the opcodes. On newer llvm versions it is and at least some
+    * versions (tested with 3.3) will emit avx opcodes without this anyway.
+    */
+   MAttrs.push_back(util_cpu_caps.has_avx  ? "+avx"  : "-avx");
+   MAttrs.push_back(util_cpu_caps.has_f16c ? "+f16c" : "-f16c");
+   MAttrs.push_back(util_cpu_caps.has_avx2 ? "+avx2" : "-avx2");
+#endif
+
+#if defined(PIPE_ARCH_PPC)
+   MAttrs.push_back(util_cpu_caps.has_altivec ? "+altivec" : "-altivec");
+#endif
+
+   builder.setMAttrs(MAttrs);
 
 #if HAVE_LLVM >= 0x0305
    StringRef MCPU = llvm::sys::getHostCPUName();
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index b5c06b69571..26bfa0d2677 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -405,16 +405,17 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
       break;
 
    case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      if (offset) {
+         offset = lp_build_int_to_float(coord_bld, offset);
+         offset = lp_build_div(coord_bld, offset, length_f);
+         coord = lp_build_add(coord_bld, coord, offset);
+      }
       /* compute mirror function */
       coord = lp_build_coord_mirror(bld, coord);
 
       /* scale coord to length */
       coord = lp_build_mul(coord_bld, coord, length_f);
       coord = lp_build_sub(coord_bld, coord, half);
-      if (offset) {
-         offset = lp_build_int_to_float(coord_bld, offset);
-         coord = lp_build_add(coord_bld, coord, offset);
-      }
 
       /* convert to int, compute lerp weight */
       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
@@ -567,12 +568,13 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
          coord = lp_build_mul(coord_bld, coord, length_f);
       }
 
+      if (offset) {
+         offset = lp_build_int_to_float(coord_bld, offset);
+         coord = lp_build_add(coord_bld, coord, offset);
+      }
       /* floor */
       /* use itrunc instead since we clamp to 0 anyway */
       icoord = lp_build_itrunc(coord_bld, coord);
-      if (offset) {
-         icoord = lp_build_add(int_coord_bld, icoord, offset);
-      }
 
       /* clamp to [0, length - 1]. */
       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
@@ -2586,6 +2588,10 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
       derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
       derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
    }
+   /*
+    * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
+    * so AoS path could be used. Not sure it's worth the trouble...
+    */
 
    min_img_filter = derived_sampler_state.min_img_filter;
    mag_img_filter = derived_sampler_state.mag_img_filter;
diff --git a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
index 08271a760f5..6ca4dc8136c 100644
--- a/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_drm_helper.h
@@ -59,6 +59,11 @@
 #include "vc4/drm/vc4_drm_public.h"
 #endif
 
+#if GALLIUM_VIRGL
+#include "virgl/drm/virgl_drm_public.h"
+#include "virgl/virgl_public.h"
+#endif
+
 static char* driver_name = NULL;
 
 /* XXX: We need to teardown the winsys if *screen_create() fails. */
@@ -296,6 +301,33 @@ pipe_freedreno_create_screen(int fd)
 }
 #endif
 
+#if defined(GALLIUM_VIRGL)
+#if defined(DRI_TARGET)
+
+const __DRIextension **__driDriverGetExtensions_virtio_gpu(void);
+
+PUBLIC const __DRIextension **__driDriverGetExtensions_virtio_gpu(void)
+{
+   globalDriverAPI = &galliumdrm_driver_api;
+   return galliumdrm_driver_extensions;
+}
+#endif
+
+static struct pipe_screen *
+pipe_virgl_create_screen(int fd)
+{
+   struct virgl_winsys *vws;
+   struct pipe_screen *screen;
+
+   vws = virgl_drm_winsys_create(fd);
+   if (!vws)
+      return NULL;
+
+   screen = virgl_create_screen(vws);
+   return screen ? debug_screen_wrap(screen) : NULL;
+}
+#endif
+
 #if defined(GALLIUM_VC4)
 #if defined(DRI_TARGET)
 
@@ -385,6 +417,11 @@ dd_create_screen(int fd)
       return pipe_freedreno_create_screen(fd);
    else
 #endif
+#if defined(GALLIUM_VIRGL)
+   if ((strcmp(driver_name, "virtio_gpu") == 0))
+      return pipe_virgl_create_screen(fd);
+   else
+#endif
 #if defined(GALLIUM_VC4)
    if (strcmp(driver_name, "vc4") == 0)
       return pipe_vc4_create_screen(fd);
@@ -474,6 +511,11 @@ dd_configuration(enum drm_conf conf)
       return configuration_query(conf);
    else
 #endif
+#if defined(GALLIUM_VIRGL)
+   if ((strcmp(driver_name, "virtio_gpu") == 0))
+      return configuration_query(conf);
+   else
+#endif
 #if defined(GALLIUM_VC4)
    if (strcmp(driver_name, "vc4") == 0)
       return configuration_query(conf);
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
index 5f46552f6c3..f3693fb1f39 100644
--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -19,6 +19,10 @@
 #include "llvmpipe/lp_public.h"
 #endif
 
+#ifdef GALLIUM_VIRGL
+#include "virgl/virgl_public.h"
+#include "virgl/vtest/virgl_vtest_public.h"
+#endif
 
 static inline struct pipe_screen *
 sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
@@ -30,6 +34,14 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
       screen = llvmpipe_create_screen(winsys);
 #endif
 
+#if defined(GALLIUM_VIRGL)
+   if (screen == NULL && strcmp(driver, "virpipe") == 0) {
+      struct virgl_winsys *vws;
+      vws = virgl_vtest_winsys_wrap(winsys);
+      screen = virgl_create_screen(vws);
+   }
+#endif
+
 #if defined(GALLIUM_SOFTPIPE)
    if (screen == NULL)
       screen = softpipe_create_screen(winsys);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 5d80cca5b0e..e29ffb39894 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -29,6 +29,7 @@
 #include "util/u_string.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_math.h"
 #include "tgsi_dump.h"
 #include "tgsi_info.h"
 #include "tgsi_iterate.h"
@@ -43,6 +44,8 @@ struct dump_ctx
 {
    struct tgsi_iterate_context iter;
 
+   boolean dump_float_as_hex;
+
    uint instno;
    uint immno;
    int indent;
@@ -88,6 +91,7 @@ dump_enum(
 #define SID(I)          ctx->dump_printf( ctx, "%d", I )
 #define FLT(F)          ctx->dump_printf( ctx, "%10.4f", F )
 #define DBL(D)          ctx->dump_printf( ctx, "%10.8f", D )
+#define HFLT(F)         ctx->dump_printf( ctx, "0x%08x", fui((F)) )
 #define ENM(E,ENUMS)    dump_enum( ctx, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
 
 const char *
@@ -251,7 +255,10 @@ dump_imm_data(struct tgsi_iterate_context *iter,
          break;
       }
       case TGSI_IMM_FLOAT32:
-         FLT( data[i].Float );
+         if (ctx->dump_float_as_hex)
+            HFLT( data[i].Float );
+         else
+            FLT( data[i].Float );
          break;
       case TGSI_IMM_UINT32:
          UID(data[i].Uint);
@@ -682,6 +689,11 @@ tgsi_dump_to_file(const struct tgsi_token *tokens, uint flags, FILE *file)
    ctx.indentation = 0;
    ctx.file = file;
 
+   if (flags & TGSI_DUMP_FLOAT_AS_HEX)
+      ctx.dump_float_as_hex = TRUE;
+   else
+      ctx.dump_float_as_hex = FALSE;
+
    tgsi_iterate_shader( tokens, &ctx.iter );
 }
 
@@ -697,6 +709,7 @@ struct str_dump_ctx
    char *str;
    char *ptr;
    int left;
+   bool nospace;
 };
 
 static void
@@ -719,10 +732,11 @@ str_dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...)
          sctx->ptr += written;
          sctx->left -= written;
       }
-   }
+   } else
+      sctx->nospace = true;
 }
 
-void
+bool
 tgsi_dump_str(
    const struct tgsi_token *tokens,
    uint flags,
@@ -749,8 +763,16 @@ tgsi_dump_str(
    ctx.str[0] = 0;
    ctx.ptr = str;
    ctx.left = (int)size;
+   ctx.nospace = false;
+
+   if (flags & TGSI_DUMP_FLOAT_AS_HEX)
+      ctx.base.dump_float_as_hex = TRUE;
+   else
+      ctx.base.dump_float_as_hex = FALSE;
 
    tgsi_iterate_shader( tokens, &ctx.base.iter );
+
+   return !ctx.nospace;
 }
 
 void
@@ -773,6 +795,7 @@ tgsi_dump_instruction_str(
    ctx.str[0] = 0;
    ctx.ptr = str;
    ctx.left = (int)size;
+   ctx.nospace = false;
 
    iter_instruction( &ctx.base.iter, (struct tgsi_full_instruction *)inst );
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
index 7c8f92ee7bc..c3722d333d7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -38,7 +38,9 @@
 extern "C" {
 #endif
 
-void
+#define TGSI_DUMP_FLOAT_AS_HEX (1 << 0)
+
+bool
 tgsi_dump_str(
    const struct tgsi_token *tokens,
    uint flags,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 3e3ed5b19d1..4a82c9b3552 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -195,8 +195,15 @@ static boolean parse_float( const char **pcur, float *val )
    boolean integral_part = FALSE;
    boolean fractional_part = FALSE;
 
-   *val = (float) atof( cur );
+   if (*cur == '0' && *(cur + 1) == 'x') {
+      union fi fi;
+      fi.ui = strtoul(cur, NULL, 16);
+      *val = fi.f;
+      cur += 10;
+      goto out;
+   }
 
+   *val = (float) atof( cur );
    if (*cur == '-' || *cur == '+')
       cur++;
    if (is_digit( cur )) {
@@ -228,6 +235,8 @@ static boolean parse_float( const char **pcur, float *val )
       else
          return FALSE;
    }
+
+out:
    *pcur = cur;
    return TRUE;
 }
diff --git a/src/gallium/auxiliary/util/u_format.c b/src/gallium/auxiliary/util/u_format.c
index c1ce408119f..79630bf6dc3 100644
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/util/u_format.c
@@ -170,6 +170,25 @@ util_format_is_snorm(enum pipe_format format)
 }
 
 boolean
+util_format_is_snorm8(enum pipe_format format)
+{
+   const struct util_format_description *desc = util_format_description(format);
+   int i;
+
+   if (desc->is_mixed)
+      return FALSE;
+
+   i = util_format_get_first_non_void_channel(format);
+   if (i == -1)
+      return FALSE;
+
+   return desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED &&
+          !desc->channel[i].pure_integer &&
+          desc->channel[i].normalized &&
+          desc->channel[i].size == 8;
+}
+
+boolean
 util_format_is_luminance_alpha(enum pipe_format format)
 {
    const struct util_format_description *desc =
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index 42b39ff04fd..a1b1b28fa41 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -686,6 +686,9 @@ util_format_is_pure_uint(enum pipe_format format);
 boolean
 util_format_is_snorm(enum pipe_format format);
 
+boolean
+util_format_is_snorm8(enum pipe_format format);
+
 /**
  * Check if the src format can be blitted to the destination format with
  * a simple memcpy.  For example, blitting from RGBA to RGBx is OK, but not
diff --git a/src/gallium/auxiliary/util/u_tests.c b/src/gallium/auxiliary/util/u_tests.c
index a94e5cc2949..006dfa95af2 100644
--- a/src/gallium/auxiliary/util/u_tests.c
+++ b/src/gallium/auxiliary/util/u_tests.c
@@ -450,6 +450,43 @@ null_constant_buffer(struct pipe_context *ctx)
    util_report_result(pass);
 }
 
+static void
+null_fragment_shader(struct pipe_context *ctx)
+{
+   struct cso_context *cso;
+   struct pipe_resource *cb;
+   void *vs;
+   struct pipe_rasterizer_state rs = {0};
+   struct pipe_query *query;
+   union pipe_query_result qresult;
+
+   cso = cso_create_context(ctx);
+   cb = util_create_texture2d(ctx->screen, 256, 256,
+                              PIPE_FORMAT_R8G8B8A8_UNORM);
+   util_set_common_states_and_clear(cso, ctx, cb);
+
+   /* No rasterization. */
+   rs.rasterizer_discard = 1;
+   cso_set_rasterizer(cso, &rs);
+
+   vs = util_set_passthrough_vertex_shader(cso, ctx, false);
+
+   query = ctx->create_query(ctx, PIPE_QUERY_PRIMITIVES_GENERATED, 0);
+   ctx->begin_query(ctx, query);
+   util_draw_fullscreen_quad(cso);
+   ctx->end_query(ctx, query);
+   ctx->get_query_result(ctx, query, true, &qresult);
+
+   /* Cleanup. */
+   cso_destroy_context(cso);
+   ctx->delete_vs_state(ctx, vs);
+   ctx->destroy_query(ctx, query);
+   pipe_resource_reference(&cb, NULL);
+
+   /* Check PRIMITIVES_GENERATED. */
+   util_report_result(qresult.u64 == 2);
+}
+
 /**
  * Run all tests. This should be run with a clean context after
  * context_create.
@@ -459,6 +496,7 @@ util_run_tests(struct pipe_screen *screen)
 {
    struct pipe_context *ctx = screen->context_create(screen, NULL, 0);
 
+   null_fragment_shader(ctx);
    tgsi_vs_window_space_position(ctx);
    null_sampler_view(ctx, TGSI_TEXTURE_2D);
    null_sampler_view(ctx, TGSI_TEXTURE_BUFFER);
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 151afb2dffe..91fdb43cfbb 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -278,7 +278,9 @@ The integer capabilities:
   in the shader.
 * ``PIPE_CAP_SHAREABLE_SHADERS``: Whether shader CSOs can be used by any
   pipe_context.
-
+* ``PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS``:
+  Whether copying between compressed and plain formats is supported where
+  a compressed block is copied to/from a plain pixel of the same size.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index a9498835011..3906c9b996e 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -81,7 +81,7 @@ draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */
 			info->restart_index : 0xffffffff);
 
-	if (ctx->rasterizer && ctx->rasterizer->point_size_per_vertex &&
+	if (ctx->rasterizer->point_size_per_vertex &&
 			(info->mode == PIPE_PRIM_POINTS))
 		primtype = DI_PT_POINTLIST_PSIZE;
 
@@ -137,7 +137,7 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 		.key = {
 			/* do binning pass first: */
 			.binning_pass = true,
-			.color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
+			.color_two_side = ctx->rasterizer->light_twoside,
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
@@ -149,9 +149,9 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			.fsaturate_t = fd3_ctx->fsaturate_t,
 			.fsaturate_r = fd3_ctx->fsaturate_r,
 		},
-		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
-		.sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : 0,
-		.sprite_coord_mode = ctx->rasterizer ? ctx->rasterizer->sprite_coord_mode : false,
+		.rasterflat = ctx->rasterizer->flatshade,
+		.sprite_coord_enable = ctx->rasterizer->sprite_coord_enable,
+		.sprite_coord_mode = ctx->rasterizer->sprite_coord_mode,
 	};
 	unsigned dirty;
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 411f5b76329..8f9c8b0623c 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -627,7 +627,7 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		ctx->prog.dirty = 0;
 	}
 
-	if ((dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) && ctx->blend) {
+	if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) {
 		struct fd3_blend_stateobj *blend = fd3_blend_stateobj(ctx->blend);
 		uint32_t i;
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index 025753c037e..7bd5163529a 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -118,12 +118,12 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 		.key = {
 			/* do binning pass first: */
 			.binning_pass = true,
-			.color_two_side = ctx->rasterizer ? ctx->rasterizer->light_twoside : false,
-			.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
+			.color_two_side = ctx->rasterizer->light_twoside,
+			.rasterflat = ctx->rasterizer->flatshade,
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
-			.ucp_enables = ctx->rasterizer ? ctx->rasterizer->clip_plane_enable : 0,
+			.ucp_enables = ctx->rasterizer->clip_plane_enable,
 			.has_per_samp = (fd4_ctx->fsaturate || fd4_ctx->vsaturate),
 			.vsaturate_s = fd4_ctx->vsaturate_s,
 			.vsaturate_t = fd4_ctx->vsaturate_t,
@@ -132,9 +132,9 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			.fsaturate_t = fd4_ctx->fsaturate_t,
 			.fsaturate_r = fd4_ctx->fsaturate_r,
 		},
-		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
-		.sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : false,
-		.sprite_coord_mode = ctx->rasterizer ? ctx->rasterizer->sprite_coord_mode : false,
+		.rasterflat = ctx->rasterizer->flatshade,
+		.sprite_coord_enable = ctx->rasterizer->sprite_coord_enable,
+		.sprite_coord_mode = ctx->rasterizer->sprite_coord_mode,
 	};
 	unsigned dirty;
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index c7ed1d2e379..cf5dd7b0f17 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -594,7 +594,7 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		ctx->prog.dirty = 0;
 	}
 
-	if ((dirty & FD_DIRTY_BLEND) && ctx->blend) {
+	if ((dirty & FD_DIRTY_BLEND)) {
 		struct fd4_blend_stateobj *blend = fd4_blend_stateobj(ctx->blend);
 		uint32_t i;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 50d140fe903..9f8c33263fb 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -238,6 +238,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
+	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 5812af626cb..2d2fd375656 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -252,6 +252,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
index 5efe9da2d22..2e9470e66e9 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
@@ -202,14 +202,16 @@ static inline void
 gen6_3DSTATE_WM(struct ilo_builder *builder,
                 const struct ilo_state_raster *rs,
                 const struct ilo_state_ps *ps,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 9;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
    dw[1] = kernel_offset;
@@ -221,6 +223,11 @@ gen6_3DSTATE_WM(struct ilo_builder *builder,
    dw[6] = rs->wm[2] | ps->ps[4];
    dw[7] = 0; /* kernel 1 */
    dw[8] = 0; /* kernel 2 */
+
+   if (ilo_state_ps_get_scratch_size(ps)) {
+      ilo_builder_batch_reloc(builder, pos + 2, scratch_bo,
+            ps->ps[0], 0);
+   }
 }
 
 static inline void
@@ -329,14 +336,16 @@ gen8_3DSTATE_WM_CHROMAKEY(struct ilo_builder *builder)
 static inline void
 gen7_3DSTATE_PS(struct ilo_builder *builder,
                 const struct ilo_state_ps *ps,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 8;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
    dw[1] = kernel_offset;
@@ -347,19 +356,26 @@ gen7_3DSTATE_PS(struct ilo_builder *builder,
    dw[5] = ps->ps[5];
    dw[6] = 0; /* kernel 1 */
    dw[7] = 0; /* kernel 2 */
+
+   if (ilo_state_ps_get_scratch_size(ps)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            ps->ps[3], 0);
+   }
 }
 
 static inline void
 gen8_3DSTATE_PS(struct ilo_builder *builder,
                 const struct ilo_state_ps *ps,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 12;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
    dw[1] = kernel_offset;
@@ -374,6 +390,11 @@ gen8_3DSTATE_PS(struct ilo_builder *builder,
    dw[9] = 0;
    dw[10] = 0; /* kernel 2 */
    dw[11] = 0;
+
+   if (ilo_state_ps_get_scratch_size(ps)) {
+      ilo_builder_batch_reloc64(builder, pos + 4, scratch_bo,
+            ps->ps[1], 0);
+   }
 }
 
 static inline void
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
index 6e94fb25f1f..3a448719c15 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
@@ -477,14 +477,16 @@ gen8_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
 static inline void
 gen6_3DSTATE_VS(struct ilo_builder *builder,
                 const struct ilo_state_vs *vs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 6;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
    dw[1] = kernel_offset;
@@ -493,19 +495,26 @@ gen6_3DSTATE_VS(struct ilo_builder *builder,
    dw[3] = vs->vs[1];
    dw[4] = vs->vs[2];
    dw[5] = vs->vs[3];
+
+   if (ilo_state_vs_get_scratch_size(vs)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            vs->vs[1], 0);
+   }
 }
 
 static inline void
 gen8_3DSTATE_VS(struct ilo_builder *builder,
                 const struct ilo_state_vs *vs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 9;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
    dw[1] = kernel_offset;
@@ -517,19 +526,26 @@ gen8_3DSTATE_VS(struct ilo_builder *builder,
    dw[6] = vs->vs[2];
    dw[7] = vs->vs[3];
    dw[8] = vs->vs[4];
+
+   if (ilo_state_vs_get_scratch_size(vs)) {
+      ilo_builder_batch_reloc64(builder, pos + 4, scratch_bo,
+            vs->vs[1], 0);
+   }
 }
 
 static inline void
 gen7_3DSTATE_HS(struct ilo_builder *builder,
                 const struct ilo_state_hs *hs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 7;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2);
    /* see hs_set_gen7_3DSTATE_HS() */
@@ -539,19 +555,26 @@ gen7_3DSTATE_HS(struct ilo_builder *builder,
    dw[4] = hs->hs[2];
    dw[5] = hs->hs[3];
    dw[6] = 0;
+
+   if (ilo_state_hs_get_scratch_size(hs)) {
+      ilo_builder_batch_reloc(builder, pos + 4, scratch_bo,
+            hs->hs[2], 0);
+   }
 }
 
 static inline void
 gen8_3DSTATE_HS(struct ilo_builder *builder,
                 const struct ilo_state_hs *hs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 9;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2);
    /* see hs_set_gen7_3DSTATE_HS() */
@@ -563,6 +586,11 @@ gen8_3DSTATE_HS(struct ilo_builder *builder,
    dw[6] = 0;
    dw[7] = hs->hs[3];
    dw[8] = 0;
+
+   if (ilo_state_hs_get_scratch_size(hs)) {
+      ilo_builder_batch_reloc64(builder, pos + 5, scratch_bo,
+            hs->hs[2], 0);
+   }
 }
 
 static inline void
@@ -586,14 +614,16 @@ gen7_3DSTATE_TE(struct ilo_builder *builder,
 static inline void
 gen7_3DSTATE_DS(struct ilo_builder *builder,
                 const struct ilo_state_ds *ds,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 6;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2);
    /* see ds_set_gen7_3DSTATE_DS() */
@@ -602,19 +632,26 @@ gen7_3DSTATE_DS(struct ilo_builder *builder,
    dw[3] = ds->ds[1];
    dw[4] = ds->ds[2];
    dw[5] = ds->ds[3];
+
+   if (ilo_state_ds_get_scratch_size(ds)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            ds->ds[1], 0);
+   }
 }
 
 static inline void
 gen8_3DSTATE_DS(struct ilo_builder *builder,
                 const struct ilo_state_ds *ds,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 9;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2);
    /* see ds_set_gen7_3DSTATE_DS() */
@@ -626,19 +663,26 @@ gen8_3DSTATE_DS(struct ilo_builder *builder,
    dw[6] = ds->ds[2];
    dw[7] = ds->ds[3];
    dw[8] = ds->ds[4];
+
+   if (ilo_state_ds_get_scratch_size(ds)) {
+      ilo_builder_batch_reloc64(builder, pos + 4, scratch_bo,
+            ds->ds[1], 0);
+   }
 }
 
 static inline void
 gen6_3DSTATE_GS(struct ilo_builder *builder,
                 const struct ilo_state_gs *gs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 7;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
    dw[1] = kernel_offset;
@@ -648,6 +692,11 @@ gen6_3DSTATE_GS(struct ilo_builder *builder,
    dw[4] = gs->gs[2];
    dw[5] = gs->gs[3];
    dw[6] = gs->gs[4];
+
+   if (ilo_state_gs_get_scratch_size(gs)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            gs->gs[1], 0);
+   }
 }
 
 static inline void
@@ -677,14 +726,16 @@ gen6_3DSTATE_GS_SVB_INDEX(struct ilo_builder *builder,
 static inline void
 gen7_3DSTATE_GS(struct ilo_builder *builder,
                 const struct ilo_state_gs *gs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 7;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
    dw[1] = kernel_offset;
@@ -694,19 +745,26 @@ gen7_3DSTATE_GS(struct ilo_builder *builder,
    dw[4] = gs->gs[2];
    dw[5] = gs->gs[3];
    dw[6] = 0;
+
+   if (ilo_state_gs_get_scratch_size(gs)) {
+      ilo_builder_batch_reloc(builder, pos + 3, scratch_bo,
+            gs->gs[1], 0);
+   }
 }
 
 static inline void
 gen8_3DSTATE_GS(struct ilo_builder *builder,
                 const struct ilo_state_gs *gs,
-                uint32_t kernel_offset)
+                uint32_t kernel_offset,
+                struct intel_bo *scratch_bo)
 {
    const uint8_t cmd_len = 10;
    uint32_t *dw;
+   unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
    dw[1] = kernel_offset;
@@ -719,6 +777,11 @@ gen8_3DSTATE_GS(struct ilo_builder *builder,
    dw[7] = gs->gs[3];
    dw[8] = 0;
    dw[9] = gs->gs[4];
+
+   if (ilo_state_gs_get_scratch_size(gs)) {
+      ilo_builder_batch_reloc64(builder, pos + 4, scratch_bo,
+            gs->gs[1], 0);
+   }
 }
 
 static inline void
diff --git a/src/gallium/drivers/ilo/core/ilo_state_compute.c b/src/gallium/drivers/ilo/core/ilo_state_compute.c
index a5fe5e1a6b0..ba3ff9001e1 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_compute.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_compute.c
@@ -158,7 +158,8 @@ compute_interface_get_gen6_read_end(const struct ilo_dev *dev,
     */
    assert(per_thread_read <= 63);
 
-   /* From the Haswell PRM, volume 2d, page 199:
+   /*
+    * From the Haswell PRM, volume 2d, page 199:
     *
     *     "(Cross-Thread Constant Data Read Length) [0,127]"
     */
@@ -210,38 +211,68 @@ compute_validate_gen6(const struct ilo_dev *dev,
    return true;
 }
 
-static uint8_t
-compute_get_gen6_scratch_space(const struct ilo_dev *dev,
-                               const struct ilo_state_compute_info *info)
+static uint32_t
+compute_get_gen6_per_thread_scratch_size(const struct ilo_dev *dev,
+                                         const struct ilo_state_compute_info *info,
+                                         uint8_t *per_thread_space)
 {
-   uint32_t scratch_size = 0;
-   uint8_t i;
+   ILO_DEV_ASSERT(dev, 6, 7);
 
-   ILO_DEV_ASSERT(dev, 6, 8);
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 2, page 30:
+    *
+    *     "(Per Thread Scratch Space)
+    *      Range = [0,11] indicating [1k bytes, 12k bytes] [DevSNB]"
+    */
+   assert(info->per_thread_scratch_size <= 12 * 1024);
 
-   for (i = 0; i < info->interface_count; i++) {
-      if (scratch_size < info->interfaces[i].scratch_size)
-         scratch_size = info->interfaces[i].scratch_size;
+   if (!info->per_thread_scratch_size) {
+      *per_thread_space = 0;
+      return 0;
    }
 
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      assert(scratch_size <= 2 * 1024 * 1024);
+   *per_thread_space = (info->per_thread_scratch_size > 1024) ?
+      (info->per_thread_scratch_size - 1) / 1024 : 0;
+
+   return 1024 * (1 + *per_thread_space);
+}
 
-      /* next power of two, starting from 1KB */
-      return (scratch_size > 1024) ?
-         (util_last_bit(scratch_size - 1) - 10): 0;
-   } else if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
-      assert(scratch_size <= 2 * 1024 * 1024);
+static uint32_t
+compute_get_gen75_per_thread_scratch_size(const struct ilo_dev *dev,
+                                          const struct ilo_state_compute_info *info,
+                                          uint8_t *per_thread_space)
+{
+   ILO_DEV_ASSERT(dev, 7.5, 8);
 
-      /* next power of two, starting from 2KB */
-      return (scratch_size > 2048) ?
-         (util_last_bit(scratch_size - 1) - 11): 0;
-   } else {
-      assert(scratch_size <= 12 * 1024);
+   /*
+    * From the Haswell PRM, volume 2b, page 407:
+    *
+    *     "(Per Thread Scratch Space)
+    *      [0,10]  Indicating [2k bytes, 2 Mbytes]"
+    *
+    *     "Note: The scratch space should be declared as 2x the desired
+    *      scratch space. The stack will start at the half-way point instead
+    *      of the end. The upper half of scratch space will not be accessed
+    *      and so does not have to be allocated in memory."
+    *
+    * From the Broadwell PRM, volume 2a, page 450:
+    *
+    *     "(Per Thread Scratch Space)
+    *      [0,11]  indicating [1k bytes, 2 Mbytes]"
+    */
+   assert(info->per_thread_scratch_size <=
+         ((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 2 : 1) * 1024 * 1024);
 
-      return (scratch_size > 1024) ?
-         (scratch_size - 1) / 1024 : 0;
+   if (!info->per_thread_scratch_size) {
+      *per_thread_space = 0;
+      return 0;
    }
+
+   /* next power of two, starting from 1KB */
+   *per_thread_space = (info->per_thread_scratch_size > 1024) ?
+      (util_last_bit(info->per_thread_scratch_size - 1) - 10) : 0;
+
+   return 1 << (10 + *per_thread_space);
 }
 
 static bool
@@ -250,7 +281,8 @@ compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
                                  const struct ilo_state_compute_info *info)
 {
    struct compute_urb_configuration urb;
-   uint8_t scratch_space;
+   uint32_t per_thread_size;
+   uint8_t per_thread_space;
 
    uint32_t dw1, dw2, dw4;
 
@@ -260,9 +292,16 @@ compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
        !compute_validate_gen6(dev, info, &urb))
       return false;
 
-   scratch_space = compute_get_gen6_scratch_space(dev, info);
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      per_thread_size = compute_get_gen75_per_thread_scratch_size(dev,
+            info, &per_thread_space);
+   } else {
+      per_thread_size = compute_get_gen6_per_thread_scratch_size(dev,
+            info, &per_thread_space);
+   }
+
+   dw1 = per_thread_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;
 
-   dw1 = scratch_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;
    dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
          urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
          GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
@@ -281,6 +320,8 @@ compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
    compute->vfe[1] = dw2;
    compute->vfe[2] = dw4;
 
+   compute->scratch_size = per_thread_size * dev->thread_count;
+
    return true;
 }
 
diff --git a/src/gallium/drivers/ilo/core/ilo_state_compute.h b/src/gallium/drivers/ilo/core/ilo_state_compute.h
index 346f7b617f4..bd56bba4369 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_compute.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_compute.h
@@ -45,8 +45,6 @@ struct ilo_state_compute_interface_info {
    /* usually 0 unless there are multiple interfaces */
    uint32_t kernel_offset;
 
-   uint32_t scratch_size;
-
    uint8_t sampler_count;
    uint8_t surface_count;
 
@@ -65,6 +63,8 @@ struct ilo_state_compute_info {
    const struct ilo_state_compute_interface_info *interfaces;
    uint8_t interface_count;
 
+   uint32_t per_thread_scratch_size;
+
    uint32_t cv_urb_alloc_size;
    uint32_t curbe_alloc_size;
 };
@@ -74,6 +74,8 @@ struct ilo_state_compute {
 
    uint32_t (*idrt)[6];
    uint8_t idrt_count;
+
+   uint32_t scratch_size;
 };
 
 static inline size_t
@@ -89,4 +91,10 @@ ilo_state_compute_init(struct ilo_state_compute *compute,
                        const struct ilo_dev *dev,
                        const struct ilo_state_compute_info *info);
 
+static inline uint32_t
+ilo_state_compute_get_scratch_size(const struct ilo_state_compute *compute)
+{
+   return compute->scratch_size;
+}
+
 #endif /* ILO_STATE_COMPUTE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader.c b/src/gallium/drivers/ilo/core/ilo_state_shader.c
index f67326c7f10..aec4fd6d8a6 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_shader.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader.c
@@ -37,7 +37,9 @@ enum vertex_stage {
 
 struct vertex_ff {
    uint8_t grf_start;
-   uint8_t scratch_space;
+
+   uint8_t per_thread_scratch_space;
+   uint32_t per_thread_scratch_size;
 
    uint8_t sampler_count;
    uint8_t surface_count;
@@ -59,13 +61,6 @@ vertex_validate_gen6_kernel(const struct ilo_dev *dev,
     * others.
     */
    const uint8_t max_grf_start = (stage == STAGE_GS) ? 16 : 32;
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 134:
-    *
-    *     "(Per-Thread Scratch Space)
-    *      Range    [0,11] indicating [1K Bytes, 2M Bytes]"
-    */
-   const uint32_t max_scratch_size = 2 * 1024 * 1024;
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
@@ -73,7 +68,6 @@ vertex_validate_gen6_kernel(const struct ilo_dev *dev,
    assert(!kernel->offset);
 
    assert(kernel->grf_start < max_grf_start);
-   assert(kernel->scratch_size <= max_scratch_size);
 
    return true;
 }
@@ -112,18 +106,33 @@ vertex_get_gen6_ff(const struct ilo_dev *dev,
                    const struct ilo_state_shader_kernel_info *kernel,
                    const struct ilo_state_shader_resource_info *resource,
                    const struct ilo_state_shader_urb_info *urb,
+                   uint32_t per_thread_scratch_size,
                    struct vertex_ff *ff)
 {
    ILO_DEV_ASSERT(dev, 6, 8);
 
+   memset(ff, 0, sizeof(*ff));
+
    if (!vertex_validate_gen6_kernel(dev, stage, kernel) ||
        !vertex_validate_gen6_urb(dev, stage, urb))
       return false;
 
    ff->grf_start = kernel->grf_start;
-   /* next power of two, starting from 1KB */
-   ff->scratch_space = (kernel->scratch_size > 1024) ?
-      (util_last_bit(kernel->scratch_size - 1) - 10): 0;
+
+   if (per_thread_scratch_size) {
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 134:
+       *
+       *     "(Per-Thread Scratch Space)
+       *      Range    [0,11] indicating [1K Bytes, 2M Bytes]"
+       */
+      assert(per_thread_scratch_size <= 2 * 1024 * 1024);
+
+      /* next power of two, starting from 1KB */
+      ff->per_thread_scratch_space = (per_thread_scratch_size > 1024) ?
+         (util_last_bit(per_thread_scratch_size - 1) - 10) : 0;
+      ff->per_thread_scratch_size = 1 << (10 + ff->per_thread_scratch_space);
+   }
 
    ff->sampler_count = (resource->sampler_count <= 12) ?
       (resource->sampler_count + 3) / 4 : 4;
@@ -192,8 +201,8 @@ vs_set_gen6_3DSTATE_VS(struct ilo_state_vs *vs,
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (!vertex_get_gen6_ff(dev, STAGE_VS, &info->kernel,
-            &info->resource, &info->urb, &ff))
+   if (!vertex_get_gen6_ff(dev, STAGE_VS, &info->kernel, &info->resource,
+            &info->urb, info->per_thread_scratch_size, &ff))
       return false;
 
    thread_count = vs_get_gen6_thread_count(dev, info);
@@ -207,7 +216,8 @@ vs_set_gen6_3DSTATE_VS(struct ilo_state_vs *vs,
    if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
       dw2 |= GEN75_THREADDISP_ACCESS_UAV;
 
-   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
 
    dw4 = ff.grf_start << GEN6_VS_DW4_URB_GRF_START__SHIFT |
          ff.vue_read_len << GEN6_VS_DW4_URB_READ_LEN__SHIFT |
@@ -234,6 +244,8 @@ vs_set_gen6_3DSTATE_VS(struct ilo_state_vs *vs,
    if (ilo_dev_gen(dev) >= ILO_GEN(8))
       vs->vs[4] = ff.user_clip_enables << GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT;
 
+   vs->scratch_size = ff.per_thread_scratch_size * thread_count;
+
    return true;
 }
 
@@ -273,8 +285,8 @@ hs_set_gen7_3DSTATE_HS(struct ilo_state_hs *hs,
 
    ILO_DEV_ASSERT(dev, 7, 8);
 
-   if (!vertex_get_gen6_ff(dev, STAGE_HS, &info->kernel,
-            &info->resource, &info->urb, &ff))
+   if (!vertex_get_gen6_ff(dev, STAGE_HS, &info->kernel, &info->resource,
+            &info->urb, info->per_thread_scratch_size, &ff))
       return false;
 
    thread_count = hs_get_gen7_thread_count(dev, info);
@@ -282,19 +294,22 @@ hs_set_gen7_3DSTATE_HS(struct ilo_state_hs *hs,
    dw1 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
          ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
 
-   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+   dw2 = 0 << GEN7_HS_DW2_INSTANCE_COUNT__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      dw2 |= thread_count << GEN8_HS_DW2_MAX_THREADS__SHIFT;
+   else if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
       dw1 |= thread_count << GEN75_HS_DW1_DISPATCH_MAX_THREADS__SHIFT;
    else
       dw1 |= thread_count << GEN7_HS_DW1_DISPATCH_MAX_THREADS__SHIFT;
 
-   dw2 = 0 << GEN7_HS_DW2_INSTANCE_COUNT__SHIFT;
-
    if (info->dispatch_enable)
       dw2 |= GEN7_HS_DW2_HS_ENABLE;
    if (info->stats_enable)
       dw2 |= GEN7_HS_DW2_STATISTICS;
 
-   dw4 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw4 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
 
    dw5 = GEN7_HS_DW5_INCLUDE_VERTEX_HANDLES |
          ff.grf_start << GEN7_HS_DW5_URB_GRF_START__SHIFT |
@@ -310,6 +325,8 @@ hs_set_gen7_3DSTATE_HS(struct ilo_state_hs *hs,
    hs->hs[2] = dw4;
    hs->hs[3] = dw5;
 
+   hs->scratch_size = ff.per_thread_scratch_size * thread_count;
+
    return true;
 }
 
@@ -373,8 +390,8 @@ ds_set_gen7_3DSTATE_DS(struct ilo_state_ds *ds,
 
    ILO_DEV_ASSERT(dev, 7, 8);
 
-   if (!vertex_get_gen6_ff(dev, STAGE_DS, &info->kernel,
-            &info->resource, &info->urb, &ff))
+   if (!vertex_get_gen6_ff(dev, STAGE_DS, &info->kernel, &info->resource,
+            &info->urb, info->per_thread_scratch_size, &ff))
       return false;
 
    thread_count = ds_get_gen7_thread_count(dev, info);
@@ -385,7 +402,8 @@ ds_set_gen7_3DSTATE_DS(struct ilo_state_ds *ds,
    if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
       dw2 |= GEN75_THREADDISP_ACCESS_UAV;
 
-   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
 
    dw4 = ff.grf_start << GEN7_DS_DW4_URB_GRF_START__SHIFT |
          ff.vue_read_len << GEN7_DS_DW4_URB_READ_LEN__SHIFT |
@@ -412,6 +430,8 @@ ds_set_gen7_3DSTATE_DS(struct ilo_state_ds *ds,
    if (ilo_dev_gen(dev) >= ILO_GEN(8))
       ds->ds[4] = ff.user_clip_enables << GEN8_DS_DW8_UCP_CLIP_ENABLES__SHIFT;
 
+   ds->scratch_size = ff.per_thread_scratch_size * thread_count;
+
    return true;
 }
 
@@ -425,8 +445,8 @@ gs_get_gen6_ff(const struct ilo_dev *dev,
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
-   if (!vertex_get_gen6_ff(dev, STAGE_GS, &info->kernel,
-            &info->resource, &info->urb, ff))
+   if (!vertex_get_gen6_ff(dev, STAGE_GS, &info->kernel, &info->resource,
+            &info->urb, info->per_thread_scratch_size, ff))
       return false;
 
    /*
@@ -510,7 +530,8 @@ gs_set_gen6_3DSTATE_GS(struct ilo_state_gs *gs,
          ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
          ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
 
-   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
 
    dw4 = ff.vue_read_len << GEN6_GS_DW4_URB_READ_LEN__SHIFT |
          ff.vue_read_offset << GEN6_GS_DW4_URB_READ_OFFSET__SHIFT |
@@ -550,6 +571,8 @@ gs_set_gen6_3DSTATE_GS(struct ilo_state_gs *gs,
    gs->gs[3] = dw5;
    gs->gs[4] = dw6;
 
+   gs->scratch_size = ff.per_thread_scratch_size * thread_count;
+
    return true;
 }
 
@@ -588,7 +611,8 @@ gs_set_gen7_3DSTATE_GS(struct ilo_state_gs *gs,
    if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
       dw2 |= GEN75_THREADDISP_ACCESS_UAV;
 
-   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff.per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
 
    dw4 = vertex_size << GEN7_GS_DW4_OUTPUT_SIZE__SHIFT |
          0 << GEN7_GS_DW4_OUTPUT_TOPO__SHIFT |
@@ -618,6 +642,8 @@ gs_set_gen7_3DSTATE_GS(struct ilo_state_gs *gs,
    if (ilo_dev_gen(dev) >= ILO_GEN(8))
       gs->gs[4] = ff.user_clip_enables << GEN8_GS_DW9_UCP_CLIP_ENABLES__SHIFT;
 
+   gs->scratch_size = ff.per_thread_scratch_size * thread_count;
+
    return true;
 }
 
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader.h b/src/gallium/drivers/ilo/core/ilo_state_shader.h
index 44690c5b0bb..35651090d66 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_shader.h
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader.h
@@ -42,8 +42,6 @@ struct ilo_state_shader_kernel_info {
 
    uint8_t grf_start;
    uint8_t pcb_attr_count;
-
-   uint32_t scratch_size;
 };
 
 /**
@@ -77,6 +75,7 @@ struct ilo_state_vs_info {
    struct ilo_state_shader_resource_info resource;
    struct ilo_state_shader_urb_info urb;
 
+   uint32_t per_thread_scratch_size;
    bool dispatch_enable;
    bool stats_enable;
 };
@@ -86,6 +85,7 @@ struct ilo_state_hs_info {
    struct ilo_state_shader_resource_info resource;
    struct ilo_state_shader_urb_info urb;
 
+   uint32_t per_thread_scratch_size;
    bool dispatch_enable;
    bool stats_enable;
 };
@@ -95,6 +95,7 @@ struct ilo_state_ds_info {
    struct ilo_state_shader_resource_info resource;
    struct ilo_state_shader_urb_info urb;
 
+   uint32_t per_thread_scratch_size;
    bool dispatch_enable;
    bool stats_enable;
 };
@@ -119,6 +120,7 @@ struct ilo_state_gs_info {
 
    struct ilo_state_gs_sol_info sol;
 
+   uint32_t per_thread_scratch_size;
    bool dispatch_enable;
    bool stats_enable;
 };
@@ -158,6 +160,8 @@ struct ilo_state_ps_info {
    struct ilo_state_ps_io_info io;
    struct ilo_state_ps_params_info params;
 
+   uint32_t per_thread_scratch_size;
+
    /* bitmask of GEN6_PS_DISPATCH_x */
    uint8_t valid_kernels;
    bool per_sample_dispatch;
@@ -173,23 +177,28 @@ struct ilo_state_ps_info {
 
 struct ilo_state_vs {
    uint32_t vs[5];
+   uint32_t scratch_size;
 };
 
 struct ilo_state_hs {
    uint32_t hs[4];
+   uint32_t scratch_size;
 };
 
 struct ilo_state_ds {
    uint32_t te[3];
    uint32_t ds[5];
+   uint32_t scratch_size;
 };
 
 struct ilo_state_gs {
    uint32_t gs[5];
+   uint32_t scratch_size;
 };
 
 struct ilo_state_ps {
    uint32_t ps[8];
+   uint32_t scratch_size;
 
    struct ilo_state_ps_dispatch_conds {
       bool ps_valid;
@@ -211,6 +220,12 @@ bool
 ilo_state_vs_init_disabled(struct ilo_state_vs *vs,
                            const struct ilo_dev *dev);
 
+static inline uint32_t
+ilo_state_vs_get_scratch_size(const struct ilo_state_vs *vs)
+{
+   return vs->scratch_size;
+}
+
 bool
 ilo_state_hs_init(struct ilo_state_hs *hs,
                   const struct ilo_dev *dev,
@@ -221,6 +236,12 @@ ilo_state_hs_init_disabled(struct ilo_state_hs *hs,
                            const struct ilo_dev *dev);
 
 
+static inline uint32_t
+ilo_state_hs_get_scratch_size(const struct ilo_state_hs *hs)
+{
+   return hs->scratch_size;
+}
+
 bool
 ilo_state_ds_init(struct ilo_state_ds *ds,
                   const struct ilo_dev *dev,
@@ -230,6 +251,12 @@ bool
 ilo_state_ds_init_disabled(struct ilo_state_ds *ds,
                            const struct ilo_dev *dev);
 
+static inline uint32_t
+ilo_state_ds_get_scratch_size(const struct ilo_state_ds *ds)
+{
+   return ds->scratch_size;
+}
+
 bool
 ilo_state_gs_init(struct ilo_state_gs *gs,
                   const struct ilo_dev *dev,
@@ -239,6 +266,12 @@ bool
 ilo_state_gs_init_disabled(struct ilo_state_gs *gs,
                            const struct ilo_dev *dev);
 
+static inline uint32_t
+ilo_state_gs_get_scratch_size(const struct ilo_state_gs *gs)
+{
+   return gs->scratch_size;
+}
+
 bool
 ilo_state_ps_init(struct ilo_state_ps *ps,
                   const struct ilo_dev *dev,
@@ -253,4 +286,10 @@ ilo_state_ps_set_params(struct ilo_state_ps *ps,
                         const struct ilo_dev *dev,
                         const struct ilo_state_ps_params_info *params);
 
+static inline uint32_t
+ilo_state_ps_get_scratch_size(const struct ilo_state_ps *ps)
+{
+   return ps->scratch_size;
+}
+
 #endif /* ILO_STATE_SHADER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
index ceeb68a460e..5c3ca1ebe37 100644
--- a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
@@ -34,7 +34,8 @@ struct pixel_ff {
    uint32_t kernel_offsets[3];
    uint8_t grf_starts[3];
    bool pcb_enable;
-   uint8_t scratch_space;
+   uint8_t per_thread_scratch_space;
+   uint32_t per_thread_scratch_size;
 
    uint8_t sampler_count;
    uint8_t surface_count;
@@ -56,13 +57,6 @@ ps_kernel_validate_gen6(const struct ilo_dev *dev,
 {
    /* "Dispatch GRF Start Register for Constant/Setup Data" is U7 */
    const uint8_t max_grf_start = 128;
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 271:
-    *
-    *     "(Per-Thread Scratch Space)
-    *      Range  [0,11] indicating [1k bytes, 2M bytes] in powers of two"
-    */
-   const uint32_t max_scratch_size = 2 * 1024 * 1024;
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
@@ -70,7 +64,6 @@ ps_kernel_validate_gen6(const struct ilo_dev *dev,
    assert(kernel->offset % 64 == 0);
 
    assert(kernel->grf_start < max_grf_start);
-   assert(kernel->scratch_size <= max_scratch_size);
 
    return true;
 }
@@ -325,7 +318,6 @@ ps_get_gen6_ff_kernels(const struct ilo_dev *dev,
    const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8;
    const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16;
    const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32;
-   uint32_t scratch_size;
 
    ILO_DEV_ASSERT(dev, 6, 8);
 
@@ -363,21 +355,6 @@ ps_get_gen6_ff_kernels(const struct ilo_dev *dev,
                      ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
                       kernel_32->pcb_attr_count));
 
-   scratch_size = 0;
-   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_8) &&
-       scratch_size < kernel_8->scratch_size)
-      scratch_size = kernel_8->scratch_size;
-   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) &&
-       scratch_size < kernel_16->scratch_size)
-      scratch_size = kernel_16->scratch_size;
-   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
-       scratch_size < kernel_32->scratch_size)
-      scratch_size = kernel_32->scratch_size;
-
-   /* next power of two, starting from 1KB */
-   ff->scratch_space = (scratch_size > 1024) ?
-      (util_last_bit(scratch_size - 1) - 10): 0;
-
    /* GPU hangs on Haswell if none of the dispatch mode bits is set */
    if (ilo_dev_gen(dev) == ILO_GEN(7.5) && !ff->dispatch_modes)
       ff->dispatch_modes |= GEN6_PS_DISPATCH_8;
@@ -401,6 +378,21 @@ ps_get_gen6_ff(const struct ilo_dev *dev,
    if (!ps_validate_gen6(dev, info) || !ps_get_gen6_ff_kernels(dev, info, ff))
       return false;
 
+   if (info->per_thread_scratch_size) {
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 271:
+       *
+       *     "(Per-Thread Scratch Space)
+       *      Range  [0,11] indicating [1k bytes, 2M bytes] in powers of two"
+       */
+      assert(info->per_thread_scratch_size <= 2 * 1024 * 1024);
+
+      /* next power of two, starting from 1KB */
+      ff->per_thread_scratch_space = (info->per_thread_scratch_size > 1024) ?
+         (util_last_bit(info->per_thread_scratch_size - 1) - 10) : 0;
+      ff->per_thread_scratch_size = 1 << (10 + ff->per_thread_scratch_space);
+   }
+
    ff->sampler_count = (resource->sampler_count <= 12) ?
       (resource->sampler_count + 3) / 4 : 4;
    ff->surface_count = resource->surface_count;
@@ -441,7 +433,8 @@ ps_set_gen6_3dstate_wm(struct ilo_state_ps *ps,
    if (false)
       dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
 
-   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff->per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
 
    dw4 = ff->grf_starts[0] << GEN6_WM_DW4_URB_GRF_START0__SHIFT |
          ff->grf_starts[1] << GEN6_WM_DW4_URB_GRF_START1__SHIFT |
@@ -539,7 +532,8 @@ ps_set_gen7_3DSTATE_PS(struct ilo_state_ps *ps,
    if (false)
       dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
 
-   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw3 = ff->per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
 
    dw4 = io->posoffset << GEN7_PS_DW4_POSOFFSET__SHIFT |
          ff->dispatch_modes << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;
@@ -603,7 +597,8 @@ ps_set_gen8_3DSTATE_PS(struct ilo_state_ps *ps,
    if (false)
       dw3 |= GEN6_THREADDISP_FP_MODE_ALT;
 
-   dw4 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw4 = ff->per_thread_scratch_space <<
+      GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
 
    dw6 = ff->thread_count << GEN8_PS_DW6_MAX_THREADS__SHIFT |
          io->posoffset << GEN8_PS_DW6_POSOFFSET__SHIFT |
@@ -705,6 +700,7 @@ ilo_state_ps_init(struct ilo_state_ps *ps,
       ret &= ps_set_gen6_3dstate_wm(ps, dev, info, &ff);
    }
 
+   ps->scratch_size = ff.per_thread_scratch_size * ff.thread_count;
    /* save conditions */
    ps->conds = ff.conds;
 
diff --git a/src/gallium/drivers/ilo/ilo_blit.h b/src/gallium/drivers/ilo/ilo_blit.h
index da0bfe9c4c9..bad4dab8404 100644
--- a/src/gallium/drivers/ilo/ilo_blit.h
+++ b/src/gallium/drivers/ilo/ilo_blit.h
@@ -58,10 +58,12 @@ ilo_blit_resolve_slices(struct ilo_context *ilo,
     * As it is only used to resolve HiZ right now, return early when there is
     * no HiZ.
     */
-   if (!ilo_image_can_enable_aux(&tex->image, level))
+   if (tex->image.aux.type != ILO_IMAGE_AUX_HIZ ||
+       !ilo_image_can_enable_aux(&tex->image, level))
       return;
 
-   if (ilo_image_can_enable_aux(&tex->image, level)) {
+   if (tex->image.aux.type == ILO_IMAGE_AUX_HIZ &&
+       ilo_image_can_enable_aux(&tex->image, level)) {
       ilo_blit_resolve_slices_for_hiz(ilo, res, level,
             first_slice, num_slices, resolve_flags);
    }
diff --git a/src/gallium/drivers/ilo/ilo_draw.c b/src/gallium/drivers/ilo/ilo_draw.c
index 433348d9326..69f36ae5df6 100644
--- a/src/gallium/drivers/ilo/ilo_draw.c
+++ b/src/gallium/drivers/ilo/ilo_draw.c
@@ -547,6 +547,7 @@ static void
 ilo_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
    struct ilo_context *ilo = ilo_context(pipe);
+   int vs_scratch_size, gs_scratch_size, fs_scratch_size;
 
    if (ilo_debug & ILO_DEBUG_DRAW) {
       if (info->indexed) {
@@ -574,8 +575,15 @@ ilo_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 
    ilo_finalize_3d_states(ilo, info);
 
+   /* upload kernels */
    ilo_shader_cache_upload(ilo->shader_cache, &ilo->cp->builder);
 
+   /* prepare scratch spaces */
+   ilo_shader_cache_get_max_scratch_sizes(ilo->shader_cache,
+         &vs_scratch_size, &gs_scratch_size, &fs_scratch_size);
+   ilo_render_prepare_scratch_spaces(ilo->render,
+         vs_scratch_size, gs_scratch_size, fs_scratch_size);
+
    ilo_blit_resolve_framebuffer(ilo);
 
    /* If draw_vbo ever fails, return immediately. */
diff --git a/src/gallium/drivers/ilo/ilo_render.c b/src/gallium/drivers/ilo/ilo_render.c
index 21f75de11a0..8bc04df4fab 100644
--- a/src/gallium/drivers/ilo/ilo_render.c
+++ b/src/gallium/drivers/ilo/ilo_render.c
@@ -67,10 +67,49 @@ ilo_render_create(struct ilo_builder *builder)
 void
 ilo_render_destroy(struct ilo_render *render)
 {
+   intel_bo_unref(render->vs_scratch.bo);
+   intel_bo_unref(render->gs_scratch.bo);
+   intel_bo_unref(render->fs_scratch.bo);
+
    intel_bo_unref(render->workaround_bo);
    FREE(render);
 }
 
+static bool
+resize_scratch_space(struct ilo_render *render,
+                     struct ilo_render_scratch_space *scratch,
+                     const char *name, int new_size)
+{
+   struct intel_bo *bo;
+
+   if (scratch->size >= new_size)
+      return true;
+
+   bo = intel_winsys_alloc_bo(render->builder->winsys, name, new_size, false);
+   if (!bo)
+      return false;
+
+   intel_bo_unref(scratch->bo);
+   scratch->bo = bo;
+   scratch->size = new_size;
+
+   return true;
+}
+
+bool
+ilo_render_prepare_scratch_spaces(struct ilo_render *render,
+                                  int vs_scratch_size,
+                                  int gs_scratch_size,
+                                  int fs_scratch_size)
+{
+   return (resize_scratch_space(render, &render->vs_scratch,
+            "vs scratch", vs_scratch_size) &&
+           resize_scratch_space(render, &render->gs_scratch,
+            "gs scratch", gs_scratch_size) &&
+           resize_scratch_space(render, &render->fs_scratch,
+            "fs scratch", fs_scratch_size));
+}
+
 void
 ilo_render_get_sample_position(const struct ilo_render *render,
                                unsigned sample_count,
diff --git a/src/gallium/drivers/ilo/ilo_render.h b/src/gallium/drivers/ilo/ilo_render.h
index 098af73ec9b..31fd1e6f859 100644
--- a/src/gallium/drivers/ilo/ilo_render.h
+++ b/src/gallium/drivers/ilo/ilo_render.h
@@ -43,6 +43,12 @@ ilo_render_create(struct ilo_builder *builder);
 void
 ilo_render_destroy(struct ilo_render *render);
 
+bool
+ilo_render_prepare_scratch_spaces(struct ilo_render *render,
+                                  int vs_scratch_size,
+                                  int gs_scratch_size,
+                                  int fs_scratch_size);
+
 void
 ilo_render_get_sample_position(const struct ilo_render *render,
                                unsigned sample_count,
diff --git a/src/gallium/drivers/ilo/ilo_render_gen.h b/src/gallium/drivers/ilo/ilo_render_gen.h
index 6b133750043..f227d6bf4da 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen.h
+++ b/src/gallium/drivers/ilo/ilo_render_gen.h
@@ -51,6 +51,11 @@ struct ilo_render {
 
    struct intel_bo *workaround_bo;
 
+   struct ilo_render_scratch_space {
+      struct intel_bo *bo;
+      int size;
+   } vs_scratch, gs_scratch, fs_scratch;
+
    struct ilo_state_sample_pattern sample_pattern;
 
    bool hw_ctx_changed;
diff --git a/src/gallium/drivers/ilo/ilo_render_gen6.c b/src/gallium/drivers/ilo/ilo_render_gen6.c
index c1f759f3043..910e6c0fb7a 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen6.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen6.c
@@ -475,10 +475,13 @@ gen6_draw_vs(struct ilo_render *r,
          gen6_wa_pre_3dstate_vs_toggle(r);
 
       if (ilo_dev_gen(r->dev) == ILO_GEN(6) &&
-          ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO))
-         gen6_3DSTATE_VS(r->builder, &cso->vs_sol.vs, kernel_offset);
-      else
-         gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
+          ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)) {
+         gen6_3DSTATE_VS(r->builder, &cso->vs_sol.vs,
+               kernel_offset, r->vs_scratch.bo);
+      } else {
+         gen6_3DSTATE_VS(r->builder, &cso->vs,
+               kernel_offset, r->vs_scratch.bo);
+      }
    }
 }
 
@@ -501,7 +504,8 @@ gen6_draw_gs(struct ilo_render *r,
          cso = ilo_shader_get_kernel_cso(vec->gs);
          kernel_offset = ilo_shader_get_kernel_offset(vec->gs);
 
-         gen6_3DSTATE_GS(r->builder, &cso->gs, kernel_offset);
+         gen6_3DSTATE_GS(r->builder, &cso->gs,
+               kernel_offset, r->gs_scratch.bo);
       } else if (ilo_dev_gen(r->dev) == ILO_GEN(6) &&
             ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)) {
          const int verts_per_prim =
@@ -524,9 +528,10 @@ gen6_draw_gs(struct ilo_render *r,
          kernel_offset = ilo_shader_get_kernel_offset(vec->vs) +
             ilo_shader_get_kernel_param(vec->vs, param);
 
-         gen6_3DSTATE_GS(r->builder, &cso->vs_sol.sol, kernel_offset);
+         gen6_3DSTATE_GS(r->builder, &cso->vs_sol.sol,
+               kernel_offset, r->gs_scratch.bo);
       } else {
-         gen6_3DSTATE_GS(r->builder, &vec->disabled_gs, 0);
+         gen6_3DSTATE_GS(r->builder, &vec->disabled_gs, 0, NULL);
       }
    }
 }
@@ -672,7 +677,7 @@ gen6_draw_wm(struct ilo_render *r,
          gen6_wa_pre_3dstate_wm_max_threads(r);
 
       gen6_3DSTATE_WM(r->builder, &vec->rasterizer->rs,
-            &cso->ps, kernel_offset);
+            &cso->ps, kernel_offset, r->fs_scratch.bo);
    }
 }
 
@@ -817,10 +822,10 @@ gen6_rectlist_vs_to_sf(struct ilo_render *r,
    gen6_wa_post_3dstate_constant_vs(r);
 
    gen6_wa_pre_3dstate_vs_toggle(r);
-   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0);
+   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0, NULL);
 
    gen6_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0);
-   gen6_3DSTATE_GS(r->builder, &blitter->gs, 0);
+   gen6_3DSTATE_GS(r->builder, &blitter->gs, 0, NULL);
 
    gen6_3DSTATE_CLIP(r->builder, &blitter->fb.rs);
    gen6_3DSTATE_SF(r->builder, &blitter->fb.rs, &blitter->sbe);
@@ -833,7 +838,7 @@ gen6_rectlist_wm(struct ilo_render *r,
    gen6_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0);
 
    gen6_wa_pre_3dstate_wm_max_threads(r);
-   gen6_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps, 0);
+   gen6_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps, 0, NULL);
 }
 
 static void
diff --git a/src/gallium/drivers/ilo/ilo_render_gen7.c b/src/gallium/drivers/ilo/ilo_render_gen7.c
index 6623a8bcb43..330ba6c88d6 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen7.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen7.c
@@ -318,10 +318,13 @@ gen7_draw_vs(struct ilo_render *r,
       const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->vs);
       const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->vs);
 
-      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
-         gen8_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
-      else
-         gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) {
+         gen8_3DSTATE_VS(r->builder, &cso->vs,
+               kernel_offset, r->vs_scratch.bo);
+      } else {
+         gen6_3DSTATE_VS(r->builder, &cso->vs,
+               kernel_offset, r->vs_scratch.bo);
+      }
    }
 }
 
@@ -338,9 +341,9 @@ gen7_draw_hs(struct ilo_render *r,
       gen7_3DSTATE_CONSTANT_HS(r->builder, 0, 0, 0);
 
       if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
-         gen8_3DSTATE_HS(r->builder, hs, kernel_offset);
+         gen8_3DSTATE_HS(r->builder, hs, kernel_offset, NULL);
       else
-         gen7_3DSTATE_HS(r->builder, hs, kernel_offset);
+         gen7_3DSTATE_HS(r->builder, hs, kernel_offset, NULL);
    }
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_HS */
@@ -373,9 +376,9 @@ gen7_draw_ds(struct ilo_render *r,
       gen7_3DSTATE_CONSTANT_DS(r->builder, 0, 0, 0);
 
       if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
-         gen8_3DSTATE_DS(r->builder, ds, kernel_offset);
+         gen8_3DSTATE_DS(r->builder, ds, kernel_offset, NULL);
       else
-         gen7_3DSTATE_DS(r->builder, ds, kernel_offset);
+         gen7_3DSTATE_DS(r->builder, ds, kernel_offset, NULL);
    }
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_DS */
@@ -397,9 +400,9 @@ gen7_draw_gs(struct ilo_render *r,
       gen7_3DSTATE_CONSTANT_GS(r->builder, 0, 0, 0);
 
       if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
-         gen8_3DSTATE_GS(r->builder, gs, kernel_offset);
+         gen8_3DSTATE_GS(r->builder, gs, kernel_offset, NULL);
       else
-         gen7_3DSTATE_GS(r->builder, gs, kernel_offset);
+         gen7_3DSTATE_GS(r->builder, gs, kernel_offset, NULL);
    }
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_GS */
@@ -534,7 +537,7 @@ gen7_draw_wm(struct ilo_render *r,
       if (r->hw_ctx_changed)
          gen7_wa_pre_3dstate_ps_max_threads(r);
 
-      gen7_3DSTATE_PS(r->builder, &cso->ps, kernel_offset);
+      gen7_3DSTATE_PS(r->builder, &cso->ps, kernel_offset, r->fs_scratch.bo);
    }
 
    /* 3DSTATE_SCISSOR_STATE_POINTERS */
@@ -678,18 +681,18 @@ gen7_rectlist_vs_to_sf(struct ilo_render *r,
                        const struct ilo_blitter *blitter)
 {
    gen7_3DSTATE_CONSTANT_VS(r->builder, NULL, NULL, 0);
-   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0);
+   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0, NULL);
 
    gen7_3DSTATE_CONSTANT_HS(r->builder, NULL, NULL, 0);
-   gen7_3DSTATE_HS(r->builder, &blitter->hs, 0);
+   gen7_3DSTATE_HS(r->builder, &blitter->hs, 0, NULL);
 
    gen7_3DSTATE_TE(r->builder, &blitter->ds);
 
    gen7_3DSTATE_CONSTANT_DS(r->builder, NULL, NULL, 0);
-   gen7_3DSTATE_DS(r->builder, &blitter->ds, 0);
+   gen7_3DSTATE_DS(r->builder, &blitter->ds, 0, NULL);
 
    gen7_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0);
-   gen7_3DSTATE_GS(r->builder, &blitter->gs, 0);
+   gen7_3DSTATE_GS(r->builder, &blitter->gs, 0, NULL);
 
    gen7_3DSTATE_STREAMOUT(r->builder, &blitter->sol);
 
@@ -711,7 +714,7 @@ gen7_rectlist_wm(struct ilo_render *r,
    gen7_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0);
 
    gen7_wa_pre_3dstate_ps_max_threads(r);
-   gen7_3DSTATE_PS(r->builder, &blitter->ps, 0);
+   gen7_3DSTATE_PS(r->builder, &blitter->ps, 0, NULL);
 }
 
 static void
diff --git a/src/gallium/drivers/ilo/ilo_render_gen8.c b/src/gallium/drivers/ilo/ilo_render_gen8.c
index 65494b4058a..efe0e0d501b 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen8.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen8.c
@@ -125,7 +125,7 @@ gen8_draw_wm(struct ilo_render *r,
 
    /* 3DSTATE_PS */
    if (DIRTY(FS) || r->instruction_bo_changed)
-      gen8_3DSTATE_PS(r->builder, &cso->ps, kernel_offset);
+      gen8_3DSTATE_PS(r->builder, &cso->ps, kernel_offset, r->fs_scratch.bo);
 
    /* 3DSTATE_PS_EXTRA */
    if (DIRTY(FS))
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index e1a7dc56685..888f7aa6782 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -474,6 +474,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/ilo/ilo_shader.c b/src/gallium/drivers/ilo/ilo_shader.c
index 73b625e9de4..c61716dc791 100644
--- a/src/gallium/drivers/ilo/ilo_shader.c
+++ b/src/gallium/drivers/ilo/ilo_shader.c
@@ -37,6 +37,10 @@
 struct ilo_shader_cache {
    struct list_head shaders;
    struct list_head changed;
+
+   int max_vs_scratch_size;
+   int max_gs_scratch_size;
+   int max_fs_scratch_size;
 };
 
 /**
@@ -121,6 +125,8 @@ ilo_shader_cache_upload(struct ilo_shader_cache *shc,
       struct ilo_shader *sh;
 
       LIST_FOR_EACH_ENTRY(sh, &shader->variants, list) {
+         int scratch_size, *cur_max;
+
          if (sh->uploaded)
             continue;
 
@@ -128,6 +134,29 @@ ilo_shader_cache_upload(struct ilo_shader_cache *shc,
                sh->kernel_size, sh->kernel);
 
          sh->uploaded = true;
+
+         switch (shader->info.type) {
+         case PIPE_SHADER_VERTEX:
+            scratch_size = ilo_state_vs_get_scratch_size(&sh->cso.vs);
+            cur_max = &shc->max_vs_scratch_size;
+            break;
+         case PIPE_SHADER_GEOMETRY:
+            scratch_size = ilo_state_gs_get_scratch_size(&sh->cso.gs);
+            cur_max = &shc->max_gs_scratch_size;
+            break;
+         case PIPE_SHADER_FRAGMENT:
+            scratch_size = ilo_state_ps_get_scratch_size(&sh->cso.ps);
+            cur_max = &shc->max_fs_scratch_size;
+            break;
+         default:
+            assert(!"unknown shader type");
+            scratch_size = 0;
+            cur_max = &shc->max_vs_scratch_size;
+            break;
+         }
+
+         if (*cur_max < scratch_size)
+            *cur_max = scratch_size;
       }
 
       list_del(&shader->list);
@@ -155,6 +184,21 @@ ilo_shader_cache_invalidate(struct ilo_shader_cache *shc)
       LIST_FOR_EACH_ENTRY(sh, &shader->variants, list)
          sh->uploaded = false;
    }
+
+   shc->max_vs_scratch_size = 0;
+   shc->max_gs_scratch_size = 0;
+   shc->max_fs_scratch_size = 0;
+}
+
+void
+ilo_shader_cache_get_max_scratch_sizes(const struct ilo_shader_cache *shc,
+                                       int *vs_scratch_size,
+                                       int *gs_scratch_size,
+                                       int *fs_scratch_size)
+{
+   *vs_scratch_size = shc->max_vs_scratch_size;
+   *gs_scratch_size = shc->max_gs_scratch_size;
+   *fs_scratch_size = shc->max_fs_scratch_size;
 }
 
 /**
@@ -578,7 +622,6 @@ init_shader_kernel(const struct ilo_shader *kernel,
    kern->grf_start = kernel->in.start_grf;
    kern->pcb_attr_count =
       (kernel->pcb.cbuf0_size + kernel->pcb.clip_state_size + 15) / 16;
-   kern->scratch_size = 0;
 }
 
 static void
@@ -602,6 +645,7 @@ init_vs(struct ilo_shader *kernel,
    init_shader_urb(kernel, state, &info.urb);
    init_shader_kernel(kernel, state, &info.kernel);
    init_shader_resource(kernel, state, &info.resource);
+   info.per_thread_scratch_size = kernel->per_thread_scratch_size;
    info.dispatch_enable = true;
    info.stats_enable = true;
 
@@ -640,6 +684,7 @@ init_gs(struct ilo_shader *kernel,
    init_shader_urb(kernel, state, &info.urb);
    init_shader_kernel(kernel, state, &info.kernel);
    init_shader_resource(kernel, state, &info.resource);
+   info.per_thread_scratch_size = kernel->per_thread_scratch_size;
    info.dispatch_enable = true;
    info.stats_enable = true;
 
@@ -664,6 +709,7 @@ init_ps(struct ilo_shader *kernel,
    init_shader_kernel(kernel, state, &info.kernel_8);
    init_shader_resource(kernel, state, &info.resource);
 
+   info.per_thread_scratch_size = kernel->per_thread_scratch_size;
    info.io.has_rt_write = true;
    info.io.posoffset = GEN6_POSOFFSET_NONE;
    info.io.attr_count = kernel->in.count;
diff --git a/src/gallium/drivers/ilo/ilo_shader.h b/src/gallium/drivers/ilo/ilo_shader.h
index 01de54146b1..10dcf739430 100644
--- a/src/gallium/drivers/ilo/ilo_shader.h
+++ b/src/gallium/drivers/ilo/ilo_shader.h
@@ -120,6 +120,12 @@ ilo_shader_cache_upload(struct ilo_shader_cache *shc,
 void
 ilo_shader_cache_invalidate(struct ilo_shader_cache *shc);
 
+void
+ilo_shader_cache_get_max_scratch_sizes(const struct ilo_shader_cache *shc,
+                                       int *vs_scratch_size,
+                                       int *gs_scratch_size,
+                                       int *fs_scratch_size);
+
 struct ilo_shader_state *
 ilo_shader_create_vs(const struct ilo_dev *dev,
                      const struct pipe_shader_state *state,
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_internal.h b/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
index 01c86675202..1f0cda174e8 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
@@ -139,6 +139,7 @@ struct ilo_shader {
 
    void *kernel;
    int kernel_size;
+   int per_thread_scratch_size;
 
    struct ilo_kernel_routing routing;
    struct ilo_state_ps_params_info ps_params;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index e2ed267da78..d1c50aefc84 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -299,6 +299,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 4c8167a9e7d..1778b13f9dd 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -854,10 +854,10 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                      jit_tex->img_stride[j] = lp_tex->img_stride[j];
                   }
 
-                  if (view->target == PIPE_TEXTURE_1D_ARRAY ||
-                      view->target == PIPE_TEXTURE_2D_ARRAY ||
-                      view->target == PIPE_TEXTURE_CUBE ||
-                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  if (res->target == PIPE_TEXTURE_1D_ARRAY ||
+                      res->target == PIPE_TEXTURE_2D_ARRAY ||
+                      res->target == PIPE_TEXTURE_CUBE ||
+                      res->target == PIPE_TEXTURE_CUBE_ARRAY) {
                      /*
                       * For array textures, we don't have first_layer, instead
                       * adjust last_layer (stored as depth) plus the mip level offsets
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index b205f02fdba..1e055878f7c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -275,10 +275,10 @@ prepare_shader_sampling(
                   row_stride[j] = lp_tex->row_stride[j];
                   img_stride[j] = lp_tex->img_stride[j];
                }
-               if (view->target == PIPE_TEXTURE_1D_ARRAY ||
-                   view->target == PIPE_TEXTURE_2D_ARRAY ||
-                   view->target == PIPE_TEXTURE_CUBE ||
-                   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+               if (tex->target == PIPE_TEXTURE_1D_ARRAY ||
+                   tex->target == PIPE_TEXTURE_2D_ARRAY ||
+                   tex->target == PIPE_TEXTURE_CUBE ||
+                   tex->target == PIPE_TEXTURE_CUBE_ARRAY) {
                   num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1;
                   for (j = first_level; j <= last_level; j++) {
                      mip_offsets[j] += view->u.tex.first_layer *
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index af46342fdf2..7862ac8f217 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -200,7 +200,8 @@ llvmpipe_can_create_resource(struct pipe_screen *screen,
 
 static boolean
 llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
-                              struct llvmpipe_resource *lpr)
+                              struct llvmpipe_resource *lpr,
+                              const void *map_front_private)
 {
    struct sw_winsys *winsys = screen->winsys;
 
@@ -215,12 +216,13 @@ llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
                                           lpr->base.format,
                                           width, height,
                                           64,
+                                          map_front_private,
                                           &lpr->row_stride[0] );
 
    if (lpr->dt == NULL)
       return FALSE;
 
-   {
+   if (!map_front_private) {
       void *map = winsys->displaytarget_map(winsys, lpr->dt,
                                             PIPE_TRANSFER_WRITE);
 
@@ -235,8 +237,9 @@ llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
 
 
 static struct pipe_resource *
-llvmpipe_resource_create(struct pipe_screen *_screen,
-                         const struct pipe_resource *templat)
+llvmpipe_resource_create_front(struct pipe_screen *_screen,
+                               const struct pipe_resource *templat,
+                               const void *map_front_private)
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
    struct llvmpipe_resource *lpr = CALLOC_STRUCT(llvmpipe_resource);
@@ -254,7 +257,7 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
                             PIPE_BIND_SCANOUT |
                             PIPE_BIND_SHARED)) {
          /* displayable surface */
-         if (!llvmpipe_displaytarget_layout(screen, lpr))
+         if (!llvmpipe_displaytarget_layout(screen, lpr, map_front_private))
             goto fail;
       }
       else {
@@ -300,7 +303,12 @@ llvmpipe_resource_create(struct pipe_screen *_screen,
    FREE(lpr);
    return NULL;
 }
-
+static struct pipe_resource *
+llvmpipe_resource_create(struct pipe_screen *_screen,
+                         const struct pipe_resource *templat)
+{
+   return llvmpipe_resource_create_front(_screen, templat, NULL);
+}
 
 static void
 llvmpipe_resource_destroy(struct pipe_screen *pscreen,
@@ -797,6 +805,7 @@ llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen)
 #endif
 
    screen->resource_create = llvmpipe_resource_create;
+   screen->resource_create_front = llvmpipe_resource_create_front;
    screen->resource_destroy = llvmpipe_resource_destroy;
    screen->resource_from_handle = llvmpipe_resource_from_handle;
    screen->resource_get_handle = llvmpipe_resource_get_handle;
diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
index c18e9f5b435..83f81135590 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -73,6 +73,9 @@ NV50_C_SOURCES := \
 	nv50/nv50_program.h \
 	nv50/nv50_push.c \
 	nv50/nv50_query.c \
+	nv50/nv50_query.h \
+	nv50/nv50_query_hw.c \
+	nv50/nv50_query_hw.h \
 	nv50/nv50_resource.c \
 	nv50/nv50_resource.h \
 	nv50/nv50_screen.c \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index cce60550ae5..6ad9dd31681 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -1128,7 +1128,6 @@ nv50_ir_init_prog_info(struct nv50_ir_prog_info *info)
       info->prop.gp.instanceCount = 1;
       info->prop.gp.maxVertices = 1;
    }
-   info->io.clipDistance = 0xff;
    info->io.pointSize = 0xff;
    info->io.instanceId = 0xff;
    info->io.vertexId = 0xff;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
index a610c773f55..0d544581697 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -73,8 +73,8 @@ public:
 
    Instruction *mkCvt(operation, DataType, Value *, DataType, Value *);
    CmpInstruction *mkCmp(operation, CondCode, DataType,
-			 Value *,
-			 DataType, Value *, Value *, Value * = NULL);
+                         Value *,
+                         DataType, Value *, Value *, Value * = NULL);
    TexInstruction *mkTex(operation, TexTarget,
                          uint16_t tic, uint16_t tsc,
                          const std::vector<Value *> &def,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 2b9edcf9172..c0cab3299b5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -99,6 +99,7 @@ struct nv50_ir_prog_info
       uint8_t sourceRep;  /* NV50_PROGRAM_IR */
       const void *source;
       void *relocData;
+      void *interpData;
       struct nv50_ir_prog_symbol *syms;
       uint16_t numSyms;
    } bin;
@@ -143,6 +144,7 @@ struct nv50_ir_prog_info
          bool earlyFragTests;
          bool separateFragData;
          bool usesDiscard;
+         bool sampleInterp;      /* perform sample interp on all fp inputs */
       } fp;
       struct {
          uint32_t inputOffset; /* base address for user args */
@@ -154,9 +156,8 @@ struct nv50_ir_prog_info
    uint8_t numBarriers;
 
    struct {
-      uint8_t clipDistance;      /* index of first clip distance output */
-      uint8_t clipDistanceMask;  /* mask of clip distances defined */
-      uint8_t cullDistanceMask;  /* clip distance mode (1 bit per output) */
+      uint8_t clipDistances;     /* number of clip distance outputs */
+      uint8_t cullDistances;     /* number of cull distance outputs */
       int8_t genUserClip;        /* request user clip planes for ClipVertex */
       uint16_t ucpBase;          /* base address for UCPs */
       uint8_t ucpCBSlot;         /* constant buffer index of UCP data */
@@ -168,7 +169,6 @@ struct nv50_ir_prog_info
       int8_t viewportId;         /* output index of ViewportIndex */
       uint8_t fragDepth;         /* output index of FragDepth */
       uint8_t sampleMask;        /* output index of SampleMask */
-      bool sampleInterp;         /* perform sample interp on all fp inputs */
       uint8_t backFaceColor[2];  /* input/output indices of back face colour */
       uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
       bool fp64;                 /* program uses fp64 math */
@@ -198,6 +198,10 @@ extern void nv50_ir_relocate_code(void *relocData, uint32_t *code,
                                   uint32_t libPos,
                                   uint32_t dataPos);
 
+extern void
+nv50_ir_change_interp(void *interpData, uint32_t *code,
+                      bool force_per_sample, bool flatshade);
+
 /* obtain code that will be shared among programs */
 extern void nv50_ir_get_target_library(uint32_t chipset,
                                        const uint32_t **code, uint32_t *size);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 8f1542959c9..d712c9c300a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1437,6 +1437,30 @@ CodeEmitterGK110::emitInterpMode(const Instruction *i)
    code[1] |= (i->ipa & 0xc) << (19 - 2);
 }
 
+static void
+interpApply(const InterpEntry *entry, uint32_t *code,
+      bool force_persample_interp, bool flatshade)
+{
+   int ipa = entry->ipa;
+   int reg = entry->reg;
+   int loc = entry->loc;
+
+   if (flatshade &&
+       (ipa & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_SC) {
+      ipa = NV50_IR_INTERP_FLAT;
+      reg = 0xff;
+   } else if (force_persample_interp &&
+              (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
+              (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
+      ipa |= NV50_IR_INTERP_CENTROID;
+   }
+   code[loc + 1] &= ~(0xf << 19);
+   code[loc + 1] |= (ipa & 0x3) << 21;
+   code[loc + 1] |= (ipa & 0xc) << (19 - 2);
+   code[loc + 0] &= ~(0xff << 23);
+   code[loc + 0] |= reg << 23;
+}
+
 void
 CodeEmitterGK110::emitINTERP(const Instruction *i)
 {
@@ -1448,10 +1472,13 @@ CodeEmitterGK110::emitINTERP(const Instruction *i)
    if (i->saturate)
       code[1] |= 1 << 18;
 
-   if (i->op == OP_PINTERP)
+   if (i->op == OP_PINTERP) {
       srcId(i->src(1), 23);
-   else
+      addInterp(i->ipa, SDATA(i->src(1)).id, interpApply);
+   } else {
       code[0] |= 0xff << 23;
+      addInterp(i->ipa, 0xff, interpApply);
+   }
 
    srcId(i->src(0).getIndirect(0), 10);
    emitInterpMode(i);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 6e22788341f..a327d572470 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -2217,6 +2217,30 @@ CodeEmitterGM107::emitAL2P()
    emitGPR  (0x00, insn->def(0));
 }
 
+static void
+interpApply(const InterpEntry *entry, uint32_t *code,
+      bool force_persample_interp, bool flatshade)
+{
+   int ipa = entry->ipa;
+   int reg = entry->reg;
+   int loc = entry->loc;
+
+   if (flatshade &&
+       (ipa & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_SC) {
+      ipa = NV50_IR_INTERP_FLAT;
+      reg = 0xff;
+   } else if (force_persample_interp &&
+              (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
+              (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
+      ipa |= NV50_IR_INTERP_CENTROID;
+   }
+   code[loc + 1] &= ~(0xf << 0x14);
+   code[loc + 1] |= (ipa & 0x3) << 0x16;
+   code[loc + 1] |= (ipa & 0xc) << (0x14 - 2);
+   code[loc + 0] &= ~(0xff << 0x14);
+   code[loc + 0] |= reg << 0x14;
+}
+
 void
 CodeEmitterGM107::emitIPA()
 {
@@ -2255,10 +2279,12 @@ CodeEmitterGM107::emitIPA()
       emitGPR(0x14, insn->src(1));
       if (insn->getSampleMode() == NV50_IR_INTERP_OFFSET)
          emitGPR(0x27, insn->src(2));
+      addInterp(insn->ipa, insn->getSrc(1)->reg.data.id, interpApply);
    } else {
       if (insn->getSampleMode() == NV50_IR_INTERP_OFFSET)
          emitGPR(0x27, insn->src(1));
       emitGPR(0x14);
+      addInterp(insn->ipa, 0xff, interpApply);
    }
 
    if (insn->getSampleMode() != NV50_IR_INTERP_OFFSET)
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 90147668c91..9f1e4b803d5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -372,7 +372,7 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
          mode |= 3 << (s * 2);
          break;
       default:
-	      ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
+         ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
          assert(0);
          break;
       }
@@ -876,6 +876,30 @@ CodeEmitterNV50::emitPFETCH(const Instruction *i)
    emitFlagsRd(i);
 }
 
+static void
+interpApply(const InterpEntry *entry, uint32_t *code,
+      bool force_persample_interp, bool flatshade)
+{
+   int ipa = entry->ipa;
+   int encSize = entry->reg;
+   int loc = entry->loc;
+
+   if ((ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
+       (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
+      if (force_persample_interp) {
+         if (encSize == 8)
+            code[loc + 1] |= 1 << 16;
+         else
+            code[loc + 0] |= 1 << 24;
+      } else {
+         if (encSize == 8)
+            code[loc + 1] &= ~(1 << 16);
+         else
+            code[loc + 0] &= ~(1 << 24);
+      }
+   }
+}
+
 void
 CodeEmitterNV50::emitINTERP(const Instruction *i)
 {
@@ -904,6 +928,8 @@ CodeEmitterNV50::emitINTERP(const Instruction *i)
       code[0] |= 1;
       emitFlagsRd(i);
    }
+
+   addInterp(i->ipa, i->encSize, interpApply);
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 6bf5219d346..fd103146c72 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1618,6 +1618,29 @@ CodeEmitterNVC0::emitInterpMode(const Instruction *i)
    }
 }
 
+static void
+interpApply(const InterpEntry *entry, uint32_t *code,
+      bool force_persample_interp, bool flatshade)
+{
+   int ipa = entry->ipa;
+   int reg = entry->reg;
+   int loc = entry->loc;
+
+   if (flatshade &&
+       (ipa & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_SC) {
+      ipa = NV50_IR_INTERP_FLAT;
+      reg = 0x3f;
+   } else if (force_persample_interp &&
+              (ipa & NV50_IR_INTERP_SAMPLE_MASK) == NV50_IR_INTERP_DEFAULT &&
+              (ipa & NV50_IR_INTERP_MODE_MASK) != NV50_IR_INTERP_FLAT) {
+      ipa |= NV50_IR_INTERP_CENTROID;
+   }
+   code[loc + 0] &= ~(0xf << 6);
+   code[loc + 0] |= ipa << 6;
+   code[loc + 0] &= ~(0x3f << 26);
+   code[loc + 0] |= reg << 26;
+}
+
 void
 CodeEmitterNVC0::emitINTERP(const Instruction *i)
 {
@@ -1630,10 +1653,13 @@ CodeEmitterNVC0::emitINTERP(const Instruction *i)
       if (i->saturate)
          code[0] |= 1 << 5;
 
-      if (i->op == OP_PINTERP)
+      if (i->op == OP_PINTERP) {
          srcId(i->src(1), 26);
-      else
+         addInterp(i->ipa, SDATA(i->src(1)).id, interpApply);
+      } else {
          code[0] |= 0x3f << 26;
+         addInterp(i->ipa, 0x3f, interpApply);
+      }
 
       srcId(i->src(0).getIndirect(0), 20);
    } else {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index c8efaf5947a..6a7cb4224f4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -910,7 +910,7 @@ bool Source::scanSource()
       info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16;
 
    if (info->io.genUserClip > 0) {
-      info->io.clipDistanceMask = (1 << info->io.genUserClip) - 1;
+      info->io.clipDistances = info->io.genUserClip;
 
       const unsigned int nOut = (info->io.genUserClip + 3) / 4;
 
@@ -919,7 +919,7 @@ bool Source::scanSource()
          info->out[i].id = i;
          info->out[i].sn = TGSI_SEMANTIC_CLIPDIST;
          info->out[i].si = n;
-         info->out[i].mask = info->io.clipDistanceMask >> (n * 4);
+         info->out[i].mask = ((1 << info->io.clipDistances) - 1) >> (n * 4);
       }
    }
 
@@ -969,6 +969,12 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
       else
          info->prop.tp.outputPrim = PIPE_PRIM_TRIANGLES; /* anything but points */
       break;
+   case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
+      info->io.clipDistances = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
+      info->io.cullDistances = prop->u[0].Data;
+      break;
    default:
       INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
       break;
@@ -1054,7 +1060,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
                default:
                   break;
                }
-               if (decl->Interp.Location || info->io.sampleInterp)
+               if (decl->Interp.Location)
                   info->in[i].centroid = 1;
             }
 
@@ -1086,8 +1092,6 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
             clipVertexOutput = i;
             break;
          case TGSI_SEMANTIC_CLIPDIST:
-            info->io.clipDistanceMask |=
-               decl->Declaration.UsageMask << (si * 4);
             info->io.genUserClip = -1;
             break;
          case TGSI_SEMANTIC_SAMPLEMASK:
@@ -1119,6 +1123,10 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
       case TGSI_SEMANTIC_VERTEXID:
          info->io.vertexId = first;
          break;
+      case TGSI_SEMANTIC_SAMPLEID:
+      case TGSI_SEMANTIC_SAMPLEPOS:
+         info->prop.fp.sampleInterp = 1;
+         break;
       default:
          break;
       }
@@ -1338,6 +1346,8 @@ private:
 
    void handleINTERP(Value *dst0[4]);
 
+   uint8_t translateInterpMode(const struct nv50_ir_varying *var,
+                               operation& op);
    Value *interpolate(tgsi::Instruction::SrcRegister, int c, Value *ptr);
 
    void insertConvergenceOps(BasicBlock *conv, BasicBlock *fork);
@@ -1451,8 +1461,8 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
    return sym;
 }
 
-static inline uint8_t
-translateInterpMode(const struct nv50_ir_varying *var, operation& op)
+uint8_t
+Converter::translateInterpMode(const struct nv50_ir_varying *var, operation& op)
 {
    uint8_t mode = NV50_IR_INTERP_PERSPECTIVE;
 
@@ -1468,7 +1478,7 @@ translateInterpMode(const struct nv50_ir_varying *var, operation& op)
    op = (mode == NV50_IR_INTERP_PERSPECTIVE || mode == NV50_IR_INTERP_SC)
       ? OP_PINTERP : OP_LINTERP;
 
-   if (var->centroid)
+   if (var->centroid || info->prop.fp.sampleInterp)
       mode |= NV50_IR_INTERP_CENTROID;
 
    return mode;
@@ -1628,7 +1638,7 @@ Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
          // don't load masked inputs, won't be assigned a slot
          if (!ptr && !(info->in[idx].mask & (1 << swz)))
             return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f);
-	 if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
+         if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
             return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0));
          return interpolate(src, c, shiftAddress(ptr));
       } else
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index fe530c76b62..afc8ff1374f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -166,7 +166,7 @@ void Target::destroy(Target *targ)
    delete targ;
 }
 
-CodeEmitter::CodeEmitter(const Target *target) : targ(target)
+CodeEmitter::CodeEmitter(const Target *target) : targ(target), interpInfo(NULL)
 {
 }
 
@@ -388,6 +388,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
       }
    }
    info->bin.relocData = emit->getRelocInfo();
+   info->bin.interpData = emit->getInterpInfo();
 
    emitSymbolTable(info);
 
@@ -428,6 +429,29 @@ CodeEmitter::addReloc(RelocEntry::Type ty, int w, uint32_t data, uint32_t m,
    return true;
 }
 
+bool
+CodeEmitter::addInterp(int ipa, int reg, InterpApply apply)
+{
+   unsigned int n = interpInfo ? interpInfo->count : 0;
+
+   if (!(n % RELOC_ALLOC_INCREMENT)) {
+      size_t size = sizeof(InterpInfo) + n * sizeof(InterpEntry);
+      interpInfo = reinterpret_cast<InterpInfo *>(
+         REALLOC(interpInfo, n ? size : 0,
+                 size + RELOC_ALLOC_INCREMENT * sizeof(InterpEntry)));
+      if (!interpInfo)
+         return false;
+      if (n == 0)
+         memset(interpInfo, 0, sizeof(InterpInfo));
+   }
+   ++interpInfo->count;
+
+   interpInfo->entry[n] = InterpEntry(ipa, reg, codeSize >> 2);
+   interpInfo->apply = apply;
+
+   return true;
+}
+
 void
 RelocEntry::apply(uint32_t *binary, const RelocInfo *info) const
 {
@@ -472,6 +496,19 @@ nv50_ir_relocate_code(void *relocData, uint32_t *code,
 }
 
 void
+nv50_ir_change_interp(void *interpData, uint32_t *code,
+                      bool force_persample_interp, bool flatshade)
+{
+   nv50_ir::InterpInfo *info = reinterpret_cast<nv50_ir::InterpInfo *>(
+      interpData);
+
+   // force_persample_interp: all non-flat -> per-sample
+   // flatshade: all color -> flat
+   for (unsigned i = 0; i < info->count; ++i)
+      info->apply(&info->entry[i], code, force_persample_interp, flatshade);
+}
+
+void
 nv50_ir_get_target_library(uint32_t chipset,
                            const uint32_t **code, uint32_t *size)
 {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
index 591916eb412..4e33997e1c1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@@ -58,6 +58,23 @@ struct RelocInfo
    RelocEntry entry[0];
 };
 
+struct InterpEntry
+{
+   InterpEntry(int ipa, int reg, int loc) : ipa(ipa), reg(reg), loc(loc) {}
+   uint32_t ipa:4; // SC mode used to identify colors
+   uint32_t reg:8; // The reg used for perspective division
+   uint32_t loc:20; // Let's hope we don't have more than 1M-sized shaders
+};
+
+typedef void (*InterpApply)(const InterpEntry*, uint32_t*, bool, bool);
+
+struct InterpInfo
+{
+   uint32_t count;
+   InterpApply apply;
+   InterpEntry entry[0];
+};
+
 class CodeEmitter
 {
 public:
@@ -78,6 +95,9 @@ public:
 
    inline void *getRelocInfo() const { return relocInfo; }
 
+   bool addInterp(int ipa, int reg, InterpApply apply);
+   inline void *getInterpInfo() const { return interpInfo; }
+
    virtual void prepareEmission(Program *);
    virtual void prepareEmission(Function *);
    virtual void prepareEmission(BasicBlock *);
@@ -92,6 +112,7 @@ protected:
    uint32_t codeSizeLimit;
 
    RelocInfo *relocInfo;
+   InterpInfo *interpInfo;
 };
 
 
diff --git a/src/gallium/drivers/nouveau/nouveau_heap.c b/src/gallium/drivers/nouveau/nouveau_heap.c
index f4aa5081dfe..3d415a5f30e 100644
--- a/src/gallium/drivers/nouveau/nouveau_heap.c
+++ b/src/gallium/drivers/nouveau/nouveau_heap.c
@@ -29,95 +29,95 @@ int
 nouveau_heap_init(struct nouveau_heap **heap,
                   unsigned start, unsigned size)
 {
-	struct nouveau_heap *r;
+   struct nouveau_heap *r;
 
-	r = calloc(1, sizeof(struct nouveau_heap));
-	if (!r)
-		return 1;
+   r = calloc(1, sizeof(struct nouveau_heap));
+   if (!r)
+      return 1;
 
-	r->start = start;
-	r->size  = size;
-	*heap = r;
-	return 0;
+   r->start = start;
+   r->size  = size;
+   *heap = r;
+   return 0;
 }
 
 void
 nouveau_heap_destroy(struct nouveau_heap **heap)
 {
-	if (!*heap)
-		return;
-	free(*heap);
-	*heap = NULL;
+   if (!*heap)
+      return;
+   free(*heap);
+   *heap = NULL;
 }
 
 int
 nouveau_heap_alloc(struct nouveau_heap *heap, unsigned size, void *priv,
                    struct nouveau_heap **res)
 {
-	struct nouveau_heap *r;
+   struct nouveau_heap *r;
 
-	if (!heap || !size || !res || *res)
-		return 1;
+   if (!heap || !size || !res || *res)
+      return 1;
 
-	while (heap) {
-		if (!heap->in_use && heap->size >= size) {
-			r = calloc(1, sizeof(struct nouveau_heap));
-			if (!r)
-				return 1;
+   while (heap) {
+      if (!heap->in_use && heap->size >= size) {
+         r = calloc(1, sizeof(struct nouveau_heap));
+         if (!r)
+            return 1;
 
-			r->start  = (heap->start + heap->size) - size;
-			r->size   = size;
-			r->in_use = 1;
-			r->priv   = priv;
+         r->start  = (heap->start + heap->size) - size;
+         r->size   = size;
+         r->in_use = 1;
+         r->priv   = priv;
 
-			heap->size -= size;
+         heap->size -= size;
 
-			r->next = heap->next;
-			if (heap->next)
-				heap->next->prev = r;
-			r->prev = heap;
-			heap->next = r;
+         r->next = heap->next;
+         if (heap->next)
+            heap->next->prev = r;
+         r->prev = heap;
+         heap->next = r;
 
-			*res = r;
-			return 0;
-		}
+         *res = r;
+         return 0;
+      }
 
-		heap = heap->next;
-	}
+      heap = heap->next;
+   }
 
-	return 1;
+   return 1;
 }
 
 void
 nouveau_heap_free(struct nouveau_heap **res)
 {
-	struct nouveau_heap *r;
-
-	if (!res || !*res)
-		return;
-	r = *res;
-	*res = NULL;
-
-	r->in_use = 0;
-
-	if (r->next && !r->next->in_use) {
-		struct nouveau_heap *new = r->next;
-
-		new->prev = r->prev;
-		if (r->prev)
-			r->prev->next = new;
-		new->size += r->size;
-		new->start = r->start;
-
-		free(r);
-		r = new;
-	}
-
-	if (r->prev && !r->prev->in_use) {
-		r->prev->next = r->next;
-		if (r->next)
-			r->next->prev = r->prev;
-		r->prev->size += r->size;
-		free(r);
-	}
+   struct nouveau_heap *r;
+
+   if (!res || !*res)
+      return;
+   r = *res;
+   *res = NULL;
+
+   r->in_use = 0;
+
+   if (r->next && !r->next->in_use) {
+      struct nouveau_heap *new = r->next;
+
+      new->prev = r->prev;
+      if (r->prev)
+         r->prev->next = new;
+      new->size += r->size;
+      new->start = r->start;
+
+      free(r);
+      r = new;
+   }
+
+   if (r->prev && !r->prev->in_use) {
+      r->prev->next = r->next;
+      if (r->next)
+         r->next->prev = r->prev;
+      r->prev->size += r->size;
+      free(r);
+   }
 }
diff --git a/src/gallium/drivers/nouveau/nouveau_heap.h b/src/gallium/drivers/nouveau/nouveau_heap.h
index a3d64a65623..99f610ed4c8 100644
--- a/src/gallium/drivers/nouveau/nouveau_heap.h
+++ b/src/gallium/drivers/nouveau/nouveau_heap.h
@@ -44,15 +44,15 @@
  * full size of the heap.
  */
 struct nouveau_heap {
-	struct nouveau_heap *prev;
-	struct nouveau_heap *next;
+   struct nouveau_heap *prev;
+   struct nouveau_heap *next;
 
-	void *priv;
+   void *priv;
 
-	unsigned start;
-	unsigned size;
+   unsigned start;
+   unsigned size;
 
-	int in_use;
+   int in_use;
 };
 
 int
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index b2290e7e784..47603b0b7fd 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -30,211 +30,211 @@ int nouveau_mesa_debug = 0;
 static const char *
 nouveau_screen_get_name(struct pipe_screen *pscreen)
 {
-	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
-	static char buffer[128];
+   struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   static char buffer[128];
 
-	util_snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
-	return buffer;
+   util_snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+   return buffer;
 }
 
 static const char *
 nouveau_screen_get_vendor(struct pipe_screen *pscreen)
 {
-	return "nouveau";
+   return "nouveau";
 }
 
 static const char *
 nouveau_screen_get_device_vendor(struct pipe_screen *pscreen)
 {
-	return "NVIDIA";
+   return "NVIDIA";
 }
 
 static uint64_t
 nouveau_screen_get_timestamp(struct pipe_screen *pscreen)
 {
-	int64_t cpu_time = os_time_get() * 1000;
+   int64_t cpu_time = os_time_get() * 1000;
 
-        /* getparam of PTIMER_TIME takes about x10 as long (several usecs) */
+   /* getparam of PTIMER_TIME takes about x10 as long (several usecs) */
 
-	return cpu_time + nouveau_screen(pscreen)->cpu_gpu_time_delta;
+   return cpu_time + nouveau_screen(pscreen)->cpu_gpu_time_delta;
 }
 
 static void
 nouveau_screen_fence_ref(struct pipe_screen *pscreen,
-			 struct pipe_fence_handle **ptr,
-			 struct pipe_fence_handle *pfence)
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *pfence)
 {
-	nouveau_fence_ref(nouveau_fence(pfence), (struct nouveau_fence **)ptr);
+   nouveau_fence_ref(nouveau_fence(pfence), (struct nouveau_fence **)ptr);
 }
 
 static boolean
 nouveau_screen_fence_finish(struct pipe_screen *screen,
-			    struct pipe_fence_handle *pfence,
+                            struct pipe_fence_handle *pfence,
                             uint64_t timeout)
 {
-	if (!timeout)
-		return nouveau_fence_signalled(nouveau_fence(pfence));
+   if (!timeout)
+      return nouveau_fence_signalled(nouveau_fence(pfence));
 
-	return nouveau_fence_wait(nouveau_fence(pfence));
+   return nouveau_fence_wait(nouveau_fence(pfence));
 }
 
 
 struct nouveau_bo *
 nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
-			      struct winsys_handle *whandle,
-			      unsigned *out_stride)
+                              struct winsys_handle *whandle,
+                              unsigned *out_stride)
 {
-	struct nouveau_device *dev = nouveau_screen(pscreen)->device;
-	struct nouveau_bo *bo = 0;
-	int ret;
-
-	if (whandle->type != DRM_API_HANDLE_TYPE_SHARED &&
-	    whandle->type != DRM_API_HANDLE_TYPE_FD) {
-		debug_printf("%s: attempt to import unsupported handle type %d\n",
-			     __FUNCTION__, whandle->type);
-		return NULL;
-	}
-
-	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED)
-		ret = nouveau_bo_name_ref(dev, whandle->handle, &bo);
-	else
-		ret = nouveau_bo_prime_handle_ref(dev, whandle->handle, &bo);
-
-	if (ret) {
-		debug_printf("%s: ref name 0x%08x failed with %d\n",
-			     __FUNCTION__, whandle->handle, ret);
-		return NULL;
-	}
-
-	*out_stride = whandle->stride;
-	return bo;
+   struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nouveau_bo *bo = 0;
+   int ret;
+
+   if (whandle->type != DRM_API_HANDLE_TYPE_SHARED &&
+       whandle->type != DRM_API_HANDLE_TYPE_FD) {
+      debug_printf("%s: attempt to import unsupported handle type %d\n",
+                   __FUNCTION__, whandle->type);
+      return NULL;
+   }
+
+   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED)
+      ret = nouveau_bo_name_ref(dev, whandle->handle, &bo);
+   else
+      ret = nouveau_bo_prime_handle_ref(dev, whandle->handle, &bo);
+
+   if (ret) {
+      debug_printf("%s: ref name 0x%08x failed with %d\n",
+                   __FUNCTION__, whandle->handle, ret);
+      return NULL;
+   }
+
+   *out_stride = whandle->stride;
+   return bo;
 }
 
 
 bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
-			     struct nouveau_bo *bo,
-			     unsigned stride,
-			     struct winsys_handle *whandle)
+                             struct nouveau_bo *bo,
+                             unsigned stride,
+                             struct winsys_handle *whandle)
 {
-	whandle->stride = stride;
-
-	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
-		return nouveau_bo_name_get(bo, &whandle->handle) == 0;
-	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
-		whandle->handle = bo->handle;
-		return true;
-	} else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
-		return nouveau_bo_set_prime(bo, (int *)&whandle->handle) == 0;
-	} else {
-		return false;
-	}
+   whandle->stride = stride;
+
+   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+      return nouveau_bo_name_get(bo, &whandle->handle) == 0;
+   } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+      whandle->handle = bo->handle;
+      return true;
+   } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+      return nouveau_bo_set_prime(bo, (int *)&whandle->handle) == 0;
+   } else {
+      return false;
+   }
 }
 
 int
 nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 {
-	struct pipe_screen *pscreen = &screen->base;
-	struct nv04_fifo nv04_data = { .vram = 0xbeef0201, .gart = 0xbeef0202 };
-	struct nvc0_fifo nvc0_data = { };
-	uint64_t time;
-	int size, ret;
-	void *data;
-	union nouveau_bo_config mm_config;
-
-	char *nv_dbg = getenv("NOUVEAU_MESA_DEBUG");
-	if (nv_dbg)
-	   nouveau_mesa_debug = atoi(nv_dbg);
-
-	/*
-	 * this is initialized to 1 in nouveau_drm_screen_create after screen
-	 * is fully constructed and added to the global screen list.
-	 */
-	screen->refcount = -1;
-
-	if (dev->chipset < 0xc0) {
-		data = &nv04_data;
-		size = sizeof(nv04_data);
-	} else {
-		data = &nvc0_data;
-		size = sizeof(nvc0_data);
-	}
-
-	/*
-	 * Set default VRAM domain if not overridden
-	 */
-	if (!screen->vram_domain) {
-		if (dev->vram_size > 0)
-			screen->vram_domain = NOUVEAU_BO_VRAM;
-		else
-			screen->vram_domain = NOUVEAU_BO_GART;
-	}
-
-	ret = nouveau_object_new(&dev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS,
-				 data, size, &screen->channel);
-	if (ret)
-		return ret;
-	screen->device = dev;
-
-	ret = nouveau_client_new(screen->device, &screen->client);
-	if (ret)
-		return ret;
-	ret = nouveau_pushbuf_new(screen->client, screen->channel,
-				  4, 512 * 1024, 1,
-				  &screen->pushbuf);
-	if (ret)
-		return ret;
-
-        /* getting CPU time first appears to be more accurate */
-        screen->cpu_gpu_time_delta = os_time_get();
-
-        ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_PTIMER_TIME, &time);
-        if (!ret)
-           screen->cpu_gpu_time_delta = time - screen->cpu_gpu_time_delta * 1000;
-
-	pscreen->get_name = nouveau_screen_get_name;
-	pscreen->get_vendor = nouveau_screen_get_vendor;
-	pscreen->get_device_vendor = nouveau_screen_get_device_vendor;
-
-	pscreen->get_timestamp = nouveau_screen_get_timestamp;
-
-	pscreen->fence_reference = nouveau_screen_fence_ref;
-	pscreen->fence_finish = nouveau_screen_fence_finish;
-
-	util_format_s3tc_init();
-
-	screen->lowmem_bindings = PIPE_BIND_GLOBAL; /* gallium limit */
-	screen->vidmem_bindings =
-		PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL |
-		PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
-		PIPE_BIND_CURSOR |
-		PIPE_BIND_SAMPLER_VIEW |
-		PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE |
-                PIPE_BIND_COMPUTE_RESOURCE |
-		PIPE_BIND_GLOBAL;
-	screen->sysmem_bindings =
-		PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT |
-		PIPE_BIND_COMMAND_ARGS_BUFFER;
-
-	memset(&mm_config, 0, sizeof(mm_config));
-
-	screen->mm_GART = nouveau_mm_create(dev,
-					    NOUVEAU_BO_GART | NOUVEAU_BO_MAP,
-					    &mm_config);
-	screen->mm_VRAM = nouveau_mm_create(dev, NOUVEAU_BO_VRAM, &mm_config);
-	return 0;
+   struct pipe_screen *pscreen = &screen->base;
+   struct nv04_fifo nv04_data = { .vram = 0xbeef0201, .gart = 0xbeef0202 };
+   struct nvc0_fifo nvc0_data = { };
+   uint64_t time;
+   int size, ret;
+   void *data;
+   union nouveau_bo_config mm_config;
+
+   char *nv_dbg = getenv("NOUVEAU_MESA_DEBUG");
+   if (nv_dbg)
+      nouveau_mesa_debug = atoi(nv_dbg);
+
+   /*
+    * this is initialized to 1 in nouveau_drm_screen_create after screen
+    * is fully constructed and added to the global screen list.
+    */
+   screen->refcount = -1;
+
+   if (dev->chipset < 0xc0) {
+      data = &nv04_data;
+      size = sizeof(nv04_data);
+   } else {
+      data = &nvc0_data;
+      size = sizeof(nvc0_data);
+   }
+
+   /*
+    * Set default VRAM domain if not overridden
+    */
+   if (!screen->vram_domain) {
+      if (dev->vram_size > 0)
+         screen->vram_domain = NOUVEAU_BO_VRAM;
+      else
+         screen->vram_domain = NOUVEAU_BO_GART;
+   }
+
+   ret = nouveau_object_new(&dev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS,
+                            data, size, &screen->channel);
+   if (ret)
+      return ret;
+   screen->device = dev;
+
+   ret = nouveau_client_new(screen->device, &screen->client);
+   if (ret)
+      return ret;
+   ret = nouveau_pushbuf_new(screen->client, screen->channel,
+                             4, 512 * 1024, 1,
+                             &screen->pushbuf);
+   if (ret)
+      return ret;
+
+   /* getting CPU time first appears to be more accurate */
+   screen->cpu_gpu_time_delta = os_time_get();
+
+   ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_PTIMER_TIME, &time);
+   if (!ret)
+      screen->cpu_gpu_time_delta = time - screen->cpu_gpu_time_delta * 1000;
+
+   pscreen->get_name = nouveau_screen_get_name;
+   pscreen->get_vendor = nouveau_screen_get_vendor;
+   pscreen->get_device_vendor = nouveau_screen_get_device_vendor;
+
+   pscreen->get_timestamp = nouveau_screen_get_timestamp;
+
+   pscreen->fence_reference = nouveau_screen_fence_ref;
+   pscreen->fence_finish = nouveau_screen_fence_finish;
+
+   util_format_s3tc_init();
+
+   screen->lowmem_bindings = PIPE_BIND_GLOBAL; /* gallium limit */
+   screen->vidmem_bindings =
+      PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL |
+      PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
+      PIPE_BIND_CURSOR |
+      PIPE_BIND_SAMPLER_VIEW |
+      PIPE_BIND_SHADER_BUFFER | PIPE_BIND_SHADER_IMAGE |
+      PIPE_BIND_COMPUTE_RESOURCE |
+      PIPE_BIND_GLOBAL;
+   screen->sysmem_bindings =
+      PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_STREAM_OUTPUT |
+      PIPE_BIND_COMMAND_ARGS_BUFFER;
+
+   memset(&mm_config, 0, sizeof(mm_config));
+
+   screen->mm_GART = nouveau_mm_create(dev,
+                                       NOUVEAU_BO_GART | NOUVEAU_BO_MAP,
+                                       &mm_config);
+   screen->mm_VRAM = nouveau_mm_create(dev, NOUVEAU_BO_VRAM, &mm_config);
+   return 0;
 }
 
 void
 nouveau_screen_fini(struct nouveau_screen *screen)
 {
-	nouveau_mm_destroy(screen->mm_GART);
-	nouveau_mm_destroy(screen->mm_VRAM);
+   nouveau_mm_destroy(screen->mm_GART);
+   nouveau_mm_destroy(screen->mm_VRAM);
 
-	nouveau_pushbuf_del(&screen->pushbuf);
+   nouveau_pushbuf_del(&screen->pushbuf);
 
-	nouveau_client_del(&screen->client);
-	nouveau_object_del(&screen->channel);
+   nouveau_client_del(&screen->client);
+   nouveau_object_del(&screen->channel);
 
-	nouveau_device_del(&screen->device);
+   nouveau_device_del(&screen->device);
 }
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index 4fdde9fbf3d..328646fe3ce 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -16,47 +16,47 @@ extern int nouveau_mesa_debug;
 struct nouveau_bo;
 
 struct nouveau_screen {
-	struct pipe_screen base;
-	struct nouveau_device *device;
-	struct nouveau_object *channel;
-	struct nouveau_client *client;
-	struct nouveau_pushbuf *pushbuf;
+   struct pipe_screen base;
+   struct nouveau_device *device;
+   struct nouveau_object *channel;
+   struct nouveau_client *client;
+   struct nouveau_pushbuf *pushbuf;
 
-	int refcount;
+   int refcount;
 
-	unsigned vidmem_bindings; /* PIPE_BIND_* where VRAM placement is desired */
-	unsigned sysmem_bindings; /* PIPE_BIND_* where GART placement is desired */
-	unsigned lowmem_bindings; /* PIPE_BIND_* that require an address < 4 GiB */
-	/*
-	 * For bindings with (vidmem & sysmem) bits set, PIPE_USAGE_* decides
-	 * placement.
-	 */
+   unsigned vidmem_bindings; /* PIPE_BIND_* where VRAM placement is desired */
+   unsigned sysmem_bindings; /* PIPE_BIND_* where GART placement is desired */
+   unsigned lowmem_bindings; /* PIPE_BIND_* that require an address < 4 GiB */
+   /*
+    * For bindings with (vidmem & sysmem) bits set, PIPE_USAGE_* decides
+    * placement.
+    */
 
-	uint16_t class_3d;
+   uint16_t class_3d;
 
-	struct {
-		struct nouveau_fence *head;
-		struct nouveau_fence *tail;
-		struct nouveau_fence *current;
-		u32 sequence;
-		u32 sequence_ack;
-		void (*emit)(struct pipe_screen *, u32 *sequence);
-		u32  (*update)(struct pipe_screen *);
-	} fence;
+   struct {
+      struct nouveau_fence *head;
+      struct nouveau_fence *tail;
+      struct nouveau_fence *current;
+      u32 sequence;
+      u32 sequence_ack;
+      void (*emit)(struct pipe_screen *, u32 *sequence);
+      u32  (*update)(struct pipe_screen *);
+   } fence;
 
-	struct nouveau_mman *mm_VRAM;
-	struct nouveau_mman *mm_GART;
+   struct nouveau_mman *mm_VRAM;
+   struct nouveau_mman *mm_GART;
 
-	int64_t cpu_gpu_time_delta;
+   int64_t cpu_gpu_time_delta;
 
-	bool hint_buf_keep_sysmem_copy;
+   bool hint_buf_keep_sysmem_copy;
 
-	unsigned vram_domain;
+   unsigned vram_domain;
 
-	struct {
-		unsigned profiles_checked;
-		unsigned profiles_present;
-	} firmware_info;
+   struct {
+      unsigned profiles_checked;
+      unsigned profiles_present;
+   } firmware_info;
 
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
    union {
@@ -100,10 +100,10 @@ struct nouveau_screen {
 
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
 # define NOUVEAU_DRV_STAT(s, n, v) do {         \
-      (s)->stats.named.n += (v);               \
+      (s)->stats.named.n += (v);                \
    } while(0)
-# define NOUVEAU_DRV_STAT_RES(r, n, v) do {                       \
-      nouveau_screen((r)->base.screen)->stats.named.n += (v);    \
+# define NOUVEAU_DRV_STAT_RES(r, n, v) do {                     \
+      nouveau_screen((r)->base.screen)->stats.named.n += (v);   \
    } while(0)
 # define NOUVEAU_DRV_STAT_IFD(x) x
 #else
@@ -115,20 +115,20 @@ struct nouveau_screen {
 static inline struct nouveau_screen *
 nouveau_screen(struct pipe_screen *pscreen)
 {
-	return (struct nouveau_screen *)pscreen;
+   return (struct nouveau_screen *)pscreen;
 }
 
 bool nouveau_drm_screen_unref(struct nouveau_screen *screen);
 
 bool
 nouveau_screen_bo_get_handle(struct pipe_screen *pscreen,
-			     struct nouveau_bo *bo,
-			     unsigned stride,
-			     struct winsys_handle *whandle);
+                             struct nouveau_bo *bo,
+                             unsigned stride,
+                             struct winsys_handle *whandle);
 struct nouveau_bo *
 nouveau_screen_bo_from_handle(struct pipe_screen *pscreen,
-			      struct winsys_handle *whandle,
-			      unsigned *out_stride);
+                              struct winsys_handle *whandle,
+                              unsigned *out_stride);
 
 
 int nouveau_screen_init(struct nouveau_screen *, struct nouveau_device *);
diff --git a/src/gallium/drivers/nouveau/nouveau_statebuf.h b/src/gallium/drivers/nouveau/nouveau_statebuf.h
index f38014091ba..da5d7972d9c 100644
--- a/src/gallium/drivers/nouveau/nouveau_statebuf.h
+++ b/src/gallium/drivers/nouveau/nouveau_statebuf.h
@@ -6,9 +6,9 @@
 
 struct nouveau_statebuf_builder
 {
-	uint32_t* p;
+   uint32_t* p;
 #ifdef DEBUG
-	uint32_t* pend;
+   uint32_t* pend;
 #endif
 };
 
@@ -22,7 +22,7 @@ struct nouveau_statebuf_builder
 
 static inline uint32_t sb_header(unsigned subc, unsigned mthd, unsigned size)
 {
-	return (size << 18) | (subc << 13) | mthd;
+   return (size << 18) | (subc << 13) | mthd;
 }
 
 #define sb_method(sb, v, n)  sb_data(sb, sb_header(SUBC_3D(v), n));
diff --git a/src/gallium/drivers/nouveau/nouveau_video.c b/src/gallium/drivers/nouveau/nouveau_video.c
index e414a534418..8bb12b22ac1 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_video.c
@@ -831,7 +831,7 @@ error:
 static int
 nouveau_screen_get_video_param(struct pipe_screen *pscreen,
                                enum pipe_video_profile profile,
-			       enum pipe_video_entrypoint entrypoint,
+                               enum pipe_video_entrypoint entrypoint,
                                enum pipe_video_cap param)
 {
    switch (param) {
diff --git a/src/gallium/drivers/nouveau/nouveau_video.h b/src/gallium/drivers/nouveau/nouveau_video.h
index fd1bd527deb..3ef6f89ce28 100644
--- a/src/gallium/drivers/nouveau/nouveau_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_video.h
@@ -83,7 +83,7 @@ BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, unsigned size)
 static inline void
 PUSH_MTHDl(struct nouveau_pushbuf *push, int subc, int mthd,
            struct nouveau_bo *bo, uint32_t offset,
-	   struct nouveau_bufctx *ctx, int bin, uint32_t rw)
+           struct nouveau_bufctx *ctx, int bin, uint32_t rw)
 {
    nouveau_bufctx_mthd(ctx, bin, NV04_FIFO_PKHDR(subc, mthd, 1),
                        bo, offset,
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.h b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
index 33e3bef3df3..58df5ee847f 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.h
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.h
@@ -117,22 +117,22 @@ struct nouveau_vp3_decoder {
 };
 
 struct comm {
-	uint32_t bsp_cur_index; // 000
-	uint32_t byte_ofs; // 004
-	uint32_t status[0x10]; // 008
-	uint32_t pos[0x10]; // 048
-	uint8_t pad[0x100 - 0x88]; // 0a0 bool comm_encrypted
-
-	uint32_t pvp_cur_index; // 100
-	uint32_t acked_byte_ofs; // 104
-	uint32_t status_vp[0x10]; // 108
-	uint16_t mb_y[0x10]; //148
-	uint32_t pvp_stage; // 168 0xeeXX
-	uint16_t parse_endpos_index; // 16c
-	uint16_t irq_index; // 16e
-	uint8_t  irq_470[0x10]; // 170
-	uint32_t irq_pos[0x10]; // 180
-	uint32_t parse_endpos[0x10]; // 1c0
+   uint32_t bsp_cur_index; // 000
+   uint32_t byte_ofs; // 004
+   uint32_t status[0x10]; // 008
+   uint32_t pos[0x10]; // 048
+   uint8_t pad[0x100 - 0x88]; // 0a0 bool comm_encrypted
+
+   uint32_t pvp_cur_index; // 100
+   uint32_t acked_byte_ofs; // 104
+   uint32_t status_vp[0x10]; // 108
+   uint16_t mb_y[0x10]; //148
+   uint32_t pvp_stage; // 168 0xeeXX
+   uint16_t parse_endpos_index; // 16c
+   uint16_t irq_index; // 16e
+   uint8_t  irq_470[0x10]; // 170
+   uint32_t irq_pos[0x10]; // 180
+   uint32_t parse_endpos[0x10]; // 1c0
 };
 
 static inline uint32_t nouveau_vp3_video_align(uint32_t h)
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
index 6d968c18399..692772e49d1 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_bsp.c
@@ -23,90 +23,90 @@
 #include "nouveau_vp3_video.h"
 
 struct strparm_bsp {
-	uint32_t w0[4]; // bits 0-23 length, bits 24-31 addr_hi
-	uint32_t w1[4]; // bit 8-24 addr_lo
-	uint32_t unk20; // should be idx * 0x8000000, bitstream offset
-	uint32_t do_crypto_crap; // set to 0
+   uint32_t w0[4]; // bits 0-23 length, bits 24-31 addr_hi
+   uint32_t w1[4]; // bit 8-24 addr_lo
+   uint32_t unk20; // should be idx * 0x8000000, bitstream offset
+   uint32_t do_crypto_crap; // set to 0
 };
 
 struct mpeg12_picparm_bsp {
-	uint16_t width;
-	uint16_t height;
-	uint8_t picture_structure;
-	uint8_t picture_coding_type;
-	uint8_t intra_dc_precision;
-	uint8_t frame_pred_frame_dct;
-	uint8_t concealment_motion_vectors;
-	uint8_t intra_vlc_format;
-	uint16_t pad;
-	uint8_t f_code[2][2];
+   uint16_t width;
+   uint16_t height;
+   uint8_t picture_structure;
+   uint8_t picture_coding_type;
+   uint8_t intra_dc_precision;
+   uint8_t frame_pred_frame_dct;
+   uint8_t concealment_motion_vectors;
+   uint8_t intra_vlc_format;
+   uint16_t pad;
+   uint8_t f_code[2][2];
 };
 
 struct mpeg4_picparm_bsp {
-	uint16_t width;
-	uint16_t height;
-	uint8_t vop_time_increment_size;
-	uint8_t interlaced;
-	uint8_t resync_marker_disable;
+   uint16_t width;
+   uint16_t height;
+   uint8_t vop_time_increment_size;
+   uint8_t interlaced;
+   uint8_t resync_marker_disable;
 };
 
 struct vc1_picparm_bsp {
-	uint16_t width;
-	uint16_t height;
-	uint8_t profile; // 04 0 simple, 1 main, 2 advanced
-	uint8_t postprocflag; // 05
-	uint8_t pulldown; // 06
-	uint8_t interlaced; // 07
-	uint8_t tfcntrflag; // 08
-	uint8_t finterpflag; // 09
-	uint8_t psf; // 0a
-	uint8_t pad; // 0b
-	uint8_t multires; // 0c
-	uint8_t syncmarker; // 0d
-	uint8_t rangered; // 0e
-	uint8_t maxbframes; // 0f
-	uint8_t dquant; // 10
-	uint8_t panscan_flag; // 11
-	uint8_t refdist_flag; // 12
-	uint8_t quantizer; // 13
-	uint8_t extended_mv; // 14
-	uint8_t extended_dmv; // 15
-	uint8_t overlap; // 16
-	uint8_t vstransform; // 17
+   uint16_t width;
+   uint16_t height;
+   uint8_t profile; // 04 0 simple, 1 main, 2 advanced
+   uint8_t postprocflag; // 05
+   uint8_t pulldown; // 06
+   uint8_t interlaced; // 07
+   uint8_t tfcntrflag; // 08
+   uint8_t finterpflag; // 09
+   uint8_t psf; // 0a
+   uint8_t pad; // 0b
+   uint8_t multires; // 0c
+   uint8_t syncmarker; // 0d
+   uint8_t rangered; // 0e
+   uint8_t maxbframes; // 0f
+   uint8_t dquant; // 10
+   uint8_t panscan_flag; // 11
+   uint8_t refdist_flag; // 12
+   uint8_t quantizer; // 13
+   uint8_t extended_mv; // 14
+   uint8_t extended_dmv; // 15
+   uint8_t overlap; // 16
+   uint8_t vstransform; // 17
 };
 
 struct h264_picparm_bsp {
-	// 00
-	uint32_t unk00;
-	// 04
-	uint32_t log2_max_frame_num_minus4; // 04 checked
-	uint32_t pic_order_cnt_type; // 08 checked
-	uint32_t log2_max_pic_order_cnt_lsb_minus4; // 0c checked
-	uint32_t delta_pic_order_always_zero_flag; // 10, or unknown
+   // 00
+   uint32_t unk00;
+   // 04
+   uint32_t log2_max_frame_num_minus4; // 04 checked
+   uint32_t pic_order_cnt_type; // 08 checked
+   uint32_t log2_max_pic_order_cnt_lsb_minus4; // 0c checked
+   uint32_t delta_pic_order_always_zero_flag; // 10, or unknown
 
-	uint32_t frame_mbs_only_flag; // 14, always 1?
-	uint32_t direct_8x8_inference_flag; // 18, always 1?
-	uint32_t width_mb; // 1c checked
-	uint32_t height_mb; // 20 checked
-	// 24
-	//struct picparm2
-		uint32_t entropy_coding_mode_flag; // 00, checked
-		uint32_t pic_order_present_flag; // 04 checked
-		uint32_t unk; // 08 seems to be 0?
-		uint32_t pad1; // 0c seems to be 0?
-		uint32_t pad2; // 10 always 0 ?
-		uint32_t num_ref_idx_l0_active_minus1; // 14 always 0?
-		uint32_t num_ref_idx_l1_active_minus1; // 18 always 0?
-		uint32_t weighted_pred_flag; // 1c checked
-		uint32_t weighted_bipred_idc; // 20 checked
-		uint32_t pic_init_qp_minus26; // 24 checked
-		uint32_t deblocking_filter_control_present_flag; // 28 always 1?
-		uint32_t redundant_pic_cnt_present_flag; // 2c always 0?
-		uint32_t transform_8x8_mode_flag; // 30 checked
-		uint32_t mb_adaptive_frame_field_flag; // 34 checked-ish
-		uint8_t field_pic_flag; // 38 checked
-		uint8_t bottom_field_flag; // 39 checked
-		uint8_t real_pad[0x1b]; // XX why?
+   uint32_t frame_mbs_only_flag; // 14, always 1?
+   uint32_t direct_8x8_inference_flag; // 18, always 1?
+   uint32_t width_mb; // 1c checked
+   uint32_t height_mb; // 20 checked
+   // 24
+   //struct picparm2
+   uint32_t entropy_coding_mode_flag; // 00, checked
+   uint32_t pic_order_present_flag; // 04 checked
+   uint32_t unk; // 08 seems to be 0?
+   uint32_t pad1; // 0c seems to be 0?
+   uint32_t pad2; // 10 always 0 ?
+   uint32_t num_ref_idx_l0_active_minus1; // 14 always 0?
+   uint32_t num_ref_idx_l1_active_minus1; // 18 always 0?
+   uint32_t weighted_pred_flag; // 1c checked
+   uint32_t weighted_bipred_idc; // 20 checked
+   uint32_t pic_init_qp_minus26; // 24 checked
+   uint32_t deblocking_filter_control_present_flag; // 28 always 1?
+   uint32_t redundant_pic_cnt_present_flag; // 2c always 0?
+   uint32_t transform_8x8_mode_flag; // 30 checked
+   uint32_t mb_adaptive_frame_field_flag; // 34 checked-ish
+   uint8_t field_pic_flag; // 38 checked
+   uint8_t bottom_field_flag; // 39 checked
+   uint8_t real_pad[0x1b]; // XX why?
 };
 
 static uint32_t
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video_vp.c b/src/gallium/drivers/nouveau/nouveau_vp3_video_vp.c
index 25283b79952..53f5db0003d 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video_vp.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video_vp.c
@@ -23,147 +23,147 @@
 #include "nouveau_vp3_video.h"
 
 struct mpeg12_picparm_vp {
-	uint16_t width; // 00 in mb units
-	uint16_t height; // 02 in mb units
-
-	uint32_t unk04; // 04 stride for Y?
-	uint32_t unk08; // 08 stride for CbCr?
-
-	uint32_t ofs[6]; // 1c..20 ofs
-	uint32_t bucket_size; // 24
-	uint32_t inter_ring_data_size; // 28
-	uint16_t unk2c; // 2c
-	uint16_t alternate_scan; // 2e
-	uint16_t unk30; // 30 not seen set yet
-	uint16_t picture_structure; // 32
-	uint16_t pad2[3];
-	uint16_t unk3a; // 3a set on I frame?
-
-	uint32_t f_code[4]; // 3c
-	uint32_t picture_coding_type; // 4c
-	uint32_t intra_dc_precision; // 50
-	uint32_t q_scale_type; // 54
-	uint32_t top_field_first; // 58
-	uint32_t full_pel_forward_vector; // 5c
-	uint32_t full_pel_backward_vector; // 60
-	uint8_t intra_quantizer_matrix[0x40]; // 64
-	uint8_t non_intra_quantizer_matrix[0x40]; // a4
+   uint16_t width; // 00 in mb units
+   uint16_t height; // 02 in mb units
+
+   uint32_t unk04; // 04 stride for Y?
+   uint32_t unk08; // 08 stride for CbCr?
+
+   uint32_t ofs[6]; // 1c..20 ofs
+   uint32_t bucket_size; // 24
+   uint32_t inter_ring_data_size; // 28
+   uint16_t unk2c; // 2c
+   uint16_t alternate_scan; // 2e
+   uint16_t unk30; // 30 not seen set yet
+   uint16_t picture_structure; // 32
+   uint16_t pad2[3];
+   uint16_t unk3a; // 3a set on I frame?
+
+   uint32_t f_code[4]; // 3c
+   uint32_t picture_coding_type; // 4c
+   uint32_t intra_dc_precision; // 50
+   uint32_t q_scale_type; // 54
+   uint32_t top_field_first; // 58
+   uint32_t full_pel_forward_vector; // 5c
+   uint32_t full_pel_backward_vector; // 60
+   uint8_t intra_quantizer_matrix[0x40]; // 64
+   uint8_t non_intra_quantizer_matrix[0x40]; // a4
 };
 
 struct mpeg4_picparm_vp {
-	uint32_t width; // 00 in normal units
-	uint32_t height; // 04 in normal units
-	uint32_t unk08; // stride 1
-	uint32_t unk0c; // stride 2
-	uint32_t ofs[6]; // 10..24 ofs
-	uint32_t bucket_size; // 28
-	uint32_t pad1; // 2c, pad
-	uint32_t pad2; // 30
-	uint32_t inter_ring_data_size; // 34
-
-	uint32_t trd[2]; // 38, 3c
-	uint32_t trb[2]; // 40, 44
-	uint32_t u48; // XXX codec selection? Should test with different values of VdpDecoderProfile
-	uint16_t f_code_fw; // 4c
-	uint16_t f_code_bw; // 4e
-	uint8_t interlaced; // 50
-
-	uint8_t quant_type; // bool, written to 528
-	uint8_t quarter_sample; // bool, written to 548
-	uint8_t short_video_header; // bool, negated written to 528 shifted by 1
-	uint8_t u54; // bool, written to 0x740
-	uint8_t vop_coding_type; // 55
-	uint8_t rounding_control; // 56
-	uint8_t alternate_vertical_scan_flag; // 57 bool
-	uint8_t top_field_first; // bool, written to vuc
-
-	uint8_t pad4[3]; // 59, 5a, 5b, contains garbage on blob
-
-	uint32_t intra[0x10]; // 5c
-	uint32_t non_intra[0x10]; // 9c
-	uint32_t pad5[0x10]; // bc what does this do?
-	// udc..uff pad?
+   uint32_t width; // 00 in normal units
+   uint32_t height; // 04 in normal units
+   uint32_t unk08; // stride 1
+   uint32_t unk0c; // stride 2
+   uint32_t ofs[6]; // 10..24 ofs
+   uint32_t bucket_size; // 28
+   uint32_t pad1; // 2c, pad
+   uint32_t pad2; // 30
+   uint32_t inter_ring_data_size; // 34
+
+   uint32_t trd[2]; // 38, 3c
+   uint32_t trb[2]; // 40, 44
+   uint32_t u48; // XXX codec selection? Should test with different values of VdpDecoderProfile
+   uint16_t f_code_fw; // 4c
+   uint16_t f_code_bw; // 4e
+   uint8_t interlaced; // 50
+
+   uint8_t quant_type; // bool, written to 528
+   uint8_t quarter_sample; // bool, written to 548
+   uint8_t short_video_header; // bool, negated written to 528 shifted by 1
+   uint8_t u54; // bool, written to 0x740
+   uint8_t vop_coding_type; // 55
+   uint8_t rounding_control; // 56
+   uint8_t alternate_vertical_scan_flag; // 57 bool
+   uint8_t top_field_first; // bool, written to vuc
+
+   uint8_t pad4[3]; // 59, 5a, 5b, contains garbage on blob
+
+   uint32_t intra[0x10]; // 5c
+   uint32_t non_intra[0x10]; // 9c
+   uint32_t pad5[0x10]; // bc what does this do?
+   // udc..uff pad?
 };
 
 // Full version, with data pumped from BSP
 struct vc1_picparm_vp {
-	uint32_t bucket_size; // 00
-	uint32_t pad; // 04
-
-	uint32_t inter_ring_data_size; // 08
-	uint32_t unk0c; // stride 1
-	uint32_t unk10; // stride 2
-	uint32_t ofs[6]; // 14..28 ofs
-
-	uint16_t width; // 2c
-	uint16_t height; // 2e
-
-	uint8_t profile; // 30 0 = simple, 1 = main, 2 = advanced
-	uint8_t loopfilter; // 31 written into vuc
-	uint8_t fastuvmc; // 32, written into vuc
-	uint8_t dquant; // 33
-
-	uint8_t overlap; // 34
-	uint8_t quantizer; // 35
-	uint8_t u36; // 36, bool
-	uint8_t pad2; // 37, to align to 0x38
+   uint32_t bucket_size; // 00
+   uint32_t pad; // 04
+
+   uint32_t inter_ring_data_size; // 08
+   uint32_t unk0c; // stride 1
+   uint32_t unk10; // stride 2
+   uint32_t ofs[6]; // 14..28 ofs
+
+   uint16_t width; // 2c
+   uint16_t height; // 2e
+
+   uint8_t profile; // 30 0 = simple, 1 = main, 2 = advanced
+   uint8_t loopfilter; // 31 written into vuc
+   uint8_t fastuvmc; // 32, written into vuc
+   uint8_t dquant; // 33
+
+   uint8_t overlap; // 34
+   uint8_t quantizer; // 35
+   uint8_t u36; // 36, bool
+   uint8_t pad2; // 37, to align to 0x38
 };
 
 struct h264_picparm_vp { // 700..a00
-	uint16_t width, height;
-	uint32_t stride1, stride2; // 04 08
-	uint32_t ofs[6]; // 0c..24 in-image offset
-
-	uint32_t tmp_stride;
-	uint32_t bucket_size; // 28 bucket size
-	uint32_t inter_ring_data_size; // 2c
-
-	unsigned mb_adaptive_frame_field_flag : 1; // 0
-	unsigned direct_8x8_inference_flag : 1; // 1 0x02: into vuc ofs 56
-	unsigned weighted_pred_flag : 1; // 2 0x04
-	unsigned constrained_intra_pred_flag : 1; // 3 0x08: into vuc ofs 68
-	unsigned is_reference : 1; // 4
-	unsigned interlace : 1; // 5 field_pic_flag
-	unsigned bottom_field_flag : 1; // 6
-	unsigned second_field : 1; // 7 0x80: nfi yet
-
-	signed log2_max_frame_num_minus4 : 4; // 31 0..3
-	unsigned chroma_format_idc : 2; // 31 4..5
-	unsigned pic_order_cnt_type : 2; // 31 6..7
-	signed pic_init_qp_minus26 : 6; // 32 0..5
-	signed chroma_qp_index_offset : 5; // 32 6..10
-	signed second_chroma_qp_index_offset : 5; // 32 11..15
-
-	unsigned weighted_bipred_idc : 2; // 34 0..1
-	unsigned fifo_dec_index : 7; // 34 2..8
-	unsigned tmp_idx : 5; // 34 9..13
-	unsigned frame_number : 16; // 34 14..29
-	unsigned u34_3030 : 1; // 34 30..30 pp.u34[30:30]
-	unsigned u34_3131 : 1; // 34 31..31 pad?
-
-	uint32_t field_order_cnt[2]; // 38, 3c
-
-	struct { // 40
-		unsigned fifo_idx : 7; // 00 0..6
-		unsigned tmp_idx : 5; // 00 7..11
-		unsigned top_is_reference : 1; // 00 12
-		unsigned bottom_is_reference : 1; // 00 13
-		unsigned is_long_term : 1; // 00 14
-		unsigned notseenyet : 1; // 00 15 pad?
-		unsigned field_pic_flag : 1; // 00 16
-		unsigned top_field_marking : 4; // 00 17..20
-		unsigned bottom_field_marking : 4; // 00 21..24
-		unsigned pad : 7; // 00 d25..31
-
-		uint32_t field_order_cnt[2]; // 04,08
-		uint32_t frame_idx; // 0c
-	} refs[0x10];
-
-	uint8_t m4x4[6][16]; // 140
-	uint8_t m8x8[2][64]; // 1a0
-	uint32_t u220; // 220 number of extra reorder_list to append?
-	uint8_t u224[0x20]; // 224..244 reorder_list append ?
-	uint8_t nfi244[0xb0]; // add some pad to make sure nulls are read
+   uint16_t width, height;
+   uint32_t stride1, stride2; // 04 08
+   uint32_t ofs[6]; // 0c..24 in-image offset
+
+   uint32_t tmp_stride;
+   uint32_t bucket_size; // 28 bucket size
+   uint32_t inter_ring_data_size; // 2c
+
+   unsigned mb_adaptive_frame_field_flag : 1; // 0
+   unsigned direct_8x8_inference_flag : 1; // 1 0x02: into vuc ofs 56
+   unsigned weighted_pred_flag : 1; // 2 0x04
+   unsigned constrained_intra_pred_flag : 1; // 3 0x08: into vuc ofs 68
+   unsigned is_reference : 1; // 4
+   unsigned interlace : 1; // 5 field_pic_flag
+   unsigned bottom_field_flag : 1; // 6
+   unsigned second_field : 1; // 7 0x80: nfi yet
+
+   signed log2_max_frame_num_minus4 : 4; // 31 0..3
+   unsigned chroma_format_idc : 2; // 31 4..5
+   unsigned pic_order_cnt_type : 2; // 31 6..7
+   signed pic_init_qp_minus26 : 6; // 32 0..5
+   signed chroma_qp_index_offset : 5; // 32 6..10
+   signed second_chroma_qp_index_offset : 5; // 32 11..15
+
+   unsigned weighted_bipred_idc : 2; // 34 0..1
+   unsigned fifo_dec_index : 7; // 34 2..8
+   unsigned tmp_idx : 5; // 34 9..13
+   unsigned frame_number : 16; // 34 14..29
+   unsigned u34_3030 : 1; // 34 30..30 pp.u34[30:30]
+   unsigned u34_3131 : 1; // 34 31..31 pad?
+
+   uint32_t field_order_cnt[2]; // 38, 3c
+
+   struct { // 40
+      unsigned fifo_idx : 7; // 00 0..6
+      unsigned tmp_idx : 5; // 00 7..11
+      unsigned top_is_reference : 1; // 00 12
+      unsigned bottom_is_reference : 1; // 00 13
+      unsigned is_long_term : 1; // 00 14
+      unsigned notseenyet : 1; // 00 15 pad?
+      unsigned field_pic_flag : 1; // 00 16
+      unsigned top_field_marking : 4; // 00 17..20
+      unsigned bottom_field_marking : 4; // 00 21..24
+      unsigned pad : 7; // 00 d25..31
+
+      uint32_t field_order_cnt[2]; // 04,08
+      uint32_t frame_idx; // 0c
+   } refs[0x10];
+
+   uint8_t m4x4[6][16]; // 140
+   uint8_t m8x8[2][64]; // 1a0
+   uint32_t u220; // 220 number of extra reorder_list to append?
+   uint8_t u224[0x20]; // 224..244 reorder_list append ?
+   uint8_t nfi244[0xb0]; // add some pad to make sure nulls are read
 };
 
 static void
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index a44fd3efcf7..1319c3290cf 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -65,18 +65,18 @@ PUSH_KICK(struct nouveau_pushbuf *push)
 static inline uint32_t
 nouveau_screen_transfer_flags(unsigned pipe)
 {
-	uint32_t flags = 0;
-
-	if (!(pipe & PIPE_TRANSFER_UNSYNCHRONIZED)) {
-		if (pipe & PIPE_TRANSFER_READ)
-			flags |= NOUVEAU_BO_RD;
-		if (pipe & PIPE_TRANSFER_WRITE)
-			flags |= NOUVEAU_BO_WR;
-		if (pipe & PIPE_TRANSFER_DONTBLOCK)
-			flags |= NOUVEAU_BO_NOBLOCK;
-	}
-
-	return flags;
+   uint32_t flags = 0;
+
+   if (!(pipe & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      if (pipe & PIPE_TRANSFER_READ)
+         flags |= NOUVEAU_BO_RD;
+      if (pipe & PIPE_TRANSFER_WRITE)
+         flags |= NOUVEAU_BO_WR;
+      if (pipe & PIPE_TRANSFER_DONTBLOCK)
+         flags |= NOUVEAU_BO_NOBLOCK;
+   }
+
+   return flags;
 }
 
 extern struct pipe_screen *
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 03301649e38..bdecb0a32b3 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -172,6 +172,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 69c121274a9..fb74a9748a3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -16,6 +16,7 @@
 #include "nv50/nv50_program.h"
 #include "nv50/nv50_resource.h"
 #include "nv50/nv50_transfer.h"
+#include "nv50/nv50_query.h"
 
 #include "nouveau_context.h"
 #include "nouveau_debug.h"
@@ -195,17 +196,6 @@ void nv50_default_kick_notify(struct nouveau_pushbuf *);
 /* nv50_draw.c */
 extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);
 
-/* nv50_query.c */
-void nv50_init_query_functions(struct nv50_context *);
-void nv50_query_pushbuf_submit(struct nouveau_pushbuf *, uint16_t method,
-                               struct pipe_query *, unsigned result_offset);
-void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
-void nva0_so_target_save_offset(struct pipe_context *,
-                                struct pipe_stream_output_target *,
-                                unsigned index, bool seralize);
-
-#define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
-
 /* nv50_shader_state.c */
 void nv50_vertprog_validate(struct nv50_context *);
 void nv50_gmtyprog_validate(struct nv50_context *);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index eff4477472c..299629b6438 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -336,7 +336,6 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
    info->io.ucpCBSlot = 15;
    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
    info->io.genUserClip = prog->vp.clpd_nr;
-   info->io.sampleInterp = prog->fp.sample_interp;
 
    info->io.resInfoCBSlot = 15;
    info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
@@ -374,6 +373,7 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
    prog->code = info->bin.code;
    prog->code_size = info->bin.codeSize;
    prog->fixups = info->bin.relocData;
+   prog->interps = info->bin.interpData;
    prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
    prog->tls_space = info->bin.tlsSpace;
 
@@ -420,8 +420,8 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
 
    switch (prog->type) {
    case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
-   case PIPE_SHADER_GEOMETRY: heap = nv50->screen->fp_code_heap; break;
-   case PIPE_SHADER_FRAGMENT: heap = nv50->screen->gp_code_heap; break;
+   case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
+   case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
    default:
       assert(!"invalid program type");
       return false;
@@ -456,6 +456,10 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
 
    if (prog->fixups)
       nv50_ir_relocate_code(prog->fixups, prog->code, prog->code_base, 0, 0);
+   if (prog->interps)
+      nv50_ir_change_interp(prog->interps, prog->code,
+                            prog->fp.force_persample_interp,
+                            false /* flatshade */);
 
    nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
                        (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index f4e8e9402ca..24cc96567d7 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -86,7 +86,7 @@ struct nv50_program {
       uint32_t interp; /* 0x1988 */
       uint32_t colors; /* 0x1904 */
       uint8_t has_samplemask;
-      uint8_t sample_interp;
+      uint8_t force_persample_interp;
    } fp;
 
    struct {
@@ -99,6 +99,7 @@ struct nv50_program {
    } gp;
 
    void *fixups; /* relocation records */
+   void *interps; /* interpolation records */
 
    struct nouveau_heap *mem;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index 5368ee73750..dd9b85b7208 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -25,356 +25,46 @@
 #define NV50_PUSH_EXPLICIT_SPACE_CHECKING
 
 #include "nv50/nv50_context.h"
-#include "nv_object.xml.h"
-
-#define NV50_QUERY_STATE_READY   0
-#define NV50_QUERY_STATE_ACTIVE  1
-#define NV50_QUERY_STATE_ENDED   2
-#define NV50_QUERY_STATE_FLUSHED 3
-
-/* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
- * (since we use only a single GPU channel per screen) will not work properly.
- *
- * The first is not that big of an issue because OpenGL does not allow nested
- * queries anyway.
- */
-
-struct nv50_query {
-   uint32_t *data;
-   uint16_t type;
-   uint16_t index;
-   uint32_t sequence;
-   struct nouveau_bo *bo;
-   uint32_t base;
-   uint32_t offset; /* base + i * 32 */
-   uint8_t state;
-   bool is64bit;
-   int nesting; /* only used for occlusion queries */
-   struct nouveau_mm_allocation *mm;
-   struct nouveau_fence *fence;
-};
-
-#define NV50_QUERY_ALLOC_SPACE 256
-
-static inline struct nv50_query *
-nv50_query(struct pipe_query *pipe)
-{
-   return (struct nv50_query *)pipe;
-}
-
-static bool
-nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
-{
-   struct nv50_screen *screen = nv50->screen;
-   int ret;
-
-   if (q->bo) {
-      nouveau_bo_ref(NULL, &q->bo);
-      if (q->mm) {
-         if (q->state == NV50_QUERY_STATE_READY)
-            nouveau_mm_free(q->mm);
-         else
-            nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work,
-                               q->mm);
-      }
-   }
-   if (size) {
-      q->mm = nouveau_mm_allocate(screen->base.mm_GART, size, &q->bo, &q->base);
-      if (!q->bo)
-         return false;
-      q->offset = q->base;
-
-      ret = nouveau_bo_map(q->bo, 0, screen->base.client);
-      if (ret) {
-         nv50_query_allocate(nv50, q, 0);
-         return false;
-      }
-      q->data = (uint32_t *)((uint8_t *)q->bo->map + q->base);
-   }
-   return true;
-}
-
-static void
-nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
-{
-   nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
-   nouveau_fence_ref(NULL, &nv50_query(pq)->fence);
-   FREE(nv50_query(pq));
-}
+#include "nv50/nv50_query.h"
+#include "nv50/nv50_query_hw.h"
 
 static struct pipe_query *
-nv50_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
+nv50_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    struct nv50_query *q;
 
-   q = CALLOC_STRUCT(nv50_query);
-   if (!q)
-      return NULL;
-
-   if (!nv50_query_allocate(nv50, q, NV50_QUERY_ALLOC_SPACE)) {
-      FREE(q);
-      return NULL;
-   }
-
-   q->is64bit = (type == PIPE_QUERY_PRIMITIVES_GENERATED ||
-                 type == PIPE_QUERY_PRIMITIVES_EMITTED ||
-                 type == PIPE_QUERY_SO_STATISTICS ||
-                 type == PIPE_QUERY_PIPELINE_STATISTICS);
-   q->type = type;
-
-   if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
-      q->offset -= 32;
-      q->data -= 32 / sizeof(*q->data); /* we advance before query_begin ! */
-   }
-
+   q = nv50_hw_create_query(nv50, type, index);
    return (struct pipe_query *)q;
 }
 
 static void
-nv50_query_get(struct nouveau_pushbuf *push, struct nv50_query *q,
-               unsigned offset, uint32_t get)
+nv50_destroy_query(struct pipe_context *pipe, struct pipe_query *pq)
 {
-   offset += q->offset;
-
-   PUSH_SPACE(push, 5);
-   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
-   BEGIN_NV04(push, NV50_3D(QUERY_ADDRESS_HIGH), 4);
-   PUSH_DATAh(push, q->bo->offset + offset);
-   PUSH_DATA (push, q->bo->offset + offset);
-   PUSH_DATA (push, q->sequence);
-   PUSH_DATA (push, get);
+   struct nv50_query *q = nv50_query(pq);
+   q->funcs->destroy_query(nv50_context(pipe), q);
 }
 
 static boolean
-nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+nv50_begin_query(struct pipe_context *pipe, struct pipe_query *pq)
 {
-   struct nv50_context *nv50 = nv50_context(pipe);
-   struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_query *q = nv50_query(pq);
-
-   /* For occlusion queries we have to change the storage, because a previous
-    * query might set the initial render conition to false even *after* we re-
-    * initialized it to true.
-    */
-   if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
-      q->offset += 32;
-      q->data += 32 / sizeof(*q->data);
-      if (q->offset - q->base == NV50_QUERY_ALLOC_SPACE)
-         nv50_query_allocate(nv50, q, NV50_QUERY_ALLOC_SPACE);
-
-      /* XXX: can we do this with the GPU, and sync with respect to a previous
-       *  query ?
-       */
-      q->data[0] = q->sequence; /* initialize sequence */
-      q->data[1] = 1; /* initial render condition = true */
-      q->data[4] = q->sequence + 1; /* for comparison COND_MODE */
-      q->data[5] = 0;
-   }
-   if (!q->is64bit)
-      q->data[0] = q->sequence++; /* the previously used one */
-
-   switch (q->type) {
-   case PIPE_QUERY_OCCLUSION_COUNTER:
-      q->nesting = nv50->screen->num_occlusion_queries_active++;
-      if (q->nesting) {
-         nv50_query_get(push, q, 0x10, 0x0100f002);
-      } else {
-         PUSH_SPACE(push, 4);
-         BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-         PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
-         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-         PUSH_DATA (push, 1);
-      }
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED:
-      nv50_query_get(push, q, 0x10, 0x06805002);
-      break;
-   case PIPE_QUERY_PRIMITIVES_EMITTED:
-      nv50_query_get(push, q, 0x10, 0x05805002);
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-      nv50_query_get(push, q, 0x20, 0x05805002);
-      nv50_query_get(push, q, 0x30, 0x06805002);
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      nv50_query_get(push, q, 0x80, 0x00801002); /* VFETCH, VERTICES */
-      nv50_query_get(push, q, 0x90, 0x01801002); /* VFETCH, PRIMS */
-      nv50_query_get(push, q, 0xa0, 0x02802002); /* VP, LAUNCHES */
-      nv50_query_get(push, q, 0xb0, 0x03806002); /* GP, LAUNCHES */
-      nv50_query_get(push, q, 0xc0, 0x04806002); /* GP, PRIMS_OUT */
-      nv50_query_get(push, q, 0xd0, 0x07804002); /* RAST, PRIMS_IN */
-      nv50_query_get(push, q, 0xe0, 0x08804002); /* RAST, PRIMS_OUT */
-      nv50_query_get(push, q, 0xf0, 0x0980a002); /* ROP, PIXELS */
-      break;
-   case PIPE_QUERY_TIME_ELAPSED:
-      nv50_query_get(push, q, 0x10, 0x00005002);
-      break;
-   default:
-      break;
-   }
-   q->state = NV50_QUERY_STATE_ACTIVE;
-   return true;
+   return q->funcs->begin_query(nv50_context(pipe), q);
 }
 
 static void
-nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+nv50_end_query(struct pipe_context *pipe, struct pipe_query *pq)
 {
-   struct nv50_context *nv50 = nv50_context(pipe);
-   struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_query *q = nv50_query(pq);
-
-   q->state = NV50_QUERY_STATE_ENDED;
-
-   switch (q->type) {
-   case PIPE_QUERY_OCCLUSION_COUNTER:
-      nv50_query_get(push, q, 0, 0x0100f002);
-      if (--nv50->screen->num_occlusion_queries_active == 0) {
-         PUSH_SPACE(push, 2);
-         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
-         PUSH_DATA (push, 0);
-      }
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED:
-      nv50_query_get(push, q, 0, 0x06805002);
-      break;
-   case PIPE_QUERY_PRIMITIVES_EMITTED:
-      nv50_query_get(push, q, 0, 0x05805002);
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-      nv50_query_get(push, q, 0x00, 0x05805002);
-      nv50_query_get(push, q, 0x10, 0x06805002);
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      nv50_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
-      nv50_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
-      nv50_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
-      nv50_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
-      nv50_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
-      nv50_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
-      nv50_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
-      nv50_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
-      break;
-   case PIPE_QUERY_TIMESTAMP:
-      q->sequence++;
-      /* fall through */
-   case PIPE_QUERY_TIME_ELAPSED:
-      nv50_query_get(push, q, 0, 0x00005002);
-      break;
-   case PIPE_QUERY_GPU_FINISHED:
-      q->sequence++;
-      nv50_query_get(push, q, 0, 0x1000f010);
-      break;
-   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
-      q->sequence++;
-      nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
-      break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      /* This query is not issued on GPU because disjoint is forced to false */
-      q->state = NV50_QUERY_STATE_READY;
-      break;
-   default:
-      assert(0);
-      break;
-   }
-
-   if (q->is64bit)
-      nouveau_fence_ref(nv50->screen->base.fence.current, &q->fence);
-}
-
-static inline void
-nv50_query_update(struct nv50_query *q)
-{
-   if (q->is64bit) {
-      if (nouveau_fence_signalled(q->fence))
-         q->state = NV50_QUERY_STATE_READY;
-   } else {
-      if (q->data[0] == q->sequence)
-         q->state = NV50_QUERY_STATE_READY;
-   }
+   q->funcs->end_query(nv50_context(pipe), q);
 }
 
 static boolean
-nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
-                  boolean wait, union pipe_query_result *result)
+nv50_get_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+                      boolean wait, union pipe_query_result *result)
 {
-   struct nv50_context *nv50 = nv50_context(pipe);
    struct nv50_query *q = nv50_query(pq);
-   uint64_t *res64 = (uint64_t *)result;
-   uint32_t *res32 = (uint32_t *)result;
-   uint8_t *res8 = (uint8_t *)result;
-   uint64_t *data64 = (uint64_t *)q->data;
-   int i;
-
-   if (q->state != NV50_QUERY_STATE_READY)
-      nv50_query_update(q);
-
-   if (q->state != NV50_QUERY_STATE_READY) {
-      if (!wait) {
-         /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */
-         if (q->state != NV50_QUERY_STATE_FLUSHED) {
-            q->state = NV50_QUERY_STATE_FLUSHED;
-            PUSH_KICK(nv50->base.pushbuf);
-         }
-         return false;
-      }
-      if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
-         return false;
-   }
-   q->state = NV50_QUERY_STATE_READY;
-
-   switch (q->type) {
-   case PIPE_QUERY_GPU_FINISHED:
-      res8[0] = true;
-      break;
-   case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
-      res64[0] = q->data[1] - q->data[5];
-      break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
-   case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
-      res64[0] = data64[0] - data64[2];
-      break;
-   case PIPE_QUERY_SO_STATISTICS:
-      res64[0] = data64[0] - data64[4];
-      res64[1] = data64[2] - data64[6];
-      break;
-   case PIPE_QUERY_PIPELINE_STATISTICS:
-      for (i = 0; i < 8; ++i)
-         res64[i] = data64[i * 2] - data64[16 + i * 2];
-      break;
-   case PIPE_QUERY_TIMESTAMP:
-      res64[0] = data64[1];
-      break;
-   case PIPE_QUERY_TIMESTAMP_DISJOINT:
-      res64[0] = 1000000000;
-      res8[8] = false;
-      break;
-   case PIPE_QUERY_TIME_ELAPSED:
-      res64[0] = data64[1] - data64[3];
-      break;
-   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
-      res32[0] = q->data[1];
-      break;
-   default:
-      return false;
-   }
-
-   return true;
-}
-
-void
-nv84_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
-{
-   struct nv50_query *q = nv50_query(pq);
-   unsigned offset = q->offset;
-
-   PUSH_SPACE(push, 5);
-   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
-   BEGIN_NV04(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
-   PUSH_DATAh(push, q->bo->offset + offset);
-   PUSH_DATA (push, q->bo->offset + offset);
-   PUSH_DATA (push, q->sequence);
-   PUSH_DATA (push, NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+   return q->funcs->get_query_result(nv50_context(pipe), q, wait, result);
 }
 
 static void
@@ -384,7 +74,8 @@ nv50_render_condition(struct pipe_context *pipe,
 {
    struct nv50_context *nv50 = nv50_context(pipe);
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
-   struct nv50_query *q;
+   struct nv50_query *q = nv50_query(pq);
+   struct nv50_hw_query *hq = nv50_hw_query(q);
    uint32_t cond;
    bool wait =
       mode != PIPE_RENDER_COND_NO_WAIT &&
@@ -394,7 +85,6 @@ nv50_render_condition(struct pipe_context *pipe,
       cond = NV50_3D_COND_MODE_ALWAYS;
    }
    else {
-      q = nv50_query(pq);
       /* NOTE: comparison of 2 queries only works if both have completed */
       switch (q->type) {
       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
@@ -405,7 +95,7 @@ nv50_render_condition(struct pipe_context *pipe,
       case PIPE_QUERY_OCCLUSION_COUNTER:
       case PIPE_QUERY_OCCLUSION_PREDICATE:
          if (likely(!condition)) {
-            if (unlikely(q->nesting))
+            if (unlikely(hq->nesting))
                cond = wait ? NV50_3D_COND_MODE_NOT_EQUAL :
                              NV50_3D_COND_MODE_ALWAYS;
             else
@@ -440,48 +130,15 @@ nv50_render_condition(struct pipe_context *pipe,
       PUSH_DATA (push, 0);
    }
 
-   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
    BEGIN_NV04(push, NV50_3D(COND_ADDRESS_HIGH), 3);
-   PUSH_DATAh(push, q->bo->offset + q->offset);
-   PUSH_DATA (push, q->bo->offset + q->offset);
+   PUSH_DATAh(push, hq->bo->offset + hq->offset);
+   PUSH_DATA (push, hq->bo->offset + hq->offset);
    PUSH_DATA (push, cond);
 
    BEGIN_NV04(push, NV50_2D(COND_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, q->bo->offset + q->offset);
-   PUSH_DATA (push, q->bo->offset + q->offset);
-}
-
-void
-nv50_query_pushbuf_submit(struct nouveau_pushbuf *push, uint16_t method,
-                          struct pipe_query *pq, unsigned result_offset)
-{
-   struct nv50_query *q = nv50_query(pq);
-
-   nv50_query_update(q);
-   if (q->state != NV50_QUERY_STATE_READY)
-      nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, push->client);
-   q->state = NV50_QUERY_STATE_READY;
-
-   BEGIN_NV04(push, SUBC_3D(method), 1);
-   PUSH_DATA (push, q->data[result_offset / 4]);
-}
-
-void
-nva0_so_target_save_offset(struct pipe_context *pipe,
-                           struct pipe_stream_output_target *ptarg,
-                           unsigned index, bool serialize)
-{
-   struct nv50_so_target *targ = nv50_so_target(ptarg);
-
-   if (serialize) {
-      struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
-      PUSH_DATA (push, 0);
-   }
-
-   nv50_query(targ->pq)->index = index;
-   nv50_query_end(pipe, targ->pq);
+   PUSH_DATAh(push, hq->bo->offset + hq->offset);
+   PUSH_DATA (push, hq->bo->offset + hq->offset);
 }
 
 void
@@ -489,10 +146,10 @@ nv50_init_query_functions(struct nv50_context *nv50)
 {
    struct pipe_context *pipe = &nv50->base.pipe;
 
-   pipe->create_query = nv50_query_create;
-   pipe->destroy_query = nv50_query_destroy;
-   pipe->begin_query = nv50_query_begin;
-   pipe->end_query = nv50_query_end;
-   pipe->get_query_result = nv50_query_result;
+   pipe->create_query = nv50_create_query;
+   pipe->destroy_query = nv50_destroy_query;
+   pipe->begin_query = nv50_begin_query;
+   pipe->end_query = nv50_end_query;
+   pipe->get_query_result = nv50_get_query_result;
    pipe->render_condition = nv50_render_condition;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.h b/src/gallium/drivers/nouveau/nv50/nv50_query.h
new file mode 100644
index 00000000000..d990285c857
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.h
@@ -0,0 +1,33 @@
+#ifndef __NV50_QUERY_H__
+#define __NV50_QUERY_H__
+
+#include "pipe/p_context.h"
+
+#include "nouveau_context.h"
+
+struct nv50_context;
+struct nv50_query;
+
+struct nv50_query_funcs {
+   void (*destroy_query)(struct nv50_context *, struct nv50_query *);
+   boolean (*begin_query)(struct nv50_context *, struct nv50_query *);
+   void (*end_query)(struct nv50_context *, struct nv50_query *);
+   boolean (*get_query_result)(struct nv50_context *, struct nv50_query *,
+                               boolean, union pipe_query_result *);
+};
+
+struct nv50_query {
+   const struct nv50_query_funcs *funcs;
+   uint16_t type;
+   uint16_t index;
+};
+
+static inline struct nv50_query *
+nv50_query(struct pipe_query *pipe)
+{
+   return (struct nv50_query *)pipe;
+}
+
+void nv50_init_query_functions(struct nv50_context *);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
new file mode 100644
index 00000000000..945ce7abe50
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define NV50_PUSH_EXPLICIT_SPACE_CHECKING
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw.h"
+#include "nv_object.xml.h"
+
+#define NV50_HW_QUERY_STATE_READY   0
+#define NV50_HW_QUERY_STATE_ACTIVE  1
+#define NV50_HW_QUERY_STATE_ENDED   2
+#define NV50_HW_QUERY_STATE_FLUSHED 3
+
+/* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
+ * (since we use only a single GPU channel per screen) will not work properly.
+ *
+ * The first is not that big of an issue because OpenGL does not allow nested
+ * queries anyway.
+ */
+
+#define NV50_HW_QUERY_ALLOC_SPACE 256
+
+static bool
+nv50_hw_query_allocate(struct nv50_context *nv50, struct nv50_query *q,
+                       int size)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+   int ret;
+
+   if (hq->bo) {
+      nouveau_bo_ref(NULL, &hq->bo);
+      if (hq->mm) {
+         if (hq->state == NV50_HW_QUERY_STATE_READY)
+            nouveau_mm_free(hq->mm);
+         else
+            nouveau_fence_work(screen->base.fence.current,
+                               nouveau_mm_free_work, hq->mm);
+      }
+   }
+   if (size) {
+      hq->mm = nouveau_mm_allocate(screen->base.mm_GART, size,
+                                   &hq->bo, &hq->base_offset);
+      if (!hq->bo)
+         return false;
+      hq->offset = hq->base_offset;
+
+      ret = nouveau_bo_map(hq->bo, 0, screen->base.client);
+      if (ret) {
+         nv50_hw_query_allocate(nv50, q, 0);
+         return false;
+      }
+      hq->data = (uint32_t *)((uint8_t *)hq->bo->map + hq->base_offset);
+   }
+   return true;
+}
+
+static void
+nv50_hw_query_get(struct nouveau_pushbuf *push, struct nv50_query *q,
+               unsigned offset, uint32_t get)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   offset += hq->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+   BEGIN_NV04(push, NV50_3D(QUERY_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->sequence);
+   PUSH_DATA (push, get);
+}
+
+static inline void
+nv50_hw_query_update(struct nv50_query *q)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   if (hq->is64bit) {
+      if (nouveau_fence_signalled(hq->fence))
+         hq->state = NV50_HW_QUERY_STATE_READY;
+   } else {
+      if (hq->data[0] == hq->sequence)
+         hq->state = NV50_HW_QUERY_STATE_READY;
+   }
+}
+
+static void
+nv50_hw_destroy_query(struct nv50_context *nv50, struct nv50_query *q)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+   nv50_hw_query_allocate(nv50, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
+}
+
+static boolean
+nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   /* For occlusion queries we have to change the storage, because a previous
+    * query might set the initial render condition to false even *after* we re-
+    * initialized it to true.
+    */
+   if (hq->rotate) {
+      hq->offset += hq->rotate;
+      hq->data += hq->rotate / sizeof(*hq->data);
+      if (hq->offset - hq->base_offset == NV50_HW_QUERY_ALLOC_SPACE)
+         nv50_hw_query_allocate(nv50, q, NV50_HW_QUERY_ALLOC_SPACE);
+
+      /* XXX: can we do this with the GPU, and sync with respect to a previous
+       *  query ?
+       */
+      hq->data[0] = hq->sequence; /* initialize sequence */
+      hq->data[1] = 1; /* initial render condition = true */
+      hq->data[4] = hq->sequence + 1; /* for comparison COND_MODE */
+      hq->data[5] = 0;
+   }
+   if (!hq->is64bit)
+      hq->data[0] = hq->sequence++; /* the previously used one */
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      hq->nesting = nv50->screen->num_occlusion_queries_active++;
+      if (hq->nesting) {
+         nv50_hw_query_get(push, q, 0x10, 0x0100f002);
+      } else {
+         PUSH_SPACE(push, 4);
+         BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
+         PUSH_DATA (push, NV50_3D_COUNTER_RESET_SAMPLECNT);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 1);
+      }
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nv50_hw_query_get(push, q, 0x10, 0x06805002);
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nv50_hw_query_get(push, q, 0x10, 0x05805002);
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nv50_hw_query_get(push, q, 0x20, 0x05805002);
+      nv50_hw_query_get(push, q, 0x30, 0x06805002);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      nv50_hw_query_get(push, q, 0x80, 0x00801002); /* VFETCH, VERTICES */
+      nv50_hw_query_get(push, q, 0x90, 0x01801002); /* VFETCH, PRIMS */
+      nv50_hw_query_get(push, q, 0xa0, 0x02802002); /* VP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0xb0, 0x03806002); /* GP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0xc0, 0x04806002); /* GP, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0xd0, 0x07804002); /* RAST, PRIMS_IN */
+      nv50_hw_query_get(push, q, 0xe0, 0x08804002); /* RAST, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0xf0, 0x0980a002); /* ROP, PIXELS */
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      nv50_hw_query_get(push, q, 0x10, 0x00005002);
+      break;
+   default:
+      assert(0);
+      return false;
+   }
+   hq->state = NV50_HW_QUERY_STATE_ACTIVE;
+   return true;
+}
+
+static void
+nv50_hw_end_query(struct nv50_context *nv50, struct nv50_query *q)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   hq->state = NV50_HW_QUERY_STATE_ENDED;
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      nv50_hw_query_get(push, q, 0, 0x0100f002);
+      if (--nv50->screen->num_occlusion_queries_active == 0) {
+         PUSH_SPACE(push, 2);
+         BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
+         PUSH_DATA (push, 0);
+      }
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nv50_hw_query_get(push, q, 0, 0x06805002);
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      nv50_hw_query_get(push, q, 0, 0x05805002);
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      nv50_hw_query_get(push, q, 0x00, 0x05805002);
+      nv50_hw_query_get(push, q, 0x10, 0x06805002);
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      nv50_hw_query_get(push, q, 0x00, 0x00801002); /* VFETCH, VERTICES */
+      nv50_hw_query_get(push, q, 0x10, 0x01801002); /* VFETCH, PRIMS */
+      nv50_hw_query_get(push, q, 0x20, 0x02802002); /* VP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0x30, 0x03806002); /* GP, LAUNCHES */
+      nv50_hw_query_get(push, q, 0x40, 0x04806002); /* GP, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0x50, 0x07804002); /* RAST, PRIMS_IN */
+      nv50_hw_query_get(push, q, 0x60, 0x08804002); /* RAST, PRIMS_OUT */
+      nv50_hw_query_get(push, q, 0x70, 0x0980a002); /* ROP, PIXELS */
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      hq->sequence++;
+      /* fall through */
+   case PIPE_QUERY_TIME_ELAPSED:
+      nv50_hw_query_get(push, q, 0, 0x00005002);
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      hq->sequence++;
+      nv50_hw_query_get(push, q, 0, 0x1000f010);
+      break;
+   case NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      hq->sequence++;
+      nv50_hw_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      /* This query is not issued on GPU because disjoint is forced to false */
+      hq->state = NV50_HW_QUERY_STATE_READY;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+   if (hq->is64bit)
+      nouveau_fence_ref(nv50->screen->base.fence.current, &hq->fence);
+}
+
+static boolean
+nv50_hw_get_query_result(struct nv50_context *nv50, struct nv50_query *q,
+                         boolean wait, union pipe_query_result *result)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+   uint64_t *res64 = (uint64_t *)result;
+   uint32_t *res32 = (uint32_t *)result;
+   uint8_t *res8 = (uint8_t *)result;
+   uint64_t *data64 = (uint64_t *)hq->data;
+   int i;
+
+   if (hq->state != NV50_HW_QUERY_STATE_READY)
+      nv50_hw_query_update(q);
+
+   if (hq->state != NV50_HW_QUERY_STATE_READY) {
+      if (!wait) {
+         /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */
+         if (hq->state != NV50_HW_QUERY_STATE_FLUSHED) {
+            hq->state = NV50_HW_QUERY_STATE_FLUSHED;
+            PUSH_KICK(nv50->base.pushbuf);
+         }
+         return false;
+      }
+      if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
+         return false;
+   }
+   hq->state = NV50_HW_QUERY_STATE_READY;
+
+   switch (q->type) {
+   case PIPE_QUERY_GPU_FINISHED:
+      res8[0] = true;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER: /* u32 sequence, u32 count, u64 time */
+      res64[0] = hq->data[1] - hq->data[5];
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
+   case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
+      res64[0] = data64[0] - data64[2];
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      res64[0] = data64[0] - data64[4];
+      res64[1] = data64[2] - data64[6];
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      for (i = 0; i < 8; ++i)
+         res64[i] = data64[i * 2] - data64[16 + i * 2];
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      res64[0] = data64[1];
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      res64[0] = 1000000000;
+      res8[8] = false;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+      res64[0] = data64[1] - data64[3];
+      break;
+   case NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      res32[0] = hq->data[1];
+      break;
+   default:
+      assert(0);
+      return false;
+   }
+
+   return true;
+}
+
+static const struct nv50_query_funcs hw_query_funcs = {
+   .destroy_query = nv50_hw_destroy_query,
+   .begin_query = nv50_hw_begin_query,
+   .end_query = nv50_hw_end_query,
+   .get_query_result = nv50_hw_get_query_result,
+};
+
+struct nv50_query *
+nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
+{
+   struct nv50_hw_query *hq;
+   struct nv50_query *q;
+
+   hq = CALLOC_STRUCT(nv50_hw_query);
+   if (!hq)
+      return NULL;
+
+   q = &hq->base;
+   q->funcs = &hw_query_funcs;
+   q->type = type;
+
+   switch (q->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      hq->rotate = 32;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      hq->is64bit = true;
+      break;
+   case PIPE_QUERY_TIME_ELAPSED:
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+   case PIPE_QUERY_GPU_FINISHED:
+   case NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      break;
+   default:
+      debug_printf("invalid query type: %u\n", type);
+      FREE(q);
+      return NULL;
+   }
+
+   if (!nv50_hw_query_allocate(nv50, q, NV50_HW_QUERY_ALLOC_SPACE)) {
+      FREE(hq);
+      return NULL;
+   }
+
+   if (hq->rotate) {
+      /* we advance before query_begin ! */
+      hq->offset -= hq->rotate;
+      hq->data -= hq->rotate / sizeof(*hq->data);
+   }
+
+   return q;
+}
+
+void
+nv50_hw_query_pushbuf_submit(struct nouveau_pushbuf *push, uint16_t method,
+                             struct nv50_query *q, unsigned result_offset)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   nv50_hw_query_update(q);
+   if (hq->state != NV50_HW_QUERY_STATE_READY)
+      nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, push->client);
+   hq->state = NV50_HW_QUERY_STATE_READY;
+
+   BEGIN_NV04(push, SUBC_3D(method), 1);
+   PUSH_DATA (push, hq->data[result_offset / 4]);
+}
+
+void
+nv84_hw_query_fifo_wait(struct nouveau_pushbuf *push, struct nv50_query *q)
+{
+   struct nv50_hw_query *hq = nv50_hw_query(q);
+   unsigned offset = hq->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, hq->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   BEGIN_NV04(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->bo->offset + offset);
+   PUSH_DATA (push, hq->sequence);
+   PUSH_DATA (push, NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
new file mode 100644
index 00000000000..294c67de9a4
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
@@ -0,0 +1,40 @@
+#ifndef __NV50_QUERY_HW_H__
+#define __NV50_QUERY_HW_H__
+
+#include "nouveau_fence.h"
+#include "nouveau_mm.h"
+
+#include "nv50_query.h"
+
+#define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
+
+struct nv50_hw_query {
+   struct nv50_query base;
+   uint32_t *data;
+   uint32_t sequence;
+   struct nouveau_bo *bo;
+   uint32_t base_offset;
+   uint32_t offset; /* base + i * rotate */
+   uint8_t state;
+   bool is64bit;
+   uint8_t rotate;
+   int nesting; /* only used for occlusion queries */
+   struct nouveau_mm_allocation *mm;
+   struct nouveau_fence *fence;
+};
+
+static inline struct nv50_hw_query *
+nv50_hw_query(struct nv50_query *q)
+{
+   return (struct nv50_hw_query *)q;
+}
+
+struct nv50_query *
+nv50_hw_create_query(struct nv50_context *, unsigned, unsigned);
+void
+nv50_hw_query_pushbuf_submit(struct nouveau_pushbuf *, uint16_t,
+                             struct nv50_query *, unsigned);
+void
+nv84_hw_query_fifo_wait(struct nouveau_pushbuf *, struct nv50_query *);
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.c b/src/gallium/drivers/nouveau/nv50/nv50_resource.c
index d289b4a24e8..325c19fb80c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.c
@@ -32,8 +32,8 @@ nv50_resource_from_handle(struct pipe_screen * screen,
 
 struct pipe_surface *
 nv50_surface_from_buffer(struct pipe_context *pipe,
-			 struct pipe_resource *pbuf,
-			 const struct pipe_surface *templ)
+                         struct pipe_resource *pbuf,
+                         const struct pipe_surface *templ)
 {
    struct nv50_surface *sf = CALLOC_STRUCT(nv50_surface);
    if (!sf)
@@ -65,8 +65,8 @@ nv50_surface_from_buffer(struct pipe_context *pipe,
 
 static struct pipe_surface *
 nv50_surface_create(struct pipe_context *pipe,
-		    struct pipe_resource *pres,
-		    const struct pipe_surface *templ)
+                    struct pipe_resource *pres,
+                    const struct pipe_surface *templ)
 {
    /* surfaces are assumed to be miptrees all over the place. */
    assert(pres->target != PIPE_BUFFER);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index ec51d00f266..a9e0c478322 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -180,6 +180,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -191,6 +193,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_INDEP_BLEND_FUNC:
    case PIPE_CAP_TEXTURE_QUERY_LOD:
    case PIPE_CAP_SAMPLE_SHADING:
+   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
       return class_3d >= NVA3_3D_CLASS;
 
    /* unsupported caps */
@@ -215,8 +218,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
-   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 941555ffbf8..9b911043132 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -27,6 +27,7 @@
 #include "util/u_inlines.h"
 
 #include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw.h"
 
 void
 nv50_constbufs_validate(struct nv50_context *nv50)
@@ -168,11 +169,23 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 {
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_program *fp = nv50->fragprog;
+   struct pipe_rasterizer_state *rast = &nv50->rast->pipe;
 
-   fp->fp.sample_interp = nv50->min_samples > 1;
+   if (fp->fp.force_persample_interp != rast->force_persample_interp) {
+      /* Force the program to be reuploaded, which will trigger interp fixups
+       * to get applied
+       */
+      if (fp->mem)
+         nouveau_heap_free(&fp->mem);
+
+      fp->fp.force_persample_interp = rast->force_persample_interp;
+   }
+
+   if (fp->mem && !(nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_MIN_SAMPLES)))
+      return;
 
    if (!nv50_program_validate(nv50, fp))
-         return;
+      return;
    nv50_program_update_context_state(nv50, fp, 1);
 
    BEGIN_NV04(push, NV50_3D(FP_REG_ALLOC_TEMP), 1);
@@ -629,7 +642,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
       const unsigned n = nv50->screen->base.class_3d >= NVA0_3D_CLASS ? 4 : 3;
 
       if (n == 4 && !targ->clean)
-         nv84_query_fifo_wait(push, targ->pq);
+         nv84_hw_query_fifo_wait(push, nv50_query(targ->pq));
       BEGIN_NV04(push, NV50_3D(STRMOUT_ADDRESS_HIGH(i)), n);
       PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
       PUSH_DATA (push, buf->address + targ->pipe.buffer_offset);
@@ -638,8 +651,8 @@ nv50_stream_output_validate(struct nv50_context *nv50)
          PUSH_DATA(push, targ->pipe.buffer_size);
          if (!targ->clean) {
             assert(targ->pq);
-            nv50_query_pushbuf_submit(push, NVA0_3D_STRMOUT_OFFSET(i),
-                                      targ->pq, 0x4);
+            nv50_hw_query_pushbuf_submit(push, NVA0_3D_STRMOUT_OFFSET(i),
+                                         nv50_query(targ->pq), 0x4);
          } else {
             BEGIN_NV04(push, NVA0_3D(STRMOUT_OFFSET(i)), 1);
             PUSH_DATA(push, 0);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 410e6311e60..6c8c9f0b4e6 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -30,6 +30,7 @@
 
 #include "nv50/nv50_stateobj.h"
 #include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw.h"
 
 #include "nv50/nv50_3d.xml.h"
 #include "nv50/nv50_texture.xml.h"
@@ -725,6 +726,9 @@ nv50_sp_state_create(struct pipe_context *pipe,
    if (cso->stream_output.num_outputs)
       prog->pipe.stream_output = cso->stream_output;
 
+   prog->translated = nv50_program_translate(
+         prog, nv50_context(pipe)->screen->base.device->chipset);
+
    return (void *)prog;
 }
 
@@ -1033,7 +1037,7 @@ nv50_so_target_create(struct pipe_context *pipe,
 
    if (nouveau_context(pipe)->screen->class_3d >= NVA0_3D_CLASS) {
       targ->pq = pipe->create_query(pipe,
-                                    NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET, 0);
+                                    NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET, 0);
       if (!targ->pq) {
          FREE(targ);
          return NULL;
@@ -1057,6 +1061,24 @@ nv50_so_target_create(struct pipe_context *pipe,
 }
 
 static void
+nva0_so_target_save_offset(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *ptarg,
+                           unsigned index, bool serialize)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+
+   if (serialize) {
+      struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
+      PUSH_SPACE(push, 2);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   nv50_query(targ->pq)->index = index;
+   pipe->end_query(pipe, targ->pq);
+}
+
+static void
 nv50_so_target_destroy(struct pipe_context *pipe,
                        struct pipe_stream_output_target *ptarg)
 {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 66dcf43533b..b6181edf24f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -487,7 +487,7 @@ static struct state_validate {
     { nv50_validate_viewport,      NV50_NEW_VIEWPORT },
     { nv50_vertprog_validate,      NV50_NEW_VERTPROG },
     { nv50_gmtyprog_validate,      NV50_NEW_GMTYPROG },
-    { nv50_fragprog_validate,      NV50_NEW_FRAGPROG |
+    { nv50_fragprog_validate,      NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
                                    NV50_NEW_MIN_SAMPLES },
     { nv50_fp_linkage_validate,    NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
                                    NV50_NEW_GMTYPROG | NV50_NEW_RASTERIZER },
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 64348b3c378..237d76d6adb 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -220,10 +220,14 @@ nv50_resource_copy_region(struct pipe_context *pipe,
    nv04_resource(dst)->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
 
    if (m2mf) {
+      struct nv50_miptree *src_mt = nv50_miptree(src);
+      struct nv50_miptree *dst_mt = nv50_miptree(dst);
       struct nv50_m2mf_rect drect, srect;
       unsigned i;
-      unsigned nx = util_format_get_nblocksx(src->format, src_box->width);
-      unsigned ny = util_format_get_nblocksy(src->format, src_box->height);
+      unsigned nx = util_format_get_nblocksx(src->format, src_box->width)
+         << src_mt->ms_x;
+      unsigned ny = util_format_get_nblocksy(src->format, src_box->height)
+         << src_mt->ms_y;
 
       nv50_m2mf_rect_setup(&drect, dst, dst_level, dstx, dsty, dstz);
       nv50_m2mf_rect_setup(&srect, src, src_level,
@@ -232,15 +236,15 @@ nv50_resource_copy_region(struct pipe_context *pipe,
       for (i = 0; i < src_box->depth; ++i) {
          nv50_m2mf_transfer_rect(nv50, &drect, &srect, nx, ny);
 
-         if (nv50_miptree(dst)->layout_3d)
+         if (dst_mt->layout_3d)
             drect.z++;
          else
-            drect.base += nv50_miptree(dst)->layer_stride;
+            drect.base += dst_mt->layer_stride;
 
-         if (nv50_miptree(src)->layout_3d)
+         if (src_mt->layout_3d)
             srect.z++;
          else
-            srect.base += nv50_miptree(src)->layer_stride;
+            srect.base += src_mt->layer_stride;
       }
       return;
    }
@@ -270,7 +274,7 @@ nv50_resource_copy_region(struct pipe_context *pipe,
 static void
 nv50_clear_render_target(struct pipe_context *pipe,
                          struct pipe_surface *dst,
-			 const union pipe_color_union *color,
+                         const union pipe_color_union *color,
                          unsigned dstx, unsigned dsty,
                          unsigned width, unsigned height)
 {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index f5f47087bef..9fa6fceeefa 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -27,6 +27,7 @@
 #include "translate/translate.h"
 
 #include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw.h"
 #include "nv50/nv50_resource.h"
 
 #include "nv50/nv50_3d.xml.h"
@@ -745,7 +746,8 @@ nva0_draw_stream_output(struct nv50_context *nv50,
       PUSH_DATA (push, 0);
       BEGIN_NV04(push, NVA0_3D(DRAW_TFB_STRIDE), 1);
       PUSH_DATA (push, so->stride);
-      nv50_query_pushbuf_submit(push, NVA0_3D_DRAW_TFB_BYTES, so->pq, 0x4);
+      nv50_hw_query_pushbuf_submit(push, NVA0_3D_DRAW_TFB_BYTES,
+                                   nv50_query(so->pq), 0x4);
       BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
       PUSH_DATA (push, 0);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c b/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c
index 7780a179399..d13480c21d5 100644
--- a/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video_vp.c
@@ -27,33 +27,33 @@
 static void dump_comm_vp(struct nouveau_vp3_decoder *dec, struct comm *comm, u32 comm_seq,
                          struct nouveau_bo *inter_bo, unsigned slice_size)
 {
-	unsigned i, idx = comm->pvp_cur_index & 0xf;
-	debug_printf("Status: %08x, stage: %08x\n", comm->status_vp[idx], comm->pvp_stage);
+   unsigned i, idx = comm->pvp_cur_index & 0xf;
+   debug_printf("Status: %08x, stage: %08x\n", comm->status_vp[idx], comm->pvp_stage);
 #if 0
-	debug_printf("Acked byte ofs: %x, bsp byte ofs: %x\n", comm->acked_byte_ofs, comm->byte_ofs);
-	debug_printf("Irq/parse indexes: %i %i\n", comm->irq_index, comm->parse_endpos_index);
+   debug_printf("Acked byte ofs: %x, bsp byte ofs: %x\n", comm->acked_byte_ofs, comm->byte_ofs);
+   debug_printf("Irq/parse indexes: %i %i\n", comm->irq_index, comm->parse_endpos_index);
 
-	for (i = 0; i != comm->irq_index; ++i)
-		debug_printf("irq[%i] = { @ %08x -> %04x }\n", i, comm->irq_pos[i], comm->irq_470[i]);
-	for (i = 0; i != comm->parse_endpos_index; ++i)
-		debug_printf("parse_endpos[%i] = { @ %08x}\n", i, comm->parse_endpos[i]);
+   for (i = 0; i != comm->irq_index; ++i)
+      debug_printf("irq[%i] = { @ %08x -> %04x }\n", i, comm->irq_pos[i], comm->irq_470[i]);
+   for (i = 0; i != comm->parse_endpos_index; ++i)
+      debug_printf("parse_endpos[%i] = { @ %08x}\n", i, comm->parse_endpos[i]);
 #endif
-	debug_printf("mb_y = %u\n", comm->mb_y[idx]);
-	if (comm->status_vp[idx] <= 1)
-		return;
-
-	if ((comm->pvp_stage & 0xff) != 0xff) {
-		unsigned *map;
-		int ret = nouveau_bo_map(inter_bo, NOUVEAU_BO_RD|NOUVEAU_BO_NOBLOCK, dec->client);
-		assert(ret >= 0);
-		map = inter_bo->map;
-		for (i = 0; i < comm->byte_ofs + slice_size; i += 0x10) {
-			debug_printf("%05x: %08x %08x %08x %08x\n", i, map[i/4], map[i/4+1], map[i/4+2], map[i/4+3]);
-		}
-		munmap(inter_bo->map, inter_bo->size);
-		inter_bo->map = NULL;
-	}
-	assert((comm->pvp_stage & 0xff) == 0xff);
+   debug_printf("mb_y = %u\n", comm->mb_y[idx]);
+   if (comm->status_vp[idx] <= 1)
+      return;
+
+   if ((comm->pvp_stage & 0xff) != 0xff) {
+      unsigned *map;
+      int ret = nouveau_bo_map(inter_bo, NOUVEAU_BO_RD|NOUVEAU_BO_NOBLOCK, dec->client);
+      assert(ret >= 0);
+      map = inter_bo->map;
+      for (i = 0; i < comm->byte_ofs + slice_size; i += 0x10) {
+         debug_printf("%05x: %08x %08x %08x %08x\n", i, map[i/4], map[i/4+1], map[i/4+2], map[i/4+3]);
+      }
+      munmap(inter_bo->map, inter_bo->size);
+      inter_bo->map = NULL;
+   }
+   assert((comm->pvp_stage & 0xff) == 0xff);
 }
 #endif
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index a168dd684ab..68048f9d6c0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -252,10 +252,10 @@ nvc0_vtgp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
       }
    }
 
-   vp->vp.clip_enable = info->io.clipDistanceMask;
-   for (i = 0; i < 8; ++i)
-      if (info->io.cullDistanceMask & (1 << i))
-         vp->vp.clip_mode |= 1 << (i * 4);
+   vp->vp.clip_enable =
+      (1 << (info->io.clipDistances + info->io.cullDistances)) - 1;
+   for (i = 0; i < info->io.cullDistances; ++i)
+      vp->vp.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
 
    if (info->io.genUserClip < 0)
       vp->vp.num_ucps = PIPE_MAX_CLIP_PLANES + 1; /* prevent rebuilding */
@@ -269,8 +269,6 @@ nvc0_vp_gen_header(struct nvc0_program *vp, struct nv50_ir_prog_info *info)
    vp->hdr[0] = 0x20061 | (1 << 10);
    vp->hdr[4] = 0xff000;
 
-   vp->hdr[18] = info->io.clipDistanceMask;
-
    return nvc0_vtgp_gen_header(vp, info);
 }
 
@@ -424,6 +422,11 @@ nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info)
 
    for (i = 0; i < info->numInputs; ++i) {
       m = nvc0_hdr_interp_mode(&info->in[i]);
+      if (info->in[i].sn == TGSI_SEMANTIC_COLOR) {
+         fp->fp.colors |= 1 << info->in[i].si;
+         if (info->in[i].sc)
+            fp->fp.color_interp[info->in[i].si] = m | (info->in[i].mask << 4);
+      }
       for (c = 0; c < 4; ++c) {
          if (!(info->in[i].mask & (1 << c)))
             continue;
@@ -531,7 +534,6 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    info->io.genUserClip = prog->vp.num_ucps;
    info->io.ucpBase = 256;
    info->io.ucpCBSlot = 15;
-   info->io.sampleInterp = prog->fp.sample_interp;
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
@@ -575,6 +577,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
    prog->immd_data = info->immd.buf;
    prog->immd_size = info->immd.bufSize;
    prog->relocs = info->bin.relocData;
+   prog->interps = info->bin.interpData;
    prog->num_gprs = MAX2(4, (info->bin.maxGPR + 1));
    prog->num_barriers = info->numBarriers;
 
@@ -713,6 +716,23 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
 
    if (prog->relocs)
       nv50_ir_relocate_code(prog->relocs, prog->code, code_pos, lib_pos, 0);
+   if (prog->interps) {
+      nv50_ir_change_interp(prog->interps, prog->code,
+                            prog->fp.force_persample_interp,
+                            prog->fp.flatshade);
+      for (int i = 0; i < 2; i++) {
+         unsigned mask = prog->fp.color_interp[i] >> 4;
+         unsigned interp = prog->fp.color_interp[i] & 3;
+         if (!mask)
+            continue;
+         prog->hdr[14] &= ~(0xff << (8 * i));
+         if (prog->fp.flatshade)
+            interp = NVC0_INTERP_FLAT;
+         for (int c = 0; c < 4; c++)
+            if (mask & (1 << c))
+               prog->hdr[14] |= interp << (2 * (4 * i + c));
+      }
+   }
 
 #ifdef DEBUG
    if (debug_get_bool_option("NV50_PROG_DEBUG", false))
@@ -773,6 +793,7 @@ nvc0_program_destroy(struct nvc0_context *nvc0, struct nvc0_program *prog)
    FREE(prog->code); /* may be 0 for hardcoded shaders */
    FREE(prog->immd_data);
    FREE(prog->relocs);
+   FREE(prog->interps);
    if (prog->type == PIPE_SHADER_COMPUTE && prog->cp.syms)
       FREE(prog->cp.syms);
    if (prog->tfb) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
index 390e0c7a4f0..9c45e7b3e31 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.h
@@ -45,8 +45,10 @@ struct nvc0_program {
    } vp;
    struct {
       uint8_t early_z;
-      uint8_t in_pos[PIPE_MAX_SHADER_INPUTS];
-      uint8_t sample_interp;
+      uint8_t colors;
+      uint8_t color_interp[2];
+      bool force_persample_interp;
+      bool flatshade;
    } fp;
    struct {
       uint32_t tess_mode; /* ~0 if defined by the other stage */
@@ -61,6 +63,7 @@ struct nvc0_program {
    uint8_t num_barriers;
 
    void *relocs;
+   void *interps;
 
    struct nvc0_transform_feedback_state *tfb;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index e4752e2dbc5..f53921092a5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -28,6 +28,7 @@
 #include "nvc0/nvc0_query.h"
 #include "nvc0/nvc0_query_sw.h"
 #include "nvc0/nvc0_query_hw.h"
+#include "nvc0/nvc0_query_hw_metric.h"
 #include "nvc0/nvc0_query_hw_sm.h"
 
 static struct pipe_query *
@@ -188,7 +189,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
             count++;
          } else
          if (screen->base.class_3d < NVE4_3D_CLASS) {
-            count++;
+            count += 2;
          }
       }
    }
@@ -218,6 +219,17 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
             return 1;
          }
       }
+   } else
+   if (id == NVC0_HW_METRIC_QUERY_GROUP) {
+      if (screen->compute) {
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            info->name = "Performance metrics";
+            info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
+            info->max_active_queries = 1;
+            info->num_queries = NVC0_HW_METRIC_QUERY_COUNT;
+            return 1;
+         }
+      }
    }
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
    else if (id == NVC0_SW_QUERY_DRV_STAT_GROUP) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
index 6883ab6ab9d..c46361c31aa 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.h
@@ -32,7 +32,8 @@ nvc0_query(struct pipe_query *pipe)
  * Driver queries groups:
  */
 #define NVC0_HW_SM_QUERY_GROUP       0
-#define NVC0_SW_QUERY_DRV_STAT_GROUP 1
+#define NVC0_HW_METRIC_QUERY_GROUP   1
+#define NVC0_SW_QUERY_DRV_STAT_GROUP 2
 
 void nvc0_init_query_functions(struct nvc0_context *);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
index 25aa09be42a..fb2806a805e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -431,7 +431,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
             id = nvc0_hw_metric_get_next_query_id(queries, id);
             info->name = nvc0_hw_metric_names[id];
             info->query_type = NVC0_HW_METRIC_QUERY(id);
-            info->group_id = -1;
+            info->group_id = NVC0_HW_METRIC_QUERY_GROUP;
             return 1;
          }
       }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
index 12b5a025064..15c803c4307 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
@@ -26,7 +26,8 @@ nvc0_resource_from_handle(struct pipe_screen * screen,
    } else {
       struct pipe_resource *res = nv50_miptree_from_handle(screen,
                                                            templ, whandle);
-      nv04_resource(res)->vtbl = &nvc0_miptree_vtbl;
+      if (res)
+         nv04_resource(res)->vtbl = &nvc0_miptree_vtbl;
       return res;
    }
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index af8e5f72670..6ad3980911d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -179,6 +179,9 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
    case PIPE_CAP_TGSI_TXQS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -201,8 +204,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-   case PIPE_CAP_SHAREABLE_SHADERS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -352,45 +353,51 @@ static int
 nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
                               enum pipe_compute_cap param, void *data)
 {
-   uint64_t *data64 = (uint64_t *)data;
-   uint32_t *data32 = (uint32_t *)data;
-   const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass;
+   struct nvc0_screen *screen = nvc0_screen(pscreen);
+   const uint16_t obj_class = screen->compute->oclass;
+
+#define RET(x) do {                  \
+   if (data)                         \
+      memcpy(data, x, sizeof(x));    \
+   return sizeof(x);                 \
+} while (0)
 
    switch (param) {
    case PIPE_COMPUTE_CAP_GRID_DIMENSION:
-      data64[0] = 3;
-      return 8;
+      RET((uint64_t []) { 3 });
    case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
-      data64[0] = (obj_class >= NVE4_COMPUTE_CLASS) ? 0x7fffffff : 65535;
-      data64[1] = 65535;
-      data64[2] = 65535;
-      return 24;
+      if (obj_class >= NVE4_COMPUTE_CLASS) {
+         RET(((uint64_t []) { 0x7fffffff, 65535, 65535 }));
+      } else {
+         RET(((uint64_t []) { 65535, 65535, 65535 }));
+      }
    case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
-      data64[0] = 1024;
-      data64[1] = 1024;
-      data64[2] = 64;
-      return 24;
+      RET(((uint64_t []) { 1024, 1024, 64 }));
    case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
-      data64[0] = 1024;
-      return 8;
+      RET((uint64_t []) { 1024 });
    case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g[] */
-      data64[0] = (uint64_t)1 << 40;
-      return 8;
+      RET((uint64_t []) { 1ULL << 40 });
    case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */
-      data64[0] = 48 << 10;
-      return 8;
+      RET((uint64_t []) { 48 << 10 });
    case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */
-      data64[0] = 512 << 10;
-      return 8;
+      RET((uint64_t []) { 512 << 10 });
    case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
-      data64[0] = 4096;
-      return 8;
+      RET((uint64_t []) { 4096 });
    case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
-      data32[0] = 32;
-      return 4;
+      RET((uint32_t []) { 32 });
+   case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+      RET((uint64_t []) { 1ULL << 40 });
+   case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+      RET((uint32_t []) { 0 });
+   case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+      RET((uint32_t []) { screen->mp_count_compute });
+   case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+      RET((uint32_t []) { 512 }); /* FIXME: arbitrary limit */
    default:
       return 0;
    }
+
+#undef RET
 }
 
 static void
@@ -827,6 +834,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATA (push, 1);
    BEGIN_NVC0(push, NVC0_3D(BLEND_ENABLE_COMMON), 1);
    PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(SHADE_MODEL), 1);
+   PUSH_DATA (push, NVC0_3D_SHADE_MODEL_SMOOTH);
    if (screen->eng3d->oclass < NVE4_3D_CLASS) {
       BEGIN_NVC0(push, NVC0_3D(TEX_MISC), 1);
       PUSH_DATA (push, NVC0_3D_TEX_MISC_SEAMLESS_CUBE_MAP);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 857eb0316c7..8b73102b98b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -38,6 +38,7 @@ struct nvc0_graph_state {
    uint32_t constant_elts;
    int32_t index_bias;
    uint16_t scissor;
+   bool flatshade;
    uint8_t patch_vertices;
    uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
    uint8_t num_vtxbufs;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index af837fc4a33..8595800592c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -107,8 +107,54 @@ nvc0_fragprog_validate(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_program *fp = nvc0->fragprog;
+   struct pipe_rasterizer_state *rast = &nvc0->rast->pipe;
 
-   fp->fp.sample_interp = nvc0->min_samples > 1;
+   if (fp->fp.force_persample_interp != rast->force_persample_interp) {
+      /* Force the program to be reuploaded, which will trigger interp fixups
+       * to get applied
+       */
+      if (fp->mem)
+         nouveau_heap_free(&fp->mem);
+
+      fp->fp.force_persample_interp = rast->force_persample_interp;
+   }
+
+   /* Shade model works well enough when both colors follow it. However if one
+    * (or both) is explicitly set, then we have to go the patching route.
+    */
+   bool has_explicit_color = fp->fp.colors &&
+      (((fp->fp.colors & 1) && !fp->fp.color_interp[0]) ||
+       ((fp->fp.colors & 2) && !fp->fp.color_interp[1]));
+   bool hwflatshade = false;
+   if (has_explicit_color && fp->fp.flatshade != rast->flatshade) {
+      /* Force re-upload */
+      if (fp->mem)
+         nouveau_heap_free(&fp->mem);
+
+      fp->fp.flatshade = rast->flatshade;
+
+      /* Always smooth-shade in this mode, the shader will decide on its own
+       * when to flat-shade.
+       */
+   } else if (!has_explicit_color) {
+      hwflatshade = rast->flatshade;
+
+      /* No need to binary-patch the shader each time, make sure that it's set
+       * up for the default behaviour.
+       */
+      fp->fp.flatshade = 0;
+   }
+
+   if (hwflatshade != nvc0->state.flatshade) {
+      nvc0->state.flatshade = hwflatshade;
+      BEGIN_NVC0(push, NVC0_3D(SHADE_MODEL), 1);
+      PUSH_DATA (push, hwflatshade ? NVC0_3D_SHADE_MODEL_FLAT :
+                                     NVC0_3D_SHADE_MODEL_SMOOTH);
+   }
+
+   if (fp->mem && !(nvc0->dirty & NVC0_NEW_FRAGPROG)) {
+      return;
+   }
 
    if (!nvc0_program_validate(nvc0, fp))
          return;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 742bef39247..ba1714da010 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -212,9 +212,6 @@ nvc0_rasterizer_state_create(struct pipe_context *pipe,
      * always emit 16 commands, one for each scissor rectangle, here.
      */
 
-    SB_BEGIN_3D(so, SHADE_MODEL, 1);
-    SB_DATA    (so, cso->flatshade ? NVC0_3D_SHADE_MODEL_FLAT :
-                                     NVC0_3D_SHADE_MODEL_SMOOTH);
     SB_IMMED_3D(so, PROVOKING_VERTEX_LAST, !cso->flatshade_first);
     SB_IMMED_3D(so, VERTEX_TWO_SIDE_ENABLE, cso->light_twoside);
 
@@ -683,6 +680,9 @@ nvc0_sp_state_create(struct pipe_context *pipe,
    if (cso->stream_output.num_outputs)
       prog->pipe.stream_output = cso->stream_output;
 
+   prog->translated = nvc0_program_translate(
+      prog, nvc0_context(pipe)->screen->base.device->chipset);
+
    return (void *)prog;
 }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index aec06097bbd..205e7dc6ae9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -606,6 +606,9 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
       ctx_to->constbuf_dirty[s] = (1 << NVC0_MAX_PIPE_CONSTBUFS) - 1;
    }
 
+   /* Reset tfb as the shader that owns it may have been deleted. */
+   ctx_to->state.tfb = NULL;
+
    if (!ctx_to->vertex)
       ctx_to->dirty &= ~(NVC0_NEW_VERTEX | NVC0_NEW_ARRAYS);
    if (!ctx_to->idxbuf.buffer)
@@ -645,7 +648,7 @@ static struct state_validate {
     { nvc0_tevlprog_validate,      NVC0_NEW_TEVLPROG },
     { nvc0_validate_tess_state,    NVC0_NEW_TESSFACTOR },
     { nvc0_gmtyprog_validate,      NVC0_NEW_GMTYPROG },
-    { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG },
+    { nvc0_fragprog_validate,      NVC0_NEW_FRAGPROG | NVC0_NEW_RASTERIZER },
     { nvc0_validate_derived_1,     NVC0_NEW_FRAGPROG | NVC0_NEW_ZSA |
                                    NVC0_NEW_RASTERIZER },
     { nvc0_validate_derived_2,     NVC0_NEW_ZSA | NVC0_NEW_FRAMEBUFFER },
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 8bc33c6a0e0..f9680f5a90f 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -23,7 +23,7 @@ struct nvc0_blend_stateobj {
 struct nvc0_rasterizer_stateobj {
    struct pipe_rasterizer_state pipe;
    int size;
-   uint32_t state[44];
+   uint32_t state[42];
 };
 
 struct nvc0_zsa_stateobj {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index dbdf292c862..be123349148 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -225,10 +225,14 @@ nvc0_resource_copy_region(struct pipe_context *pipe,
    nv04_resource(dst)->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
 
    if (m2mf) {
+      struct nv50_miptree *src_mt = nv50_miptree(src);
+      struct nv50_miptree *dst_mt = nv50_miptree(dst);
       struct nv50_m2mf_rect drect, srect;
       unsigned i;
-      unsigned nx = util_format_get_nblocksx(src->format, src_box->width);
-      unsigned ny = util_format_get_nblocksy(src->format, src_box->height);
+      unsigned nx = util_format_get_nblocksx(src->format, src_box->width)
+         << src_mt->ms_x;
+      unsigned ny = util_format_get_nblocksy(src->format, src_box->height)
+         << src_mt->ms_y;
 
       nv50_m2mf_rect_setup(&drect, dst, dst_level, dstx, dsty, dstz);
       nv50_m2mf_rect_setup(&srect, src, src_level,
@@ -237,15 +241,15 @@ nvc0_resource_copy_region(struct pipe_context *pipe,
       for (i = 0; i < src_box->depth; ++i) {
          nvc0->m2mf_copy_rect(nvc0, &drect, &srect, nx, ny);
 
-         if (nv50_miptree(dst)->layout_3d)
+         if (dst_mt->layout_3d)
             drect.z++;
          else
-            drect.base += nv50_miptree(dst)->layer_stride;
+            drect.base += dst_mt->layer_stride;
 
-         if (nv50_miptree(src)->layout_3d)
+         if (src_mt->layout_3d)
             srect.z++;
          else
-            srect.base += nv50_miptree(src)->layer_stride;
+            srect.base += src_mt->layer_stride;
       }
       return;
    }
@@ -493,57 +497,57 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe,
                          unsigned dstx, unsigned dsty,
                          unsigned width, unsigned height)
 {
-	struct nvc0_context *nvc0 = nvc0_context(pipe);
-	struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-	struct nv50_miptree *mt = nv50_miptree(dst->texture);
-	struct nv50_surface *sf = nv50_surface(dst);
-	uint32_t mode = 0;
-	int unk = mt->base.base.target == PIPE_TEXTURE_2D;
-	unsigned z;
-
-	if (!PUSH_SPACE(push, 32 + sf->depth))
-		return;
-
-	PUSH_REFN (push, mt->base.bo, mt->base.domain | NOUVEAU_BO_WR);
-
-	if (clear_flags & PIPE_CLEAR_DEPTH) {
-		BEGIN_NVC0(push, NVC0_3D(CLEAR_DEPTH), 1);
-		PUSH_DATAf(push, depth);
-		mode |= NVC0_3D_CLEAR_BUFFERS_Z;
-	}
-
-	if (clear_flags & PIPE_CLEAR_STENCIL) {
-		BEGIN_NVC0(push, NVC0_3D(CLEAR_STENCIL), 1);
-		PUSH_DATA (push, stencil & 0xff);
-		mode |= NVC0_3D_CLEAR_BUFFERS_S;
-	}
-
-	BEGIN_NVC0(push, NVC0_3D(SCREEN_SCISSOR_HORIZ), 2);
-	PUSH_DATA (push, ( width << 16) | dstx);
-	PUSH_DATA (push, (height << 16) | dsty);
-
-	BEGIN_NVC0(push, NVC0_3D(ZETA_ADDRESS_HIGH), 5);
-	PUSH_DATAh(push, mt->base.address + sf->offset);
-	PUSH_DATA (push, mt->base.address + sf->offset);
-	PUSH_DATA (push, nvc0_format_table[dst->format].rt);
-	PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode);
-	PUSH_DATA (push, mt->layer_stride >> 2);
-	BEGIN_NVC0(push, NVC0_3D(ZETA_ENABLE), 1);
-	PUSH_DATA (push, 1);
-	BEGIN_NVC0(push, NVC0_3D(ZETA_HORIZ), 3);
-	PUSH_DATA (push, sf->width);
-	PUSH_DATA (push, sf->height);
-	PUSH_DATA (push, (unk << 16) | (dst->u.tex.first_layer + sf->depth));
-	BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1);
-	PUSH_DATA (push, dst->u.tex.first_layer);
-
-	BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
-	for (z = 0; z < sf->depth; ++z) {
-		PUSH_DATA (push, mode |
-			   (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT));
-	}
-
-	nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nv50_miptree *mt = nv50_miptree(dst->texture);
+   struct nv50_surface *sf = nv50_surface(dst);
+   uint32_t mode = 0;
+   int unk = mt->base.base.target == PIPE_TEXTURE_2D;
+   unsigned z;
+
+   if (!PUSH_SPACE(push, 32 + sf->depth))
+      return;
+
+   PUSH_REFN (push, mt->base.bo, mt->base.domain | NOUVEAU_BO_WR);
+
+   if (clear_flags & PIPE_CLEAR_DEPTH) {
+      BEGIN_NVC0(push, NVC0_3D(CLEAR_DEPTH), 1);
+      PUSH_DATAf(push, depth);
+      mode |= NVC0_3D_CLEAR_BUFFERS_Z;
+   }
+
+   if (clear_flags & PIPE_CLEAR_STENCIL) {
+      BEGIN_NVC0(push, NVC0_3D(CLEAR_STENCIL), 1);
+      PUSH_DATA (push, stencil & 0xff);
+      mode |= NVC0_3D_CLEAR_BUFFERS_S;
+   }
+
+   BEGIN_NVC0(push, NVC0_3D(SCREEN_SCISSOR_HORIZ), 2);
+   PUSH_DATA (push, ( width << 16) | dstx);
+   PUSH_DATA (push, (height << 16) | dsty);
+
+   BEGIN_NVC0(push, NVC0_3D(ZETA_ADDRESS_HIGH), 5);
+   PUSH_DATAh(push, mt->base.address + sf->offset);
+   PUSH_DATA (push, mt->base.address + sf->offset);
+   PUSH_DATA (push, nvc0_format_table[dst->format].rt);
+   PUSH_DATA (push, mt->level[sf->base.u.tex.level].tile_mode);
+   PUSH_DATA (push, mt->layer_stride >> 2);
+   BEGIN_NVC0(push, NVC0_3D(ZETA_ENABLE), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NVC0(push, NVC0_3D(ZETA_HORIZ), 3);
+   PUSH_DATA (push, sf->width);
+   PUSH_DATA (push, sf->height);
+   PUSH_DATA (push, (unk << 16) | (dst->u.tex.first_layer + sf->depth));
+   BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1);
+   PUSH_DATA (push, dst->u.tex.first_layer);
+
+   BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
+   for (z = 0; z < sf->depth; ++z) {
+      PUSH_DATA (push, mode |
+                 (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT));
+   }
+
+   nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
index 8b23a4887da..9c19ba20a7e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo_translate.c
@@ -27,6 +27,7 @@ struct push_context {
    struct {
       bool enabled;
       bool value;
+      uint8_t width;
       unsigned stride;
       const uint8_t *data;
    } edgeflag;
@@ -53,6 +54,7 @@ nvc0_push_context_init(struct nvc0_context *nvc0, struct push_context *ctx)
    /* silence warnings */
    ctx->edgeflag.data = NULL;
    ctx->edgeflag.stride = 0;
+   ctx->edgeflag.width = 0;
 }
 
 static inline void
@@ -100,6 +102,7 @@ nvc0_push_map_edgeflag(struct push_context *ctx, struct nvc0_context *nvc0,
    struct nv04_resource *buf = nv04_resource(vb->buffer);
 
    ctx->edgeflag.stride = vb->stride;
+   ctx->edgeflag.width = util_format_get_blocksize(ve->src_format);
    if (buf) {
       unsigned offset = vb->buffer_offset + ve->src_offset;
       ctx->edgeflag.data = nouveau_resource_map_offset(&nvc0->base,
@@ -137,10 +140,17 @@ prim_restart_search_i32(const uint32_t *elts, unsigned push, uint32_t index)
 }
 
 static inline bool
-ef_value(const struct push_context *ctx, uint32_t index)
+ef_value_8(const struct push_context *ctx, uint32_t index)
 {
-   float *pf = (float *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
-   return *pf ? true : false;
+   uint8_t *pf = (uint8_t *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
+   return !!*pf;
+}
+
+static inline bool
+ef_value_32(const struct push_context *ctx, uint32_t index)
+{
+   uint32_t *pf = (uint32_t *)&ctx->edgeflag.data[index * ctx->edgeflag.stride];
+   return !!*pf;
 }
 
 static inline bool
@@ -154,7 +164,11 @@ static inline unsigned
 ef_toggle_search_i08(struct push_context *ctx, const uint8_t *elts, unsigned n)
 {
    unsigned i;
-   for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
+   bool ef = ctx->edgeflag.value;
+   if (ctx->edgeflag.width == 1)
+      for (i = 0; i < n && ef_value_8(ctx, elts[i]) == ef; ++i);
+   else
+      for (i = 0; i < n && ef_value_32(ctx, elts[i]) == ef; ++i);
    return i;
 }
 
@@ -162,7 +176,11 @@ static inline unsigned
 ef_toggle_search_i16(struct push_context *ctx, const uint16_t *elts, unsigned n)
 {
    unsigned i;
-   for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
+   bool ef = ctx->edgeflag.value;
+   if (ctx->edgeflag.width == 1)
+      for (i = 0; i < n && ef_value_8(ctx, elts[i]) == ef; ++i);
+   else
+      for (i = 0; i < n && ef_value_32(ctx, elts[i]) == ef; ++i);
    return i;
 }
 
@@ -170,7 +188,11 @@ static inline unsigned
 ef_toggle_search_i32(struct push_context *ctx, const uint32_t *elts, unsigned n)
 {
    unsigned i;
-   for (i = 0; i < n && ef_value(ctx, elts[i]) == ctx->edgeflag.value; ++i);
+   bool ef = ctx->edgeflag.value;
+   if (ctx->edgeflag.width == 1)
+      for (i = 0; i < n && ef_value_8(ctx, elts[i]) == ef; ++i);
+   else
+      for (i = 0; i < n && ef_value_32(ctx, elts[i]) == ef; ++i);
    return i;
 }
 
@@ -178,7 +200,11 @@ static inline unsigned
 ef_toggle_search_seq(struct push_context *ctx, unsigned start, unsigned n)
 {
    unsigned i;
-   for (i = 0; i < n && ef_value(ctx, start++) == ctx->edgeflag.value; ++i);
+   bool ef = ctx->edgeflag.value;
+   if (ctx->edgeflag.width == 1)
+      for (i = 0; i < n && ef_value_8(ctx, start++) == ef; ++i);
+   else
+      for (i = 0; i < n && ef_value_32(ctx, start++) == ef; ++i);
    return i;
 }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_video_vp.c b/src/gallium/drivers/nouveau/nvc0/nvc0_video_vp.c
index 28bcb629e43..91543782dfc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_video_vp.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_video_vp.c
@@ -27,33 +27,33 @@
 static void dump_comm_vp(struct nouveau_vp3_decoder *dec, struct comm *comm, u32 comm_seq,
                          struct nouveau_bo *inter_bo, unsigned slice_size)
 {
-	unsigned i, idx = comm->pvp_cur_index & 0xf;
-	debug_printf("Status: %08x, stage: %08x\n", comm->status_vp[idx], comm->pvp_stage);
+   unsigned i, idx = comm->pvp_cur_index & 0xf;
+   debug_printf("Status: %08x, stage: %08x\n", comm->status_vp[idx], comm->pvp_stage);
 #if 0
-	debug_printf("Acked byte ofs: %x, bsp byte ofs: %x\n", comm->acked_byte_ofs, comm->byte_ofs);
-	debug_printf("Irq/parse indexes: %i %i\n", comm->irq_index, comm->parse_endpos_index);
+   debug_printf("Acked byte ofs: %x, bsp byte ofs: %x\n", comm->acked_byte_ofs, comm->byte_ofs);
+   debug_printf("Irq/parse indexes: %i %i\n", comm->irq_index, comm->parse_endpos_index);
 
-	for (i = 0; i != comm->irq_index; ++i)
-		debug_printf("irq[%i] = { @ %08x -> %04x }\n", i, comm->irq_pos[i], comm->irq_470[i]);
-	for (i = 0; i != comm->parse_endpos_index; ++i)
-		debug_printf("parse_endpos[%i] = { @ %08x}\n", i, comm->parse_endpos[i]);
+   for (i = 0; i != comm->irq_index; ++i)
+      debug_printf("irq[%i] = { @ %08x -> %04x }\n", i, comm->irq_pos[i], comm->irq_470[i]);
+   for (i = 0; i != comm->parse_endpos_index; ++i)
+      debug_printf("parse_endpos[%i] = { @ %08x}\n", i, comm->parse_endpos[i]);
 #endif
-	debug_printf("mb_y = %u\n", comm->mb_y[idx]);
-	if (comm->status_vp[idx] <= 1)
-		return;
-
-	if ((comm->pvp_stage & 0xff) != 0xff) {
-		unsigned *map;
-		int ret = nouveau_bo_map(inter_bo, NOUVEAU_BO_RD|NOUVEAU_BO_NOBLOCK, dec->client);
-		assert(ret >= 0);
-		map = inter_bo->map;
-		for (i = 0; i < comm->byte_ofs + slice_size; i += 0x10) {
-			debug_printf("%05x: %08x %08x %08x %08x\n", i, map[i/4], map[i/4+1], map[i/4+2], map[i/4+3]);
-		}
-		munmap(inter_bo->map, inter_bo->size);
-		inter_bo->map = NULL;
-	}
-	assert((comm->pvp_stage & 0xff) == 0xff);
+   debug_printf("mb_y = %u\n", comm->mb_y[idx]);
+   if (comm->status_vp[idx] <= 1)
+      return;
+
+   if ((comm->pvp_stage & 0xff) != 0xff) {
+      unsigned *map;
+      int ret = nouveau_bo_map(inter_bo, NOUVEAU_BO_RD|NOUVEAU_BO_NOBLOCK, dec->client);
+      assert(ret >= 0);
+      map = inter_bo->map;
+      for (i = 0; i < comm->byte_ofs + slice_size; i += 0x10) {
+         debug_printf("%05x: %08x %08x %08x %08x\n", i, map[i/4], map[i/4+1], map[i/4+2], map[i/4+3]);
+      }
+      munmap(inter_bo->map, inter_bo->size);
+      inter_bo->map = NULL;
+   }
+   assert((comm->pvp_stage & 0xff) == 0xff);
 }
 #endif
 
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index a576abdfaf2..d5981248a86 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -198,6 +198,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_TGSI_TXQS:
         case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
         case PIPE_CAP_SHAREABLE_SHADERS:
+        case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index bc6980660a5..ee7beee3001 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -635,7 +635,7 @@ static int replace_gpr_with_pv_ps(struct r600_bytecode *bc,
 	return 0;
 }
 
-void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg)
+void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *neg, unsigned abs)
 {
 	switch(value) {
 	case 0:
@@ -655,11 +655,11 @@ void r600_bytecode_special_constants(uint32_t value, unsigned *sel, unsigned *ne
 		break;
 	case 0xBF800000: /* -1.0f */
 		*sel = V_SQ_ALU_SRC_1;
-		*neg ^= 1;
+		*neg ^= !abs;
 		break;
 	case 0xBF000000: /* -0.5f */
 		*sel = V_SQ_ALU_SRC_0_5;
-		*neg ^= 1;
+		*neg ^= !abs;
 		break;
 	default:
 		*sel = V_SQ_ALU_SRC_LITERAL;
@@ -1208,7 +1208,7 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc,
 		}
 		if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
 			r600_bytecode_special_constants(nalu->src[i].value,
-				&nalu->src[i].sel, &nalu->src[i].neg);
+				&nalu->src[i].sel, &nalu->src[i].neg, nalu->src[i].abs);
 	}
 	if (nalu->dst.sel >= bc->ngpr) {
 		bc->ngpr = nalu->dst.sel + 1;
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 7cf3a090908..d48ad1ebf01 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -255,7 +255,7 @@ int r600_bytecode_add_cfinst(struct r600_bytecode *bc,
 int r600_bytecode_add_alu_type(struct r600_bytecode *bc,
 		const struct r600_bytecode_alu *alu, unsigned type);
 void r600_bytecode_special_constants(uint32_t value,
-		unsigned *sel, unsigned *neg);
+		unsigned *sel, unsigned *neg, unsigned abs);
 void r600_bytecode_disasm(struct r600_bytecode *bc);
 void r600_bytecode_alu_read(struct r600_bytecode *bc,
 		struct r600_bytecode_alu *alu, uint32_t word0, uint32_t word1);
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 9a97de9965e..9f4cda2c142 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -344,6 +344,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
+	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 		return 0;
 
 	/* Stream output. */
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 8efe902a329..fc6335ae8bc 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -162,10 +162,6 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 		goto error;
 	}
 
-    /* disable SB for geom shaders on R6xx/R7xx due to some mysterious gs piglit regressions with it enabled. */
-    if (rctx->b.chip_class <= R700) {
-	    use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY);
-    }
 	/* disable SB for shaders using doubles */
 	use_sb &= !shader->shader.uses_doubles;
 
@@ -1008,7 +1004,7 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
 			(tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
 
 			index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
-			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
+			r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
 			if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
 				return;
 		}
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index f341ecb41a5..0dc6c918331 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -443,6 +443,27 @@ struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 	return &rbuffer->b.b;
 }
 
+struct pipe_resource *r600_aligned_buffer_create(struct pipe_screen *screen,
+						 unsigned bind,
+						 unsigned usage,
+						 unsigned size,
+						 unsigned alignment)
+{
+	struct pipe_resource buffer;
+
+	memset(&buffer, 0, sizeof buffer);
+	buffer.target = PIPE_BUFFER;
+	buffer.format = PIPE_FORMAT_R8_UNORM;
+	buffer.bind = bind;
+	buffer.usage = usage;
+	buffer.flags = 0;
+	buffer.width0 = size;
+	buffer.height0 = 1;
+	buffer.depth0 = 1;
+	buffer.array_size = 1;
+	return r600_buffer_create(screen, &buffer, alignment);
+}
+
 struct pipe_resource *
 r600_buffer_from_user_memory(struct pipe_screen *screen,
 			     const struct pipe_resource *templ,
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 7ac94caad9f..0ad36849645 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -360,6 +360,8 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "precompile", DBG_PRECOMPILE, "Compile one shader variant at shader creation." },
 	{ "nowc", DBG_NO_WC, "Disable GTT write combining" },
 	{ "check_vm", DBG_CHECK_VM, "Check VM faults and dump debug info." },
+	{ "nodcc", DBG_NO_DCC, "Disable DCC." },
+	{ "nodccclear", DBG_NO_DCC_CLEAR, "Disable DCC fast clear." },
 
 	DEBUG_NAMED_VALUE_END /* must be last */
 };
@@ -416,6 +418,7 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 	case CHIP_ICELAND: return "AMD ICELAND";
 	case CHIP_CARRIZO: return "AMD CARRIZO";
 	case CHIP_FIJI: return "AMD FIJI";
+	case CHIP_STONEY: return "AMD STONEY";
 	default: return "AMD unknown";
 	}
 }
@@ -540,6 +543,11 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_ICELAND: return "iceland";
 	case CHIP_CARRIZO: return "carrizo";
 	case CHIP_FIJI: return "fiji";
+#if HAVE_LLVM <= 0x0307
+	case CHIP_STONEY: return "carrizo";
+#else
+	case CHIP_STONEY: return "stoney";
+#endif
 	default: return "";
 	}
 }
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index b58b500bd76..c300c0b3332 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -99,6 +99,8 @@
 #define DBG_INFO		(1llu << 40)
 #define DBG_NO_WC		(1llu << 41)
 #define DBG_CHECK_VM		(1llu << 42)
+#define DBG_NO_DCC		(1llu << 43)
+#define DBG_NO_DCC_CLEAR	(1llu << 44)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 
@@ -214,6 +216,7 @@ struct r600_texture {
 	struct r600_fmask_info		fmask;
 	struct r600_cmask_info		cmask;
 	struct r600_resource		*cmask_buffer;
+	struct r600_resource		*dcc_buffer;
 	unsigned			cb_color_info; /* fast clear enable bit */
 	unsigned			color_clear_value[2];
 
@@ -243,6 +246,7 @@ struct r600_surface {
 	unsigned cb_color_dim;		/* EG only */
 	unsigned cb_color_pitch;	/* EG and later */
 	unsigned cb_color_slice;	/* EG and later */
+	unsigned cb_dcc_base;		/* VI and later */
 	unsigned cb_color_attrib;	/* EG and later */
 	unsigned cb_dcc_control;	/* VI and later */
 	unsigned cb_color_fmask;	/* CB_COLORn_FMASK (EG and later) or CB_COLORn_FRAG (r600) */
@@ -489,6 +493,11 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 					 const struct pipe_resource *templ,
 					 unsigned alignment);
+struct pipe_resource * r600_aligned_buffer_create(struct pipe_screen *screen,
+						  unsigned bind,
+						  unsigned usage,
+						  unsigned size,
+						  unsigned alignment);
 struct pipe_resource *
 r600_buffer_from_user_memory(struct pipe_screen *screen,
 			     const struct pipe_resource *templ,
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index fc69f48bb70..edfdfe33187 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -268,6 +268,7 @@ static void r600_texture_destroy(struct pipe_screen *screen,
 	if (rtex->cmask_buffer != &rtex->resource) {
 	    pipe_resource_reference((struct pipe_resource**)&rtex->cmask_buffer, NULL);
 	}
+	pipe_resource_reference((struct pipe_resource**)&rtex->dcc_buffer, NULL);
 	pb_reference(&resource->buf, NULL);
 	FREE(rtex);
 }
@@ -482,6 +483,25 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
 		rtex->cb_color_info |= EG_S_028C70_FAST_CLEAR(1);
 }
 
+static void vi_texture_alloc_dcc_separate(struct r600_common_screen *rscreen,
+					      struct r600_texture *rtex)
+{
+	if (rscreen->debug_flags & DBG_NO_DCC)
+		return;
+
+	rtex->dcc_buffer = (struct r600_resource *)
+		r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
+				   PIPE_USAGE_DEFAULT, rtex->surface.dcc_size, rtex->surface.dcc_alignment);
+	if (rtex->dcc_buffer == NULL) {
+		return;
+	}
+
+	r600_screen_clear_buffer(rscreen, &rtex->dcc_buffer->b.b, 0, rtex->surface.dcc_size,
+				 0xFFFFFFFF, true);
+
+	rtex->cb_color_info |= VI_S_028C70_DCC_ENABLE(1);
+}
+
 static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 					    struct r600_texture *rtex)
 {
@@ -621,6 +641,8 @@ r600_texture_create_object(struct pipe_screen *screen,
 				return NULL;
 			}
 		}
+		if (rtex->surface.dcc_size)
+			vi_texture_alloc_dcc_separate(rscreen, rtex);
 	}
 
 	/* Now create the backing buffer. */
@@ -1219,6 +1241,81 @@ static void evergreen_set_clear_color(struct r600_texture *rtex,
 	memcpy(rtex->color_clear_value, &uc, 2 * sizeof(uint32_t));
 }
 
+static void vi_get_fast_clear_parameters(enum pipe_format surface_format,
+					 const union pipe_color_union *color,
+					 uint32_t* reset_value,
+					 bool* clear_words_needed)
+{
+	bool values[4] = {};
+	int i;
+	bool main_value = false;
+	bool extra_value = false;
+	int extra_channel;
+	const struct util_format_description *desc = util_format_description(surface_format);
+
+	*clear_words_needed = true;
+	*reset_value = 0x20202020U;
+
+	/* If we want to clear without needing a fast clear eliminate step, we
+	 * can set each channel to 0 or 1 (or 0/max for integer formats). We
+	 * have two sets of flags, one for the last or first channel(extra) and
+	 * one for the other channels(main).
+	 */
+
+	if (surface_format == PIPE_FORMAT_R11G11B10_FLOAT ||
+	    surface_format == PIPE_FORMAT_B5G6R5_UNORM ||
+	    surface_format == PIPE_FORMAT_B5G6R5_SRGB) {
+		extra_channel = -1;
+	} else if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
+		if(r600_translate_colorswap(surface_format) <= 1)
+			extra_channel = desc->nr_channels - 1;
+		else
+			extra_channel = 0;
+	} else
+		return;
+
+	for (i = 0; i < 4; ++i) {
+		int index = desc->swizzle[i] - UTIL_FORMAT_SWIZZLE_X;
+
+		if (desc->swizzle[i] < UTIL_FORMAT_SWIZZLE_X ||
+		    desc->swizzle[i] > UTIL_FORMAT_SWIZZLE_W)
+			continue;
+
+		if (util_format_is_pure_sint(surface_format)) {
+			values[i] = color->i[i] != 0;
+			if (color->i[i] != 0 && color->i[i] != INT32_MAX)
+				return;
+		} else if (util_format_is_pure_uint(surface_format)) {
+			values[i] = color->ui[i] != 0U;
+			if (color->ui[i] != 0U && color->ui[i] != UINT32_MAX)
+				return;
+		} else {
+			values[i] = color->f[i] != 0.0F;
+			if (color->f[i] != 0.0F && color->f[i] != 1.0F)
+				return;
+		}
+
+		if (index == extra_channel)
+			extra_value = values[i];
+		else
+			main_value = values[i];
+	}
+
+	for (int i = 0; i < 4; ++i)
+		if (values[i] != main_value &&
+		    desc->swizzle[i] - UTIL_FORMAT_SWIZZLE_X != extra_channel &&
+		    desc->swizzle[i] >= UTIL_FORMAT_SWIZZLE_X &&
+		    desc->swizzle[i] <= UTIL_FORMAT_SWIZZLE_W)
+			return;
+
+	*clear_words_needed = false;
+	if (main_value)
+		*reset_value |= 0x80808080U;
+
+	if (extra_value)
+		*reset_value |= 0x40404040U;
+}
+
 void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   struct pipe_framebuffer_state *fb,
 				   struct r600_atom *fb_state,
@@ -1272,18 +1369,36 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 			continue;
 		}
 
-		/* ensure CMASK is enabled */
-		r600_texture_alloc_cmask_separate(rctx->screen, tex);
-		if (tex->cmask.size == 0) {
-			continue;
+		if (tex->dcc_buffer) {
+			uint32_t reset_value;
+			bool clear_words_needed;
+
+			if (rctx->screen->debug_flags & DBG_NO_DCC_CLEAR)
+				continue;
+
+			vi_get_fast_clear_parameters(fb->cbufs[i]->format, color, &reset_value, &clear_words_needed);
+
+			rctx->clear_buffer(&rctx->b, &tex->dcc_buffer->b.b,
+					0, tex->surface.dcc_size, reset_value, true);
+
+			if (clear_words_needed)
+				tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
+		} else {
+			/* ensure CMASK is enabled */
+			r600_texture_alloc_cmask_separate(rctx->screen, tex);
+			if (tex->cmask.size == 0) {
+				continue;
+			}
+
+			/* Do the fast clear. */
+			rctx->clear_buffer(&rctx->b, &tex->cmask_buffer->b.b,
+					tex->cmask.offset, tex->cmask.size, 0, true);
+
+			tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
 		}
 
-		/* Do the fast clear. */
 		evergreen_set_clear_color(tex, fb->cbufs[i]->format, color);
-		rctx->clear_buffer(&rctx->b, &tex->cmask_buffer->b.b,
-				   tex->cmask.offset, tex->cmask.size, 0, true);
 
-		tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
 		if (dirty_cbufs)
 			*dirty_cbufs |= 1 << i;
 		rctx->set_atom_dirty(rctx, fb_state, true);
diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h
index 115042d153e..a3d182cd30f 100644
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -202,6 +202,7 @@
 
 #define   EG_S_028C70_FAST_CLEAR(x)                       (((x) & 0x1) << 17)
 #define   SI_S_028C70_FAST_CLEAR(x)                       (((x) & 0x1) << 13)
+#define   VI_S_028C70_DCC_ENABLE(x)                       (((x) & 0x1) << 28)
 
 /*CIK+*/
 #define R_0300FC_CP_STRMOUT_CNTL		     0x0300FC
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index c3ac7e7f2ef..33b01361aa5 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -478,6 +478,8 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video
 	result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8;
 	if (((struct r600_common_screen*)dec->screen)->family == CHIP_CARRIZO)
 		result.sps_info_flags |= 1 << 9;
+	if (pic->UseRefPicList == true)
+		result.sps_info_flags |= 1 << 10;
 
 	result.chroma_format = pic->pps->sps->chroma_format_idc;
 	result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8;
@@ -586,6 +588,11 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video
 	memcpy(dec->it + 480, pic->pps->sps->ScalingList16x16, 6 * 64);
 	memcpy(dec->it + 864, pic->pps->sps->ScalingList32x32, 2 * 64);
 
+	for (i = 0 ; i < 2 ; i++) {
+		for (int j = 0 ; j < 15 ; j++)
+			result.direct_reflist[i][j] = pic->RefPicList[i][j];
+	}
+
 	/* TODO
 	result.highestTid;
 	result.isNonRef;
diff --git a/src/gallium/drivers/radeon/radeon_uvd.h b/src/gallium/drivers/radeon/radeon_uvd.h
index 452fbd60880..9cc0a694c30 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -233,6 +233,15 @@ struct ruvd_h265 {
 
 	uint8_t		highestTid;
 	uint8_t		isNonRef;
+
+	uint8_t 	p010_mode;
+	uint8_t 	msb_mode;
+	uint8_t 	luma_10to8;
+	uint8_t 	chroma_10to8;
+	uint8_t 	sclr_luma10to8;
+	uint8_t 	sclr_chroma10to8;
+
+	uint8_t 	direct_reflist[2][15];
 };
 
 struct ruvd_vc1 {
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 3a1834b948f..32bfc32073b 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -205,11 +205,12 @@ int rvid_get_video_param(struct pipe_screen *screen,
 			 enum pipe_video_cap param)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
+	enum pipe_video_format codec = u_reduce_video_profile(profile);
 
 	if (entrypoint == PIPE_VIDEO_ENTRYPOINT_ENCODE) {
 		switch (param) {
 		case PIPE_VIDEO_CAP_SUPPORTED:
-			return u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
+			return codec == PIPE_VIDEO_FORMAT_MPEG4_AVC &&
 				rvce_is_fw_version_supported(rscreen);
 	        case PIPE_VIDEO_CAP_NPOT_TEXTURES:
         	        return 1;
@@ -232,38 +233,19 @@ int rvid_get_video_param(struct pipe_screen *screen,
 		}
 	}
 
-	/* UVD 2.x limits */
-	if (rscreen->family < CHIP_PALM) {
-		enum pipe_video_format codec = u_reduce_video_profile(profile);
-		switch (param) {
-		case PIPE_VIDEO_CAP_SUPPORTED:
-			/* no support for MPEG4 */
-			return codec != PIPE_VIDEO_FORMAT_MPEG4 &&
-			       /* FIXME: VC-1 simple/main profile is broken */
-			       profile != PIPE_VIDEO_PROFILE_VC1_SIMPLE &&
-			       profile != PIPE_VIDEO_PROFILE_VC1_MAIN;
-		case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-		case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
-			/* MPEG2 only with shaders and no support for
-			   interlacing on R6xx style UVD */
-			return codec != PIPE_VIDEO_FORMAT_MPEG12 &&
-			       rscreen->family > CHIP_RV770;
-		default:
-			break;
-		}
-	}
-
 	switch (param) {
 	case PIPE_VIDEO_CAP_SUPPORTED:
-		switch (u_reduce_video_profile(profile)) {
+		switch (codec) {
 		case PIPE_VIDEO_FORMAT_MPEG12:
 		case PIPE_VIDEO_FORMAT_MPEG4:
 		case PIPE_VIDEO_FORMAT_MPEG4_AVC:
-			return entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE;
+			if (rscreen->family < CHIP_PALM)
+				/* no support for MPEG4 */
+				return codec != PIPE_VIDEO_FORMAT_MPEG4;
+			return true;
 		case PIPE_VIDEO_FORMAT_VC1:
 			/* FIXME: VC-1 simple/main profile is broken */
-			return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED &&
-			       entrypoint != PIPE_VIDEO_ENTRYPOINT_ENCODE;
+			return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED;
 		case PIPE_VIDEO_FORMAT_HEVC:
 			/* Carrizo only supports HEVC Main */
 			return rscreen->family >= CHIP_CARRIZO &&
@@ -280,13 +262,17 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 		return PIPE_FORMAT_NV12;
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
-		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
-			return false; //The hardware doesn't support interlaced HEVC.
-		return true;
 	case PIPE_VIDEO_CAP_SUPPORTS_INTERLACED:
-		if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
-			return false; //The hardware doesn't support interlaced HEVC.
-		return true;
+		if (rscreen->family < CHIP_PALM) {
+			/* MPEG2 only with shaders and no support for
+			   interlacing on R6xx style UVD */
+			return codec != PIPE_VIDEO_FORMAT_MPEG12 &&
+			       rscreen->family > CHIP_RV770;
+		} else {
+			if (u_reduce_video_profile(profile) == PIPE_VIDEO_FORMAT_HEVC)
+				return false; //The firmware doesn't support interlaced HEVC.
+			return true;
+		}
 	case PIPE_VIDEO_CAP_SUPPORTS_PROGRESSIVE:
 		return true;
 	case PIPE_VIDEO_CAP_MAX_LEVEL:
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index b91e1adf41d..8bf1e15f3be 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -137,6 +137,7 @@ enum radeon_family {
     CHIP_ICELAND,
     CHIP_CARRIZO,
     CHIP_FIJI,
+    CHIP_STONEY,
     CHIP_LAST,
 };
 
@@ -331,6 +332,7 @@ struct radeon_surf_level {
     uint32_t                    nblk_z;
     uint32_t                    pitch_bytes;
     uint32_t                    mode;
+    uint64_t                    dcc_offset;
 };
 
 struct radeon_surf {
@@ -366,6 +368,9 @@ struct radeon_surf {
     uint32_t                    stencil_tiling_index[RADEON_SURF_MAX_LEVEL];
     uint32_t                    pipe_config;
     uint32_t                    num_banks;
+
+    uint64_t                    dcc_size;
+    uint64_t                    dcc_alignment;
 };
 
 struct radeon_bo_list_item {
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 6454b8ce8c0..e53af1dd6b5 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -242,7 +242,8 @@ void cik_sdma_copy(struct pipe_context *ctx,
 
 	if (src->format != dst->format ||
 	    rdst->surface.nsamples > 1 || rsrc->surface.nsamples > 1 ||
-	    (rdst->dirty_level_mask | rdst->stencil_dirty_level_mask) & (1 << dst_level)) {
+	    (rdst->dirty_level_mask | rdst->stencil_dirty_level_mask) & (1 << dst_level) ||
+	    rdst->dcc_buffer || rsrc->dcc_buffer) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 082ea850675..fce014a1e6b 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -326,7 +326,7 @@ void si_decompress_color_textures(struct si_context *sctx,
 		assert(view);
 
 		tex = (struct r600_texture *)view->texture;
-		assert(tex->cmask.size || tex->fmask.size);
+		assert(tex->cmask.size || tex->fmask.size || tex->dcc_buffer);
 
 		si_blit_decompress_color(&sctx->b.b, tex,
 					 view->u.tex.first_level, view->u.tex.last_level,
@@ -455,7 +455,7 @@ static void si_decompress_subresource(struct pipe_context *ctx,
 			si_blit_decompress_depth_in_place(sctx, rtex, true,
 							  level, level,
 							  first_layer, last_layer);
-	} else if (rtex->fmask.size || rtex->cmask.size) {
+	} else if (rtex->fmask.size || rtex->cmask.size || rtex->dcc_buffer) {
 		si_blit_decompress_color(ctx, rtex, level, level,
 					 first_layer, last_layer);
 	}
@@ -507,7 +507,7 @@ void si_resource_copy_region(struct pipe_context *ctx,
 	util_blitter_default_dst_texture(&dst_templ, dst, dst_level, dstz);
 	util_blitter_default_src_texture(&src_templ, src, src_level);
 
-	if (util_format_is_compressed(src->format) &&
+	if (util_format_is_compressed(src->format) ||
 	    util_format_is_compressed(dst->format)) {
 		unsigned blocksize = util_format_get_blocksize(src->format);
 
@@ -536,7 +536,7 @@ void si_resource_copy_region(struct pipe_context *ctx,
 		src_force_level = src_level;
 	} else if (!util_blitter_is_copy_supported(sctx->blitter, dst, src) ||
 		   /* also *8_SNORM has precision issues, use UNORM instead */
-		   util_format_is_snorm(src->format)) {
+		   util_format_is_snorm8(src->format)) {
 		if (util_format_is_subsampled_422(src->format)) {
 			src_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
 			dst_templ.format = PIPE_FORMAT_R8G8B8A8_UINT;
@@ -675,7 +675,8 @@ static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
 	    info->src.box.depth == 1 &&
 	    dst->surface.level[info->dst.level].mode >= RADEON_SURF_MODE_1D &&
 	    !(dst->surface.flags & RADEON_SURF_SCANOUT) &&
-	    (!dst->cmask.size || !dst->dirty_level_mask) /* dst cannot be fast-cleared */) {
+	    (!dst->cmask.size || !dst->dirty_level_mask) && /* dst cannot be fast-cleared */
+	    !dst->dcc_buffer) {
 		si_blitter_begin(ctx, SI_COLOR_RESOLVE |
 				 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
 		util_blitter_custom_resolve_color(sctx->blitter,
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 13738da5e2c..a8ff6f27319 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -181,6 +181,11 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 				rview->resource, RADEON_USAGE_READ,
 				r600_get_sampler_view_priority(rview->resource));
 
+		if (rview->dcc_buffer && rview->dcc_buffer != rview->resource)
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				rview->dcc_buffer, RADEON_USAGE_READ,
+				RADEON_PRIO_DCC);
+
 		pipe_sampler_view_reference(&views->views[slot], view);
 		memcpy(views->desc.list + slot*8, view_desc, 8*4);
 		views->desc.enabled_mask |= 1llu << slot;
@@ -229,7 +234,8 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 			} else {
 				samplers->depth_texture_mask &= ~(1 << slot);
 			}
-			if (rtex->cmask.size || rtex->fmask.size) {
+			if (rtex->cmask.size || rtex->fmask.size ||
+			    (rtex->dcc_buffer && rtex->dirty_level_mask)) {
 				samplers->compressed_colortex_mask |= 1 << slot;
 			} else {
 				samplers->compressed_colortex_mask &= ~(1 << slot);
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 31b0b41e5a4..581e89f42d8 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -248,7 +248,8 @@ void si_dma_copy(struct pipe_context *ctx,
 	if (src->format != dst->format || src_box->depth > 1 ||
 	    (rdst->dirty_level_mask | rdst->stencil_dirty_level_mask) & (1 << dst_level) ||
 	    rdst->cmask.size || rdst->fmask.size ||
-	    rsrc->cmask.size || rsrc->fmask.size) {
+	    rsrc->cmask.size || rsrc->fmask.size ||
+	    rdst->dcc_buffer || rsrc->dcc_buffer) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 5f910c95ef3..60baad3d13c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -55,8 +55,6 @@ static void si_destroy_context(struct pipe_context *context)
 
 	if (sctx->pstipple_sampler_state)
 		sctx->b.b.delete_sampler_state(&sctx->b.b, sctx->pstipple_sampler_state);
-	if (sctx->dummy_pixel_shader)
-		sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader);
 	if (sctx->fixed_func_tcs_shader.cso)
 		sctx->b.b.delete_tcs_state(&sctx->b.b, sctx->fixed_func_tcs_shader.cso);
 	if (sctx->custom_dsa_flush)
@@ -300,6 +298,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_SM5:
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -578,6 +577,33 @@ static bool si_initialize_pipe_config(struct si_screen *sscreen)
 	return true;
 }
 
+static bool si_init_gs_info(struct si_screen *sscreen)
+{
+	switch (sscreen->b.family) {
+	case CHIP_OLAND:
+	case CHIP_HAINAN:
+	case CHIP_KAVERI:
+	case CHIP_KABINI:
+	case CHIP_MULLINS:
+	case CHIP_ICELAND:
+	case CHIP_CARRIZO:
+	case CHIP_STONEY:
+		sscreen->gs_table_depth = 16;
+		return true;
+	case CHIP_TAHITI:
+	case CHIP_PITCAIRN:
+	case CHIP_VERDE:
+	case CHIP_BONAIRE:
+	case CHIP_HAWAII:
+	case CHIP_TONGA:
+	case CHIP_FIJI:
+		sscreen->gs_table_depth = 32;
+		return true;
+	default:
+		return false;
+	}
+}
+
 struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 {
 	struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
@@ -595,7 +621,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 	sscreen->b.b.resource_create = r600_resource_create_common;
 
 	if (!r600_common_screen_init(&sscreen->b, ws) ||
-	    !si_initialize_pipe_config(sscreen)) {
+	    !si_initialize_pipe_config(sscreen) ||
+	    !si_init_gs_info(sscreen)) {
 		FREE(sscreen);
 		return NULL;
 	}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index d7a2282952a..42cd8803c36 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -42,6 +42,7 @@
 #define SI_BASE_VERTEX_UNKNOWN INT_MIN
 #define SI_RESTART_INDEX_UNKNOWN INT_MIN
 #define SI_NUM_SMOOTH_AA_SAMPLES 8
+#define SI_GS_PER_ES 128
 
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE		(R600_CONTEXT_PRIVATE_FLAG << 0)
@@ -85,6 +86,7 @@ struct si_compute;
 
 struct si_screen {
 	struct r600_common_screen	b;
+	unsigned			gs_table_depth;
 };
 
 struct si_blend_color {
@@ -96,6 +98,7 @@ struct si_sampler_view {
 	struct pipe_sampler_view	base;
 	struct list_head		list;
 	struct r600_resource		*resource;
+	struct r600_resource		*dcc_buffer;
         /* [0..7] = image descriptor
          * [4..7] = buffer descriptor */
 	uint32_t			state[8];
@@ -203,9 +206,6 @@ struct si_context {
 	struct si_pm4_state		*init_config;
 	bool				init_config_has_vgt_flush;
 	struct si_pm4_state		*vgt_shader_config[4];
-	/* With rasterizer discard, there doesn't have to be a pixel shader.
-	 * In that case, we bind this one: */
-	void				*dummy_pixel_shader;
 
 	/* shaders */
 	struct si_shader_ctx_state	ps_shader;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 243bdc6e6d7..18b64056bc7 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -266,6 +266,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at
 	 * Reproducible with Unigine Heaven 4.0 and drirc missing.
 	 */
 	if (blend->dual_src_blend &&
+	    sctx->ps_shader.cso &&
 	    (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3)
 		mask = 0;
 
@@ -697,6 +698,7 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 	rs->clamp_fragment_color = state->clamp_fragment_color;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
+	rs->rasterizer_discard = state->rasterizer_discard;
 	rs->pa_sc_line_stipple = state->line_stipple_enable ?
 				S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
 				S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0;
@@ -1924,8 +1926,21 @@ static void si_initialize_color_surface(struct si_context *sctx,
 	surf->cb_color_info = color_info;
 	surf->cb_color_attrib = color_attrib;
 
-	if (sctx->b.chip_class >= VI)
-		surf->cb_dcc_control = S_028C78_OVERWRITE_COMBINER_DISABLE(1);
+	if (sctx->b.chip_class >= VI && rtex->dcc_buffer) {
+		unsigned max_uncompressed_block_size = 2;
+		uint64_t dcc_offset = rtex->surface.level[level].dcc_offset;
+
+		if (rtex->surface.nsamples > 1) {
+			if (rtex->surface.bpe == 1)
+				max_uncompressed_block_size = 0;
+			else if (rtex->surface.bpe == 2)
+				max_uncompressed_block_size = 1;
+		}
+
+		surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
+		                       S_028C78_INDEPENDENT_64B_BLOCKS(1);
+		surf->cb_dcc_base = (rtex->dcc_buffer->gpu_address + dcc_offset) >> 8;
+	}
 
 	if (rtex->fmask.size) {
 		surf->cb_color_fmask = (offset + rtex->fmask.offset) >> 8;
@@ -2249,6 +2264,12 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 				RADEON_PRIO_CMASK);
 		}
 
+		if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) {
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				tex->dcc_buffer, RADEON_USAGE_READWRITE,
+				RADEON_PRIO_DCC);
+		}
+
 		radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
 					   sctx->b.chip_class >= VI ? 14 : 13);
 		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
@@ -2266,7 +2287,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 		radeon_emit(cs, tex->color_clear_value[1]);	/* R_028C90_CB_COLOR0_CLEAR_WORD1 */
 
 		if (sctx->b.chip_class >= VI)
-			radeon_emit(cs, 0);	/* R_028C94_CB_COLOR0_DCC_BASE */
+			radeon_emit(cs, cb->cb_dcc_base);	/* R_028C94_CB_COLOR0_DCC_BASE */
 	}
 	/* set CB_COLOR1_INFO for possible dual-src blending */
 	if (i == 1 && state->cbufs[0] &&
@@ -2633,8 +2654,18 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 	view->state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));
 	view->state[5] = (S_008F24_BASE_ARRAY(state->u.tex.first_layer) |
 			  S_008F24_LAST_ARRAY(last_layer));
-	view->state[6] = 0;
-	view->state[7] = 0;
+
+	if (tmp->dcc_buffer) {
+		uint64_t dcc_offset = surflevel[base_level].dcc_offset;
+		unsigned swap = r600_translate_colorswap(pipe_format);
+
+		view->state[6] = S_008F28_COMPRESSION_EN(1) | S_008F28_ALPHA_IS_ON_MSB(swap <= 1);
+		view->state[7] = (tmp->dcc_buffer->gpu_address + dcc_offset) >> 8;
+		view->dcc_buffer = tmp->dcc_buffer;
+	} else {
+		view->state[6] = 0;
+		view->state[7] = 0;
+	}
 
 	/* Initialize the sampler view for FMASK. */
 	if (tmp->fmask.size) {
@@ -3262,7 +3293,7 @@ static void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
 
 	/* FIXME calculate these values somehow ??? */
-	si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, 0x80);
+	si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
 	si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
 	si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
 
@@ -3336,6 +3367,7 @@ static void si_init_config(struct si_context *sctx)
 		break;
 	case CHIP_KABINI:
 	case CHIP_MULLINS:
+	case CHIP_STONEY:
 		raster_config = 0x00000000;
 		raster_config_1 = 0x00000000;
 		break;
@@ -3406,7 +3438,8 @@ static void si_init_config(struct si_context *sctx)
 
 	if (sctx->b.chip_class >= VI) {
 		si_pm4_set_reg(pm4, R_028424_CB_DCC_CONTROL,
-			       S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1));
+			       S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) |
+			       S_028424_OVERWRITE_COMBINER_WATERMARK(4));
 		si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 30);
 		si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32);
 	}
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index fba6619d2fd..8b9a311cd3f 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -61,6 +61,7 @@ struct si_state_rasterizer {
 	bool			poly_smooth;
 	bool			uses_poly_offset;
 	bool			clamp_fragment_color;
+	bool			rasterizer_discard;
 };
 
 struct si_dsa_stencil_ref_part {
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index ce6c98c3124..cf0891a2ab7 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -223,6 +223,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	unsigned prim = info->mode;
 	unsigned primgroup_size = 128; /* recommended without a GS */
+	unsigned max_primgroup_in_wave = 2;
 
 	/* SWITCH_ON_EOP(0) is always preferable. */
 	bool wd_switch_on_eop = false;
@@ -246,13 +247,10 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		/* primgroup_size must be set to a multiple of NUM_PATCHES */
 		primgroup_size = (primgroup_size / num_patches) * num_patches;
 
-		/* SWITCH_ON_EOI must be set if PrimID is used.
-		 * If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+		/* SWITCH_ON_EOI must be set if PrimID is used. */
 		if ((sctx->tcs_shader.cso && sctx->tcs_shader.cso->info.uses_primid) ||
-		    sctx->tes_shader.cso->info.uses_primid) {
+		    sctx->tes_shader.cso->info.uses_primid)
 			ia_switch_on_eoi = true;
-			partial_es_wave = true;
-		}
 
 		/* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
 		if ((sctx->b.family == CHIP_TAHITI ||
@@ -269,10 +267,6 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		wd_switch_on_eop = true;
 	}
 
-	if (sctx->b.streamout.streamout_enabled ||
-	    sctx->b.streamout.prims_gen_query_enabled)
-		partial_vs_wave = true;
-
 	if (sctx->b.chip_class >= CIK) {
 		/* WD_SWITCH_ON_EOP has no effect on GPUs with less than
 		 * 4 shader engines. Set 1 to pass the assertion below.
@@ -282,7 +276,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		    prim == PIPE_PRIM_LINE_LOOP ||
 		    prim == PIPE_PRIM_TRIANGLE_FAN ||
 		    prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
-		    info->primitive_restart)
+		    info->primitive_restart ||
+		    info->count_from_stream_output)
 			wd_switch_on_eop = true;
 
 		/* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
@@ -292,14 +287,34 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 		    (info->indirect || info->instance_count > 1))
 			wd_switch_on_eop = true;
 
-		/* USE_OPAQUE doesn't work when WD_SWITCH_ON_EOP is 0. */
-		if (info->count_from_stream_output)
-			wd_switch_on_eop = true;
+		/* Required on CIK and later. */
+		if (sctx->b.screen->info.max_se > 2 && !wd_switch_on_eop)
+			ia_switch_on_eoi = true;
+
+		/* Required by Hawaii and, for some special cases, by VI. */
+		if (ia_switch_on_eoi &&
+		    (sctx->b.family == CHIP_HAWAII ||
+		     (sctx->b.chip_class == VI &&
+		      (sctx->gs_shader.cso || max_primgroup_in_wave != 2))))
+			partial_vs_wave = true;
+
+		/* Instancing bug on Bonaire. */
+		if (sctx->b.family == CHIP_BONAIRE && ia_switch_on_eoi &&
+		    (info->indirect || info->instance_count > 1))
+			partial_vs_wave = true;
 
 		/* If the WD switch is false, the IA switch must be false too. */
 		assert(wd_switch_on_eop || !ia_switch_on_eop);
 	}
 
+	/* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+	if (ia_switch_on_eoi)
+		partial_es_wave = true;
+
+	/* GS requirement. */
+	if (SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
+		partial_es_wave = true;
+
 	/* Hw bug with single-primitive instances and SWITCH_ON_EOI
 	 * on multi-SE chips. */
 	if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
@@ -308,18 +323,14 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	      u_prims_for_vertices(info->mode, info->count) <= 1)))
 		sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 
-	/* Instancing bug on 2 SE chips. */
-	if (sctx->b.screen->info.max_se == 2 && ia_switch_on_eoi &&
-	    (info->indirect || info->instance_count > 1))
-		partial_vs_wave = true;
-
 	return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
 		S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
 		S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) |
 		S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
 		S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) |
 		S_028AA8_WD_SWITCH_ON_EOP(sctx->b.chip_class >= CIK ? wd_switch_on_eop : 0) |
-		S_028AA8_MAX_PRIMGRP_IN_WAVE(sctx->b.chip_class >= VI ? 2 : 0);
+		S_028AA8_MAX_PRIMGRP_IN_WAVE(sctx->b.chip_class >= VI ?
+					     max_primgroup_in_wave : 0);
 }
 
 static unsigned si_get_ls_hs_config(struct si_context *sctx,
@@ -636,6 +647,17 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 			         S_0085F0_CB5_DEST_BASE_ENA(1) |
 			         S_0085F0_CB6_DEST_BASE_ENA(1) |
 			         S_0085F0_CB7_DEST_BASE_ENA(1);
+
+		/* Necessary for DCC */
+		if (sctx->chip_class >= VI) {
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0) | compute);
+			radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) |
+			                EVENT_INDEX(5));
+			radeon_emit(cs, 0);
+			radeon_emit(cs, 0);
+			radeon_emit(cs, 0);
+			radeon_emit(cs, 0);
+		}
 	}
 	if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
 		cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
@@ -728,6 +750,7 @@ static void si_get_draw_start_count(struct si_context *sctx,
 void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	struct pipe_index_buffer ib = {};
 	unsigned mask;
 
@@ -735,7 +758,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	    (info->indexed || !info->count_from_stream_output))
 		return;
 
-	if (!sctx->ps_shader.cso || !sctx->vs_shader.cso) {
+	if (!sctx->vs_shader.cso) {
+		assert(0);
+		return;
+	}
+	if (!sctx->ps_shader.cso && (!rs || !rs->rasterizer_discard)) {
 		assert(0);
 		return;
 	}
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index eea00e0fafc..4a3a04caa52 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -799,11 +799,11 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
 
-	if (sctx->vs_shader.cso == sel || !sel)
+	if (sctx->vs_shader.cso == sel)
 		return;
 
 	sctx->vs_shader.cso = sel;
-	sctx->vs_shader.current = sel->first_variant;
+	sctx->vs_shader.current = sel ? sel->first_variant : NULL;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
 	si_update_viewports_and_scissors(sctx);
 }
@@ -864,16 +864,6 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 	si_update_viewports_and_scissors(sctx);
 }
 
-static void si_make_dummy_ps(struct si_context *sctx)
-{
-	if (!sctx->dummy_pixel_shader) {
-		sctx->dummy_pixel_shader =
-			util_make_fragment_cloneinput_shader(&sctx->b.b, 0,
-							     TGSI_SEMANTIC_GENERIC,
-							     TGSI_INTERPOLATE_CONSTANT);
-	}
-}
-
 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -883,14 +873,8 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
 	if (sctx->ps_shader.cso == sel)
 		return;
 
-	/* use a dummy shader if binding a NULL shader */
-	if (!sel) {
-		si_make_dummy_ps(sctx);
-		sel = sctx->dummy_pixel_shader;
-	}
-
 	sctx->ps_shader.cso = sel;
-	sctx->ps_shader.current = sel->first_variant;
+	sctx->ps_shader.current = sel ? sel->first_variant : NULL;
 	si_mark_atom_dirty(sctx, &sctx->cb_target_mask);
 }
 
@@ -956,13 +940,15 @@ static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	struct si_shader *ps = sctx->ps_shader.current;
 	struct si_shader *vs = si_get_vs_state(sctx);
-	struct tgsi_shader_info *psinfo = &ps->selector->info;
+	struct tgsi_shader_info *psinfo;
 	struct tgsi_shader_info *vsinfo = &vs->selector->info;
 	unsigned i, j, tmp, num_written = 0;
 
-	if (!ps->nparam)
+	if (!ps || !ps->nparam)
 		return;
 
+	psinfo = &ps->selector->info;
+
 	radeon_set_context_reg_seq(cs, R_028644_SPI_PS_INPUT_CNTL_0, ps->nparam);
 
 	for (i = 0; i < psinfo->num_inputs; i++) {
@@ -1025,7 +1011,12 @@ static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	struct si_shader *ps = sctx->ps_shader.current;
-	unsigned input_ena = ps->spi_ps_input_ena;
+	unsigned input_ena;
+
+	if (!ps)
+		return;
+
+	input_ena = ps->spi_ps_input_ena;
 
 	/* we need to enable at least one of them, otherwise we hang the GPU */
 	assert(G_0286CC_PERSP_SAMPLE_ENA(input_ena) ||
@@ -1531,23 +1522,38 @@ bool si_update_shaders(struct si_context *sctx)
 
 	si_update_vgt_shader_config(sctx);
 
-	r = si_shader_select(ctx, &sctx->ps_shader);
-	if (r)
-		return false;
-	si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
-
-	if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
-	    sctx->sprite_coord_enable != rs->sprite_coord_enable ||
-	    sctx->flatshade != rs->flatshade) {
-		sctx->sprite_coord_enable = rs->sprite_coord_enable;
-		sctx->flatshade = rs->flatshade;
-		si_mark_atom_dirty(sctx, &sctx->spi_map);
-	}
+	if (sctx->ps_shader.cso) {
+		r = si_shader_select(ctx, &sctx->ps_shader);
+		if (r)
+			return false;
+		si_pm4_bind_state(sctx, ps, sctx->ps_shader.current->pm4);
+
+		if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
+		    sctx->sprite_coord_enable != rs->sprite_coord_enable ||
+		    sctx->flatshade != rs->flatshade) {
+			sctx->sprite_coord_enable = rs->sprite_coord_enable;
+			sctx->flatshade = rs->flatshade;
+			si_mark_atom_dirty(sctx, &sctx->spi_map);
+		}
+
+		if (si_pm4_state_changed(sctx, ps) ||
+		    sctx->force_persample_interp != rs->force_persample_interp) {
+			sctx->force_persample_interp = rs->force_persample_interp;
+			si_mark_atom_dirty(sctx, &sctx->spi_ps_input);
+		}
+
+		if (sctx->ps_db_shader_control != sctx->ps_shader.current->db_shader_control) {
+			sctx->ps_db_shader_control = sctx->ps_shader.current->db_shader_control;
+			si_mark_atom_dirty(sctx, &sctx->db_render_state);
+		}
+
+		if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.poly_line_smoothing) {
+			sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.poly_line_smoothing;
+			si_mark_atom_dirty(sctx, &sctx->msaa_config);
 
-	if (si_pm4_state_changed(sctx, ps) ||
-	    sctx->force_persample_interp != rs->force_persample_interp) {
-		sctx->force_persample_interp = rs->force_persample_interp;
-		si_mark_atom_dirty(sctx, &sctx->spi_ps_input);
+			if (sctx->b.chip_class == SI)
+				si_mark_atom_dirty(sctx, &sctx->db_render_state);
+		}
 	}
 
 	if (si_pm4_state_changed(sctx, ls) ||
@@ -1559,19 +1565,6 @@ bool si_update_shaders(struct si_context *sctx)
 		if (!si_update_spi_tmpring_size(sctx))
 			return false;
 	}
-
-	if (sctx->ps_db_shader_control != sctx->ps_shader.current->db_shader_control) {
-		sctx->ps_db_shader_control = sctx->ps_shader.current->db_shader_control;
-		si_mark_atom_dirty(sctx, &sctx->db_render_state);
-	}
-
-	if (sctx->smoothing_enabled != sctx->ps_shader.current->key.ps.poly_line_smoothing) {
-		sctx->smoothing_enabled = sctx->ps_shader.current->key.ps.poly_line_smoothing;
-		si_mark_atom_dirty(sctx, &sctx->msaa_config);
-
-		if (sctx->b.chip_class == SI)
-			si_mark_atom_dirty(sctx, &sctx->db_render_state);
-	}
 	return true;
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index e7006d2fa0d..c0fc82b2f2c 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -249,6 +249,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index d7a3360713f..23ec4ef3cb6 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -214,10 +214,10 @@ prepare_shader_sampling(
                   row_stride[j] = sp_tex->stride[j];
                   img_stride[j] = sp_tex->img_stride[j];
                }
-               if (view->target == PIPE_TEXTURE_1D_ARRAY ||
-                   view->target == PIPE_TEXTURE_2D_ARRAY ||
-                   view->target == PIPE_TEXTURE_CUBE ||
-                   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+               if (tex->target == PIPE_TEXTURE_1D_ARRAY ||
+                   tex->target == PIPE_TEXTURE_2D_ARRAY ||
+                   tex->target == PIPE_TEXTURE_CUBE ||
+                   tex->target == PIPE_TEXTURE_CUBE_ARRAY) {
                   num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1;
                   for (j = first_level; j <= last_level; j++) {
                      mip_offsets[j] += view->u.tex.first_layer *
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 8a0935062b6..e3e28a3ef32 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -1033,6 +1033,7 @@ img_filter_2d_linear_repeat_POT(const struct sp_sampler_view *sp_sview,
       
    addr.value = 0;
    addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
    /* Can we fetch all four at once:
     */
@@ -1081,6 +1082,7 @@ img_filter_2d_nearest_repeat_POT(const struct sp_sampler_view *sp_sview,
 
    addr.value = 0;
    addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
    out = get_texel_2d_no_border(sp_sview, addr, x0, y0);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1111,6 +1113,7 @@ img_filter_2d_nearest_clamp_POT(const struct sp_sampler_view *sp_sview,
 
    addr.value = 0;
    addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
    x0 = util_ifloor(u);
    if (x0 < 0) 
@@ -1154,7 +1157,8 @@ img_filter_1d_nearest(const struct sp_sampler_view *sp_sview,
 
    sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
 
-   out = get_texel_2d(sp_sview, sp_samp, addr, x, 0);
+   out = get_texel_1d_array(sp_sview, sp_samp, addr, x,
+                            sp_sview->base.u.tex.first_layer);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
@@ -1215,6 +1219,7 @@ img_filter_2d_nearest(const struct sp_sampler_view *sp_sview,
  
    addr.value = 0;
    addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
    sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
    sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
@@ -1396,8 +1401,10 @@ img_filter_1d_linear(const struct sp_sampler_view *sp_sview,
 
    sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw);
 
-   tx0 = get_texel_2d(sp_sview, sp_samp, addr, x0, 0);
-   tx1 = get_texel_2d(sp_sview, sp_samp, addr, x1, 0);
+   tx0 = get_texel_1d_array(sp_sview, sp_samp, addr, x0,
+                            sp_sview->base.u.tex.first_layer);
+   tx1 = get_texel_1d_array(sp_sview, sp_samp, addr, x1,
+                            sp_sview->base.u.tex.first_layer);
 
    /* interpolate R, G, B, A */
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1523,6 +1530,7 @@ img_filter_2d_linear(const struct sp_sampler_view *sp_sview,
 
    addr.value = 0;
    addr.bits.level = args->level;
+   addr.bits.z = sp_sview->base.u.tex.first_layer;
 
    sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
    sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
@@ -3252,10 +3260,22 @@ sp_get_texels(const struct sp_sampler_view *sp_sview,
 
    switch (sp_sview->base.target) {
    case PIPE_BUFFER:
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         const int x = CLAMP(v_i[j] + offset[0] +
+                             sp_sview->base.u.buf.first_element,
+                             sp_sview->base.u.buf.first_element,
+                             sp_sview->base.u.buf.last_element);
+         tx = get_texel_2d_no_border(sp_sview, addr, x, 0);
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = tx[c];
+         }
+      }
+      break;
    case PIPE_TEXTURE_1D:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          const int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
-         tx = get_texel_2d_no_border(sp_sview, addr, x, 0);
+         tx = get_texel_2d_no_border(sp_sview, addr, x,
+                                     sp_sview->base.u.tex.first_layer);
          for (c = 0; c < 4; c++) {
             rgba[c][j] = tx[c];
          }
@@ -3277,7 +3297,8 @@ sp_get_texels(const struct sp_sampler_view *sp_sview,
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          const int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
          const int y = CLAMP(v_j[j] + offset[1], 0, height - 1);
-         tx = get_texel_2d_no_border(sp_sview, addr, x, y);
+         tx = get_texel_3d_no_border(sp_sview, addr, x, y,
+                                     sp_sview->base.u.tex.first_layer);
          for (c = 0; c < 4; c++) {
             rgba[c][j] = tx[c];
          }
@@ -3307,6 +3328,7 @@ sp_get_texels(const struct sp_sampler_view *sp_sview,
       }
       break;
    case PIPE_TEXTURE_CUBE: /* TXF can't work on CUBE according to spec */
+   case PIPE_TEXTURE_CUBE_ARRAY:
    default:
       assert(!"Unknown or CUBE texture type in TXF processing\n");
       break;
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index e1ea5df24ca..3347f5f1883 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -127,7 +127,8 @@ softpipe_can_create_resource(struct pipe_screen *screen,
  */
 static boolean
 softpipe_displaytarget_layout(struct pipe_screen *screen,
-                              struct softpipe_resource *spr)
+                              struct softpipe_resource *spr,
+                              const void *map_front_private)
 {
    struct sw_winsys *winsys = softpipe_screen(screen)->winsys;
 
@@ -139,6 +140,7 @@ softpipe_displaytarget_layout(struct pipe_screen *screen,
                                           spr->base.width0, 
                                           spr->base.height0,
                                           64,
+                                          map_front_private,
                                           &spr->stride[0] );
 
    return spr->dt != NULL;
@@ -149,8 +151,9 @@ softpipe_displaytarget_layout(struct pipe_screen *screen,
  * Create new pipe_resource given the template information.
  */
 static struct pipe_resource *
-softpipe_resource_create(struct pipe_screen *screen,
-                         const struct pipe_resource *templat)
+softpipe_resource_create_front(struct pipe_screen *screen,
+                               const struct pipe_resource *templat,
+                               const void *map_front_private)
 {
    struct softpipe_resource *spr = CALLOC_STRUCT(softpipe_resource);
    if (!spr)
@@ -169,7 +172,7 @@ softpipe_resource_create(struct pipe_screen *screen,
    if (spr->base.bind & (PIPE_BIND_DISPLAY_TARGET |
 			 PIPE_BIND_SCANOUT |
 			 PIPE_BIND_SHARED)) {
-      if (!softpipe_displaytarget_layout(screen, spr))
+      if (!softpipe_displaytarget_layout(screen, spr, map_front_private))
          goto fail;
    }
    else {
@@ -184,6 +187,12 @@ softpipe_resource_create(struct pipe_screen *screen,
    return NULL;
 }
 
+static struct pipe_resource *
+softpipe_resource_create(struct pipe_screen *screen,
+                         const struct pipe_resource *templat)
+{
+   return softpipe_resource_create_front(screen, templat, NULL);
+}
 
 static void
 softpipe_resource_destroy(struct pipe_screen *pscreen,
@@ -514,6 +523,7 @@ void
 softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
 {
    screen->resource_create = softpipe_resource_create;
+   screen->resource_create_front = softpipe_resource_create_front;
    screen->resource_destroy = softpipe_resource_destroy;
    screen->resource_from_handle = softpipe_resource_from_handle;
    screen->resource_get_handle = softpipe_resource_get_handle;
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index d3cf52f08e2..0e1e332d6cb 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -1016,6 +1016,8 @@ SVGA3D_BeginDrawPrimitives(struct svga_winsys_context *swc,
    *decls = declArray;
    *ranges = rangeArray;
 
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
+
    return PIPE_OK;
 }
 
diff --git a/src/gallium/drivers/svga/svga_cmd_vgpu10.c b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
index 596ba953cd2..5c121089f91 100644
--- a/src/gallium/drivers/svga/svga_cmd_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_cmd_vgpu10.c
@@ -535,6 +535,7 @@ SVGA3D_vgpu10_Draw(struct svga_winsys_context *swc,
 
    SVGA3D_COPY_BASIC_2(vertexCount, startVertexLocation);
 
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -550,6 +551,7 @@ SVGA3D_vgpu10_DrawIndexed(struct svga_winsys_context *swc,
    SVGA3D_COPY_BASIC_3(indexCount, startIndexLocation,
                        baseVertexLocation);
 
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -566,6 +568,7 @@ SVGA3D_vgpu10_DrawInstanced(struct svga_winsys_context *swc,
    SVGA3D_COPY_BASIC_4(vertexCountPerInstance, instanceCount,
                        startVertexLocation, startInstanceLocation);
 
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -584,6 +587,8 @@ SVGA3D_vgpu10_DrawIndexedInstanced(struct svga_winsys_context *swc,
                        startIndexLocation, baseVertexLocation,
                        startInstanceLocation);
 
+
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
    swc->commit(swc);
    return PIPE_OK;
 }
@@ -593,6 +598,7 @@ SVGA3D_vgpu10_DrawAuto(struct svga_winsys_context *swc)
 {
    SVGA3D_CREATE_COMMAND(DrawAuto, DRAW_AUTO);
 
+   swc->hints |= SVGA_HINT_FLAG_DRAW_EMITTED;
    swc->commit(swc);
    return PIPE_OK;
 }
diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c
index 5635411d938..caf4b17de16 100644
--- a/src/gallium/drivers/svga/svga_draw_arrays.c
+++ b/src/gallium/drivers/svga/svga_draw_arrays.c
@@ -32,6 +32,7 @@
 #include "svga_draw.h"
 #include "svga_draw_private.h"
 #include "svga_context.h"
+#include "svga_shader.h"
 
 
 #define DBG 0
@@ -206,6 +207,32 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
    unsigned gen_prim, gen_size, gen_nr, gen_type;
    u_generate_func gen_func;
    enum pipe_error ret = PIPE_OK;
+   unsigned api_pv = hwtnl->api_pv;
+   struct svga_context *svga = hwtnl->svga;
+
+   if (svga->curr.rast->templ.flatshade &&
+       svga->state.hw_draw.fs->constant_color_output) {
+      /* The fragment color is a constant, not per-vertex so the whole
+       * primitive will be the same color (except for possible blending).
+       * We can ignore the current provoking vertex state and use whatever
+       * the hardware wants.
+       */
+      api_pv = hwtnl->hw_pv;
+
+      if (hwtnl->api_fillmode == PIPE_POLYGON_MODE_FILL) {
+         /* Do some simple primitive conversions to avoid index buffer
+          * generation below.  Note that polygons and quads are not directly
+          * supported by the svga device.  Also note, we can only do this
+          * for flat/constant-colored rendering because of provoking vertex.
+          */
+         if (prim == PIPE_PRIM_POLYGON) {
+            prim = PIPE_PRIM_TRIANGLE_FAN;
+         }
+         else if (prim == PIPE_PRIM_QUADS && count == 4) {
+            prim = PIPE_PRIM_TRIANGLE_FAN;
+         }
+      }
+   }
 
    if (hwtnl->api_fillmode != PIPE_POLYGON_MODE_FILL &&
        prim >= PIPE_PRIM_TRIANGLES) {
@@ -226,7 +253,7 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
                                    prim,
                                    start,
                                    count,
-                                   hwtnl->api_pv,
+                                   api_pv,
                                    hwtnl->hw_pv,
                                    &gen_prim, &gen_size, &gen_nr, &gen_func);
    }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index f6fafca5c0b..5aa7b0d86eb 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -382,6 +382,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
       return 0;
    }
 
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index efcac408626..f49fdb46d0e 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -155,6 +155,9 @@ struct svga_shader_variant
                                 *  applied to any of the varyings.
                                 */
 
+   /** Is the color output just a constant value? (fragment shader only) */
+   boolean constant_color_output;
+
    /** For FS-based polygon stipple */
    unsigned pstipple_sampler_unit;
 
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index 202eee276b7..4c16f4313a0 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -240,6 +240,13 @@ svga_tgsi_vgpu9_translate(struct svga_context *svga,
 
    variant->pstipple_sampler_unit = emit.pstipple_sampler_unit;
 
+   /* If there was exactly one write to a fragment shader output register
+    * and it came from a constant buffer, we know all fragments will have
+    * the same color (except for blending).
+    */
+   variant->constant_color_output =
+      emit.constant_color_output && emit.num_output_writes == 1;
+
 #if 0
    if (!svga_shader_verify(variant->tokens, variant->nr_tokens) ||
        SVGA_DEBUG & DEBUG_TGSI) {
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 0b82483ab2e..83f0c8bd4d0 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -84,6 +84,9 @@ struct svga_shader_emitter
 
    int dynamic_branching_level;
 
+   unsigned num_output_writes;
+   boolean constant_color_output;
+
    boolean in_main_func;
 
    boolean created_common_immediate;
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 00c91a4fa61..dbb90f7654e 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -99,6 +99,7 @@ translate_dst_register( struct svga_shader_emitter *emit,
        * Need to lookup a table built at decl time:
        */
       dest = emit->output_map[reg->Register.Index];
+      emit->num_output_writes++;
       break;
 
    default:
@@ -2103,6 +2104,29 @@ emit_simple_instruction(struct svga_shader_emitter *emit,
 
 
 /**
+ * TGSI_OPCODE_MOVE is only special-cased here to detect the
+ * svga_fragment_shader::constant_color_output case.
+ */
+static boolean
+emit_mov(struct svga_shader_emitter *emit,
+         const struct tgsi_full_instruction *insn)
+{
+   const struct tgsi_full_src_register *src = &insn->Src[0];
+   const struct tgsi_full_dst_register *dst = &insn->Dst[0];
+
+   if (emit->unit == PIPE_SHADER_FRAGMENT &&
+       dst->Register.File == TGSI_FILE_OUTPUT &&
+       dst->Register.Index == 0 &&
+       src->Register.File == TGSI_FILE_CONSTANT &&
+       !src->Register.Indirect) {
+      emit->constant_color_output = TRUE;
+   }
+
+   return emit_simple_instruction(emit, SVGA3DOP_MOV, insn);
+}
+
+
+/**
  * Translate/emit TGSI DDX, DDY instructions.
  */
 static boolean
@@ -3045,6 +3069,9 @@ svga_emit_instruction(struct svga_shader_emitter *emit,
    case TGSI_OPCODE_SSG:
       return emit_ssg( emit, insn );
 
+   case TGSI_OPCODE_MOV:
+      return emit_mov( emit, insn );
+
    default:
       {
          unsigned opcode = translate_opcode(insn->Instruction.Opcode);
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index d62f2bbcc96..e70ee689c59 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -202,6 +202,9 @@ struct svga_shader_emitter_v10
    /* user clip plane constant slot indexes */
    unsigned clip_plane_const[PIPE_MAX_CLIP_PLANES];
 
+   unsigned num_output_writes;
+   boolean constant_color_output;
+
    boolean uses_flat_interp;
 
    /* For all shaders: const reg index for RECT coord scaling */
@@ -913,6 +916,8 @@ emit_dst_register(struct svga_shader_emitter_v10 *emit,
              */
             assert(sem_name == TGSI_SEMANTIC_COLOR);
             index = emit->info.output_semantic_index[index];
+
+            emit->num_output_writes++;
          }
       }
    }
@@ -3097,7 +3102,7 @@ emit_clip_distance_instructions(struct svga_shader_emitter_v10 *emit)
    unsigned i;
    unsigned clip_plane_enable = emit->key.clip_plane_enable;
    unsigned clip_dist_tmp_index = emit->clip_dist_tmp_index;
-   unsigned num_written_clipdist = emit->info.num_written_clipdistance;
+   int num_written_clipdist = emit->info.num_written_clipdistance;
 
    assert(emit->clip_dist_out_index != INVALID_INDEX);
    assert(emit->clip_dist_tmp_index != INVALID_INDEX);
@@ -3109,7 +3114,7 @@ emit_clip_distance_instructions(struct svga_shader_emitter_v10 *emit)
     */
    emit->clip_dist_tmp_index = INVALID_INDEX;
 
-   for (i = 0; i < 2 && num_written_clipdist; i++, num_written_clipdist-=4) {
+   for (i = 0; i < 2 && num_written_clipdist > 0; i++, num_written_clipdist-=4) {
 
       tmp_clip_dist_src = make_src_temp_reg(clip_dist_tmp_index + i);
 
@@ -5573,6 +5578,29 @@ emit_simple(struct svga_shader_emitter_v10 *emit,
 
 
 /**
+ * We only special case the MOV instruction to try to detect constant
+ * color writes in the fragment shader.
+ */
+static boolean
+emit_mov(struct svga_shader_emitter_v10 *emit,
+         const struct tgsi_full_instruction *inst)
+{
+   const struct tgsi_full_src_register *src = &inst->Src[0];
+   const struct tgsi_full_dst_register *dst = &inst->Dst[0];
+
+   if (emit->unit == PIPE_SHADER_FRAGMENT &&
+       dst->Register.File == TGSI_FILE_OUTPUT &&
+       dst->Register.Index == 0 &&
+       src->Register.File == TGSI_FILE_CONSTANT &&
+       !src->Register.Indirect) {
+      emit->constant_color_output = TRUE;
+   }
+
+   return emit_simple(emit, inst);
+}
+
+
+/**
  * Emit a simple VGPU10 instruction which writes to multiple dest registers,
  * where TGSI only uses one dest register.
  */
@@ -5652,7 +5680,6 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
    case TGSI_OPCODE_MAD:
    case TGSI_OPCODE_MAX:
    case TGSI_OPCODE_MIN:
-   case TGSI_OPCODE_MOV:
    case TGSI_OPCODE_MUL:
    case TGSI_OPCODE_NOP:
    case TGSI_OPCODE_NOT:
@@ -5677,7 +5704,8 @@ emit_vgpu10_instruction(struct svga_shader_emitter_v10 *emit,
       /* simple instructions */
       return emit_simple(emit, inst);
 
-
+   case TGSI_OPCODE_MOV:
+      return emit_mov(emit, inst);
    case TGSI_OPCODE_EMIT:
       return emit_vertex(emit, inst);
    case TGSI_OPCODE_ENDPRIM:
@@ -6762,6 +6790,13 @@ svga_tgsi_vgpu10_translate(struct svga_context *svga,
 
    variant->pstipple_sampler_unit = emit->fs.pstipple_sampler_unit;
 
+   /* If there was exactly one write to a fragment shader output register
+    * and it came from a constant buffer, we know all fragments will have
+    * the same color (except for blending).
+    */
+   variant->constant_color_output =
+      emit->constant_color_output && emit->num_output_writes == 1;
+
    /** keep track in the variant if flat interpolation is used
     *  for any of the varyings.
     */
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index c750603989f..3129e46ed06 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -85,6 +85,8 @@ struct winsys_handle;
 #define SVGA_QUERY_FLAG_SET        (1 << 0)
 #define SVGA_QUERY_FLAG_REF        (1 << 1)
 
+#define SVGA_HINT_FLAG_DRAW_EMITTED (1 << 0)
+
 /** Opaque surface handle */
 struct svga_winsys_surface;
 
@@ -213,6 +215,11 @@ struct svga_winsys_context
    uint32 cid;
 
    /**
+    * Flags to hint the current context state
+    */
+   uint32 hints;
+
+   /**
     ** BEGIN new functions for guest-backed surfaces.
     **/
 
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 6d748010baf..476d2b5b0b1 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -22,6 +22,7 @@
  */
 
 #include "util/u_math.h"
+#include "util/u_prim.h"
 #include "util/macros.h"
 #include "vc4_context.h"
 
@@ -163,6 +164,26 @@ dump_VC4_PACKET_LOAD_TILE_BUFFER_GENERAL(void *cl, uint32_t offset, uint32_t hw_
 }
 
 static void
+dump_VC4_PACKET_GL_INDEXED_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        uint8_t *b = cl + offset;
+        uint32_t *count = cl + offset + 1;
+        uint32_t *ib_offset = cl + offset + 5;
+        uint32_t *max_index = cl + offset + 9;
+
+        fprintf(stderr, "0x%08x 0x%08x:      0x%02x %s %s\n",
+                offset, hw_offset,
+                b[0], (b[0] & VC4_INDEX_BUFFER_U16) ? "16-bit" : "8-bit",
+                u_prim_name(b[0] & 0x7));
+        fprintf(stderr, "0x%08x 0x%08x:           %d verts\n",
+                offset + 1, hw_offset + 1, *count);
+        fprintf(stderr, "0x%08x 0x%08x:      0x%08x IB offset\n",
+                offset + 5, hw_offset + 5, *ib_offset);
+        fprintf(stderr, "0x%08x 0x%08x:      0x%08x max index\n",
+                offset + 9, hw_offset + 9, *max_index);
+}
+
+static void
 dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint32_t *bits = cl + offset;
@@ -262,14 +283,14 @@ dump_VC4_PACKET_TILE_RENDERING_MODE_CONFIG(void *cl, uint32_t offset, uint32_t h
                 shorts[1]);
 
         const char *format = "???";
-        switch ((bytes[0] >> 2) & 3) {
-        case 0:
+        switch (VC4_GET_FIELD(shorts[2], VC4_RENDER_CONFIG_FORMAT)) {
+        case VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED:
                 format = "BGR565_DITHERED";
                 break;
-        case 1:
+        case VC4_RENDER_CONFIG_FORMAT_RGBA8888:
                 format = "RGBA8888";
                 break;
-        case 2:
+        case VC4_RENDER_CONFIG_FORMAT_BGR565:
                 format = "BGR565";
                 break;
         }
@@ -277,29 +298,31 @@ dump_VC4_PACKET_TILE_RENDERING_MODE_CONFIG(void *cl, uint32_t offset, uint32_t h
                 format = "64bit";
 
         const char *tiling = "???";
-        switch ((bytes[0] >> 6) & 3) {
-        case 0:
+        switch (VC4_GET_FIELD(shorts[2], VC4_RENDER_CONFIG_MEMORY_FORMAT)) {
+        case VC4_TILING_FORMAT_LINEAR:
                 tiling = "linear";
                 break;
-        case 1:
+        case VC4_TILING_FORMAT_T:
                 tiling = "T";
                 break;
-        case 2:
+        case VC4_TILING_FORMAT_LT:
                 tiling = "LT";
                 break;
         }
 
-        fprintf(stderr, "0x%08x 0x%08x: 0x%02x %s %s %s\n",
+        fprintf(stderr, "0x%08x 0x%08x: 0x%02x %s %s %s %s\n",
                 offset + 8, hw_offset + 8,
                 bytes[0],
                 format, tiling,
-                (bytes[0] & VC4_RENDER_CONFIG_MS_MODE_4X) ? "ms" : "ss");
+                (shorts[2] & VC4_RENDER_CONFIG_MS_MODE_4X) ? "ms" : "ss",
+                (shorts[2] & VC4_RENDER_CONFIG_DECIMATE_MODE_4X) ?
+                "ms_decimate" : "ss_decimate");
 
         const char *earlyz = "";
-        if (bytes[1] & (1 << 3)) {
+        if (shorts[2] & VC4_RENDER_CONFIG_EARLY_Z_COVERAGE_DISABLE) {
                 earlyz = "early_z disabled";
         } else {
-                if (bytes[1] & (1 << 2))
+                if (shorts[2] & VC4_RENDER_CONFIG_EARLY_Z_DIRECTION_G)
                         earlyz = "early_z >";
                 else
                         earlyz = "early_z <";
@@ -356,7 +379,7 @@ static const struct packet_info {
         PACKET_DUMP(VC4_PACKET_STORE_TILE_BUFFER_GENERAL),
         PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
 
-        PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE),
+        PACKET_DUMP(VC4_PACKET_GL_INDEXED_PRIMITIVE),
         PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE),
 
         PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE),
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index c7698422951..86f2ce5e608 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -250,10 +250,10 @@ struct vc4_context {
         bool needs_flush;
 
         /**
-         * Set when needs_flush, and the queued rendering is not just composed
-         * of full-buffer clears.
+         * Number of draw calls (not counting full buffer clears) queued in
+         * the current job.
          */
-        bool draw_call_queued;
+        uint32_t draw_calls_queued;
 
         /** Maximum index buffer valid for the current shader_rec. */
         uint32_t max_index;
@@ -291,7 +291,10 @@ struct vc4_context {
 
         struct vc4_vertex_stateobj *vtx;
 
-        struct pipe_blend_color blend_color;
+        struct {
+                struct pipe_blend_color f;
+                uint8_t ub[4];
+        } blend_color;
         struct pipe_stencil_ref stencil_ref;
         unsigned sample_mask;
         struct pipe_framebuffer_state framebuffer;
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index a4e5e092b1a..624a236c573 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -25,6 +25,7 @@
 #include "util/u_prim.h"
 #include "util/u_format.h"
 #include "util/u_pack_color.h"
+#include "util/u_upload_mgr.h"
 #include "indices/u_primconvert.h"
 
 #include "vc4_context.h"
@@ -100,7 +101,7 @@ vc4_start_draw(struct vc4_context *vc4)
                      VC4_PRIMITIVE_LIST_FORMAT_TYPE_TRIANGLES));
 
         vc4->needs_flush = true;
-        vc4->draw_call_queued = true;
+        vc4->draw_calls_queued++;
         vc4->draw_width = width;
         vc4->draw_height = height;
 
@@ -226,6 +227,38 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *i
         vc4->max_index = max_index;
 }
 
+/**
+ * HW-2116 workaround: Flush the batch before triggering the hardware state
+ * counter wraparound behavior.
+ *
+ * State updates are tracked by a global counter which increments at the first
+ * state update after a draw or a START_BINNING.  Tiles can then have their
+ * state updated at draw time with a set of cheap checks for whether the
+ * state's copy of the global counter matches the global counter the last time
+ * that state was written to the tile.
+ *
+ * The state counters are relatively small and wrap around quickly, so you
+ * could get false negatives for needing to update a particular state in the
+ * tile.  To avoid this, the hardware attempts to write all of the state in
+ * the tile at wraparound time.  This apparently is broken, so we just flush
+ * everything before that behavior is triggered.  A batch flush is sufficient
+ * to get our current contents drawn and reset the counters to 0.
+ *
+ * Note that we can't just use VC4_PACKET_FLUSH_ALL, because that caps the
+ * tiles with VC4_PACKET_RETURN_FROM_LIST.
+ */
+static void
+vc4_hw_2116_workaround(struct pipe_context *pctx)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+
+        if (vc4->draw_calls_queued == 0x1ef0) {
+                perf_debug("Flushing batch due to HW-2116 workaround "
+                           "(too many draw calls per scene\n");
+                vc4_flush(pctx);
+        }
+}
+
 static void
 vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 {
@@ -244,6 +277,8 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
         vc4_update_shadow_textures(pctx, &vc4->verttex);
         vc4_update_shadow_textures(pctx, &vc4->fragtex);
 
+        vc4_hw_2116_workaround(pctx);
+
         vc4_get_draw_cl_space(vc4);
 
         if (vc4->prim_mode != info->mode) {
@@ -285,7 +320,15 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                                                            info->count, &offset);
                         index_size = 2;
                 } else {
-                        prsc = vc4->indexbuf.buffer;
+                        if (vc4->indexbuf.user_buffer) {
+                                prsc = NULL;
+                                u_upload_data(vc4->uploader, 0,
+                                              info->count * index_size,
+                                              vc4->indexbuf.user_buffer,
+                                              &offset, &prsc);
+                        } else {
+                                prsc = vc4->indexbuf.buffer;
+                        }
                 }
                 struct vc4_resource *rsc = vc4_resource(prsc);
 
@@ -300,7 +343,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 cl_reloc(vc4, &vc4->bcl, &bcl, rsc->bo, offset);
                 cl_u32(&bcl, vc4->max_index);
 
-                if (vc4->indexbuf.index_size == 4)
+                if (vc4->indexbuf.index_size == 4 || vc4->indexbuf.user_buffer)
                         pipe_resource_reference(&prsc, NULL);
         } else {
                 cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
@@ -343,8 +386,8 @@ vc4_clear(struct pipe_context *pctx, unsigned buffers,
         /* We can't flag new buffers for clearing once we've queued draws.  We
          * could avoid this by using the 3d engine to clear.
          */
-        if (vc4->draw_call_queued) {
-                perf_debug("Flushing rendering to process new clear.");
+        if (vc4->draw_calls_queued) {
+                perf_debug("Flushing rendering to process new clear.\n");
                 vc4_flush(pctx);
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index 7ebd9f160eb..9ad79c2ea10 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -55,7 +55,7 @@ vc4_job_reset(struct vc4_context *vc4)
         vc4->shader_rec_count = 0;
 
         vc4->needs_flush = false;
-        vc4->draw_call_queued = false;
+        vc4->draw_calls_queued = 0;
 
         /* We have no hardware context saved between our draw calls, so we
          * need to flag the next draw as needing all state emitted.  Emitting
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index 17b524653bb..373c9e12d11 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -86,11 +86,11 @@ vc4_nir_srgb_encode(nir_builder *b, nir_ssa_def *linear)
 }
 
 static nir_ssa_def *
-vc4_blend_channel(nir_builder *b,
-                  nir_ssa_def **src,
-                  nir_ssa_def **dst,
-                  unsigned factor,
-                  int channel)
+vc4_blend_channel_f(nir_builder *b,
+                    nir_ssa_def **src,
+                    nir_ssa_def **dst,
+                    unsigned factor,
+                    int channel)
 {
         switch(factor) {
         case PIPE_BLENDFACTOR_ONE:
@@ -146,8 +146,75 @@ vc4_blend_channel(nir_builder *b,
 }
 
 static nir_ssa_def *
-vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
-               unsigned func)
+vc4_nir_set_packed_chan(nir_builder *b, nir_ssa_def *src0, nir_ssa_def *src1,
+                        int chan)
+{
+        unsigned chan_mask = 0xff << (chan * 8);
+        return nir_ior(b,
+                       nir_iand(b, src0, nir_imm_int(b, ~chan_mask)),
+                       nir_iand(b, src1, nir_imm_int(b, chan_mask)));
+}
+
+static nir_ssa_def *
+vc4_blend_channel_i(nir_builder *b,
+                    nir_ssa_def *src,
+                    nir_ssa_def *dst,
+                    nir_ssa_def *src_a,
+                    nir_ssa_def *dst_a,
+                    unsigned factor,
+                    int a_chan)
+{
+        switch (factor) {
+        case PIPE_BLENDFACTOR_ONE:
+                return nir_imm_int(b, ~0);
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+                return src;
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+                return src_a;
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+                return dst_a;
+        case PIPE_BLENDFACTOR_DST_COLOR:
+                return dst;
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+                return vc4_nir_set_packed_chan(b,
+                                               nir_umin_4x8(b,
+                                                            src_a,
+                                                            nir_inot(b, dst_a)),
+                                               nir_imm_int(b, ~0),
+                                               a_chan);
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_RGBA);
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+                return vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_AAAA);
+        case PIPE_BLENDFACTOR_ZERO:
+                return nir_imm_int(b, 0);
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+                return nir_inot(b, src);
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+                return nir_inot(b, src_a);
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+                return nir_inot(b, dst_a);
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+                return nir_inot(b, dst);
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+                return nir_inot(b, vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_RGBA));
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+                return nir_inot(b, vc4_nir_get_state_uniform(b, QUNIFORM_BLEND_CONST_COLOR_AAAA));
+
+        default:
+        case PIPE_BLENDFACTOR_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_SRC1_ALPHA:
+        case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+        case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend factor %d\n", factor);
+                return nir_imm_int(b, ~0);
+        }
+}
+
+static nir_ssa_def *
+vc4_blend_func_f(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
+                 unsigned func)
 {
         switch (func) {
         case PIPE_BLEND_ADD:
@@ -169,9 +236,33 @@ vc4_blend_func(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
         }
 }
 
+static nir_ssa_def *
+vc4_blend_func_i(nir_builder *b, nir_ssa_def *src, nir_ssa_def *dst,
+                 unsigned func)
+{
+        switch (func) {
+        case PIPE_BLEND_ADD:
+                return nir_usadd_4x8(b, src, dst);
+        case PIPE_BLEND_SUBTRACT:
+                return nir_ussub_4x8(b, src, dst);
+        case PIPE_BLEND_REVERSE_SUBTRACT:
+                return nir_ussub_4x8(b, dst, src);
+        case PIPE_BLEND_MIN:
+                return nir_umin_4x8(b, src, dst);
+        case PIPE_BLEND_MAX:
+                return nir_umax_4x8(b, src, dst);
+
+        default:
+                /* Unsupported. */
+                fprintf(stderr, "Unknown blend func %d\n", func);
+                return src;
+
+        }
+}
+
 static void
-vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
-                nir_ssa_def **src_color, nir_ssa_def **dst_color)
+vc4_do_blending_f(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
+                  nir_ssa_def **src_color, nir_ssa_def **dst_color)
 {
         struct pipe_rt_blend_state *blend = &c->fs_key->blend;
 
@@ -192,20 +283,106 @@ vc4_do_blending(struct vc4_compile *c, nir_builder *b, nir_ssa_def **result,
                 int dst_factor = ((i != 3) ? blend->rgb_dst_factor :
                                   blend->alpha_dst_factor);
                 src_blend[i] = nir_fmul(b, src_color[i],
-                                        vc4_blend_channel(b,
-                                                          src_color, dst_color,
-                                                          src_factor, i));
+                                        vc4_blend_channel_f(b,
+                                                            src_color, dst_color,
+                                                            src_factor, i));
                 dst_blend[i] = nir_fmul(b, dst_color[i],
-                                        vc4_blend_channel(b,
-                                                          src_color, dst_color,
-                                                          dst_factor, i));
+                                        vc4_blend_channel_f(b,
+                                                            src_color, dst_color,
+                                                            dst_factor, i));
         }
 
         for (int i = 0; i < 4; i++) {
-                result[i] = vc4_blend_func(b, src_blend[i], dst_blend[i],
-                                           ((i != 3) ? blend->rgb_func :
-                                            blend->alpha_func));
+                result[i] = vc4_blend_func_f(b, src_blend[i], dst_blend[i],
+                                             ((i != 3) ? blend->rgb_func :
+                                              blend->alpha_func));
+        }
+}
+
+static nir_ssa_def *
+vc4_nir_splat(nir_builder *b, nir_ssa_def *src)
+{
+        nir_ssa_def *or1 = nir_ior(b, src, nir_ishl(b, src, nir_imm_int(b, 8)));
+        return nir_ior(b, or1, nir_ishl(b, or1, nir_imm_int(b, 16)));
+}
+
+static nir_ssa_def *
+vc4_do_blending_i(struct vc4_compile *c, nir_builder *b,
+                  nir_ssa_def *src_color, nir_ssa_def *dst_color,
+                  nir_ssa_def *src_float_a)
+{
+        struct pipe_rt_blend_state *blend = &c->fs_key->blend;
+
+        if (!blend->blend_enable)
+                return src_color;
+
+        enum pipe_format color_format = c->fs_key->color_format;
+        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+        nir_ssa_def *imm_0xff = nir_imm_int(b, 0xff);
+        nir_ssa_def *src_a = nir_pack_unorm_4x8(b, src_float_a);
+        nir_ssa_def *dst_a;
+        int alpha_chan;
+        for (alpha_chan = 0; alpha_chan < 4; alpha_chan++) {
+                if (format_swiz[alpha_chan] == 3)
+                        break;
+        }
+        if (alpha_chan != 4) {
+                nir_ssa_def *shift = nir_imm_int(b, alpha_chan * 8);
+                dst_a = vc4_nir_splat(b, nir_iand(b, nir_ushr(b, dst_color,
+                                                              shift), imm_0xff));
+        } else {
+                dst_a = nir_imm_int(b, ~0);
+        }
+
+        nir_ssa_def *src_factor = vc4_blend_channel_i(b,
+                                                      src_color, dst_color,
+                                                      src_a, dst_a,
+                                                      blend->rgb_src_factor,
+                                                      alpha_chan);
+        nir_ssa_def *dst_factor = vc4_blend_channel_i(b,
+                                                      src_color, dst_color,
+                                                      src_a, dst_a,
+                                                      blend->rgb_dst_factor,
+                                                      alpha_chan);
+
+        if (alpha_chan != 4 &&
+            blend->alpha_src_factor != blend->rgb_src_factor) {
+                nir_ssa_def *src_alpha_factor =
+                        vc4_blend_channel_i(b,
+                                            src_color, dst_color,
+                                            src_a, dst_a,
+                                            blend->alpha_src_factor,
+                                            alpha_chan);
+                src_factor = vc4_nir_set_packed_chan(b, src_factor,
+                                                     src_alpha_factor,
+                                                     alpha_chan);
+        }
+        if (alpha_chan != 4 &&
+            blend->alpha_dst_factor != blend->rgb_dst_factor) {
+                nir_ssa_def *dst_alpha_factor =
+                        vc4_blend_channel_i(b,
+                                            src_color, dst_color,
+                                            src_a, dst_a,
+                                            blend->alpha_dst_factor,
+                                            alpha_chan);
+                dst_factor = vc4_nir_set_packed_chan(b, dst_factor,
+                                                     dst_alpha_factor,
+                                                     alpha_chan);
+        }
+        nir_ssa_def *src_blend = nir_umul_unorm_4x8(b, src_color, src_factor);
+        nir_ssa_def *dst_blend = nir_umul_unorm_4x8(b, dst_color, dst_factor);
+
+        nir_ssa_def *result =
+                vc4_blend_func_i(b, src_blend, dst_blend, blend->rgb_func);
+        if (alpha_chan != 4 && blend->alpha_func != blend->rgb_func) {
+                nir_ssa_def *result_a = vc4_blend_func_i(b,
+                                                         src_blend,
+                                                         dst_blend,
+                                                         blend->alpha_func);
+                result = vc4_nir_set_packed_chan(b, result, result_a,
+                                                 alpha_chan);
         }
+        return result;
 }
 
 static nir_ssa_def *
@@ -299,12 +476,33 @@ vc4_nir_emit_alpha_test_discard(struct vc4_compile *c, nir_builder *b,
         nir_builder_instr_insert(b, &discard->instr);
 }
 
+static nir_ssa_def *
+vc4_nir_swizzle_and_pack(struct vc4_compile *c, nir_builder *b,
+                         nir_ssa_def **colors)
+{
+        enum pipe_format color_format = c->fs_key->color_format;
+        const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+
+        nir_ssa_def *swizzled[4];
+        for (int i = 0; i < 4; i++) {
+                swizzled[i] = vc4_nir_get_swizzled_channel(b, colors,
+                                                           format_swiz[i]);
+        }
+
+        return nir_pack_unorm_4x8(b,
+                                  nir_vec4(b,
+                                           swizzled[0], swizzled[1],
+                                           swizzled[2], swizzled[3]));
+
+}
+
 static void
 vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
                           nir_intrinsic_instr *intr)
 {
         enum pipe_format color_format = c->fs_key->color_format;
         const uint8_t *format_swiz = vc4_get_format_swizzle(color_format);
+        bool srgb = util_format_is_srgb(color_format);
 
         /* Pull out the float src/dst color components. */
         nir_ssa_def *packed_dst_color = vc4_nir_get_dst_color(b);
@@ -315,45 +513,39 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
                 unpacked_dst_color[i] = nir_swizzle(b, dst_vec4, &i, 1, false);
         }
 
-        /* Unswizzle the destination color. */
-        nir_ssa_def *dst_color[4];
-        for (unsigned i = 0; i < 4; i++) {
-                dst_color[i] = vc4_nir_get_swizzled_channel(b,
-                                                            unpacked_dst_color,
-                                                            format_swiz[i]);
-        }
-
         vc4_nir_emit_alpha_test_discard(c, b, src_color[3]);
 
-        /* Turn dst color to linear. */
-        if (util_format_is_srgb(color_format)) {
+        nir_ssa_def *packed_color;
+        if (srgb) {
+                /* Unswizzle the destination color. */
+                nir_ssa_def *dst_color[4];
+                for (unsigned i = 0; i < 4; i++) {
+                        dst_color[i] = vc4_nir_get_swizzled_channel(b,
+                                                                    unpacked_dst_color,
+                                                                    format_swiz[i]);
+                }
+
+                /* Turn dst color to linear. */
                 for (int i = 0; i < 3; i++)
                         dst_color[i] = vc4_nir_srgb_decode(b, dst_color[i]);
-        }
 
-        nir_ssa_def *blend_color[4];
-        vc4_do_blending(c, b, blend_color, src_color, dst_color);
+                nir_ssa_def *blend_color[4];
+                vc4_do_blending_f(c, b, blend_color, src_color, dst_color);
 
-        /* sRGB encode the output color */
-        if (util_format_is_srgb(color_format)) {
+                /* sRGB encode the output color */
                 for (int i = 0; i < 3; i++)
                         blend_color[i] = vc4_nir_srgb_encode(b, blend_color[i]);
-        }
 
-        nir_ssa_def *swizzled_outputs[4];
-        for (int i = 0; i < 4; i++) {
-                swizzled_outputs[i] =
-                        vc4_nir_get_swizzled_channel(b, blend_color,
-                                                     format_swiz[i]);
-        }
+                packed_color = vc4_nir_swizzle_and_pack(c, b, blend_color);
+        } else {
+                nir_ssa_def *packed_src_color =
+                        vc4_nir_swizzle_and_pack(c, b, src_color);
 
-        nir_ssa_def *packed_color =
-                nir_pack_unorm_4x8(b,
-                                   nir_vec4(b,
-                                            swizzled_outputs[0],
-                                            swizzled_outputs[1],
-                                            swizzled_outputs[2],
-                                            swizzled_outputs[3]));
+                packed_color =
+                        vc4_do_blending_i(c, b,
+                                          packed_src_color, packed_dst_color,
+                                          src_color[3]);
+        }
 
         packed_color = vc4_logicop(b, c->fs_key->logicop_func,
                                    packed_color, packed_dst_color);
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index caf706aa2a6..7ea263afb68 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -406,6 +406,7 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
 
         case nir_intrinsic_load_uniform:
         case nir_intrinsic_load_uniform_indirect:
+        case nir_intrinsic_load_user_clip_plane:
                 vc4_nir_lower_uniform(c, b, intr);
                 break;
 
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index 5b435832b92..f1bab810eff 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -64,6 +64,7 @@ is_constant_value(struct vc4_compile *c, struct qreg reg,
                   uint32_t val)
 {
         if (reg.file == QFILE_UNIF &&
+            !reg.pack &&
             c->uniform_contents[reg.index] == QUNIFORM_CONSTANT &&
             c->uniform_data[reg.index] == val) {
                 return true;
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index fd2539aed95..0eee5c34e1d 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -41,34 +41,77 @@ qir_opt_copy_propagation(struct vc4_compile *c)
         bool debug = false;
 
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
-                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
-                        int index = inst->src[i].index;
-                        if (inst->src[i].file == QFILE_TEMP &&
-                            c->defs[index] &&
-                            c->defs[index]->op == QOP_MOV &&
-                            (c->defs[index]->src[0].file == QFILE_TEMP ||
-                             c->defs[index]->src[0].file == QFILE_UNIF)) {
-                                /* If it has a pack, it shouldn't be an SSA
-                                 * def.
+                int nsrc = qir_get_op_nsrc(inst->op);
+                for (int i = 0; i < nsrc; i++) {
+                        if (inst->src[i].file != QFILE_TEMP)
+                                continue;
+
+                        struct qinst *mov = c->defs[inst->src[i].index];
+                        if (!mov ||
+                            (mov->op != QOP_MOV &&
+                             mov->op != QOP_FMOV &&
+                             mov->op != QOP_MMOV)) {
+                                continue;
+                        }
+
+                        if (mov->src[0].file != QFILE_TEMP &&
+                            mov->src[0].file != QFILE_UNIF) {
+                                continue;
+                        }
+
+                        if (mov->dst.pack)
+                                continue;
+
+                        uint8_t unpack;
+                        if (mov->src[0].pack) {
+                                /* Make sure that the meaning of the unpack
+                                 * would be the same between the two
+                                 * instructions.
                                  */
-                                assert(!c->defs[index]->dst.pack);
+                                if (qir_is_float_input(inst) !=
+                                    qir_is_float_input(mov)) {
+                                        continue;
+                                }
 
-                                if (debug) {
-                                        fprintf(stderr, "Copy propagate: ");
-                                        qir_dump_inst(c, inst);
-                                        fprintf(stderr, "\n");
+                                /* There's only one unpack field, so make sure
+                                 * this instruction doesn't already use it.
+                                 */
+                                bool already_has_unpack = false;
+                                for (int j = 0; j < nsrc; j++) {
+                                        if (inst->src[j].pack)
+                                                already_has_unpack = true;
                                 }
+                                if (already_has_unpack)
+                                        continue;
 
-                                inst->src[i] = c->defs[index]->src[0];
+                                /* A destination pack requires the PM bit to
+                                 * be set to a specific value already, which
+                                 * may be different from ours.
+                                 */
+                                if (inst->dst.pack)
+                                        continue;
 
-                                if (debug) {
-                                        fprintf(stderr, "to: ");
-                                        qir_dump_inst(c, inst);
-                                        fprintf(stderr, "\n");
-                                }
+                                unpack = mov->src[0].pack;
+                        } else {
+                                unpack = inst->src[i].pack;
+                        }
 
-                                progress = true;
+                        if (debug) {
+                                fprintf(stderr, "Copy propagate: ");
+                                qir_dump_inst(c, inst);
+                                fprintf(stderr, "\n");
                         }
+
+                        inst->src[i] = mov->src[0];
+                        inst->src[i].pack = unpack;
+
+                        if (debug) {
+                                fprintf(stderr, "to: ");
+                                qir_dump_inst(c, inst);
+                                fprintf(stderr, "\n");
+                        }
+
+                        progress = true;
                 }
         }
         return progress;
diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 0e5480ea781..8b4d429074c 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -65,6 +65,7 @@ vc4_find_cse(struct vc4_compile *c, struct hash_table *ht,
              struct qinst *inst, uint32_t sf_count)
 {
         if (inst->dst.file != QFILE_TEMP ||
+            !c->defs[inst->dst.index] ||
             inst->op == QOP_MOV ||
             qir_get_op_nsrc(inst->op) > 4) {
                 return NULL;
diff --git a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
index d6e98f0aebf..e61562171aa 100644
--- a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
+++ b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
@@ -56,6 +56,7 @@ qir_opt_small_immediates(struct vc4_compile *c)
                         struct qreg src = qir_follow_movs(c, inst->src[i]);
 
                         if (src.file != QFILE_UNIF ||
+                            src.pack ||
                             c->uniform_contents[src.index] !=
                             QUNIFORM_CONSTANT) {
                                 continue;
@@ -72,9 +73,6 @@ qir_opt_small_immediates(struct vc4_compile *c)
                                 continue;
                         }
 
-                        if (qir_src_needs_a_file(inst))
-                                continue;
-
                         uint32_t imm = c->uniform_data[src.index];
                         uint32_t small_imm = qpu_encode_small_immediate(imm);
                         if (small_imm == ~0)
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
index f2cdf8f694f..73ded766db9 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
@@ -58,7 +58,7 @@ qir_opt_vpm_writes(struct vc4_compile *c)
         }
 
         for (int i = 0; i < vpm_write_count; i++) {
-                if (vpm_writes[i]->op != QOP_MOV ||
+                if (!qir_is_raw_mov(vpm_writes[i]) ||
                     vpm_writes[i]->src[0].file != QFILE_TEMP) {
                         continue;
                 }
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 6e9ec6530c6..a48dad804e2 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -738,6 +738,20 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
                 vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
         }
 
+        /* If the pack is replicating the same channel 4 times, use the 8888
+         * pack flag.  This is common for blending using the alpha
+         * channel.
+         */
+        if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
+            instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
+            instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
+                struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
+                *dest = qir_PACK_8888_F(c,
+                                        ntq_get_src(c, instr->src[0].src,
+                                                    instr->src[0].swizzle[0]));
+                return;
+        }
+
         for (int i = 0; i < 4; i++) {
                 int swiz = instr->src[0].swizzle[i];
                 struct qreg src;
@@ -1040,41 +1054,37 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
                 *dest = ntq_emit_ubfe(c, src[0], src[1], src[2]);
                 break;
 
-        default:
-                fprintf(stderr, "unknown NIR ALU inst: ");
-                nir_print_instr(&instr->instr, stderr);
-                fprintf(stderr, "\n");
-                abort();
-        }
-}
+        case nir_op_usadd_4x8:
+                *dest = qir_V8ADDS(c, src[0], src[1]);
+                break;
 
-static void
-clip_distance_discard(struct vc4_compile *c)
-{
-        for (int i = 0; i < PIPE_MAX_CLIP_PLANES; i++) {
-                if (!(c->key->ucp_enables & (1 << i)))
-                        continue;
+        case nir_op_ussub_4x8:
+                *dest = qir_V8SUBS(c, src[0], src[1]);
+                break;
 
-                struct qreg dist =
-                        emit_fragment_varying(c,
-                                              VARYING_SLOT_CLIP_DIST0 + (i / 4),
-                                              i % 4);
+        case nir_op_umin_4x8:
+                *dest = qir_V8MIN(c, src[0], src[1]);
+                break;
 
-                qir_SF(c, dist);
+        case nir_op_umax_4x8:
+                *dest = qir_V8MAX(c, src[0], src[1]);
+                break;
 
-                if (c->discard.file == QFILE_NULL)
-                        c->discard = qir_uniform_ui(c, 0);
+        case nir_op_umul_unorm_4x8:
+                *dest = qir_V8MULD(c, src[0], src[1]);
+                break;
 
-                c->discard = qir_SEL_X_Y_NS(c, qir_uniform_ui(c, ~0),
-                                            c->discard);
+        default:
+                fprintf(stderr, "unknown NIR ALU inst: ");
+                nir_print_instr(&instr->instr, stderr);
+                fprintf(stderr, "\n");
+                abort();
         }
 }
 
 static void
 emit_frag_end(struct vc4_compile *c)
 {
-        clip_distance_discard(c);
-
         struct qreg color;
         if (c->output_color_index != -1) {
                 color = c->outputs[c->output_color_index];
@@ -1190,45 +1200,6 @@ emit_stub_vpm_read(struct vc4_compile *c)
 }
 
 static void
-emit_ucp_clipdistance(struct vc4_compile *c)
-{
-        unsigned cv;
-        if (c->output_clipvertex_index != -1)
-                cv = c->output_clipvertex_index;
-        else if (c->output_position_index != -1)
-                cv = c->output_position_index;
-        else
-                return;
-
-        for (int plane = 0; plane < PIPE_MAX_CLIP_PLANES; plane++) {
-                if (!(c->key->ucp_enables & (1 << plane)))
-                        continue;
-
-                /* Pick the next outputs[] that hasn't been written to, since
-                 * there are no other program writes left to be processed at
-                 * this point.  If something had been declared but not written
-                 * (like a w component), we'll just smash over the top of it.
-                 */
-                uint32_t output_index = c->num_outputs++;
-                add_output(c, output_index,
-                           VARYING_SLOT_CLIP_DIST0 + plane / 4,
-                           plane % 4);
-
-
-                struct qreg dist = qir_uniform_f(c, 0.0);
-                for (int i = 0; i < 4; i++) {
-                        struct qreg pos_chan = c->outputs[cv + i];
-                        struct qreg ucp =
-                                qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
-                                            plane * 4 + i);
-                        dist = qir_FADD(c, dist, qir_FMUL(c, pos_chan, ucp));
-                }
-
-                c->outputs[output_index] = dist;
-        }
-}
-
-static void
 emit_vert_end(struct vc4_compile *c,
               struct vc4_varying_slot *fs_inputs,
               uint32_t num_fs_inputs)
@@ -1236,7 +1207,6 @@ emit_vert_end(struct vc4_compile *c,
         struct qreg rcp_w = qir_RCP(c, c->outputs[c->output_position_index + 3]);
 
         emit_stub_vpm_read(c);
-        emit_ucp_clipdistance(c);
 
         emit_scaled_viewport_write(c, rcp_w);
         emit_zs_write(c, rcp_w);
@@ -1391,9 +1361,6 @@ ntq_setup_outputs(struct vc4_compile *c)
                         case VARYING_SLOT_POS:
                                 c->output_position_index = loc;
                                 break;
-                        case VARYING_SLOT_CLIP_VERTEX:
-                                c->output_clipvertex_index = loc;
-                                break;
                         case VARYING_SLOT_PSIZ:
                                 c->output_point_size_index = loc;
                                 break;
@@ -1486,6 +1453,11 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 
                 break;
 
+        case nir_intrinsic_load_user_clip_plane:
+                *dest = qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+                                    instr->const_index[0]);
+                break;
+
         case nir_intrinsic_load_input:
                 assert(instr->num_components == 1);
                 if (instr->const_index[0] == VC4_NIR_TLB_COLOR_READ_INPUT) {
@@ -1683,10 +1655,18 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         c->s = tgsi_to_nir(tokens, &nir_options);
         nir_opt_global_to_local(c->s);
         nir_convert_to_ssa(c->s);
+
         if (stage == QSTAGE_FRAG)
                 vc4_nir_lower_blend(c);
+
         if (c->fs_key && c->fs_key->light_twoside)
                 nir_lower_two_sided_color(c->s);
+
+        if (stage == QSTAGE_FRAG)
+                nir_lower_clip_fs(c->s, c->key->ucp_enables);
+        else
+                nir_lower_clip_vs(c->s, c->key->ucp_enables);
+
         vc4_nir_lower_io(c);
         nir_lower_idiv(c->s);
         nir_lower_load_const_to_scalar(c->s);
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index e385fbb65ae..7894b081b19 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -36,10 +36,17 @@ struct qir_op_info {
 
 static const struct qir_op_info qir_op_info[] = {
         [QOP_MOV] = { "mov", 1, 1 },
+        [QOP_FMOV] = { "fmov", 1, 1 },
+        [QOP_MMOV] = { "mmov", 1, 1 },
         [QOP_FADD] = { "fadd", 1, 2 },
         [QOP_FSUB] = { "fsub", 1, 2 },
         [QOP_FMUL] = { "fmul", 1, 2 },
         [QOP_MUL24] = { "mul24", 1, 2 },
+        [QOP_V8MULD] = {"v8muld", 1, 2 },
+        [QOP_V8MIN] = {"v8min", 1, 2 },
+        [QOP_V8MAX] = {"v8max", 1, 2 },
+        [QOP_V8ADDS] = {"v8adds", 1, 2 },
+        [QOP_V8SUBS] = {"v8subs", 1, 2 },
         [QOP_FMIN] = { "fmin", 1, 2 },
         [QOP_FMAX] = { "fmax", 1, 2 },
         [QOP_FMINABS] = { "fminabs", 1, 2 },
@@ -71,11 +78,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_RSQ] = { "rsq", 1, 1, false, true },
         [QOP_EXP2] = { "exp2", 1, 2, false, true },
         [QOP_LOG2] = { "log2", 1, 2, false, true },
-        [QOP_PACK_8888_F] = { "pack_8888_f", 1, 1 },
-        [QOP_PACK_8A_F] = { "pack_8a_f", 1, 1 },
-        [QOP_PACK_8B_F] = { "pack_8b_f", 1, 1 },
-        [QOP_PACK_8C_F] = { "pack_8c_f", 1, 1 },
-        [QOP_PACK_8D_F] = { "pack_8d_f", 1, 1 },
         [QOP_TLB_DISCARD_SETUP] = { "discard", 0, 1, true },
         [QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true },
         [QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true },
@@ -95,18 +97,6 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_TEX_B] = { "tex_b", 0, 2 },
         [QOP_TEX_DIRECT] = { "tex_direct", 0, 2 },
         [QOP_TEX_RESULT] = { "tex_result", 1, 0, true },
-        [QOP_UNPACK_8A_F] = { "unpack_8a_f", 1, 1 },
-        [QOP_UNPACK_8B_F] = { "unpack_8b_f", 1, 1 },
-        [QOP_UNPACK_8C_F] = { "unpack_8c_f", 1, 1 },
-        [QOP_UNPACK_8D_F] = { "unpack_8d_f", 1, 1 },
-        [QOP_UNPACK_16A_F] = { "unpack_16a_f", 1, 1 },
-        [QOP_UNPACK_16B_F] = { "unpack_16b_f", 1, 1 },
-        [QOP_UNPACK_8A_I] = { "unpack_8a_i", 1, 1 },
-        [QOP_UNPACK_8B_I] = { "unpack_8b_i", 1, 1 },
-        [QOP_UNPACK_8C_I] = { "unpack_8c_i", 1, 1 },
-        [QOP_UNPACK_8D_I] = { "unpack_8d_i", 1, 1 },
-        [QOP_UNPACK_16A_I] = { "unpack_16a_i", 1, 1 },
-        [QOP_UNPACK_16B_I] = { "unpack_16b_i", 1, 1 },
 };
 
 static const char *
@@ -171,8 +161,14 @@ bool
 qir_is_mul(struct qinst *inst)
 {
         switch (inst->op) {
+        case QOP_MMOV:
         case QOP_FMUL:
         case QOP_MUL24:
+        case QOP_V8MULD:
+        case QOP_V8MIN:
+        case QOP_V8MAX:
+        case QOP_V8ADDS:
+        case QOP_V8SUBS:
                 return true;
         default:
                 return false;
@@ -180,6 +176,35 @@ qir_is_mul(struct qinst *inst)
 }
 
 bool
+qir_is_float_input(struct qinst *inst)
+{
+        switch (inst->op) {
+        case QOP_FMOV:
+        case QOP_FMUL:
+        case QOP_FADD:
+        case QOP_FSUB:
+        case QOP_FMIN:
+        case QOP_FMAX:
+        case QOP_FMINABS:
+        case QOP_FMAXABS:
+        case QOP_FTOI:
+                return true;
+        default:
+                return false;
+        }
+}
+
+bool
+qir_is_raw_mov(struct qinst *inst)
+{
+        return ((inst->op == QOP_MOV ||
+                 inst->op == QOP_FMOV ||
+                 inst->op == QOP_MMOV) &&
+                !inst->dst.pack &&
+                !inst->src[0].pack);
+}
+
+bool
 qir_is_tex(struct qinst *inst)
 {
         return inst->op >= QOP_TEX_S && inst->op <= QOP_TEX_DIRECT;
@@ -204,28 +229,6 @@ qir_depends_on_flags(struct qinst *inst)
 }
 
 bool
-qir_src_needs_a_file(struct qinst *inst)
-{
-        switch (inst->op) {
-        case QOP_UNPACK_8A_F:
-        case QOP_UNPACK_8B_F:
-        case QOP_UNPACK_8C_F:
-        case QOP_UNPACK_8D_F:
-        case QOP_UNPACK_16A_F:
-        case QOP_UNPACK_16B_F:
-        case QOP_UNPACK_8A_I:
-        case QOP_UNPACK_8B_I:
-        case QOP_UNPACK_8C_I:
-        case QOP_UNPACK_8D_I:
-        case QOP_UNPACK_16A_I:
-        case QOP_UNPACK_16B_I:
-                return true;
-        default:
-                return false;
-        }
-}
-
-bool
 qir_writes_r4(struct qinst *inst)
 {
         switch (inst->op) {
@@ -295,6 +298,7 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
         for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                 fprintf(stderr, ", ");
                 qir_print_reg(c, inst->src[i], false);
+                vc4_qpu_disasm_unpack(stderr, inst->src[i].pack);
         }
 }
 
@@ -385,7 +389,6 @@ qir_compile_init(void)
         list_inithead(&c->instructions);
 
         c->output_position_index = -1;
-        c->output_clipvertex_index = -1;
         c->output_color_index = -1;
         c->output_point_size_index = -1;
 
@@ -411,7 +414,8 @@ qir_follow_movs(struct vc4_compile *c, struct qreg reg)
 {
         while (reg.file == QFILE_TEMP &&
                c->defs[reg.index] &&
-               c->defs[reg.index]->op == QOP_MOV) {
+               c->defs[reg.index]->op == QOP_MOV &&
+               !c->defs[reg.index]->dst.pack) {
                 reg = c->defs[reg.index]->src[0];
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index ddde96db6b4..a92ad93ee07 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -37,6 +37,7 @@
 #include "util/u_math.h"
 
 #include "vc4_screen.h"
+#include "vc4_qpu_defines.h"
 #include "pipe/p_state.h"
 
 struct nir_builder;
@@ -64,9 +65,16 @@ struct qreg {
 enum qop {
         QOP_UNDEF,
         QOP_MOV,
+        QOP_FMOV,
+        QOP_MMOV,
         QOP_FADD,
         QOP_FSUB,
         QOP_FMUL,
+        QOP_V8MULD,
+        QOP_V8MIN,
+        QOP_V8MAX,
+        QOP_V8ADDS,
+        QOP_V8SUBS,
         QOP_MUL24,
         QOP_FMIN,
         QOP_FMAX,
@@ -105,11 +113,6 @@ enum qop {
         QOP_LOG2,
         QOP_VW_SETUP,
         QOP_VR_SETUP,
-        QOP_PACK_8888_F,
-        QOP_PACK_8A_F,
-        QOP_PACK_8B_F,
-        QOP_PACK_8C_F,
-        QOP_PACK_8D_F,
         QOP_TLB_DISCARD_SETUP,
         QOP_TLB_STENCIL_SETUP,
         QOP_TLB_Z_WRITE,
@@ -123,20 +126,6 @@ enum qop {
         QOP_FRAG_W,
         QOP_FRAG_REV_FLAG,
 
-        QOP_UNPACK_8A_F,
-        QOP_UNPACK_8B_F,
-        QOP_UNPACK_8C_F,
-        QOP_UNPACK_8D_F,
-        QOP_UNPACK_16A_F,
-        QOP_UNPACK_16B_F,
-
-        QOP_UNPACK_8A_I,
-        QOP_UNPACK_8B_I,
-        QOP_UNPACK_8C_I,
-        QOP_UNPACK_8D_I,
-        QOP_UNPACK_16A_I,
-        QOP_UNPACK_16B_I,
-
         /** Texture x coordinate parameter write */
         QOP_TEX_S,
         /** Texture y coordinate parameter write */
@@ -248,6 +237,8 @@ enum quniform_contents {
         QUNIFORM_BLEND_CONST_COLOR_Y,
         QUNIFORM_BLEND_CONST_COLOR_Z,
         QUNIFORM_BLEND_CONST_COLOR_W,
+        QUNIFORM_BLEND_CONST_COLOR_RGBA,
+        QUNIFORM_BLEND_CONST_COLOR_AAAA,
 
         QUNIFORM_STENCIL,
 
@@ -399,7 +390,6 @@ struct vc4_compile {
         uint32_t num_outputs;
         uint32_t num_texture_samples;
         uint32_t output_position_index;
-        uint32_t output_clipvertex_index;
         uint32_t output_color_index;
         uint32_t output_point_size_index;
 
@@ -457,10 +447,11 @@ bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
 bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst);
 bool qir_is_multi_instruction(struct qinst *inst);
 bool qir_is_mul(struct qinst *inst);
+bool qir_is_raw_mov(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
+bool qir_is_float_input(struct qinst *inst);
 bool qir_depends_on_flags(struct qinst *inst);
 bool qir_writes_r4(struct qinst *inst);
-bool qir_src_needs_a_file(struct qinst *inst);
 struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg);
 
 void qir_dump(struct vc4_compile *c);
@@ -561,9 +552,16 @@ qir_##name(struct vc4_compile *c, struct qreg dest, struct qreg a)       \
 }
 
 QIR_ALU1(MOV)
+QIR_ALU1(FMOV)
+QIR_ALU1(MMOV)
 QIR_ALU2(FADD)
 QIR_ALU2(FSUB)
 QIR_ALU2(FMUL)
+QIR_ALU2(V8MULD)
+QIR_ALU2(V8MIN)
+QIR_ALU2(V8MAX)
+QIR_ALU2(V8ADDS)
+QIR_ALU2(V8SUBS)
 QIR_ALU2(MUL24)
 QIR_ALU1(SEL_X_0_ZS)
 QIR_ALU1(SEL_X_0_ZC)
@@ -596,11 +594,6 @@ QIR_ALU1(RCP)
 QIR_ALU1(RSQ)
 QIR_ALU1(EXP2)
 QIR_ALU1(LOG2)
-QIR_ALU1(PACK_8888_F)
-QIR_PACK(PACK_8A_F)
-QIR_PACK(PACK_8B_F)
-QIR_PACK(PACK_8C_F)
-QIR_PACK(PACK_8D_F)
 QIR_ALU1(VARY_ADD_C)
 QIR_NODST_2(TEX_S)
 QIR_NODST_2(TEX_T)
@@ -622,41 +615,50 @@ QIR_NODST_1(TLB_STENCIL_SETUP)
 static inline struct qreg
 qir_UNPACK_8_F(struct vc4_compile *c, struct qreg src, int i)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_UNPACK_8A_F + i, t, src, c->undef));
+        struct qreg t = qir_FMOV(c, src);
+        c->defs[t.index]->src[0].pack = QPU_UNPACK_8A + i;
         return t;
 }
 
 static inline struct qreg
 qir_UNPACK_8_I(struct vc4_compile *c, struct qreg src, int i)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_UNPACK_8A_I + i, t, src, c->undef));
+        struct qreg t = qir_MOV(c, src);
+        c->defs[t.index]->src[0].pack = QPU_UNPACK_8A + i;
         return t;
 }
 
 static inline struct qreg
 qir_UNPACK_16_F(struct vc4_compile *c, struct qreg src, int i)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_UNPACK_16A_F + i, t, src, c->undef));
+        struct qreg t = qir_FMOV(c, src);
+        c->defs[t.index]->src[0].pack = QPU_UNPACK_16A + i;
         return t;
 }
 
 static inline struct qreg
 qir_UNPACK_16_I(struct vc4_compile *c, struct qreg src, int i)
 {
-        struct qreg t = qir_get_temp(c);
-        qir_emit(c, qir_inst(QOP_UNPACK_16A_I + i, t, src, c->undef));
+        struct qreg t = qir_MOV(c, src);
+        c->defs[t.index]->src[0].pack = QPU_UNPACK_16A + i;
         return t;
 }
 
-static inline struct qreg
+static inline void
 qir_PACK_8_F(struct vc4_compile *c, struct qreg dest, struct qreg val, int chan)
 {
-        qir_emit(c, qir_inst(QOP_PACK_8A_F + chan, dest, val, c->undef));
+        assert(!dest.pack);
+        dest.pack = QPU_PACK_MUL_8A + chan;
+        qir_emit(c, qir_inst(QOP_MMOV, dest, val, c->undef));
         if (dest.file == QFILE_TEMP)
                 c->defs[dest.index] = NULL;
+}
+
+static inline struct qreg
+qir_PACK_8888_F(struct vc4_compile *c, struct qreg val)
+{
+        struct qreg dest = qir_MMOV(c, val);
+        c->defs[dest.index]->dst.pack = QPU_PACK_MUL_8888;
         return dest;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h
index 0719d2828b5..866ca5c1300 100644
--- a/src/gallium/drivers/vc4/vc4_qpu.h
+++ b/src/gallium/drivers/vc4/vc4_qpu.h
@@ -213,6 +213,9 @@ void
 vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack);
 
 void
+vc4_qpu_disasm_unpack(FILE *out, uint32_t pack);
+
+void
 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst);
 
 #endif /* VC4_QPU_H */
diff --git a/src/gallium/drivers/vc4/vc4_qpu_defines.h b/src/gallium/drivers/vc4/vc4_qpu_defines.h
index eb3dfb33827..626dc3be6be 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_defines.h
+++ b/src/gallium/drivers/vc4/vc4_qpu_defines.h
@@ -200,8 +200,8 @@ enum qpu_pack_a {
 
 enum qpu_unpack {
         QPU_UNPACK_NOP,
-        QPU_UNPACK_16A_TO_F32,
-        QPU_UNPACK_16B_TO_F32,
+        QPU_UNPACK_16A,
+        QPU_UNPACK_16B,
         QPU_UNPACK_8D_REP,
         QPU_UNPACK_8A,
         QPU_UNPACK_8B,
diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
index 0879787ec03..c46fd1a0e3f 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c
@@ -98,8 +98,8 @@ static const char *qpu_pack_mul[] = {
  */
 static const char *qpu_unpack[] = {
         [QPU_UNPACK_NOP] = "",
-        [QPU_UNPACK_16A_TO_F32] = "16a",
-        [QPU_UNPACK_16B_TO_F32] = "16b",
+        [QPU_UNPACK_16A] = "16a",
+        [QPU_UNPACK_16B] = "16b",
         [QPU_UNPACK_8D_REP] = "8d_rep",
         [QPU_UNPACK_8A] = "8a",
         [QPU_UNPACK_8B] = "8b",
@@ -257,6 +257,13 @@ vc4_qpu_disasm_pack_a(FILE *out, uint32_t pack)
         fprintf(out, "%s", DESC(qpu_pack_a, pack));
 }
 
+void
+vc4_qpu_disasm_unpack(FILE *out, uint32_t unpack)
+{
+        if (unpack != QPU_UNPACK_NOP)
+                fprintf(out, ".%s", DESC(qpu_unpack, unpack));
+}
+
 static void
 print_alu_dst(uint64_t inst, bool is_mul)
 {
@@ -315,10 +322,9 @@ print_alu_src(uint64_t inst, uint32_t mux)
                         fprintf(stderr, "%s", DESC(special_read_b, raddr - 32));
         }
 
-        if (unpack != QPU_UNPACK_NOP &&
-            ((mux == QPU_MUX_A && !(inst & QPU_PM)) ||
+        if (((mux == QPU_MUX_A && !(inst & QPU_PM)) ||
              (mux == QPU_MUX_R4 && (inst & QPU_PM)))) {
-                fprintf(stderr, ".%s", DESC(qpu_unpack, unpack));
+                vc4_qpu_disasm_unpack(stderr, unpack);
         }
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index adf3a8b3658..133e1385178 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -101,7 +101,8 @@ swap_file(struct qpu_reg *src)
 static void
 fixup_raddr_conflict(struct vc4_compile *c,
                      struct qpu_reg dst,
-                     struct qpu_reg *src0, struct qpu_reg *src1)
+                     struct qpu_reg *src0, struct qpu_reg *src1,
+                     struct qinst *inst, uint64_t *unpack)
 {
         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
@@ -117,7 +118,21 @@ fixup_raddr_conflict(struct vc4_compile *c,
                 return;
 
         if (mux0 == QPU_MUX_A) {
-                queue(c, qpu_a_MOV(qpu_rb(31), *src0));
+                /* Make sure we use the same type of MOV as the instruction,
+                 * in case of unpacks.
+                 */
+                if (qir_is_float_input(inst))
+                        queue(c, qpu_a_FMAX(qpu_rb(31), *src0, *src0));
+                else
+                        queue(c, qpu_a_MOV(qpu_rb(31), *src0));
+
+                /* If we had an unpack on this A-file source, we need to put
+                 * it into this MOV, not into the later move from regfile B.
+                 */
+                if (inst->src[0].pack) {
+                        *last_inst(c) |= *unpack;
+                        *unpack = 0;
+                }
                 *src0 = qpu_rb(31);
         } else {
                 queue(c, qpu_a_MOV(qpu_ra(31), *src0));
@@ -125,6 +140,27 @@ fixup_raddr_conflict(struct vc4_compile *c,
         }
 }
 
+static void
+set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
+{
+        bool had_pm = *last_inst(c) & QPU_PM;
+        bool had_ws = *last_inst(c) & QPU_WS;
+        uint32_t unpack = QPU_GET_FIELD(*last_inst(c), QPU_UNPACK);
+
+        if (!inst->dst.pack)
+                return;
+
+        *last_inst(c) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
+
+        if (qir_is_mul(inst)) {
+                assert(!unpack || had_pm);
+                *last_inst(c) |= QPU_PM;
+        } else {
+                assert(!unpack || !had_pm);
+                assert(!had_ws); /* dst must be a-file to pack. */
+        }
+}
+
 void
 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 {
@@ -134,15 +170,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
         uint32_t vpm_read_fifo_count = 0;
         uint32_t vpm_read_offset = 0;
         int last_vpm_read_index = -1;
-        /* Map from the QIR ops enum order to QPU unpack bits. */
-        static const uint32_t unpack_map[] = {
-                QPU_UNPACK_8A,
-                QPU_UNPACK_8B,
-                QPU_UNPACK_8C,
-                QPU_UNPACK_8D,
-                QPU_UNPACK_16A_TO_F32,
-                QPU_UNPACK_16B_TO_F32,
-        };
 
         list_inithead(&c->qpu_inst_list);
 
@@ -203,9 +230,22 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         A(NOT),
 
                         M(FMUL),
+                        M(V8MULD),
+                        M(V8MIN),
+                        M(V8MAX),
+                        M(V8ADDS),
+                        M(V8SUBS),
                         M(MUL24),
+
+                        /* If we replicate src[0] out to src[1], this works
+                         * out the same as a MOV.
+                         */
+                        [QOP_MOV] = { QPU_A_OR },
+                        [QOP_FMOV] = { QPU_A_FMAX },
+                        [QOP_MMOV] = { QPU_M_V8MIN },
                 };
 
+                uint64_t unpack = 0;
                 struct qpu_reg src[4];
                 for (int i = 0; i < qir_get_op_nsrc(qinst->op); i++) {
                         int index = qinst->src[i].index;
@@ -215,6 +255,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 break;
                         case QFILE_TEMP:
                                 src[i] = temp_registers[index];
+                                if (qinst->src[i].pack) {
+                                        assert(!unpack ||
+                                               unpack == qinst->src[i].pack);
+                                        unpack = QPU_SET_FIELD(qinst->src[i].pack,
+                                                               QPU_UNPACK);
+                                        if (src[i].mux == QPU_MUX_R4)
+                                                unpack |= QPU_PM;
+                                }
                                 break;
                         case QFILE_UNIF:
                                 src[i] = qpu_unif();
@@ -259,19 +307,11 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 }
 
                 switch (qinst->op) {
-                case QOP_MOV:
-                        /* Skip emitting the MOV if it's a no-op. */
-                        if (dst.mux == QPU_MUX_A || dst.mux == QPU_MUX_B ||
-                            dst.mux != src[0].mux || dst.addr != src[0].addr) {
-                                queue(c, qpu_a_MOV(dst, src[0]));
-                        }
-                        break;
-
                 case QOP_SEL_X_0_ZS:
                 case QOP_SEL_X_0_ZC:
                 case QOP_SEL_X_0_NS:
                 case QOP_SEL_X_0_NC:
-                        queue(c, qpu_a_MOV(dst, src[0]));
+                        queue(c, qpu_a_MOV(dst, src[0]) | unpack);
                         set_last_cond_add(c, qinst->op - QOP_SEL_X_0_ZS +
                                           QPU_COND_ZS);
 
@@ -285,10 +325,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_SEL_X_Y_NS:
                 case QOP_SEL_X_Y_NC:
                         queue(c, qpu_a_MOV(dst, src[0]));
+                        if (qinst->src[0].pack)
+                                *(last_inst(c)) |= unpack;
                         set_last_cond_add(c, qinst->op - QOP_SEL_X_Y_ZS +
                                           QPU_COND_ZS);
 
                         queue(c, qpu_a_MOV(dst, src[1]));
+                        if (qinst->src[1].pack)
+                                *(last_inst(c)) |= unpack;
                         set_last_cond_add(c, ((qinst->op - QOP_SEL_X_Y_ZS) ^
                                               1) + QPU_COND_ZS);
 
@@ -301,19 +345,19 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         switch (qinst->op) {
                         case QOP_RCP:
                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
-                                                   src[0]));
+                                                   src[0]) | unpack);
                                 break;
                         case QOP_RSQ:
                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
-                                                   src[0]));
+                                                   src[0]) | unpack);
                                 break;
                         case QOP_EXP2:
                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
-                                                   src[0]));
+                                                   src[0]) | unpack);
                                 break;
                         case QOP_LOG2:
                                 queue(c, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
-                                                   src[0]));
+                                                   src[0]) | unpack);
                                 break;
                         default:
                                 abort();
@@ -324,25 +368,6 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                         break;
 
-                case QOP_PACK_8888_F:
-                        queue(c, qpu_m_MOV(dst, src[0]));
-                        *last_inst(c) |= QPU_PM;
-                        *last_inst(c) |= QPU_SET_FIELD(QPU_PACK_MUL_8888,
-                                                       QPU_PACK);
-                        break;
-
-                case QOP_PACK_8A_F:
-                case QOP_PACK_8B_F:
-                case QOP_PACK_8C_F:
-                case QOP_PACK_8D_F:
-                        queue(c,
-                              qpu_m_MOV(dst, src[0]) |
-                              QPU_PM |
-                              QPU_SET_FIELD(QPU_PACK_MUL_8A +
-                                            qinst->op - QOP_PACK_8A_F,
-                                            QPU_PACK));
-                        break;
-
                 case QOP_FRAG_X:
                         queue(c, qpu_a_ITOF(dst,
                                             qpu_ra(QPU_R_XY_PIXEL_COORD)));
@@ -367,16 +392,19 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 
                 case QOP_TLB_DISCARD_SETUP:
                         discard = true;
-                        queue(c, qpu_a_MOV(src[0], src[0]));
+                        queue(c, qpu_a_MOV(src[0], src[0]) | unpack);
                         *last_inst(c) |= QPU_SF;
                         break;
 
                 case QOP_TLB_STENCIL_SETUP:
-                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), src[0]));
+                        assert(!unpack);
+                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP),
+                                           src[0]) | unpack);
                         break;
 
                 case QOP_TLB_Z_WRITE:
-                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), src[0]));
+                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
+                                           src[0]) | unpack);
                         if (discard) {
                                 set_last_cond_add(c, QPU_COND_ZS);
                         }
@@ -392,14 +420,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         break;
 
                 case QOP_TLB_COLOR_WRITE:
-                        queue(c, qpu_a_MOV(qpu_tlbc(), src[0]));
+                        queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack);
                         if (discard) {
                                 set_last_cond_add(c, QPU_COND_ZS);
                         }
                         break;
 
                 case QOP_VARY_ADD_C:
-                        queue(c, qpu_a_FADD(dst, src[0], qpu_r5()));
+                        queue(c, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
                         break;
 
                 case QOP_TEX_S:
@@ -408,12 +436,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QOP_TEX_B:
                         queue(c, qpu_a_MOV(qpu_rb(QPU_W_TMU0_S +
                                                   (qinst->op - QOP_TEX_S)),
-                                           src[0]));
+                                           src[0]) | unpack);
                         break;
 
                 case QOP_TEX_DIRECT:
-                        fixup_raddr_conflict(c, dst, &src[0], &src[1]);
-                        queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
+                        fixup_raddr_conflict(c, dst, &src[0], &src[1],
+                                             qinst, &unpack);
+                        queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S),
+                                           src[0], src[1]) | unpack);
                         break;
 
                 case QOP_TEX_RESULT:
@@ -424,67 +454,16 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 queue(c, qpu_a_MOV(dst, qpu_r4()));
                         break;
 
-                case QOP_UNPACK_8A_F:
-                case QOP_UNPACK_8B_F:
-                case QOP_UNPACK_8C_F:
-                case QOP_UNPACK_8D_F:
-                case QOP_UNPACK_16A_F:
-                case QOP_UNPACK_16B_F: {
-                        if (src[0].mux == QPU_MUX_R4) {
-                                queue(c, qpu_a_MOV(dst, src[0]));
-                                *last_inst(c) |= QPU_PM;
-                                *last_inst(c) |= QPU_SET_FIELD(QPU_UNPACK_8A +
-                                                               (qinst->op -
-                                                                QOP_UNPACK_8A_F),
-                                                               QPU_UNPACK);
-                        } else {
-                                assert(src[0].mux == QPU_MUX_A);
-
-                                /* Since we're setting the pack bits, if the
-                                 * destination is in A it would get re-packed.
-                                 */
-                                queue(c, qpu_a_FMAX((dst.mux == QPU_MUX_A ?
-                                                     qpu_rb(31) : dst),
-                                                    src[0], src[0]));
-                                *last_inst(c) |=
-                                        QPU_SET_FIELD(unpack_map[qinst->op -
-                                                                 QOP_UNPACK_8A_F],
-                                                      QPU_UNPACK);
-
-                                if (dst.mux == QPU_MUX_A) {
-                                        queue(c, qpu_a_MOV(dst, qpu_rb(31)));
-                                }
-                        }
-                }
-                        break;
-
-                case QOP_UNPACK_8A_I:
-                case QOP_UNPACK_8B_I:
-                case QOP_UNPACK_8C_I:
-                case QOP_UNPACK_8D_I:
-                case QOP_UNPACK_16A_I:
-                case QOP_UNPACK_16B_I: {
-                        assert(src[0].mux == QPU_MUX_A);
-
-                        /* Since we're setting the pack bits, if the
-                         * destination is in A it would get re-packed.
-                         */
-                        queue(c, qpu_a_MOV((dst.mux == QPU_MUX_A ?
-                                            qpu_rb(31) : dst), src[0]));
-                        *last_inst(c) |= QPU_SET_FIELD(unpack_map[qinst->op -
-                                                                  QOP_UNPACK_8A_I],
-                                                       QPU_UNPACK);
-
-                        if (dst.mux == QPU_MUX_A) {
-                                queue(c, qpu_a_MOV(dst, qpu_rb(31)));
-                        }
-                }
-                        break;
-
                 default:
                         assert(qinst->op < ARRAY_SIZE(translate));
                         assert(translate[qinst->op].op != 0); /* NOPs */
 
+                        /* Skip emitting the MOV if it's a no-op. */
+                        if (qir_is_raw_mov(qinst) &&
+                            dst.mux == src[0].mux && dst.addr == src[0].addr) {
+                                break;
+                        }
+
                         /* If we have only one source, put it in the second
                          * argument slot as well so that we don't take up
                          * another raddr just to get unused data.
@@ -492,27 +471,19 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         if (qir_get_op_nsrc(qinst->op) == 1)
                                 src[1] = src[0];
 
-                        fixup_raddr_conflict(c, dst, &src[0], &src[1]);
+                        fixup_raddr_conflict(c, dst, &src[0], &src[1],
+                                             qinst, &unpack);
 
                         if (qir_is_mul(qinst)) {
                                 queue(c, qpu_m_alu2(translate[qinst->op].op,
                                                     dst,
-                                                    src[0], src[1]));
-                                if (qinst->dst.pack) {
-                                        *last_inst(c) |= QPU_PM;
-                                        *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
-                                                                       QPU_PACK);
-                                }
+                                                    src[0], src[1]) | unpack);
                         } else {
                                 queue(c, qpu_a_alu2(translate[qinst->op].op,
                                                     dst,
-                                                    src[0], src[1]));
-                                if (qinst->dst.pack) {
-                                        assert(dst.mux == QPU_MUX_A);
-                                        *last_inst(c) |= QPU_SET_FIELD(qinst->dst.pack,
-                                                                       QPU_PACK);
-                                }
+                                                    src[0], src[1]) | unpack);
                         }
+                        set_last_dst_pack(c, qinst);
 
                         break;
                 }
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 3ced50f3a44..bca36c3e7f4 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -282,23 +282,23 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
                         class_bits[inst->dst.index] &= CLASS_BIT_A;
                 }
 
-                if (qir_src_needs_a_file(inst)) {
-                        switch (inst->op) {
-                        case QOP_UNPACK_8A_F:
-                        case QOP_UNPACK_8B_F:
-                        case QOP_UNPACK_8C_F:
-                        case QOP_UNPACK_8D_F:
-                                /* Special case: these can be done as R4
-                                 * unpacks, as well.
-                                 */
-                                class_bits[inst->src[0].index] &= (CLASS_BIT_A |
-                                                                   CLASS_BIT_R4);
-                                break;
-                        default:
-                                class_bits[inst->src[0].index] &= CLASS_BIT_A;
-                                break;
+                /* Apply restrictions for src unpacks.  The integer unpacks
+                 * can only be done from regfile A, while float unpacks can be
+                 * either A or R4.
+                 */
+                for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
+                        if (inst->src[i].file == QFILE_TEMP &&
+                            inst->src[i].pack) {
+                                if (qir_is_float_input(inst)) {
+                                        class_bits[inst->src[i].index] &=
+                                                CLASS_BIT_A | CLASS_BIT_R4;
+                                } else {
+                                        class_bits[inst->src[i].index] &=
+                                                CLASS_BIT_A;
+                                }
                         }
                 }
+
                 ip++;
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 5d5166fd818..122bda0bac6 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -667,11 +667,16 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
                        shadow_offset, &shadow_rsc, &data);
         uint16_t *dst = data;
 
-        struct pipe_transfer *src_transfer;
-        uint32_t *src = pipe_buffer_map_range(pctx, &orig->base.b,
-                                              ib->offset,
-                                              count * 4,
-                                              PIPE_TRANSFER_READ, &src_transfer);
+        struct pipe_transfer *src_transfer = NULL;
+        uint32_t *src;
+        if (ib->user_buffer) {
+                src = ib->user_buffer;
+        } else {
+                src = pipe_buffer_map_range(pctx, &orig->base.b,
+                                            ib->offset,
+                                            count * 4,
+                                            PIPE_TRANSFER_READ, &src_transfer);
+        }
 
         for (int i = 0; i < count; i++) {
                 uint32_t src_index = src[i];
@@ -679,7 +684,8 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
                 dst[i] = src_index;
         }
 
-        pctx->transfer_unmap(pctx, src_transfer);
+        if (src_transfer)
+                pctx->transfer_unmap(pctx, src_transfer);
 
         return shadow_rsc;
 }
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 774ec095652..bb867611804 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -94,6 +94,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_SHADOW_MAP:
         case PIPE_CAP_BLEND_EQUATION_SEPARATE:
         case PIPE_CAP_TWO_SIDED_STENCIL:
+        case PIPE_CAP_USER_INDEX_BUFFERS:
                 return 1;
 
                 /* lying for GL 2.0 */
@@ -152,7 +153,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
         case PIPE_CAP_VERTEX_COLOR_CLAMPED:
         case PIPE_CAP_USER_VERTEX_BUFFERS:
-        case PIPE_CAP_USER_INDEX_BUFFERS:
         case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
         case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
         case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
@@ -183,6 +183,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
+	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 76980ca32af..10dabd09f5e 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -32,6 +32,11 @@
 #include "vc4_simulator_validate.h"
 #include "simpenrose/simpenrose.h"
 
+/* A marker placed just after each BO, then checked after rendering to make
+ * sure it's still there.
+ */
+#define BO_SENTINEL		0xfedcba98
+
 #define OVERFLOW_SIZE (32 * 1024 * 1024)
 
 static struct drm_gem_cma_object *
@@ -49,10 +54,12 @@ vc4_wrap_bo_with_cma(struct drm_device *dev, struct vc4_bo *bo)
         obj->vaddr = screen->simulator_mem_base + dev->simulator_mem_next;
         obj->paddr = simpenrose_hw_addr(obj->vaddr);
 
-        dev->simulator_mem_next += size;
+        dev->simulator_mem_next += size + sizeof(uint32_t);
         dev->simulator_mem_next = align(dev->simulator_mem_next, 4096);
         assert(dev->simulator_mem_next <= screen->simulator_mem_size);
 
+        *(uint32_t *)(obj->vaddr + bo->size) = BO_SENTINEL;
+
         return obj;
 }
 
@@ -109,6 +116,7 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
                 struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
                 struct vc4_bo *bo = drm_bo->bo;
 
+                assert(*(uint32_t *)(obj->vaddr + bo->size) == BO_SENTINEL);
                 memcpy(bo->map, obj->vaddr, bo->size);
 
                 if (drm_bo->validated_shader) {
@@ -197,6 +205,8 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
         list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec.unref_list,
                                  unref_head) {
 		list_del(&bo->unref_head);
+                assert(*(uint32_t *)(bo->base.vaddr + bo->bo->size) ==
+                       BO_SENTINEL);
                 vc4_bo_unreference(&bo->bo);
                 free(bo);
         }
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 8a759c2ca4c..78aa344ab1d 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -51,7 +51,9 @@ vc4_set_blend_color(struct pipe_context *pctx,
                     const struct pipe_blend_color *blend_color)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
-        vc4->blend_color = *blend_color;
+        vc4->blend_color.f = *blend_color;
+        for (int i = 0; i < 4; i++)
+                vc4->blend_color.ub[i] = float_to_ubyte(blend_color->color[i]);
         vc4->dirty |= VC4_DIRTY_BLEND_COLOR;
 }
 
@@ -303,10 +305,10 @@ vc4_set_index_buffer(struct pipe_context *pctx,
         struct vc4_context *vc4 = vc4_context(pctx);
 
         if (ib) {
-                assert(!ib->user_buffer);
                 pipe_resource_reference(&vc4->indexbuf.buffer, ib->buffer);
                 vc4->indexbuf.index_size = ib->index_size;
                 vc4->indexbuf.offset = ib->offset;
+                vc4->indexbuf.user_buffer = ib->user_buffer;
         } else {
                 pipe_resource_reference(&vc4->indexbuf.buffer, NULL);
         }
diff --git a/src/gallium/drivers/vc4/vc4_uniforms.c b/src/gallium/drivers/vc4/vc4_uniforms.c
index 85d6998205e..f5ad481f186 100644
--- a/src/gallium/drivers/vc4/vc4_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_uniforms.c
@@ -262,11 +262,35 @@ vc4_write_uniforms(struct vc4_context *vc4, struct vc4_compiled_shader *shader,
                 case QUNIFORM_BLEND_CONST_COLOR_Z:
                 case QUNIFORM_BLEND_CONST_COLOR_W:
                         cl_aligned_f(&uniforms,
-                                     CLAMP(vc4->blend_color.color[uinfo->contents[i] -
-                                                                  QUNIFORM_BLEND_CONST_COLOR_X],
+                                     CLAMP(vc4->blend_color.f.color[uinfo->contents[i] -
+                                                                    QUNIFORM_BLEND_CONST_COLOR_X],
                                            0, 1));
                         break;
 
+                case QUNIFORM_BLEND_CONST_COLOR_RGBA: {
+                        const uint8_t *format_swiz =
+                                vc4_get_format_swizzle(vc4->framebuffer.cbufs[0]->format);
+                        uint32_t color = 0;
+                        for (int i = 0; i < 4; i++) {
+                                if (format_swiz[i] >= 4)
+                                        continue;
+
+                                color |= (vc4->blend_color.ub[format_swiz[i]] <<
+                                          (i * 8));
+                        }
+                        cl_aligned_u32(&uniforms, color);
+                        break;
+                }
+
+                case QUNIFORM_BLEND_CONST_COLOR_AAAA: {
+                        uint8_t a = vc4->blend_color.ub[3];
+                        cl_aligned_u32(&uniforms, ((a) |
+                                                   (a << 8) |
+                                                   (a << 16) |
+                                                   (a << 24)));
+                        break;
+                }
+
                 case QUNIFORM_STENCIL:
                         cl_aligned_u32(&uniforms,
                                        vc4->zsa->stencil_uniforms[uinfo->data[i]] |
@@ -330,6 +354,8 @@ vc4_set_shader_uniform_dirty_flags(struct vc4_compiled_shader *shader)
                 case QUNIFORM_BLEND_CONST_COLOR_Y:
                 case QUNIFORM_BLEND_CONST_COLOR_Z:
                 case QUNIFORM_BLEND_CONST_COLOR_W:
+                case QUNIFORM_BLEND_CONST_COLOR_RGBA:
+                case QUNIFORM_BLEND_CONST_COLOR_AAAA:
                         dirty |= VC4_DIRTY_BLEND_COLOR;
                         break;
 
diff --git a/src/gallium/drivers/virgl/Automake.inc b/src/gallium/drivers/virgl/Automake.inc
new file mode 100644
index 00000000000..b05d3e314c8
--- /dev/null
+++ b/src/gallium/drivers/virgl/Automake.inc
@@ -0,0 +1,11 @@
+if HAVE_GALLIUM_VIRGL
+
+TARGET_DRIVERS += virtio_gpu
+TARGET_CPPFLAGS += -DGALLIUM_VIRGL
+TARGET_LIB_DEPS += \
+	$(top_builddir)/src/gallium/drivers/virgl/libvirgl.la \
+	$(top_builddir)/src/gallium/winsys/virgl/drm/libvirgldrm.la \
+	$(top_builddir)/src/gallium/winsys/virgl/vtest/libvirglvtest.la \
+	$(LIBDRM_LIBS)
+
+endif
diff --git a/src/gallium/drivers/virgl/Makefile.am b/src/gallium/drivers/virgl/Makefile.am
new file mode 100644
index 00000000000..82d9756143f
--- /dev/null
+++ b/src/gallium/drivers/virgl/Makefile.am
@@ -0,0 +1,32 @@
+# Copyright © 2014, 2015 Red Hat.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CPPFLAGS = \
+	$(GALLIUM_DRIVER_CFLAGS) \
+	$(LIBDRM_CFLAGS)
+
+noinst_LTLIBRARIES = libvirgl.la
+
+libvirgl_la_SOURCES = $(C_SOURCES)
diff --git a/src/gallium/drivers/virgl/Makefile.sources b/src/gallium/drivers/virgl/Makefile.sources
new file mode 100644
index 00000000000..c27d284e248
--- /dev/null
+++ b/src/gallium/drivers/virgl/Makefile.sources
@@ -0,0 +1,18 @@
+C_SOURCES := \
+	virgl_buffer.c \
+	virgl_context.c \
+	virgl_context.h \
+	virgl_encode.c \
+	virgl_encode.h \
+	virgl_hw.h \
+	virgl_protocol.h \
+	virgl_public.h \
+	virgl_query.c \
+	virgl_resource.c \
+	virgl_resource.h \
+	virgl_screen.c \
+	virgl_screen.h \
+	virgl_streamout.c \
+	virgl_texture.c \
+	virgl_tgsi.c \
+	virgl_winsys.h
diff --git a/src/gallium/drivers/virgl/virgl_buffer.c b/src/gallium/drivers/virgl/virgl_buffer.c
new file mode 100644
index 00000000000..ce19fb949d0
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_buffer.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "virgl_context.h"
+#include "virgl_resource.h"
+#include "virgl_screen.h"
+
+static void virgl_buffer_destroy(struct pipe_screen *screen,
+                                 struct pipe_resource *buf)
+{
+   struct virgl_screen *vs = virgl_screen(screen);
+   struct virgl_buffer *vbuf = virgl_buffer(buf);
+
+   util_range_destroy(&vbuf->valid_buffer_range);
+   vs->vws->resource_unref(vs->vws, vbuf->base.hw_res);
+   FREE(vbuf);
+}
+
+static void *virgl_buffer_transfer_map(struct pipe_context *ctx,
+                                       struct pipe_resource *resource,
+                                       unsigned level,
+                                       unsigned usage,
+                                       const struct pipe_box *box,
+                                       struct pipe_transfer **transfer)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *vs = virgl_screen(ctx->screen);
+   struct virgl_buffer *vbuf = virgl_buffer(resource);
+   struct virgl_transfer *trans;
+   void *ptr;
+   bool readback;
+   uint32_t offset;
+   bool doflushwait = false;
+
+   if ((usage & PIPE_TRANSFER_READ) && (vbuf->on_list == TRUE))
+      doflushwait = true;
+   else
+      doflushwait = virgl_res_needs_flush_wait(vctx, &vbuf->base, usage);
+
+   if (doflushwait)
+      ctx->flush(ctx, NULL, 0);
+
+   trans = util_slab_alloc(&vctx->texture_transfer_pool);
+   if (trans == NULL)
+      return NULL;
+
+   trans->base.resource = resource;
+   trans->base.level = level;
+   trans->base.usage = usage;
+   trans->base.box = *box;
+   trans->base.stride = 0;
+   trans->base.layer_stride = 0;
+
+   offset = box->x;
+
+   readback = virgl_res_needs_readback(vctx, &vbuf->base, usage);
+   if (readback)
+      vs->vws->transfer_get(vs->vws, vbuf->base.hw_res, box, trans->base.stride, trans->base.layer_stride, offset, level);
+
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED))
+      doflushwait = true;
+
+   if (doflushwait || readback)
+      vs->vws->resource_wait(vs->vws, vbuf->base.hw_res);
+
+   ptr = vs->vws->resource_map(vs->vws, vbuf->base.hw_res);
+   if (!ptr) {
+      return NULL;
+   }
+
+   trans->offset = offset;
+   *transfer = &trans->base;
+
+   return ptr + trans->offset;
+}
+
+static void virgl_buffer_transfer_unmap(struct pipe_context *ctx,
+                                        struct pipe_transfer *transfer)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_transfer *trans = virgl_transfer(transfer);
+   struct virgl_buffer *vbuf = virgl_buffer(transfer->resource);
+
+   if (trans->base.usage & PIPE_TRANSFER_WRITE) {
+      if (!(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) {
+         struct virgl_screen *vs = virgl_screen(ctx->screen);
+         vbuf->base.clean = FALSE;
+         vctx->num_transfers++;
+         vs->vws->transfer_put(vs->vws, vbuf->base.hw_res,
+                               &transfer->box, trans->base.stride, trans->base.layer_stride, trans->offset, transfer->level);
+
+      }
+   }
+
+   util_slab_free(&vctx->texture_transfer_pool, trans);
+}
+
+static void virgl_buffer_transfer_flush_region(struct pipe_context *ctx,
+                                               struct pipe_transfer *transfer,
+                                               const struct pipe_box *box)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_buffer *vbuf = virgl_buffer(transfer->resource);
+
+   if (!vbuf->on_list) {
+       struct pipe_resource *res = NULL;
+
+       list_addtail(&vbuf->flush_list, &vctx->to_flush_bufs);
+       vbuf->on_list = TRUE;
+       pipe_resource_reference(&res, &vbuf->base.u.b);
+   }
+
+   util_range_add(&vbuf->valid_buffer_range, transfer->box.x + box->x,
+                  transfer->box.x + box->x + box->width);
+
+   vbuf->base.clean = FALSE;
+}
+
+static const struct u_resource_vtbl virgl_buffer_vtbl =
+{
+   u_default_resource_get_handle,            /* get_handle */
+   virgl_buffer_destroy,                     /* resource_destroy */
+   virgl_buffer_transfer_map,                /* transfer_map */
+   virgl_buffer_transfer_flush_region,       /* transfer_flush_region */
+   virgl_buffer_transfer_unmap,              /* transfer_unmap */
+   virgl_transfer_inline_write               /* transfer_inline_write */
+};
+
+struct pipe_resource *virgl_buffer_create(struct virgl_screen *vs,
+                                          const struct pipe_resource *template)
+{
+   struct virgl_buffer *buf;
+   uint32_t size;
+   uint32_t vbind;
+   buf = CALLOC_STRUCT(virgl_buffer);
+   buf->base.clean = TRUE;
+   buf->base.u.b = *template;
+   buf->base.u.b.screen = &vs->base;
+   buf->base.u.vtbl = &virgl_buffer_vtbl;
+   pipe_reference_init(&buf->base.u.b.reference, 1);
+   util_range_init(&buf->valid_buffer_range);
+
+   vbind = pipe_to_virgl_bind(template->bind);
+   size = template->width0;
+
+   buf->base.hw_res = vs->vws->resource_create(vs->vws, template->target, template->format, vbind, template->width0, 1, 1, 1, 0, 0, size);
+
+   util_range_set_empty(&buf->valid_buffer_range);
+   return &buf->base.u.b;
+}
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
new file mode 100644
index 00000000000..e4f02ba1096
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -0,0 +1,963 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "pipe/p_shader_tokens.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_transfer.h"
+#include "util/u_helpers.h"
+#include "util/u_slab.h"
+#include "util/u_upload_mgr.h"
+#include "util/u_blitter.h"
+#include "tgsi/tgsi_text.h"
+#include "indices/u_primconvert.h"
+
+#include "pipebuffer/pb_buffer.h"
+#include "state_tracker/graw.h"
+
+#include "virgl_encode.h"
+#include "virgl_context.h"
+#include "virgl_protocol.h"
+#include "virgl_resource.h"
+#include "virgl_screen.h"
+
+static uint32_t next_handle;
+uint32_t virgl_object_assign_handle(void)
+{
+   return ++next_handle;
+}
+
+static void virgl_buffer_flush(struct virgl_context *vctx,
+                              struct virgl_buffer *vbuf)
+{
+   struct virgl_screen *rs = virgl_screen(vctx->base.screen);
+   struct pipe_box box;
+
+   assert(vbuf->on_list);
+
+   box.height = 1;
+   box.depth = 1;
+   box.y = 0;
+   box.z = 0;
+
+   box.x = vbuf->valid_buffer_range.start;
+   box.width = MIN2(vbuf->valid_buffer_range.end - vbuf->valid_buffer_range.start, vbuf->base.u.b.width0);
+
+   vctx->num_transfers++;
+   rs->vws->transfer_put(rs->vws, vbuf->base.hw_res,
+                         &box, 0, 0, box.x, 0);
+
+   util_range_set_empty(&vbuf->valid_buffer_range);
+}
+
+static void virgl_attach_res_framebuffer(struct virgl_context *vctx)
+{
+   struct virgl_winsys *vws = virgl_screen(vctx->base.screen)->vws;
+   struct pipe_surface *surf;
+   struct virgl_resource *res;
+   unsigned i;
+
+   surf = vctx->framebuffer.zsbuf;
+   if (surf) {
+      res = virgl_resource(surf->texture);
+      if (res)
+         vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
+   }
+   for (i = 0; i < vctx->framebuffer.nr_cbufs; i++) {
+      surf = vctx->framebuffer.cbufs[i];
+      if (surf) {
+         res = virgl_resource(surf->texture);
+         if (res)
+            vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
+      }
+   }
+}
+
+static void virgl_attach_res_sampler_views(struct virgl_context *vctx,
+                                           unsigned shader_type)
+{
+   struct virgl_winsys *vws = virgl_screen(vctx->base.screen)->vws;
+   struct virgl_textures_info *tinfo = &vctx->samplers[shader_type];
+   struct virgl_resource *res;
+   uint32_t remaining_mask = tinfo->enabled_mask;
+   unsigned i;
+   while (remaining_mask) {
+      i = u_bit_scan(&remaining_mask);
+      assert(tinfo->views[i]);
+
+      res = virgl_resource(tinfo->views[i]->base.texture);
+      if (res)
+         vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
+   }
+}
+
+static void virgl_attach_res_vertex_buffers(struct virgl_context *vctx)
+{
+   struct virgl_winsys *vws = virgl_screen(vctx->base.screen)->vws;
+   struct virgl_resource *res;
+   unsigned i;
+
+   for (i = 0; i < vctx->num_vertex_buffers; i++) {
+      res = virgl_resource(vctx->vertex_buffer[i].buffer);
+      if (res)
+         vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
+   }
+}
+
+static void virgl_attach_res_index_buffer(struct virgl_context *vctx)
+{
+   struct virgl_winsys *vws = virgl_screen(vctx->base.screen)->vws;
+   struct virgl_resource *res;
+
+   res = virgl_resource(vctx->index_buffer.buffer);
+   if (res)
+      vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
+}
+
+static void virgl_attach_res_so_targets(struct virgl_context *vctx)
+{
+   struct virgl_winsys *vws = virgl_screen(vctx->base.screen)->vws;
+   struct virgl_resource *res;
+   unsigned i;
+
+   for (i = 0; i < vctx->num_so_targets; i++) {
+      res = virgl_resource(vctx->so_targets[i].base.buffer);
+      if (res)
+         vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
+   }
+}
+
+static void virgl_attach_res_uniform_buffers(struct virgl_context *vctx,
+                                             unsigned shader_type)
+{
+   struct virgl_winsys *vws = virgl_screen(vctx->base.screen)->vws;
+   struct virgl_resource *res;
+   unsigned i;
+   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+      res = virgl_resource(vctx->ubos[shader_type][i]);
+      if (res) {
+         vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
+      }
+   }
+}
+
+/*
+ * after flushing, the hw context still has a bunch of
+ * resources bound, so we need to rebind those here.
+ */
+static void virgl_reemit_res(struct virgl_context *vctx)
+{
+   unsigned shader_type;
+
+   /* reattach any flushed resources */
+   /* framebuffer, sampler views, vertex/index/uniform/stream buffers */
+   virgl_attach_res_framebuffer(vctx);
+
+   for (shader_type = 0; shader_type < PIPE_SHADER_TYPES; shader_type++) {
+      virgl_attach_res_sampler_views(vctx, shader_type);
+      virgl_attach_res_uniform_buffers(vctx, shader_type);
+   }
+   virgl_attach_res_index_buffer(vctx);
+   virgl_attach_res_vertex_buffers(vctx);
+   virgl_attach_res_so_targets(vctx);
+}
+
+static struct pipe_surface *virgl_create_surface(struct pipe_context *ctx,
+                                                struct pipe_resource *resource,
+                                                const struct pipe_surface *templ)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_surface *surf;
+   struct virgl_resource *res = virgl_resource(resource);
+   uint32_t handle;
+
+   surf = CALLOC_STRUCT(virgl_surface);
+   if (surf == NULL)
+      return NULL;
+
+   res->clean = FALSE;
+   handle = virgl_object_assign_handle();
+   pipe_reference_init(&surf->base.reference, 1);
+   pipe_resource_reference(&surf->base.texture, resource);
+   surf->base.context = ctx;
+   surf->base.format = templ->format;
+   if (resource->target != PIPE_BUFFER) {
+      surf->base.width = u_minify(resource->width0, templ->u.tex.level);
+      surf->base.height = u_minify(resource->height0, templ->u.tex.level);
+      surf->base.u.tex.level = templ->u.tex.level;
+      surf->base.u.tex.first_layer = templ->u.tex.first_layer;
+      surf->base.u.tex.last_layer = templ->u.tex.last_layer;
+   } else {
+      surf->base.width = templ->u.buf.last_element - templ->u.buf.first_element + 1;
+      surf->base.height = resource->height0;
+      surf->base.u.buf.first_element = templ->u.buf.first_element;
+      surf->base.u.buf.last_element = templ->u.buf.last_element;
+   }
+   virgl_encoder_create_surface(vctx, handle, res, &surf->base);
+   surf->handle = handle;
+   return &surf->base;
+}
+
+static void virgl_surface_destroy(struct pipe_context *ctx,
+                                 struct pipe_surface *psurf)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_surface *surf = virgl_surface(psurf);
+
+   pipe_resource_reference(&surf->base.texture, NULL);
+   virgl_encode_delete_object(vctx, surf->handle, VIRGL_OBJECT_SURFACE);
+   FREE(surf);
+}
+
+static void *virgl_create_blend_state(struct pipe_context *ctx,
+                                              const struct pipe_blend_state *blend_state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle;
+   handle = virgl_object_assign_handle();
+
+   virgl_encode_blend_state(vctx, handle, blend_state);
+   return (void *)(unsigned long)handle;
+
+}
+
+static void virgl_bind_blend_state(struct pipe_context *ctx,
+                                           void *blend_state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = (unsigned long)blend_state;
+   virgl_encode_bind_object(vctx, handle, VIRGL_OBJECT_BLEND);
+}
+
+static void virgl_delete_blend_state(struct pipe_context *ctx,
+                                     void *blend_state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = (unsigned long)blend_state;
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_BLEND);
+}
+
+static void *virgl_create_depth_stencil_alpha_state(struct pipe_context *ctx,
+                                                   const struct pipe_depth_stencil_alpha_state *blend_state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle;
+   handle = virgl_object_assign_handle();
+
+   virgl_encode_dsa_state(vctx, handle, blend_state);
+   return (void *)(unsigned long)handle;
+}
+
+static void virgl_bind_depth_stencil_alpha_state(struct pipe_context *ctx,
+                                                void *blend_state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = (unsigned long)blend_state;
+   virgl_encode_bind_object(vctx, handle, VIRGL_OBJECT_DSA);
+}
+
+static void virgl_delete_depth_stencil_alpha_state(struct pipe_context *ctx,
+                                                  void *dsa_state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = (unsigned long)dsa_state;
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_DSA);
+}
+
+static void *virgl_create_rasterizer_state(struct pipe_context *ctx,
+                                                   const struct pipe_rasterizer_state *rs_state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle;
+   handle = virgl_object_assign_handle();
+
+   virgl_encode_rasterizer_state(vctx, handle, rs_state);
+   return (void *)(unsigned long)handle;
+}
+
+static void virgl_bind_rasterizer_state(struct pipe_context *ctx,
+                                                void *rs_state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = (unsigned long)rs_state;
+
+   virgl_encode_bind_object(vctx, handle, VIRGL_OBJECT_RASTERIZER);
+}
+
+static void virgl_delete_rasterizer_state(struct pipe_context *ctx,
+                                         void *rs_state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = (unsigned long)rs_state;
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_RASTERIZER);
+}
+
+static void virgl_set_framebuffer_state(struct pipe_context *ctx,
+                                                const struct pipe_framebuffer_state *state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   vctx->framebuffer = *state;
+   virgl_encoder_set_framebuffer_state(vctx, state);
+   virgl_attach_res_framebuffer(vctx);
+}
+
+static void virgl_set_viewport_states(struct pipe_context *ctx,
+                                     unsigned start_slot,
+                                     unsigned num_viewports,
+                                     const struct pipe_viewport_state *state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   virgl_encoder_set_viewport_states(vctx, start_slot, num_viewports, state);
+}
+
+static void *virgl_create_vertex_elements_state(struct pipe_context *ctx,
+                                                        unsigned num_elements,
+                                                        const struct pipe_vertex_element *elements)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = virgl_object_assign_handle();
+   virgl_encoder_create_vertex_elements(vctx, handle,
+                                       num_elements, elements);
+   return (void*)(unsigned long)handle;
+
+}
+
+static void virgl_delete_vertex_elements_state(struct pipe_context *ctx,
+                                              void *ve)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = (unsigned long)ve;
+
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_VERTEX_ELEMENTS);
+}
+
+static void virgl_bind_vertex_elements_state(struct pipe_context *ctx,
+                                                     void *ve)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = (unsigned long)ve;
+   virgl_encode_bind_object(vctx, handle, VIRGL_OBJECT_VERTEX_ELEMENTS);
+}
+
+static void virgl_set_vertex_buffers(struct pipe_context *ctx,
+                                    unsigned start_slot,
+                                    unsigned num_buffers,
+                                    const struct pipe_vertex_buffer *buffers)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   util_set_vertex_buffers_count(vctx->vertex_buffer,
+                                 &vctx->num_vertex_buffers,
+                                 buffers, start_slot, num_buffers);
+
+   vctx->vertex_array_dirty = TRUE;
+}
+
+static void virgl_hw_set_vertex_buffers(struct pipe_context *ctx)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   if (vctx->vertex_array_dirty) {
+      virgl_encoder_set_vertex_buffers(vctx, vctx->num_vertex_buffers, vctx->vertex_buffer);
+      virgl_attach_res_vertex_buffers(vctx);
+   }
+}
+
+static void virgl_set_stencil_ref(struct pipe_context *ctx,
+                                 const struct pipe_stencil_ref *ref)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   virgl_encoder_set_stencil_ref(vctx, ref);
+}
+
+static void virgl_set_blend_color(struct pipe_context *ctx,
+                                 const struct pipe_blend_color *color)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   virgl_encoder_set_blend_color(vctx, color);
+}
+
+static void virgl_set_index_buffer(struct pipe_context *ctx,
+                                  const struct pipe_index_buffer *ib)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   if (ib) {
+      pipe_resource_reference(&vctx->index_buffer.buffer, ib->buffer);
+      memcpy(&vctx->index_buffer, ib, sizeof(*ib));
+   } else {
+      pipe_resource_reference(&vctx->index_buffer.buffer, NULL);
+   }
+}
+
+static void virgl_hw_set_index_buffer(struct pipe_context *ctx,
+                                     struct pipe_index_buffer *ib)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   virgl_encoder_set_index_buffer(vctx, ib);
+   virgl_attach_res_index_buffer(vctx);
+}
+
+static void virgl_set_constant_buffer(struct pipe_context *ctx,
+                                     uint shader, uint index,
+                                     struct pipe_constant_buffer *buf)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   if (buf) {
+      if (!buf->user_buffer){
+         struct virgl_resource *res = virgl_resource(buf->buffer);
+         virgl_encoder_set_uniform_buffer(vctx, shader, index, buf->buffer_offset,
+                                          buf->buffer_size, res);
+         pipe_resource_reference(&vctx->ubos[shader][index], buf->buffer);
+         return;
+      }
+      pipe_resource_reference(&vctx->ubos[shader][index], NULL);
+      virgl_encoder_write_constant_buffer(vctx, shader, index, buf->buffer_size / 4, buf->user_buffer);
+   } else {
+      virgl_encoder_write_constant_buffer(vctx, shader, index, 0, NULL);
+      pipe_resource_reference(&vctx->ubos[shader][index], NULL);
+   }
+}
+
+void virgl_transfer_inline_write(struct pipe_context *ctx,
+                                struct pipe_resource *res,
+                                unsigned level,
+                                unsigned usage,
+                                const struct pipe_box *box,
+                                const void *data,
+                                unsigned stride,
+                                unsigned layer_stride)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *vs = virgl_screen(ctx->screen);
+   struct virgl_resource *grres = virgl_resource(res);
+   struct virgl_buffer *vbuf = virgl_buffer(res);
+
+   grres->clean = FALSE;
+
+   if (virgl_res_needs_flush_wait(vctx, &vbuf->base, usage)) {
+      ctx->flush(ctx, NULL, 0);
+
+      vs->vws->resource_wait(vs->vws, vbuf->base.hw_res);
+   }
+
+   virgl_encoder_inline_write(vctx, grres, level, usage,
+                              box, data, stride, layer_stride);
+}
+
+static void *virgl_shader_encoder(struct pipe_context *ctx,
+                                  const struct pipe_shader_state *shader,
+                                  unsigned type)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle;
+   struct tgsi_token *new_tokens;
+   int ret;
+
+   new_tokens = virgl_tgsi_transform(shader->tokens);
+   if (!new_tokens)
+      return NULL;
+
+   handle = virgl_object_assign_handle();
+   /* encode VS state */
+   ret = virgl_encode_shader_state(vctx, handle, type,
+                                   &shader->stream_output,
+                                   new_tokens);
+   if (ret) {
+      return NULL;
+   }
+
+   FREE(new_tokens);
+   return (void *)(unsigned long)handle;
+
+}
+static void *virgl_create_vs_state(struct pipe_context *ctx,
+                                   const struct pipe_shader_state *shader)
+{
+   return virgl_shader_encoder(ctx, shader, PIPE_SHADER_VERTEX);
+}
+
+static void *virgl_create_gs_state(struct pipe_context *ctx,
+                                   const struct pipe_shader_state *shader)
+{
+   return virgl_shader_encoder(ctx, shader, PIPE_SHADER_GEOMETRY);
+}
+
+static void *virgl_create_fs_state(struct pipe_context *ctx,
+                                   const struct pipe_shader_state *shader)
+{
+   return virgl_shader_encoder(ctx, shader, PIPE_SHADER_FRAGMENT);
+}
+
+static void
+virgl_delete_fs_state(struct pipe_context *ctx,
+                     void *fs)
+{
+   uint32_t handle = (unsigned long)fs;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_SHADER);
+}
+
+static void
+virgl_delete_gs_state(struct pipe_context *ctx,
+                     void *gs)
+{
+   uint32_t handle = (unsigned long)gs;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_SHADER);
+}
+
+static void
+virgl_delete_vs_state(struct pipe_context *ctx,
+                     void *vs)
+{
+   uint32_t handle = (unsigned long)vs;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_SHADER);
+}
+
+static void virgl_bind_vs_state(struct pipe_context *ctx,
+                                        void *vss)
+{
+   uint32_t handle = (unsigned long)vss;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_bind_shader(vctx, handle, PIPE_SHADER_VERTEX);
+}
+
+static void virgl_bind_gs_state(struct pipe_context *ctx,
+                               void *vss)
+{
+   uint32_t handle = (unsigned long)vss;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_bind_shader(vctx, handle, PIPE_SHADER_GEOMETRY);
+}
+
+
+static void virgl_bind_fs_state(struct pipe_context *ctx,
+                                        void *vss)
+{
+   uint32_t handle = (unsigned long)vss;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_bind_shader(vctx, handle, PIPE_SHADER_FRAGMENT);
+}
+
+static void virgl_clear(struct pipe_context *ctx,
+                                unsigned buffers,
+                                const union pipe_color_union *color,
+                                double depth, unsigned stencil)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_clear(vctx, buffers, color, depth, stencil);
+}
+
+static void virgl_draw_vbo(struct pipe_context *ctx,
+                                   const struct pipe_draw_info *dinfo)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *rs = virgl_screen(ctx->screen);
+   struct pipe_index_buffer ib = {};
+   struct pipe_draw_info info = *dinfo;
+
+   if (!(rs->caps.caps.v1.prim_mask & (1 << dinfo->mode))) {
+      util_primconvert_save_index_buffer(vctx->primconvert, &vctx->index_buffer);
+      util_primconvert_draw_vbo(vctx->primconvert, dinfo);
+      return;
+   }
+   if (info.indexed) {
+           pipe_resource_reference(&ib.buffer, vctx->index_buffer.buffer);
+           ib.user_buffer = vctx->index_buffer.user_buffer;
+           ib.index_size = vctx->index_buffer.index_size;
+           ib.offset = vctx->index_buffer.offset + info.start * ib.index_size;
+
+           if (ib.user_buffer) {
+                   u_upload_data(vctx->uploader, 0, info.count * ib.index_size,
+                                 ib.user_buffer, &ib.offset, &ib.buffer);
+                   ib.user_buffer = NULL;
+           }
+   }
+
+   u_upload_unmap(vctx->uploader);
+
+   vctx->num_draws++;
+   virgl_hw_set_vertex_buffers(ctx);
+   if (info.indexed)
+      virgl_hw_set_index_buffer(ctx, &ib);
+
+   virgl_encoder_draw_vbo(vctx, &info);
+
+   pipe_resource_reference(&ib.buffer, NULL);
+
+}
+
+static void virgl_flush_eq(struct virgl_context *ctx, void *closure)
+{
+   struct virgl_screen *rs = virgl_screen(ctx->base.screen);
+
+   /* send the buffer to the remote side for decoding */
+   ctx->num_transfers = ctx->num_draws = 0;
+   rs->vws->submit_cmd(rs->vws, ctx->cbuf);
+
+   virgl_encoder_set_sub_ctx(ctx, ctx->hw_sub_ctx_id);
+
+   /* add back current framebuffer resources to reference list? */
+   virgl_reemit_res(ctx);
+}
+
+static void virgl_flush_from_st(struct pipe_context *ctx,
+                               struct pipe_fence_handle **fence,
+                               enum pipe_flush_flags flags)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *rs = virgl_screen(ctx->screen);
+   struct virgl_buffer *buf, *tmp;
+
+   if (fence)
+      *fence = rs->vws->cs_create_fence(rs->vws);
+
+   LIST_FOR_EACH_ENTRY_SAFE(buf, tmp, &vctx->to_flush_bufs, flush_list) {
+      struct pipe_resource *res = &buf->base.u.b;
+      virgl_buffer_flush(vctx, buf);
+      list_del(&buf->flush_list);
+      buf->on_list = FALSE;
+      pipe_resource_reference(&res, NULL);
+
+   }
+   virgl_flush_eq(vctx, vctx);
+}
+
+static struct pipe_sampler_view *virgl_create_sampler_view(struct pipe_context *ctx,
+                                      struct pipe_resource *texture,
+                                      const struct pipe_sampler_view *state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_sampler_view *grview;
+   uint32_t handle;
+   struct virgl_resource *res;
+
+   if (state == NULL)
+      return NULL;
+
+   grview = CALLOC_STRUCT(virgl_sampler_view);
+   if (!grview)
+      return NULL;
+
+   res = virgl_resource(texture);
+   handle = virgl_object_assign_handle();
+   virgl_encode_sampler_view(vctx, handle, res, state);
+
+   grview->base = *state;
+   grview->base.reference.count = 1;
+
+   grview->base.texture = NULL;
+   grview->base.context = ctx;
+   pipe_resource_reference(&grview->base.texture, texture);
+   grview->handle = handle;
+   return &grview->base;
+}
+
+static void virgl_set_sampler_views(struct pipe_context *ctx,
+                                   unsigned shader_type,
+                                   unsigned start_slot,
+                                   unsigned num_views,
+                                   struct pipe_sampler_view **views)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   int i;
+   uint32_t disable_mask = ~((1ull << num_views) - 1);
+   struct virgl_textures_info *tinfo = &vctx->samplers[shader_type];
+   uint32_t new_mask = 0;
+   uint32_t remaining_mask;
+
+   remaining_mask = tinfo->enabled_mask & disable_mask;
+
+   while (remaining_mask) {
+      i = u_bit_scan(&remaining_mask);
+      assert(tinfo->views[i]);
+
+      pipe_sampler_view_reference((struct pipe_sampler_view **)&tinfo->views[i], NULL);
+   }
+
+   for (i = 0; i < num_views; i++) {
+      struct virgl_sampler_view *grview = virgl_sampler_view(views[i]);
+
+      if (views[i] == (struct pipe_sampler_view *)tinfo->views[i])
+         continue;
+
+      if (grview) {
+         new_mask |= 1 << i;
+         pipe_sampler_view_reference((struct pipe_sampler_view **)&tinfo->views[i], views[i]);
+      } else {
+         pipe_sampler_view_reference((struct pipe_sampler_view **)&tinfo->views[i], NULL);
+         disable_mask |= 1 << i;
+      }
+   }
+
+   tinfo->enabled_mask &= ~disable_mask;
+   tinfo->enabled_mask |= new_mask;
+   virgl_encode_set_sampler_views(vctx, shader_type, start_slot, num_views, tinfo->views);
+   virgl_attach_res_sampler_views(vctx, shader_type);
+}
+
+static void virgl_destroy_sampler_view(struct pipe_context *ctx,
+                                 struct pipe_sampler_view *view)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_sampler_view *grview = virgl_sampler_view(view);
+
+   virgl_encode_delete_object(vctx, grview->handle, VIRGL_OBJECT_SAMPLER_VIEW);
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+static void *virgl_create_sampler_state(struct pipe_context *ctx,
+                                        const struct pipe_sampler_state *state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle;
+
+   handle = virgl_object_assign_handle();
+
+   virgl_encode_sampler_state(vctx, handle, state);
+   return (void *)(unsigned long)handle;
+}
+
+static void virgl_delete_sampler_state(struct pipe_context *ctx,
+                                      void *ss)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle = (unsigned long)ss;
+
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_SAMPLER_STATE);
+}
+
+static void virgl_bind_sampler_states(struct pipe_context *ctx,
+                                     unsigned shader, unsigned start_slot,
+                                     unsigned num_samplers,
+                                     void **samplers)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handles[32];
+   int i;
+   for (i = 0; i < num_samplers; i++) {
+      handles[i] = (unsigned long)(samplers[i]);
+   }
+   virgl_encode_bind_sampler_states(vctx, shader, start_slot, num_samplers, handles);
+}
+
+static void virgl_set_polygon_stipple(struct pipe_context *ctx,
+                                     const struct pipe_poly_stipple *ps)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   virgl_encoder_set_polygon_stipple(vctx, ps);
+}
+
+static void virgl_set_scissor_states(struct pipe_context *ctx,
+                                    unsigned start_slot,
+                                    unsigned num_scissor,
+                                   const struct pipe_scissor_state *ss)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   virgl_encoder_set_scissor_state(vctx, start_slot, num_scissor, ss);
+}
+
+static void virgl_set_sample_mask(struct pipe_context *ctx,
+                                 unsigned sample_mask)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   virgl_encoder_set_sample_mask(vctx, sample_mask);
+}
+
+static void virgl_set_clip_state(struct pipe_context *ctx,
+                                const struct pipe_clip_state *clip)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   virgl_encoder_set_clip_state(vctx, clip);
+}
+
+static void virgl_resource_copy_region(struct pipe_context *ctx,
+                                      struct pipe_resource *dst,
+                                      unsigned dst_level,
+                                      unsigned dstx, unsigned dsty, unsigned dstz,
+                                      struct pipe_resource *src,
+                                      unsigned src_level,
+                                      const struct pipe_box *src_box)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_resource *dres = virgl_resource(dst);
+   struct virgl_resource *sres = virgl_resource(src);
+
+   dres->clean = FALSE;
+   virgl_encode_resource_copy_region(vctx, dres,
+                                    dst_level, dstx, dsty, dstz,
+                                    sres, src_level,
+                                    src_box);
+}
+
+static void
+virgl_flush_resource(struct pipe_context *pipe,
+                    struct pipe_resource *resource)
+{
+}
+
+static void virgl_blit(struct pipe_context *ctx,
+                      const struct pipe_blit_info *blit)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_resource *dres = virgl_resource(blit->dst.resource);
+   struct virgl_resource *sres = virgl_resource(blit->src.resource);
+
+   dres->clean = FALSE;
+   virgl_encode_blit(vctx, dres, sres,
+                    blit);
+}
+
+static void
+virgl_context_destroy( struct pipe_context *ctx )
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *rs = virgl_screen(ctx->screen);
+
+   vctx->framebuffer.zsbuf = NULL;
+   vctx->framebuffer.nr_cbufs = 0;
+   virgl_encoder_destroy_sub_ctx(vctx, vctx->hw_sub_ctx_id);
+   virgl_flush_eq(vctx, vctx);
+
+   rs->vws->cmd_buf_destroy(vctx->cbuf);
+   if (vctx->uploader)
+      u_upload_destroy(vctx->uploader);
+   util_primconvert_destroy(vctx->primconvert);
+
+   util_slab_destroy(&vctx->texture_transfer_pool);
+   FREE(vctx);
+}
+
+struct pipe_context *virgl_context_create(struct pipe_screen *pscreen,
+                                          void *priv,
+                                          unsigned flags)
+{
+   struct virgl_context *vctx;
+   struct virgl_screen *rs = virgl_screen(pscreen);
+   vctx = CALLOC_STRUCT(virgl_context);
+
+   vctx->cbuf = rs->vws->cmd_buf_create(rs->vws);
+   if (!vctx->cbuf) {
+      FREE(vctx);
+      return NULL;
+   }
+
+   vctx->base.destroy = virgl_context_destroy;
+   vctx->base.create_surface = virgl_create_surface;
+   vctx->base.surface_destroy = virgl_surface_destroy;
+   vctx->base.set_framebuffer_state = virgl_set_framebuffer_state;
+   vctx->base.create_blend_state = virgl_create_blend_state;
+   vctx->base.bind_blend_state = virgl_bind_blend_state;
+   vctx->base.delete_blend_state = virgl_delete_blend_state;
+   vctx->base.create_depth_stencil_alpha_state = virgl_create_depth_stencil_alpha_state;
+   vctx->base.bind_depth_stencil_alpha_state = virgl_bind_depth_stencil_alpha_state;
+   vctx->base.delete_depth_stencil_alpha_state = virgl_delete_depth_stencil_alpha_state;
+   vctx->base.create_rasterizer_state = virgl_create_rasterizer_state;
+   vctx->base.bind_rasterizer_state = virgl_bind_rasterizer_state;
+   vctx->base.delete_rasterizer_state = virgl_delete_rasterizer_state;
+
+   vctx->base.set_viewport_states = virgl_set_viewport_states;
+   vctx->base.create_vertex_elements_state = virgl_create_vertex_elements_state;
+   vctx->base.bind_vertex_elements_state = virgl_bind_vertex_elements_state;
+   vctx->base.delete_vertex_elements_state = virgl_delete_vertex_elements_state;
+   vctx->base.set_vertex_buffers = virgl_set_vertex_buffers;
+   vctx->base.set_index_buffer = virgl_set_index_buffer;
+   vctx->base.set_constant_buffer = virgl_set_constant_buffer;
+   vctx->base.transfer_inline_write = virgl_transfer_inline_write;
+
+   vctx->base.create_vs_state = virgl_create_vs_state;
+   vctx->base.create_gs_state = virgl_create_gs_state;
+   vctx->base.create_fs_state = virgl_create_fs_state;
+
+   vctx->base.bind_vs_state = virgl_bind_vs_state;
+   vctx->base.bind_gs_state = virgl_bind_gs_state;
+   vctx->base.bind_fs_state = virgl_bind_fs_state;
+
+   vctx->base.delete_vs_state = virgl_delete_vs_state;
+   vctx->base.delete_gs_state = virgl_delete_gs_state;
+   vctx->base.delete_fs_state = virgl_delete_fs_state;
+
+   vctx->base.clear = virgl_clear;
+   vctx->base.draw_vbo = virgl_draw_vbo;
+   vctx->base.flush = virgl_flush_from_st;
+   vctx->base.screen = pscreen;
+   vctx->base.create_sampler_view = virgl_create_sampler_view;
+   vctx->base.sampler_view_destroy = virgl_destroy_sampler_view;
+   vctx->base.set_sampler_views = virgl_set_sampler_views;
+
+   vctx->base.create_sampler_state = virgl_create_sampler_state;
+   vctx->base.delete_sampler_state = virgl_delete_sampler_state;
+   vctx->base.bind_sampler_states = virgl_bind_sampler_states;
+
+   vctx->base.set_polygon_stipple = virgl_set_polygon_stipple;
+   vctx->base.set_scissor_states = virgl_set_scissor_states;
+   vctx->base.set_sample_mask = virgl_set_sample_mask;
+   vctx->base.set_stencil_ref = virgl_set_stencil_ref;
+   vctx->base.set_clip_state = virgl_set_clip_state;
+
+   vctx->base.set_blend_color = virgl_set_blend_color;
+
+   vctx->base.resource_copy_region = virgl_resource_copy_region;
+   vctx->base.flush_resource = virgl_flush_resource;
+   vctx->base.blit =  virgl_blit;
+
+   virgl_init_context_resource_functions(&vctx->base);
+   virgl_init_query_functions(vctx);
+   virgl_init_so_functions(vctx);
+
+   list_inithead(&vctx->to_flush_bufs);
+   util_slab_create(&vctx->texture_transfer_pool, sizeof(struct virgl_transfer),
+                    16, UTIL_SLAB_SINGLETHREADED);
+
+   vctx->primconvert = util_primconvert_create(&vctx->base, rs->caps.caps.v1.prim_mask);
+   vctx->uploader = u_upload_create(&vctx->base, 1024 * 1024, 256,
+                                     PIPE_BIND_INDEX_BUFFER);
+   if (!vctx->uploader)
+           goto fail;
+
+   vctx->hw_sub_ctx_id = rs->sub_ctx_id++;
+   virgl_encoder_create_sub_ctx(vctx, vctx->hw_sub_ctx_id);
+
+   virgl_encoder_set_sub_ctx(vctx, vctx->hw_sub_ctx_id);
+   return &vctx->base;
+fail:
+   return NULL;
+}
diff --git a/src/gallium/drivers/virgl/virgl_context.h b/src/gallium/drivers/virgl/virgl_context.h
new file mode 100644
index 00000000000..adb8adef33c
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_context.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_CONTEXT_H
+#define VIRGL_CONTEXT_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "util/u_slab.h"
+#include "util/list.h"
+
+struct pipe_screen;
+struct tgsi_token;
+struct u_upload_mgr;
+struct virgl_cmd_buf;
+
+struct virgl_sampler_view {
+   struct pipe_sampler_view base;
+   uint32_t handle;
+};
+
+struct virgl_so_target {
+   struct pipe_stream_output_target base;
+   uint32_t handle;
+};
+
+struct virgl_textures_info {
+   struct virgl_sampler_view *views[16];
+   uint32_t enabled_mask;
+};
+
+struct virgl_context {
+   struct pipe_context base;
+   struct virgl_cmd_buf *cbuf;
+
+   struct virgl_textures_info samplers[PIPE_SHADER_TYPES];
+
+   struct pipe_framebuffer_state framebuffer;
+
+   struct util_slab_mempool texture_transfer_pool;
+
+   struct pipe_index_buffer index_buffer;
+   struct u_upload_mgr *uploader;
+
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   unsigned num_vertex_buffers;
+   boolean vertex_array_dirty;
+
+   struct virgl_so_target so_targets[PIPE_MAX_SO_BUFFERS];
+   unsigned num_so_targets;
+
+   struct pipe_resource *ubos[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
+   int num_transfers;
+   int num_draws;
+   struct list_head to_flush_bufs;
+
+   struct primconvert_context *primconvert;
+   uint32_t hw_sub_ctx_id;
+};
+
+static inline struct virgl_sampler_view *
+virgl_sampler_view(struct pipe_sampler_view *view)
+{
+   return (struct virgl_sampler_view *)view;
+};
+
+static inline struct virgl_so_target *
+virgl_so_target(struct pipe_stream_output_target *target)
+{
+   return (struct virgl_so_target *)target;
+}
+
+static inline struct virgl_context *virgl_context(struct pipe_context *ctx)
+{
+   return (struct virgl_context *)ctx;
+}
+
+struct pipe_context *virgl_context_create(struct pipe_screen *pscreen,
+                                          void *priv, unsigned flags);
+
+void virgl_init_blit_functions(struct virgl_context *vctx);
+void virgl_init_query_functions(struct virgl_context *vctx);
+void virgl_init_so_functions(struct virgl_context *vctx);
+
+void virgl_transfer_inline_write(struct pipe_context *ctx,
+                                struct pipe_resource *res,
+                                unsigned level,
+                                unsigned usage,
+                                const struct pipe_box *box,
+                                const void *data,
+                                unsigned stride,
+                                unsigned layer_stride);
+
+struct tgsi_token *virgl_tgsi_transform(const struct tgsi_token *tokens_in);
+
+#endif
diff --git a/src/gallium/drivers/virgl/virgl_encode.c b/src/gallium/drivers/virgl/virgl_encode.c
new file mode 100644
index 00000000000..22fb5292819
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_encode.c
@@ -0,0 +1,867 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <stdint.h>
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "virgl_context.h"
+#include "virgl_encode.h"
+#include "virgl_protocol.h"
+#include "virgl_resource.h"
+#include "virgl_screen.h"
+
+static int virgl_encoder_write_cmd_dword(struct virgl_context *ctx,
+                                        uint32_t dword)
+{
+   int len = (dword >> 16);
+
+   if ((ctx->cbuf->cdw + len + 1) > VIRGL_MAX_CMDBUF_DWORDS)
+      ctx->base.flush(&ctx->base, NULL, 0);
+
+   virgl_encoder_write_dword(ctx->cbuf, dword);
+   return 0;
+}
+
+static void virgl_encoder_write_res(struct virgl_context *ctx,
+                                    struct virgl_resource *res)
+{
+   struct virgl_winsys *vws = virgl_screen(ctx->base.screen)->vws;
+
+   if (res && res->hw_res)
+      vws->emit_res(vws, ctx->cbuf, res->hw_res, TRUE);
+   else {
+      virgl_encoder_write_dword(ctx->cbuf, 0);
+   }
+}
+
+int virgl_encode_bind_object(struct virgl_context *ctx,
+                            uint32_t handle, uint32_t object)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_BIND_OBJECT, object, 1));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   return 0;
+}
+
+int virgl_encode_delete_object(struct virgl_context *ctx,
+                              uint32_t handle, uint32_t object)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_DESTROY_OBJECT, object, 1));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   return 0;
+}
+
+int virgl_encode_blend_state(struct virgl_context *ctx,
+                            uint32_t handle,
+                            const struct pipe_blend_state *blend_state)
+{
+   uint32_t tmp;
+   int i;
+
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_BLEND, VIRGL_OBJ_BLEND_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+
+   tmp =
+      VIRGL_OBJ_BLEND_S0_INDEPENDENT_BLEND_ENABLE(blend_state->independent_blend_enable) |
+      VIRGL_OBJ_BLEND_S0_LOGICOP_ENABLE(blend_state->logicop_enable) |
+      VIRGL_OBJ_BLEND_S0_DITHER(blend_state->dither) |
+      VIRGL_OBJ_BLEND_S0_ALPHA_TO_COVERAGE(blend_state->alpha_to_coverage) |
+      VIRGL_OBJ_BLEND_S0_ALPHA_TO_ONE(blend_state->alpha_to_one);
+
+   virgl_encoder_write_dword(ctx->cbuf, tmp);
+
+   tmp = VIRGL_OBJ_BLEND_S1_LOGICOP_FUNC(blend_state->logicop_func);
+   virgl_encoder_write_dword(ctx->cbuf, tmp);
+
+   for (i = 0; i < VIRGL_MAX_COLOR_BUFS; i++) {
+      tmp =
+         VIRGL_OBJ_BLEND_S2_RT_BLEND_ENABLE(blend_state->rt[i].blend_enable) |
+         VIRGL_OBJ_BLEND_S2_RT_RGB_FUNC(blend_state->rt[i].rgb_func) |
+         VIRGL_OBJ_BLEND_S2_RT_RGB_SRC_FACTOR(blend_state->rt[i].rgb_src_factor) |
+         VIRGL_OBJ_BLEND_S2_RT_RGB_DST_FACTOR(blend_state->rt[i].rgb_dst_factor)|
+         VIRGL_OBJ_BLEND_S2_RT_ALPHA_FUNC(blend_state->rt[i].alpha_func) |
+         VIRGL_OBJ_BLEND_S2_RT_ALPHA_SRC_FACTOR(blend_state->rt[i].alpha_src_factor) |
+         VIRGL_OBJ_BLEND_S2_RT_ALPHA_DST_FACTOR(blend_state->rt[i].alpha_dst_factor) |
+         VIRGL_OBJ_BLEND_S2_RT_COLORMASK(blend_state->rt[i].colormask);
+      virgl_encoder_write_dword(ctx->cbuf, tmp);
+   }
+   return 0;
+}
+
+int virgl_encode_dsa_state(struct virgl_context *ctx,
+                          uint32_t handle,
+                          const struct pipe_depth_stencil_alpha_state *dsa_state)
+{
+   uint32_t tmp;
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_DSA, VIRGL_OBJ_DSA_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+
+   tmp = VIRGL_OBJ_DSA_S0_DEPTH_ENABLE(dsa_state->depth.enabled) |
+      VIRGL_OBJ_DSA_S0_DEPTH_WRITEMASK(dsa_state->depth.writemask) |
+      VIRGL_OBJ_DSA_S0_DEPTH_FUNC(dsa_state->depth.func) |
+      VIRGL_OBJ_DSA_S0_ALPHA_ENABLED(dsa_state->alpha.enabled) |
+      VIRGL_OBJ_DSA_S0_ALPHA_FUNC(dsa_state->alpha.func);
+   virgl_encoder_write_dword(ctx->cbuf, tmp);
+
+   for (i = 0; i < 2; i++) {
+      tmp = VIRGL_OBJ_DSA_S1_STENCIL_ENABLED(dsa_state->stencil[i].enabled) |
+         VIRGL_OBJ_DSA_S1_STENCIL_FUNC(dsa_state->stencil[i].func) |
+         VIRGL_OBJ_DSA_S1_STENCIL_FAIL_OP(dsa_state->stencil[i].fail_op) |
+         VIRGL_OBJ_DSA_S1_STENCIL_ZPASS_OP(dsa_state->stencil[i].zpass_op) |
+         VIRGL_OBJ_DSA_S1_STENCIL_ZFAIL_OP(dsa_state->stencil[i].zfail_op) |
+         VIRGL_OBJ_DSA_S1_STENCIL_VALUEMASK(dsa_state->stencil[i].valuemask) |
+         VIRGL_OBJ_DSA_S1_STENCIL_WRITEMASK(dsa_state->stencil[i].writemask);
+      virgl_encoder_write_dword(ctx->cbuf, tmp);
+   }
+
+   virgl_encoder_write_dword(ctx->cbuf, fui(dsa_state->alpha.ref_value));
+   return 0;
+}
+int virgl_encode_rasterizer_state(struct virgl_context *ctx,
+                                  uint32_t handle,
+                                  const struct pipe_rasterizer_state *state)
+{
+   uint32_t tmp;
+
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_RASTERIZER, VIRGL_OBJ_RS_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+
+   tmp = VIRGL_OBJ_RS_S0_FLATSHADE(state->flatshade) |
+      VIRGL_OBJ_RS_S0_DEPTH_CLIP(state->depth_clip) |
+      VIRGL_OBJ_RS_S0_CLIP_HALFZ(state->clip_halfz) |
+      VIRGL_OBJ_RS_S0_RASTERIZER_DISCARD(state->rasterizer_discard) |
+      VIRGL_OBJ_RS_S0_FLATSHADE_FIRST(state->flatshade_first) |
+      VIRGL_OBJ_RS_S0_LIGHT_TWOSIZE(state->light_twoside) |
+      VIRGL_OBJ_RS_S0_SPRITE_COORD_MODE(state->sprite_coord_mode) |
+      VIRGL_OBJ_RS_S0_POINT_QUAD_RASTERIZATION(state->point_quad_rasterization) |
+      VIRGL_OBJ_RS_S0_CULL_FACE(state->cull_face) |
+      VIRGL_OBJ_RS_S0_FILL_FRONT(state->fill_front) |
+      VIRGL_OBJ_RS_S0_FILL_BACK(state->fill_back) |
+      VIRGL_OBJ_RS_S0_SCISSOR(state->scissor) |
+      VIRGL_OBJ_RS_S0_FRONT_CCW(state->front_ccw) |
+      VIRGL_OBJ_RS_S0_CLAMP_VERTEX_COLOR(state->clamp_vertex_color) |
+      VIRGL_OBJ_RS_S0_CLAMP_FRAGMENT_COLOR(state->clamp_fragment_color) |
+      VIRGL_OBJ_RS_S0_OFFSET_LINE(state->offset_line) |
+      VIRGL_OBJ_RS_S0_OFFSET_POINT(state->offset_point) |
+      VIRGL_OBJ_RS_S0_OFFSET_TRI(state->offset_tri) |
+      VIRGL_OBJ_RS_S0_POLY_SMOOTH(state->poly_smooth) |
+      VIRGL_OBJ_RS_S0_POLY_STIPPLE_ENABLE(state->poly_stipple_enable) |
+      VIRGL_OBJ_RS_S0_POINT_SMOOTH(state->point_smooth) |
+      VIRGL_OBJ_RS_S0_POINT_SIZE_PER_VERTEX(state->point_size_per_vertex) |
+      VIRGL_OBJ_RS_S0_MULTISAMPLE(state->multisample) |
+      VIRGL_OBJ_RS_S0_LINE_SMOOTH(state->line_smooth) |
+      VIRGL_OBJ_RS_S0_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
+      VIRGL_OBJ_RS_S0_LINE_LAST_PIXEL(state->line_last_pixel) |
+      VIRGL_OBJ_RS_S0_HALF_PIXEL_CENTER(state->half_pixel_center) |
+      VIRGL_OBJ_RS_S0_BOTTOM_EDGE_RULE(state->bottom_edge_rule);
+
+   virgl_encoder_write_dword(ctx->cbuf, tmp); /* S0 */
+   virgl_encoder_write_dword(ctx->cbuf, fui(state->point_size)); /* S1 */
+   virgl_encoder_write_dword(ctx->cbuf, state->sprite_coord_enable); /* S2 */
+   tmp = VIRGL_OBJ_RS_S3_LINE_STIPPLE_PATTERN(state->line_stipple_pattern) |
+      VIRGL_OBJ_RS_S3_LINE_STIPPLE_FACTOR(state->line_stipple_factor) |
+      VIRGL_OBJ_RS_S3_CLIP_PLANE_ENABLE(state->clip_plane_enable);
+   virgl_encoder_write_dword(ctx->cbuf, tmp); /* S3 */
+   virgl_encoder_write_dword(ctx->cbuf, fui(state->line_width)); /* S4 */
+   virgl_encoder_write_dword(ctx->cbuf, fui(state->offset_units)); /* S5 */
+   virgl_encoder_write_dword(ctx->cbuf, fui(state->offset_scale)); /* S6 */
+   virgl_encoder_write_dword(ctx->cbuf, fui(state->offset_clamp)); /* S7 */
+   return 0;
+}
+
+static void virgl_emit_shader_header(struct virgl_context *ctx,
+                                     uint32_t handle, uint32_t len,
+                                     uint32_t type, uint32_t offlen,
+                                     uint32_t num_tokens)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_SHADER, len));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   virgl_encoder_write_dword(ctx->cbuf, type);
+   virgl_encoder_write_dword(ctx->cbuf, offlen);
+   virgl_encoder_write_dword(ctx->cbuf, num_tokens);
+}
+
+static void virgl_emit_shader_streamout(struct virgl_context *ctx,
+                                        const struct pipe_stream_output_info *so_info)
+{
+   int num_outputs = 0;
+   int i;
+   uint32_t tmp;
+
+   if (so_info)
+      num_outputs = so_info->num_outputs;
+
+   virgl_encoder_write_dword(ctx->cbuf, num_outputs);
+   if (num_outputs) {
+      for (i = 0; i < 4; i++)
+         virgl_encoder_write_dword(ctx->cbuf, so_info->stride[i]);
+
+      for (i = 0; i < so_info->num_outputs; i++) {
+         tmp =
+           VIRGL_OBJ_SHADER_SO_OUTPUT_REGISTER_INDEX(so_info->output[i].register_index) |
+           VIRGL_OBJ_SHADER_SO_OUTPUT_START_COMPONENT(so_info->output[i].start_component) |
+           VIRGL_OBJ_SHADER_SO_OUTPUT_NUM_COMPONENTS(so_info->output[i].num_components) |
+           VIRGL_OBJ_SHADER_SO_OUTPUT_BUFFER(so_info->output[i].output_buffer) |
+           VIRGL_OBJ_SHADER_SO_OUTPUT_DST_OFFSET(so_info->output[i].dst_offset);
+         virgl_encoder_write_dword(ctx->cbuf, tmp);
+         virgl_encoder_write_dword(ctx->cbuf, 0);
+      }
+   }
+}
+
+int virgl_encode_shader_state(struct virgl_context *ctx,
+                              uint32_t handle,
+                              uint32_t type,
+                              const struct pipe_stream_output_info *so_info,
+                              const struct tgsi_token *tokens)
+{
+   char *str, *sptr;
+   uint32_t shader_len, len;
+   bool bret;
+   int num_tokens = tgsi_num_tokens(tokens);
+   int str_total_size = 65536;
+   int retry_size = 1;
+   uint32_t left_bytes, base_hdr_size, strm_hdr_size, thispass;
+   bool first_pass;
+   str = CALLOC(1, str_total_size);
+   if (!str)
+      return -1;
+
+   do {
+      int old_size;
+
+      bret = tgsi_dump_str(tokens, TGSI_DUMP_FLOAT_AS_HEX, str, str_total_size);
+      if (bret == false) {
+         fprintf(stderr, "Failed to translate shader in available space - trying again\n");
+         old_size = str_total_size;
+         str_total_size = 65536 * ++retry_size;
+         str = REALLOC(str, old_size, str_total_size);
+         if (!str)
+            return -1;
+      }
+   } while (bret == false && retry_size < 10);
+
+   if (bret == false)
+      return -1;
+
+   shader_len = strlen(str) + 1;
+
+   left_bytes = shader_len;
+
+   base_hdr_size = 5;
+   strm_hdr_size = so_info->num_outputs ? so_info->num_outputs * 2 + 4 : 0;
+   first_pass = true;
+   sptr = str;
+   while (left_bytes) {
+      uint32_t length, offlen;
+      int hdr_len = base_hdr_size + (first_pass ? strm_hdr_size : 0);
+      if (ctx->cbuf->cdw + hdr_len + 1 > VIRGL_MAX_CMDBUF_DWORDS)
+         ctx->base.flush(&ctx->base, NULL, 0);
+
+      thispass = (VIRGL_MAX_CMDBUF_DWORDS - ctx->cbuf->cdw - hdr_len - 1) * 4;
+
+      length = MIN2(thispass, left_bytes);
+      len = ((length + 3) / 4) + hdr_len;
+
+      if (first_pass)
+         offlen = VIRGL_OBJ_SHADER_OFFSET_VAL(shader_len);
+      else
+         offlen = VIRGL_OBJ_SHADER_OFFSET_VAL((uintptr_t)sptr - (uintptr_t)str) | VIRGL_OBJ_SHADER_OFFSET_CONT;
+
+      virgl_emit_shader_header(ctx, handle, len, type, offlen, num_tokens);
+
+      virgl_emit_shader_streamout(ctx, first_pass ? so_info : NULL);
+
+      virgl_encoder_write_block(ctx->cbuf, (uint8_t *)sptr, length);
+
+      sptr += length;
+      first_pass = false;
+      left_bytes -= length;
+   }
+
+   FREE(str);
+   return 0;
+}
+
+
+int virgl_encode_clear(struct virgl_context *ctx,
+                      unsigned buffers,
+                      const union pipe_color_union *color,
+                      double depth, unsigned stencil)
+{
+   int i;
+
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CLEAR, 0, VIRGL_OBJ_CLEAR_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, buffers);
+   for (i = 0; i < 4; i++)
+      virgl_encoder_write_dword(ctx->cbuf, color->ui[i]);
+   virgl_encoder_write_qword(ctx->cbuf, *(uint64_t *)&depth);
+   virgl_encoder_write_dword(ctx->cbuf, stencil);
+   return 0;
+}
+
+int virgl_encoder_set_framebuffer_state(struct virgl_context *ctx,
+                                       const struct pipe_framebuffer_state *state)
+{
+   struct virgl_surface *zsurf = virgl_surface(state->zsbuf);
+   int i;
+
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_FRAMEBUFFER_STATE, 0, VIRGL_SET_FRAMEBUFFER_STATE_SIZE(state->nr_cbufs)));
+   virgl_encoder_write_dword(ctx->cbuf, state->nr_cbufs);
+   virgl_encoder_write_dword(ctx->cbuf, zsurf ? zsurf->handle : 0);
+   for (i = 0; i < state->nr_cbufs; i++) {
+      struct virgl_surface *surf = virgl_surface(state->cbufs[i]);
+      virgl_encoder_write_dword(ctx->cbuf, surf ? surf->handle : 0);
+   }
+
+   return 0;
+}
+
+int virgl_encoder_set_viewport_states(struct virgl_context *ctx,
+                                      int start_slot,
+                                      int num_viewports,
+                                      const struct pipe_viewport_state *states)
+{
+   int i,v;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_VIEWPORT_STATE, 0, VIRGL_SET_VIEWPORT_STATE_SIZE(num_viewports)));
+   virgl_encoder_write_dword(ctx->cbuf, start_slot);
+   for (v = 0; v < num_viewports; v++) {
+      for (i = 0; i < 3; i++)
+         virgl_encoder_write_dword(ctx->cbuf, fui(states[v].scale[i]));
+      for (i = 0; i < 3; i++)
+         virgl_encoder_write_dword(ctx->cbuf, fui(states[v].translate[i]));
+   }
+   return 0;
+}
+
+int virgl_encoder_create_vertex_elements(struct virgl_context *ctx,
+                                        uint32_t handle,
+                                        unsigned num_elements,
+                                        const struct pipe_vertex_element *element)
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_VERTEX_ELEMENTS, VIRGL_OBJ_VERTEX_ELEMENTS_SIZE(num_elements)));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   for (i = 0; i < num_elements; i++) {
+      virgl_encoder_write_dword(ctx->cbuf, element[i].src_offset);
+      virgl_encoder_write_dword(ctx->cbuf, element[i].instance_divisor);
+      virgl_encoder_write_dword(ctx->cbuf, element[i].vertex_buffer_index);
+      virgl_encoder_write_dword(ctx->cbuf, element[i].src_format);
+   }
+   return 0;
+}
+
+int virgl_encoder_set_vertex_buffers(struct virgl_context *ctx,
+                                    unsigned num_buffers,
+                                    const struct pipe_vertex_buffer *buffers)
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_VERTEX_BUFFERS, 0, VIRGL_SET_VERTEX_BUFFERS_SIZE(num_buffers)));
+   for (i = 0; i < num_buffers; i++) {
+      struct virgl_resource *res = virgl_resource(buffers[i].buffer);
+      virgl_encoder_write_dword(ctx->cbuf, buffers[i].stride);
+      virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_offset);
+      virgl_encoder_write_res(ctx, res);
+   }
+   return 0;
+}
+
+int virgl_encoder_set_index_buffer(struct virgl_context *ctx,
+                                  const struct pipe_index_buffer *ib)
+{
+   int length = VIRGL_SET_INDEX_BUFFER_SIZE(ib);
+   struct virgl_resource *res = NULL;
+   if (ib)
+      res = virgl_resource(ib->buffer);
+
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_INDEX_BUFFER, 0, length));
+   virgl_encoder_write_res(ctx, res);
+   if (ib) {
+      virgl_encoder_write_dword(ctx->cbuf, ib->index_size);
+      virgl_encoder_write_dword(ctx->cbuf, ib->offset);
+   }
+   return 0;
+}
+
+int virgl_encoder_draw_vbo(struct virgl_context *ctx,
+                          const struct pipe_draw_info *info)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_DRAW_VBO, 0, VIRGL_DRAW_VBO_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, info->start);
+   virgl_encoder_write_dword(ctx->cbuf, info->count);
+   virgl_encoder_write_dword(ctx->cbuf, info->mode);
+   virgl_encoder_write_dword(ctx->cbuf, info->indexed);
+   virgl_encoder_write_dword(ctx->cbuf, info->instance_count);
+   virgl_encoder_write_dword(ctx->cbuf, info->index_bias);
+   virgl_encoder_write_dword(ctx->cbuf, info->start_instance);
+   virgl_encoder_write_dword(ctx->cbuf, info->primitive_restart);
+   virgl_encoder_write_dword(ctx->cbuf, info->restart_index);
+   virgl_encoder_write_dword(ctx->cbuf, info->min_index);
+   virgl_encoder_write_dword(ctx->cbuf, info->max_index);
+   if (info->count_from_stream_output)
+      virgl_encoder_write_dword(ctx->cbuf, info->count_from_stream_output->buffer_size);
+   else
+      virgl_encoder_write_dword(ctx->cbuf, 0);
+   return 0;
+}
+
+int virgl_encoder_create_surface(struct virgl_context *ctx,
+                                uint32_t handle,
+                                struct virgl_resource *res,
+                                const struct pipe_surface *templat)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_SURFACE, VIRGL_OBJ_SURFACE_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   virgl_encoder_write_res(ctx, res);
+   virgl_encoder_write_dword(ctx->cbuf, templat->format);
+   if (templat->texture->target == PIPE_BUFFER) {
+      virgl_encoder_write_dword(ctx->cbuf, templat->u.buf.first_element);
+      virgl_encoder_write_dword(ctx->cbuf, templat->u.buf.last_element);
+
+   } else {
+      virgl_encoder_write_dword(ctx->cbuf, templat->u.tex.level);
+      virgl_encoder_write_dword(ctx->cbuf, templat->u.tex.first_layer | (templat->u.tex.last_layer << 16));
+   }
+   return 0;
+}
+
+int virgl_encoder_create_so_target(struct virgl_context *ctx,
+                                  uint32_t handle,
+                                  struct virgl_resource *res,
+                                  unsigned buffer_offset,
+                                  unsigned buffer_size)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_STREAMOUT_TARGET, VIRGL_OBJ_STREAMOUT_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   virgl_encoder_write_res(ctx, res);
+   virgl_encoder_write_dword(ctx->cbuf, buffer_offset);
+   virgl_encoder_write_dword(ctx->cbuf, buffer_size);
+   return 0;
+}
+
+static void virgl_encoder_iw_emit_header_1d(struct virgl_context *ctx,
+                                           struct virgl_resource *res,
+                                           unsigned level, unsigned usage,
+                                           const struct pipe_box *box,
+                                           unsigned stride, unsigned layer_stride)
+{
+   virgl_encoder_write_res(ctx, res);
+   virgl_encoder_write_dword(ctx->cbuf, level);
+   virgl_encoder_write_dword(ctx->cbuf, usage);
+   virgl_encoder_write_dword(ctx->cbuf, stride);
+   virgl_encoder_write_dword(ctx->cbuf, layer_stride);
+   virgl_encoder_write_dword(ctx->cbuf, box->x);
+   virgl_encoder_write_dword(ctx->cbuf, box->y);
+   virgl_encoder_write_dword(ctx->cbuf, box->z);
+   virgl_encoder_write_dword(ctx->cbuf, box->width);
+   virgl_encoder_write_dword(ctx->cbuf, box->height);
+   virgl_encoder_write_dword(ctx->cbuf, box->depth);
+}
+
+int virgl_encoder_inline_write(struct virgl_context *ctx,
+                              struct virgl_resource *res,
+                              unsigned level, unsigned usage,
+                              const struct pipe_box *box,
+                              const void *data, unsigned stride,
+                              unsigned layer_stride)
+{
+   uint32_t size = (stride ? stride : box->width) * box->height;
+   uint32_t length, thispass, left_bytes;
+   struct pipe_box mybox = *box;
+
+   length = 11 + (size + 3) / 4;
+   if ((ctx->cbuf->cdw + length + 1) > VIRGL_MAX_CMDBUF_DWORDS) {
+      if (box->height > 1 || box->depth > 1) {
+         debug_printf("inline transfer failed due to multi dimensions and too large\n");
+         assert(0);
+      }
+   }
+
+   left_bytes = size;
+   while (left_bytes) {
+      if (ctx->cbuf->cdw + 12 > VIRGL_MAX_CMDBUF_DWORDS)
+         ctx->base.flush(&ctx->base, NULL, 0);
+
+      thispass = (VIRGL_MAX_CMDBUF_DWORDS - ctx->cbuf->cdw - 12) * 4;
+
+      length = MIN2(thispass, left_bytes);
+
+      mybox.width = length;
+      virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_RESOURCE_INLINE_WRITE, 0, ((length + 3) / 4) + 11));
+      virgl_encoder_iw_emit_header_1d(ctx, res, level, usage, &mybox, stride, layer_stride);
+      virgl_encoder_write_block(ctx->cbuf, data, length);
+      left_bytes -= length;
+      mybox.x += length;
+      data += length;
+   }
+   return 0;
+}
+
+int virgl_encoder_flush_frontbuffer(struct virgl_context *ctx,
+                                   struct virgl_resource *res)
+{
+//   virgl_encoder_write_dword(ctx->cbuf, VIRGL_CMD0(VIRGL_CCMD_FLUSH_FRONTUBFFER, 0, 1));
+//   virgl_encoder_write_dword(ctx->cbuf, res_handle);
+   return 0;
+}
+
+int virgl_encode_sampler_state(struct virgl_context *ctx,
+                              uint32_t handle,
+                              const struct pipe_sampler_state *state)
+{
+   uint32_t tmp;
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_SAMPLER_STATE, VIRGL_OBJ_SAMPLER_STATE_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+
+   tmp = VIRGL_OBJ_SAMPLE_STATE_S0_WRAP_S(state->wrap_s) |
+      VIRGL_OBJ_SAMPLE_STATE_S0_WRAP_T(state->wrap_t) |
+      VIRGL_OBJ_SAMPLE_STATE_S0_WRAP_R(state->wrap_r) |
+      VIRGL_OBJ_SAMPLE_STATE_S0_MIN_IMG_FILTER(state->min_img_filter) |
+      VIRGL_OBJ_SAMPLE_STATE_S0_MIN_MIP_FILTER(state->min_mip_filter) |
+      VIRGL_OBJ_SAMPLE_STATE_S0_MAG_IMG_FILTER(state->mag_img_filter) |
+      VIRGL_OBJ_SAMPLE_STATE_S0_COMPARE_MODE(state->compare_mode) |
+      VIRGL_OBJ_SAMPLE_STATE_S0_COMPARE_FUNC(state->compare_func);
+
+   virgl_encoder_write_dword(ctx->cbuf, tmp);
+   virgl_encoder_write_dword(ctx->cbuf, fui(state->lod_bias));
+   virgl_encoder_write_dword(ctx->cbuf, fui(state->min_lod));
+   virgl_encoder_write_dword(ctx->cbuf, fui(state->max_lod));
+   for (i = 0; i <  4; i++)
+      virgl_encoder_write_dword(ctx->cbuf, state->border_color.ui[i]);
+   return 0;
+}
+
+
+int virgl_encode_sampler_view(struct virgl_context *ctx,
+                             uint32_t handle,
+                             struct virgl_resource *res,
+                             const struct pipe_sampler_view *state)
+{
+   uint32_t tmp;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_SAMPLER_VIEW, VIRGL_OBJ_SAMPLER_VIEW_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   virgl_encoder_write_res(ctx, res);
+   virgl_encoder_write_dword(ctx->cbuf, state->format);
+   if (res->u.b.target == PIPE_BUFFER) {
+      virgl_encoder_write_dword(ctx->cbuf, state->u.buf.first_element);
+      virgl_encoder_write_dword(ctx->cbuf, state->u.buf.last_element);
+   } else {
+      virgl_encoder_write_dword(ctx->cbuf, state->u.tex.first_layer | state->u.tex.last_layer << 16);
+      virgl_encoder_write_dword(ctx->cbuf, state->u.tex.first_level | state->u.tex.last_level << 8);
+   }
+   tmp = VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE_R(state->swizzle_r) |
+      VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE_G(state->swizzle_g) |
+      VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE_B(state->swizzle_b) |
+      VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE_A(state->swizzle_a);
+   virgl_encoder_write_dword(ctx->cbuf, tmp);
+   return 0;
+}
+
+int virgl_encode_set_sampler_views(struct virgl_context *ctx,
+                                  uint32_t shader_type,
+                                  uint32_t start_slot,
+                                  uint32_t num_views,
+                                  struct virgl_sampler_view **views)
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_SAMPLER_VIEWS, 0, VIRGL_SET_SAMPLER_VIEWS_SIZE(num_views)));
+   virgl_encoder_write_dword(ctx->cbuf, shader_type);
+   virgl_encoder_write_dword(ctx->cbuf, start_slot);
+   for (i = 0; i < num_views; i++) {
+      uint32_t handle = views[i] ? views[i]->handle : 0;
+      virgl_encoder_write_dword(ctx->cbuf, handle);
+   }
+   return 0;
+}
+
+int virgl_encode_bind_sampler_states(struct virgl_context *ctx,
+                                    uint32_t shader_type,
+                                    uint32_t start_slot,
+                                    uint32_t num_handles,
+                                    uint32_t *handles)
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_BIND_SAMPLER_STATES, 0, VIRGL_BIND_SAMPLER_STATES(num_handles)));
+   virgl_encoder_write_dword(ctx->cbuf, shader_type);
+   virgl_encoder_write_dword(ctx->cbuf, start_slot);
+   for (i = 0; i < num_handles; i++)
+      virgl_encoder_write_dword(ctx->cbuf, handles[i]);
+   return 0;
+}
+
+int virgl_encoder_write_constant_buffer(struct virgl_context *ctx,
+                                       uint32_t shader,
+                                       uint32_t index,
+                                       uint32_t size,
+                                       const void *data)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_CONSTANT_BUFFER, 0, size + 2));
+   virgl_encoder_write_dword(ctx->cbuf, shader);
+   virgl_encoder_write_dword(ctx->cbuf, index);
+   if (data)
+      virgl_encoder_write_block(ctx->cbuf, data, size * 4);
+   return 0;
+}
+
+int virgl_encoder_set_uniform_buffer(struct virgl_context *ctx,
+                                     uint32_t shader,
+                                     uint32_t index,
+                                     uint32_t offset,
+                                     uint32_t length,
+                                     struct virgl_resource *res)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_UNIFORM_BUFFER, 0, VIRGL_SET_UNIFORM_BUFFER_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, shader);
+   virgl_encoder_write_dword(ctx->cbuf, index);
+   virgl_encoder_write_dword(ctx->cbuf, offset);
+   virgl_encoder_write_dword(ctx->cbuf, length);
+   virgl_encoder_write_res(ctx, res);
+   return 0;
+}
+
+
+int virgl_encoder_set_stencil_ref(struct virgl_context *ctx,
+                                 const struct pipe_stencil_ref *ref)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_STENCIL_REF, 0, VIRGL_SET_STENCIL_REF_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, VIRGL_STENCIL_REF_VAL(ref->ref_value[0] , (ref->ref_value[1])));
+   return 0;
+}
+
+int virgl_encoder_set_blend_color(struct virgl_context *ctx,
+                                 const struct pipe_blend_color *color)
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_BLEND_COLOR, 0, VIRGL_SET_BLEND_COLOR_SIZE));
+   for (i = 0; i < 4; i++)
+      virgl_encoder_write_dword(ctx->cbuf, fui(color->color[i]));
+   return 0;
+}
+
+int virgl_encoder_set_scissor_state(struct virgl_context *ctx,
+                                    unsigned start_slot,
+                                    int num_scissors,
+                                    const struct pipe_scissor_state *ss)
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_SCISSOR_STATE, 0, VIRGL_SET_SCISSOR_STATE_SIZE(num_scissors)));
+   virgl_encoder_write_dword(ctx->cbuf, start_slot);
+   for (i = 0; i < num_scissors; i++) {
+      virgl_encoder_write_dword(ctx->cbuf, (ss[i].minx | ss[i].miny << 16));
+      virgl_encoder_write_dword(ctx->cbuf, (ss[i].maxx | ss[i].maxy << 16));
+   }
+   return 0;
+}
+
+void virgl_encoder_set_polygon_stipple(struct virgl_context *ctx,
+                                      const struct pipe_poly_stipple *ps)
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_POLYGON_STIPPLE, 0, VIRGL_POLYGON_STIPPLE_SIZE));
+   for (i = 0; i < VIRGL_POLYGON_STIPPLE_SIZE; i++) {
+      virgl_encoder_write_dword(ctx->cbuf, ps->stipple[i]);
+   }
+}
+
+void virgl_encoder_set_sample_mask(struct virgl_context *ctx,
+                                  unsigned sample_mask)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_SAMPLE_MASK, 0, VIRGL_SET_SAMPLE_MASK_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, sample_mask);
+}
+
+void virgl_encoder_set_clip_state(struct virgl_context *ctx,
+                                 const struct pipe_clip_state *clip)
+{
+   int i, j;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_CLIP_STATE, 0, VIRGL_SET_CLIP_STATE_SIZE));
+   for (i = 0; i < VIRGL_MAX_CLIP_PLANES; i++) {
+      for (j = 0; j < 4; j++) {
+         virgl_encoder_write_dword(ctx->cbuf, fui(clip->ucp[i][j]));
+      }
+   }
+}
+
+int virgl_encode_resource_copy_region(struct virgl_context *ctx,
+                                     struct virgl_resource *dst_res,
+                                     unsigned dst_level,
+                                     unsigned dstx, unsigned dsty, unsigned dstz,
+                                     struct virgl_resource *src_res,
+                                     unsigned src_level,
+                                     const struct pipe_box *src_box)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_RESOURCE_COPY_REGION, 0, VIRGL_CMD_RESOURCE_COPY_REGION_SIZE));
+   virgl_encoder_write_res(ctx, dst_res);
+   virgl_encoder_write_dword(ctx->cbuf, dst_level);
+   virgl_encoder_write_dword(ctx->cbuf, dstx);
+   virgl_encoder_write_dword(ctx->cbuf, dsty);
+   virgl_encoder_write_dword(ctx->cbuf, dstz);
+   virgl_encoder_write_res(ctx, src_res);
+   virgl_encoder_write_dword(ctx->cbuf, src_level);
+   virgl_encoder_write_dword(ctx->cbuf, src_box->x);
+   virgl_encoder_write_dword(ctx->cbuf, src_box->y);
+   virgl_encoder_write_dword(ctx->cbuf, src_box->z);
+   virgl_encoder_write_dword(ctx->cbuf, src_box->width);
+   virgl_encoder_write_dword(ctx->cbuf, src_box->height);
+   virgl_encoder_write_dword(ctx->cbuf, src_box->depth);
+   return 0;
+}
+
+int virgl_encode_blit(struct virgl_context *ctx,
+                     struct virgl_resource *dst_res,
+                     struct virgl_resource *src_res,
+                     const struct pipe_blit_info *blit)
+{
+   uint32_t tmp;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_BLIT, 0, VIRGL_CMD_BLIT_SIZE));
+   tmp = VIRGL_CMD_BLIT_S0_MASK(blit->mask) |
+      VIRGL_CMD_BLIT_S0_FILTER(blit->filter) |
+      VIRGL_CMD_BLIT_S0_SCISSOR_ENABLE(blit->scissor_enable);
+   virgl_encoder_write_dword(ctx->cbuf, tmp);
+   virgl_encoder_write_dword(ctx->cbuf, (blit->scissor.minx | blit->scissor.miny << 16));
+   virgl_encoder_write_dword(ctx->cbuf, (blit->scissor.maxx | blit->scissor.maxy << 16));
+
+   virgl_encoder_write_res(ctx, dst_res);
+   virgl_encoder_write_dword(ctx->cbuf, blit->dst.level);
+   virgl_encoder_write_dword(ctx->cbuf, blit->dst.format);
+   virgl_encoder_write_dword(ctx->cbuf, blit->dst.box.x);
+   virgl_encoder_write_dword(ctx->cbuf, blit->dst.box.y);
+   virgl_encoder_write_dword(ctx->cbuf, blit->dst.box.z);
+   virgl_encoder_write_dword(ctx->cbuf, blit->dst.box.width);
+   virgl_encoder_write_dword(ctx->cbuf, blit->dst.box.height);
+   virgl_encoder_write_dword(ctx->cbuf, blit->dst.box.depth);
+
+   virgl_encoder_write_res(ctx, src_res);
+   virgl_encoder_write_dword(ctx->cbuf, blit->src.level);
+   virgl_encoder_write_dword(ctx->cbuf, blit->src.format);
+   virgl_encoder_write_dword(ctx->cbuf, blit->src.box.x);
+   virgl_encoder_write_dword(ctx->cbuf, blit->src.box.y);
+   virgl_encoder_write_dword(ctx->cbuf, blit->src.box.z);
+   virgl_encoder_write_dword(ctx->cbuf, blit->src.box.width);
+   virgl_encoder_write_dword(ctx->cbuf, blit->src.box.height);
+   virgl_encoder_write_dword(ctx->cbuf, blit->src.box.depth);
+   return 0;
+}
+
+int virgl_encoder_create_query(struct virgl_context *ctx,
+                              uint32_t handle,
+                              uint query_type,
+                              uint query_index,
+                              struct virgl_resource *res,
+                              uint32_t offset)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_QUERY, VIRGL_OBJ_QUERY_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   virgl_encoder_write_dword(ctx->cbuf, ((query_type & 0xffff) | (query_index << 16)));
+   virgl_encoder_write_dword(ctx->cbuf, offset);
+   virgl_encoder_write_res(ctx, res);
+   return 0;
+}
+
+int virgl_encoder_begin_query(struct virgl_context *ctx,
+                             uint32_t handle)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_BEGIN_QUERY, 0, 1));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   return 0;
+}
+
+int virgl_encoder_end_query(struct virgl_context *ctx,
+                           uint32_t handle)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_END_QUERY, 0, 1));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   return 0;
+}
+
+int virgl_encoder_get_query_result(struct virgl_context *ctx,
+                                  uint32_t handle, boolean wait)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_GET_QUERY_RESULT, 0, 2));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   virgl_encoder_write_dword(ctx->cbuf, wait ? 1 : 0);
+   return 0;
+}
+
+int virgl_encoder_render_condition(struct virgl_context *ctx,
+                                  uint32_t handle, boolean condition,
+                                  uint mode)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_RENDER_CONDITION, 0, VIRGL_RENDER_CONDITION_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   virgl_encoder_write_dword(ctx->cbuf, condition);
+   virgl_encoder_write_dword(ctx->cbuf, mode);
+   return 0;
+}
+
+int virgl_encoder_set_so_targets(struct virgl_context *ctx,
+                                unsigned num_targets,
+                                struct pipe_stream_output_target **targets,
+                                unsigned append_bitmask)
+{
+   int i;
+
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_STREAMOUT_TARGETS, 0, num_targets + 1));
+   virgl_encoder_write_dword(ctx->cbuf, append_bitmask);
+   for (i = 0; i < num_targets; i++) {
+      struct virgl_so_target *tg = virgl_so_target(targets[i]);
+      virgl_encoder_write_dword(ctx->cbuf, tg->handle);
+   }
+   return 0;
+}
+
+
+int virgl_encoder_set_sub_ctx(struct virgl_context *ctx, uint32_t sub_ctx_id)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_SUB_CTX, 0, 1));
+   virgl_encoder_write_dword(ctx->cbuf, sub_ctx_id);
+   return 0;
+}
+
+int virgl_encoder_create_sub_ctx(struct virgl_context *ctx, uint32_t sub_ctx_id)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_SUB_CTX, 0, 1));
+   virgl_encoder_write_dword(ctx->cbuf, sub_ctx_id);
+   return 0;
+}
+
+int virgl_encoder_destroy_sub_ctx(struct virgl_context *ctx, uint32_t sub_ctx_id)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_DESTROY_SUB_CTX, 0, 1));
+   virgl_encoder_write_dword(ctx->cbuf, sub_ctx_id);
+   return 0;
+}
+
+int virgl_encode_bind_shader(struct virgl_context *ctx,
+                             uint32_t handle, uint32_t type)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_BIND_SHADER, 0, 2));
+   virgl_encoder_write_dword(ctx->cbuf, handle);
+   virgl_encoder_write_dword(ctx->cbuf, type);
+   return 0;
+}
diff --git a/src/gallium/drivers/virgl/virgl_encode.h b/src/gallium/drivers/virgl/virgl_encode.h
new file mode 100644
index 00000000000..030bcd6d16e
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_encode.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_ENCODE_H
+#define VIRGL_ENCODE_H
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "virgl_winsys.h"
+
+struct tgsi_token;
+
+struct virgl_context;
+struct virgl_resource;
+struct virgl_sampler_view;
+
+struct virgl_surface {
+   struct pipe_surface base;
+   uint32_t handle;
+};
+
+static inline struct virgl_surface *virgl_surface(struct pipe_surface *surf)
+{
+   return (struct virgl_surface *)surf;
+}
+
+static inline void virgl_encoder_write_dword(struct virgl_cmd_buf *state,
+                                            uint32_t dword)
+{
+   state->buf[state->cdw++] = dword;
+}
+
+static inline void virgl_encoder_write_qword(struct virgl_cmd_buf *state,
+                                            uint64_t qword)
+{
+   memcpy(state->buf + state->cdw, &qword, sizeof(uint64_t));
+   state->cdw += 2;
+}
+
+static inline void virgl_encoder_write_block(struct virgl_cmd_buf *state,
+                                            const uint8_t *ptr, uint32_t len)
+{
+   int x;
+   memcpy(state->buf + state->cdw, ptr, len);
+   x = (len % 4);
+//   fprintf(stderr, "[%d] block %d x is %d\n", state->cdw, len, x);
+   if (x) {
+      uint8_t *mp = (uint8_t *)(state->buf + state->cdw);
+      mp += len;
+      memset(mp, 0, x);
+   }
+   state->cdw += (len + 3) / 4;
+}
+
+extern int virgl_encode_blend_state(struct virgl_context *ctx,
+                                   uint32_t handle,
+                                   const struct pipe_blend_state *blend_state);
+extern int virgl_encode_rasterizer_state(struct virgl_context *ctx,
+                                         uint32_t handle,
+                                         const struct pipe_rasterizer_state *state);
+
+extern int virgl_encode_shader_state(struct virgl_context *ctx,
+                                     uint32_t handle,
+                                     uint32_t type,
+                                     const struct pipe_stream_output_info *so_info,
+                                     const struct tgsi_token *tokens);
+
+int virgl_encode_stream_output_info(struct virgl_context *ctx,
+                                   uint32_t handle,
+                                   uint32_t type,
+                                   const struct pipe_shader_state *shader);
+
+int virgl_encoder_set_so_targets(struct virgl_context *ctx,
+                                unsigned num_targets,
+                                struct pipe_stream_output_target **targets,
+                                unsigned append_bitmask);
+
+int virgl_encoder_create_so_target(struct virgl_context *ctx,
+                                  uint32_t handle,
+                                  struct virgl_resource *res,
+                                  unsigned buffer_offset,
+                                  unsigned buffer_size);
+
+int virgl_encode_clear(struct virgl_context *ctx,
+                      unsigned buffers,
+                      const union pipe_color_union *color,
+                      double depth, unsigned stencil);
+
+int virgl_encode_bind_object(struct virgl_context *ctx,
+                            uint32_t handle, uint32_t object);
+int virgl_encode_delete_object(struct virgl_context *ctx,
+                              uint32_t handle, uint32_t object);
+
+int virgl_encoder_set_framebuffer_state(struct virgl_context *ctx,
+                                       const struct pipe_framebuffer_state *state);
+int virgl_encoder_set_viewport_states(struct virgl_context *ctx,
+                                      int start_slot,
+                                      int num_viewports,
+                                      const struct pipe_viewport_state *states);
+
+int virgl_encoder_draw_vbo(struct virgl_context *ctx,
+                          const struct pipe_draw_info *info);
+
+
+int virgl_encoder_create_surface(struct virgl_context *ctx,
+                                uint32_t handle,
+                                struct virgl_resource *res,
+                                const struct pipe_surface *templat);
+
+int virgl_encoder_flush_frontbuffer(struct virgl_context *ctx,
+                                   struct virgl_resource *res);
+
+int virgl_encoder_create_vertex_elements(struct virgl_context *ctx,
+                                        uint32_t handle,
+                                        unsigned num_elements,
+                                        const struct pipe_vertex_element *element);
+
+int virgl_encoder_set_vertex_buffers(struct virgl_context *ctx,
+                                    unsigned num_buffers,
+                                    const struct pipe_vertex_buffer *buffers);
+
+
+int virgl_encoder_inline_write(struct virgl_context *ctx,
+                              struct virgl_resource *res,
+                              unsigned level, unsigned usage,
+                              const struct pipe_box *box,
+                              const void *data, unsigned stride,
+                              unsigned layer_stride);
+int virgl_encode_sampler_state(struct virgl_context *ctx,
+                              uint32_t handle,
+                              const struct pipe_sampler_state *state);
+int virgl_encode_sampler_view(struct virgl_context *ctx,
+                             uint32_t handle,
+                             struct virgl_resource *res,
+                             const struct pipe_sampler_view *state);
+
+int virgl_encode_set_sampler_views(struct virgl_context *ctx,
+                                  uint32_t shader_type,
+                                  uint32_t start_slot,
+                                  uint32_t num_views,
+                                  struct virgl_sampler_view **views);
+
+int virgl_encode_bind_sampler_states(struct virgl_context *ctx,
+                                    uint32_t shader_type,
+                                    uint32_t start_slot,
+                                    uint32_t num_handles,
+                                    uint32_t *handles);
+
+int virgl_encoder_set_index_buffer(struct virgl_context *ctx,
+                                  const struct pipe_index_buffer *ib);
+
+uint32_t virgl_object_assign_handle(void);
+
+int virgl_encoder_write_constant_buffer(struct virgl_context *ctx,
+                                       uint32_t shader,
+                                       uint32_t index,
+                                       uint32_t size,
+                                       const void *data);
+
+int virgl_encoder_set_uniform_buffer(struct virgl_context *ctx,
+                                     uint32_t shader,
+                                     uint32_t index,
+                                     uint32_t offset,
+                                     uint32_t length,
+                                     struct virgl_resource *res);
+int virgl_encode_dsa_state(struct virgl_context *ctx,
+                          uint32_t handle,
+                          const struct pipe_depth_stencil_alpha_state *dsa_state);
+
+int virgl_encoder_set_stencil_ref(struct virgl_context *ctx,
+                                 const struct pipe_stencil_ref *ref);
+
+int virgl_encoder_set_blend_color(struct virgl_context *ctx,
+                                 const struct pipe_blend_color *color);
+
+int virgl_encoder_set_scissor_state(struct virgl_context *ctx,
+                                    unsigned start_slot,
+                                    int num_scissors,
+                                    const struct pipe_scissor_state *ss);
+
+void virgl_encoder_set_polygon_stipple(struct virgl_context *ctx,
+                                      const struct pipe_poly_stipple *ps);
+
+void virgl_encoder_set_sample_mask(struct virgl_context *ctx,
+                                  unsigned sample_mask);
+
+void virgl_encoder_set_clip_state(struct virgl_context *ctx,
+                                 const struct pipe_clip_state *clip);
+
+int virgl_encode_resource_copy_region(struct virgl_context *ctx,
+                                     struct virgl_resource *dst_res,
+                                     unsigned dst_level,
+                                     unsigned dstx, unsigned dsty, unsigned dstz,
+                                     struct virgl_resource *src_res,
+                                     unsigned src_level,
+                                     const struct pipe_box *src_box);
+
+int virgl_encode_blit(struct virgl_context *ctx,
+                     struct virgl_resource *dst_res,
+                     struct virgl_resource *src_res,
+                     const struct pipe_blit_info *blit);
+
+int virgl_encoder_create_query(struct virgl_context *ctx,
+                              uint32_t handle,
+                              uint query_type,
+                              uint query_index,
+                              struct virgl_resource *res,
+                              uint32_t offset);
+
+int virgl_encoder_begin_query(struct virgl_context *ctx,
+                             uint32_t handle);
+int virgl_encoder_end_query(struct virgl_context *ctx,
+                           uint32_t handle);
+int virgl_encoder_get_query_result(struct virgl_context *ctx,
+                                  uint32_t handle, boolean wait);
+
+int virgl_encoder_render_condition(struct virgl_context *ctx,
+                                  uint32_t handle, boolean condition,
+                                  uint mode);
+
+int virgl_encoder_set_sub_ctx(struct virgl_context *ctx, uint32_t sub_ctx_id);
+int virgl_encoder_create_sub_ctx(struct virgl_context *ctx, uint32_t sub_ctx_id);
+int virgl_encoder_destroy_sub_ctx(struct virgl_context *ctx, uint32_t sub_ctx_id);
+
+int virgl_encode_bind_shader(struct virgl_context *ctx,
+                             uint32_t handle, uint32_t type);
+#endif
diff --git a/src/gallium/drivers/virgl/virgl_hw.h b/src/gallium/drivers/virgl/virgl_hw.h
new file mode 100644
index 00000000000..e3c56db2ac6
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_hw.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_HW_H
+#define VIRGL_HW_H
+
+struct virgl_box {
+	uint32_t x, y, z;
+	uint32_t w, h, d;
+};
+
+/* formats known by the HW device - based on gallium subset */
+enum virgl_formats {
+   VIRGL_FORMAT_B8G8R8A8_UNORM          = 1,
+   VIRGL_FORMAT_B8G8R8X8_UNORM          = 2,
+   VIRGL_FORMAT_A8R8G8B8_UNORM          = 3,
+   VIRGL_FORMAT_X8R8G8B8_UNORM          = 4,
+   VIRGL_FORMAT_B5G5R5A1_UNORM          = 5,
+   VIRGL_FORMAT_B4G4R4A4_UNORM          = 6,
+   VIRGL_FORMAT_B5G6R5_UNORM            = 7,
+   VIRGL_FORMAT_L8_UNORM                = 9,    /**< ubyte luminance */
+   VIRGL_FORMAT_A8_UNORM                = 10,   /**< ubyte alpha */
+   VIRGL_FORMAT_L8A8_UNORM              = 12,   /**< ubyte alpha, luminance */
+   VIRGL_FORMAT_L16_UNORM               = 13,   /**< ushort luminance */
+
+   VIRGL_FORMAT_Z16_UNORM               = 16,
+   VIRGL_FORMAT_Z32_UNORM               = 17,
+   VIRGL_FORMAT_Z32_FLOAT               = 18,
+   VIRGL_FORMAT_Z24_UNORM_S8_UINT       = 19,
+   VIRGL_FORMAT_S8_UINT_Z24_UNORM       = 20,
+   VIRGL_FORMAT_Z24X8_UNORM             = 21,
+   VIRGL_FORMAT_S8_UINT                 = 23,   /**< ubyte stencil */
+
+   VIRGL_FORMAT_R32_FLOAT               = 28,
+   VIRGL_FORMAT_R32G32_FLOAT            = 29,
+   VIRGL_FORMAT_R32G32B32_FLOAT         = 30,
+   VIRGL_FORMAT_R32G32B32A32_FLOAT      = 31,
+
+   VIRGL_FORMAT_R16_UNORM               = 48,
+   VIRGL_FORMAT_R16G16_UNORM            = 49,
+
+   VIRGL_FORMAT_R16G16B16A16_UNORM      = 51,
+
+   VIRGL_FORMAT_R16_SNORM               = 56,
+   VIRGL_FORMAT_R16G16_SNORM            = 57,
+   VIRGL_FORMAT_R16G16B16A16_SNORM      = 59,
+
+   VIRGL_FORMAT_R8_UNORM                = 64,
+   VIRGL_FORMAT_R8G8_UNORM              = 65,
+
+   VIRGL_FORMAT_R8G8B8A8_UNORM          = 67,
+
+   VIRGL_FORMAT_R8_SNORM                = 74,
+   VIRGL_FORMAT_R8G8_SNORM              = 75,
+   VIRGL_FORMAT_R8G8B8_SNORM            = 76,
+   VIRGL_FORMAT_R8G8B8A8_SNORM          = 77,
+
+   VIRGL_FORMAT_R16_FLOAT               = 91,
+   VIRGL_FORMAT_R16G16_FLOAT            = 92,
+   VIRGL_FORMAT_R16G16B16_FLOAT         = 93,
+   VIRGL_FORMAT_R16G16B16A16_FLOAT      = 94,
+
+   VIRGL_FORMAT_L8_SRGB                 = 95,
+   VIRGL_FORMAT_L8A8_SRGB               = 96,
+   VIRGL_FORMAT_B8G8R8A8_SRGB           = 100,
+   VIRGL_FORMAT_B8G8R8X8_SRGB           = 101,
+
+   /* compressed formats */
+   VIRGL_FORMAT_DXT1_RGB                = 105,
+   VIRGL_FORMAT_DXT1_RGBA               = 106,
+   VIRGL_FORMAT_DXT3_RGBA               = 107,
+   VIRGL_FORMAT_DXT5_RGBA               = 108,
+
+   /* sRGB, compressed */
+   VIRGL_FORMAT_DXT1_SRGB               = 109,
+   VIRGL_FORMAT_DXT1_SRGBA              = 110,
+   VIRGL_FORMAT_DXT3_SRGBA              = 111,
+   VIRGL_FORMAT_DXT5_SRGBA              = 112,
+
+   /* rgtc compressed */
+   VIRGL_FORMAT_RGTC1_UNORM             = 113,
+   VIRGL_FORMAT_RGTC1_SNORM             = 114,
+   VIRGL_FORMAT_RGTC2_UNORM             = 115,
+   VIRGL_FORMAT_RGTC2_SNORM             = 116,
+
+   VIRGL_FORMAT_A8B8G8R8_UNORM          = 121,
+   VIRGL_FORMAT_B5G5R5X1_UNORM          = 122,
+   VIRGL_FORMAT_R11G11B10_FLOAT         = 124,
+   VIRGL_FORMAT_R9G9B9E5_FLOAT          = 125,
+   VIRGL_FORMAT_Z32_FLOAT_S8X24_UINT    = 126,
+
+   VIRGL_FORMAT_B10G10R10A2_UNORM       = 131,
+   VIRGL_FORMAT_R8G8B8X8_UNORM          = 134,
+   VIRGL_FORMAT_B4G4R4X4_UNORM          = 135,
+   VIRGL_FORMAT_B2G3R3_UNORM            = 139,
+
+   VIRGL_FORMAT_L16A16_UNORM            = 140,
+   VIRGL_FORMAT_A16_UNORM               = 141,
+
+   VIRGL_FORMAT_A8_SNORM                = 147,
+   VIRGL_FORMAT_L8_SNORM                = 148,
+   VIRGL_FORMAT_L8A8_SNORM              = 149,
+
+   VIRGL_FORMAT_A16_SNORM               = 151,
+   VIRGL_FORMAT_L16_SNORM               = 152,
+   VIRGL_FORMAT_L16A16_SNORM            = 153,
+
+   VIRGL_FORMAT_A16_FLOAT               = 155,
+   VIRGL_FORMAT_L16_FLOAT               = 156,
+   VIRGL_FORMAT_L16A16_FLOAT            = 157,
+
+   VIRGL_FORMAT_A32_FLOAT               = 159,
+   VIRGL_FORMAT_L32_FLOAT               = 160,
+   VIRGL_FORMAT_L32A32_FLOAT            = 161,
+
+   VIRGL_FORMAT_R8_UINT                 = 177,
+   VIRGL_FORMAT_R8G8_UINT               = 178,
+   VIRGL_FORMAT_R8G8B8_UINT             = 179,
+   VIRGL_FORMAT_R8G8B8A8_UINT           = 180,
+
+   VIRGL_FORMAT_R8_SINT                 = 181,
+   VIRGL_FORMAT_R8G8_SINT               = 182,
+   VIRGL_FORMAT_R8G8B8_SINT             = 183,
+   VIRGL_FORMAT_R8G8B8A8_SINT           = 184,
+
+   VIRGL_FORMAT_R16_UINT                = 185,
+   VIRGL_FORMAT_R16G16_UINT             = 186,
+   VIRGL_FORMAT_R16G16B16_UINT          = 187,
+   VIRGL_FORMAT_R16G16B16A16_UINT       = 188,
+
+   VIRGL_FORMAT_R16_SINT                = 189,
+   VIRGL_FORMAT_R16G16_SINT             = 190,
+   VIRGL_FORMAT_R16G16B16_SINT          = 191,
+   VIRGL_FORMAT_R16G16B16A16_SINT       = 192,
+   VIRGL_FORMAT_R32_UINT                = 193,
+   VIRGL_FORMAT_R32G32_UINT             = 194,
+   VIRGL_FORMAT_R32G32B32_UINT          = 195,
+   VIRGL_FORMAT_R32G32B32A32_UINT       = 196,
+
+   VIRGL_FORMAT_R32_SINT                = 197,
+   VIRGL_FORMAT_R32G32_SINT             = 198,
+   VIRGL_FORMAT_R32G32B32_SINT          = 199,
+   VIRGL_FORMAT_R32G32B32A32_SINT       = 200,
+
+   VIRGL_FORMAT_A8_UINT                 = 201,
+   VIRGL_FORMAT_L8_UINT                 = 203,
+   VIRGL_FORMAT_L8A8_UINT               = 204,
+
+   VIRGL_FORMAT_A8_SINT                 = 205,
+   VIRGL_FORMAT_L8_SINT                 = 207,
+   VIRGL_FORMAT_L8A8_SINT               = 208,
+
+   VIRGL_FORMAT_A16_UINT                = 209,
+   VIRGL_FORMAT_L16_UINT                = 211,
+   VIRGL_FORMAT_L16A16_UINT             = 212,
+
+   VIRGL_FORMAT_A16_SINT                = 213,
+   VIRGL_FORMAT_L16_SINT                = 215,
+   VIRGL_FORMAT_L16A16_SINT             = 216,
+
+   VIRGL_FORMAT_A32_UINT                = 217,
+   VIRGL_FORMAT_L32_UINT                = 219,
+   VIRGL_FORMAT_L32A32_UINT             = 220,
+
+   VIRGL_FORMAT_A32_SINT                = 221,
+   VIRGL_FORMAT_L32_SINT                = 223,
+   VIRGL_FORMAT_L32A32_SINT             = 224,
+
+   VIRGL_FORMAT_B10G10R10A2_UINT        = 225, 
+   VIRGL_FORMAT_R8G8B8X8_SNORM          = 229,
+
+   VIRGL_FORMAT_R8G8B8X8_SRGB           = 230,
+
+   VIRGL_FORMAT_B10G10R10X2_UNORM       = 233,
+   VIRGL_FORMAT_R16G16B16X16_UNORM      = 234,
+   VIRGL_FORMAT_R16G16B16X16_SNORM      = 235,
+   VIRGL_FORMAT_MAX,
+};
+
+#define VIRGL_BIND_DEPTH_STENCIL (1 << 0)
+#define VIRGL_BIND_RENDER_TARGET (1 << 1)
+#define VIRGL_BIND_SAMPLER_VIEW  (1 << 3)
+#define VIRGL_BIND_VERTEX_BUFFER (1 << 4)
+#define VIRGL_BIND_INDEX_BUFFER  (1 << 5)
+#define VIRGL_BIND_CONSTANT_BUFFER (1 << 6)
+#define VIRGL_BIND_DISPLAY_TARGET (1 << 7)
+#define VIRGL_BIND_STREAM_OUTPUT (1 << 11)
+#define VIRGL_BIND_CURSOR        (1 << 16)
+#define VIRGL_BIND_CUSTOM        (1 << 17)
+#define VIRGL_BIND_SCANOUT       (1 << 18)
+
+struct virgl_caps_bool_set1 {
+        unsigned indep_blend_enable:1;
+        unsigned indep_blend_func:1;
+        unsigned cube_map_array:1;
+        unsigned shader_stencil_export:1;
+        unsigned conditional_render:1;
+        unsigned start_instance:1;
+        unsigned primitive_restart:1;
+        unsigned blend_eq_sep:1;
+        unsigned instanceid:1;
+        unsigned vertex_element_instance_divisor:1;
+        unsigned seamless_cube_map:1;
+        unsigned occlusion_query:1;
+        unsigned timer_query:1;
+        unsigned streamout_pause_resume:1;
+        unsigned texture_multisample:1;
+        unsigned fragment_coord_conventions:1;
+        unsigned depth_clip_disable:1;
+        unsigned seamless_cube_map_per_texture:1;
+        unsigned ubo:1;
+        unsigned color_clamping:1; /* not in GL 3.1 core profile */
+        unsigned poly_stipple:1; /* not in GL 3.1 core profile */
+        unsigned mirror_clamp:1;
+        unsigned texture_query_lod:1;
+};
+
+/* endless expansion capabilites - current gallium has 252 formats */
+struct virgl_supported_format_mask {
+        uint32_t bitmask[16];
+};
+/* capabilities set 2 - version 1 - 32-bit and float values */
+struct virgl_caps_v1 {
+        uint32_t max_version;
+        struct virgl_supported_format_mask sampler;
+        struct virgl_supported_format_mask render;
+        struct virgl_supported_format_mask depthstencil;
+        struct virgl_supported_format_mask vertexbuffer;
+        struct virgl_caps_bool_set1 bset;
+        uint32_t glsl_level;
+        uint32_t max_texture_array_layers;
+        uint32_t max_streamout_buffers;
+        uint32_t max_dual_source_render_targets;
+        uint32_t max_render_targets;
+        uint32_t max_samples;
+        uint32_t prim_mask;
+        uint32_t max_tbo_size;
+        uint32_t max_uniform_blocks;
+        uint32_t max_viewports;
+        uint32_t max_texture_gather_components;
+};
+
+union virgl_caps {
+        uint32_t max_version;
+        struct virgl_caps_v1 v1;
+};
+
+enum virgl_errors {
+        VIRGL_ERROR_NONE,
+        VIRGL_ERROR_UNKNOWN,
+        VIRGL_ERROR_UNKNOWN_RESOURCE_FORMAT,
+};
+
+enum virgl_ctx_errors {
+        VIRGL_ERROR_CTX_NONE,
+        VIRGL_ERROR_CTX_UNKNOWN,
+        VIRGL_ERROR_CTX_ILLEGAL_SHADER,
+        VIRGL_ERROR_CTX_ILLEGAL_HANDLE,
+        VIRGL_ERROR_CTX_ILLEGAL_RESOURCE,
+        VIRGL_ERROR_CTX_ILLEGAL_SURFACE,
+        VIRGL_ERROR_CTX_ILLEGAL_VERTEX_FORMAT,
+        VIRGL_ERROR_CTX_ILLEGAL_CMD_BUFFER,
+};
+
+
+#define VIRGL_RESOURCE_Y_0_TOP (1 << 0)
+#endif
diff --git a/src/gallium/drivers/virgl/virgl_protocol.h b/src/gallium/drivers/virgl/virgl_protocol.h
new file mode 100644
index 00000000000..ca3142f5f72
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_protocol.h
@@ -0,0 +1,468 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_PROTOCOL_H
+#define VIRGL_PROTOCOL_H
+
+#define VIRGL_QUERY_STATE_NEW 0
+#define VIRGL_QUERY_STATE_DONE 1
+#define VIRGL_QUERY_STATE_WAIT_HOST 2
+
+struct virgl_host_query_state {
+   uint32_t query_state;
+   uint32_t result_size;
+   uint64_t result;
+};
+
+enum virgl_object_type {
+   VIRGL_OBJECT_NULL,
+   VIRGL_OBJECT_BLEND,
+   VIRGL_OBJECT_RASTERIZER,
+   VIRGL_OBJECT_DSA,
+   VIRGL_OBJECT_SHADER,
+   VIRGL_OBJECT_VERTEX_ELEMENTS,
+   VIRGL_OBJECT_SAMPLER_VIEW,
+   VIRGL_OBJECT_SAMPLER_STATE,
+   VIRGL_OBJECT_SURFACE,
+   VIRGL_OBJECT_QUERY,
+   VIRGL_OBJECT_STREAMOUT_TARGET,
+   VIRGL_MAX_OBJECTS,
+};
+
+/* context cmds to be encoded in the command stream */
+enum virgl_context_cmd {
+   VIRGL_CCMD_NOP = 0,
+   VIRGL_CCMD_CREATE_OBJECT = 1,
+   VIRGL_CCMD_BIND_OBJECT,
+   VIRGL_CCMD_DESTROY_OBJECT,
+   VIRGL_CCMD_SET_VIEWPORT_STATE,
+   VIRGL_CCMD_SET_FRAMEBUFFER_STATE,
+   VIRGL_CCMD_SET_VERTEX_BUFFERS,
+   VIRGL_CCMD_CLEAR,
+   VIRGL_CCMD_DRAW_VBO,
+   VIRGL_CCMD_RESOURCE_INLINE_WRITE,
+   VIRGL_CCMD_SET_SAMPLER_VIEWS,
+   VIRGL_CCMD_SET_INDEX_BUFFER,
+   VIRGL_CCMD_SET_CONSTANT_BUFFER,
+   VIRGL_CCMD_SET_STENCIL_REF,
+   VIRGL_CCMD_SET_BLEND_COLOR,
+   VIRGL_CCMD_SET_SCISSOR_STATE,
+   VIRGL_CCMD_BLIT,
+   VIRGL_CCMD_RESOURCE_COPY_REGION,
+   VIRGL_CCMD_BIND_SAMPLER_STATES,
+   VIRGL_CCMD_BEGIN_QUERY,
+   VIRGL_CCMD_END_QUERY,
+   VIRGL_CCMD_GET_QUERY_RESULT,
+   VIRGL_CCMD_SET_POLYGON_STIPPLE,
+   VIRGL_CCMD_SET_CLIP_STATE,
+   VIRGL_CCMD_SET_SAMPLE_MASK,
+   VIRGL_CCMD_SET_STREAMOUT_TARGETS,
+   VIRGL_CCMD_SET_RENDER_CONDITION,
+   VIRGL_CCMD_SET_UNIFORM_BUFFER,
+
+   VIRGL_CCMD_SET_SUB_CTX,
+   VIRGL_CCMD_CREATE_SUB_CTX,
+   VIRGL_CCMD_DESTROY_SUB_CTX,
+   VIRGL_CCMD_BIND_SHADER,
+};
+
+/*
+ 8-bit cmd headers
+ 8-bit object type
+ 16-bit length
+*/
+
+#define VIRGL_CMD0(cmd, obj, len) ((cmd) | ((obj) << 8) | ((len) << 16))
+
+/* hw specification */
+#define VIRGL_MAX_COLOR_BUFS 8
+#define VIRGL_MAX_CLIP_PLANES 8
+
+#define VIRGL_OBJ_CREATE_HEADER 0
+#define VIRGL_OBJ_CREATE_HANDLE 1
+
+#define VIRGL_OBJ_BIND_HEADER 0
+#define VIRGL_OBJ_BIND_HANDLE 1
+
+#define VIRGL_OBJ_DESTROY_HANDLE 1
+
+/* some of these defines are a specification - not used in the code */
+/* bit offsets for blend state object */
+#define VIRGL_OBJ_BLEND_SIZE (VIRGL_MAX_COLOR_BUFS + 3)
+#define VIRGL_OBJ_BLEND_HANDLE 1
+#define VIRGL_OBJ_BLEND_S0 2
+#define VIRGL_OBJ_BLEND_S0_INDEPENDENT_BLEND_ENABLE(x) ((x) & 0x1 << 0)
+#define VIRGL_OBJ_BLEND_S0_LOGICOP_ENABLE(x) (((x) & 0x1) << 1)
+#define VIRGL_OBJ_BLEND_S0_DITHER(x) (((x) & 0x1) << 2)
+#define VIRGL_OBJ_BLEND_S0_ALPHA_TO_COVERAGE(x) (((x) & 0x1) << 3)
+#define VIRGL_OBJ_BLEND_S0_ALPHA_TO_ONE(x) (((x) & 0x1) << 4)
+#define VIRGL_OBJ_BLEND_S1 3
+#define VIRGL_OBJ_BLEND_S1_LOGICOP_FUNC(x) (((x) & 0xf) << 0)
+/* repeated once per number of cbufs */
+
+#define VIRGL_OBJ_BLEND_S2(cbuf) (4 + (cbuf))
+#define VIRGL_OBJ_BLEND_S2_RT_BLEND_ENABLE(x) (((x) & 0x1) << 0)
+#define VIRGL_OBJ_BLEND_S2_RT_RGB_FUNC(x) (((x) & 0x7) << 1)
+#define VIRGL_OBJ_BLEND_S2_RT_RGB_SRC_FACTOR(x) (((x) & 0x1f) << 4)
+#define VIRGL_OBJ_BLEND_S2_RT_RGB_DST_FACTOR(x) (((x) & 0x1f) << 9)
+#define VIRGL_OBJ_BLEND_S2_RT_ALPHA_FUNC(x) (((x) & 0x7) << 14)
+#define VIRGL_OBJ_BLEND_S2_RT_ALPHA_SRC_FACTOR(x) (((x) & 0x1f) << 17)
+#define VIRGL_OBJ_BLEND_S2_RT_ALPHA_DST_FACTOR(x) (((x) & 0x1f) << 22)
+#define VIRGL_OBJ_BLEND_S2_RT_COLORMASK(x) (((x) & 0xf) << 27)
+
+/* bit offsets for DSA state */
+#define VIRGL_OBJ_DSA_SIZE 5
+#define VIRGL_OBJ_DSA_HANDLE 1
+#define VIRGL_OBJ_DSA_S0 2
+#define VIRGL_OBJ_DSA_S0_DEPTH_ENABLE(x) (((x) & 0x1) << 0)
+#define VIRGL_OBJ_DSA_S0_DEPTH_WRITEMASK(x) (((x) & 0x1) << 1)
+#define VIRGL_OBJ_DSA_S0_DEPTH_FUNC(x) (((x) & 0x7) << 2)
+#define VIRGL_OBJ_DSA_S0_ALPHA_ENABLED(x) (((x) & 0x1) << 8)
+#define VIRGL_OBJ_DSA_S0_ALPHA_FUNC(x) (((x) & 0x7) << 9)
+#define VIRGL_OBJ_DSA_S1 3
+#define VIRGL_OBJ_DSA_S2 4
+#define VIRGL_OBJ_DSA_S1_STENCIL_ENABLED(x) (((x) & 0x1) << 0)
+#define VIRGL_OBJ_DSA_S1_STENCIL_FUNC(x) (((x) & 0x7) << 1)
+#define VIRGL_OBJ_DSA_S1_STENCIL_FAIL_OP(x) (((x) & 0x7) << 4)
+#define VIRGL_OBJ_DSA_S1_STENCIL_ZPASS_OP(x) (((x) & 0x7) << 7)
+#define VIRGL_OBJ_DSA_S1_STENCIL_ZFAIL_OP(x) (((x) & 0x7) << 10)
+#define VIRGL_OBJ_DSA_S1_STENCIL_VALUEMASK(x) (((x) & 0xff) << 13)
+#define VIRGL_OBJ_DSA_S1_STENCIL_WRITEMASK(x) (((x) & 0xff) << 21)
+#define VIRGL_OBJ_DSA_ALPHA_REF 5
+
+/* offsets for rasterizer state */
+#define VIRGL_OBJ_RS_SIZE 9
+#define VIRGL_OBJ_RS_HANDLE 1
+#define VIRGL_OBJ_RS_S0 2
+#define VIRGL_OBJ_RS_S0_FLATSHADE(x) (((x) & 0x1) << 0)
+#define VIRGL_OBJ_RS_S0_DEPTH_CLIP(x) (((x) & 0x1) << 1)
+#define VIRGL_OBJ_RS_S0_CLIP_HALFZ(x) (((x) & 0x1) << 2)
+#define VIRGL_OBJ_RS_S0_RASTERIZER_DISCARD(x) (((x) & 0x1) << 3)
+#define VIRGL_OBJ_RS_S0_FLATSHADE_FIRST(x) (((x) & 0x1) << 4)
+#define VIRGL_OBJ_RS_S0_LIGHT_TWOSIZE(x) (((x) & 0x1) << 5)
+#define VIRGL_OBJ_RS_S0_SPRITE_COORD_MODE(x) (((x) & 0x1) << 6)
+#define VIRGL_OBJ_RS_S0_POINT_QUAD_RASTERIZATION(x) (((x) & 0x1) << 7)
+#define VIRGL_OBJ_RS_S0_CULL_FACE(x) (((x) & 0x3) << 8)
+#define VIRGL_OBJ_RS_S0_FILL_FRONT(x) (((x) & 0x3) << 10)
+#define VIRGL_OBJ_RS_S0_FILL_BACK(x) (((x) & 0x3) << 12)
+#define VIRGL_OBJ_RS_S0_SCISSOR(x) (((x) & 0x1) << 14)
+#define VIRGL_OBJ_RS_S0_FRONT_CCW(x) (((x) & 0x1) << 15)
+#define VIRGL_OBJ_RS_S0_CLAMP_VERTEX_COLOR(x) (((x) & 0x1) << 16)
+#define VIRGL_OBJ_RS_S0_CLAMP_FRAGMENT_COLOR(x) (((x) & 0x1) << 17)
+#define VIRGL_OBJ_RS_S0_OFFSET_LINE(x) (((x) & 0x1) << 18)
+#define VIRGL_OBJ_RS_S0_OFFSET_POINT(x) (((x) & 0x1) << 19)
+#define VIRGL_OBJ_RS_S0_OFFSET_TRI(x) (((x) & 0x1) << 20)
+#define VIRGL_OBJ_RS_S0_POLY_SMOOTH(x) (((x) & 0x1) << 21)
+#define VIRGL_OBJ_RS_S0_POLY_STIPPLE_ENABLE(x) (((x) & 0x1) << 22)
+#define VIRGL_OBJ_RS_S0_POINT_SMOOTH(x) (((x) & 0x1) << 23)
+#define VIRGL_OBJ_RS_S0_POINT_SIZE_PER_VERTEX(x) (((x) & 0x1) << 24)
+#define VIRGL_OBJ_RS_S0_MULTISAMPLE(x) (((x) & 0x1) << 25)
+#define VIRGL_OBJ_RS_S0_LINE_SMOOTH(x) (((x) & 0x1) << 26)
+#define VIRGL_OBJ_RS_S0_LINE_STIPPLE_ENABLE(x) (((x) & 0x1) << 27)
+#define VIRGL_OBJ_RS_S0_LINE_LAST_PIXEL(x) (((x) & 0x1) << 28)
+#define VIRGL_OBJ_RS_S0_HALF_PIXEL_CENTER(x) (((x) & 0x1) << 29)
+#define VIRGL_OBJ_RS_S0_BOTTOM_EDGE_RULE(x) (((x) & 0x1) << 30)
+
+#define VIRGL_OBJ_RS_POINT_SIZE 3
+#define VIRGL_OBJ_RS_SPRITE_COORD_ENABLE 4
+#define VIRGL_OBJ_RS_S3 5
+
+#define VIRGL_OBJ_RS_S3_LINE_STIPPLE_PATTERN(x) (((x) & 0xffff) << 0)
+#define VIRGL_OBJ_RS_S3_LINE_STIPPLE_FACTOR(x) (((x) & 0xff) << 16)
+#define VIRGL_OBJ_RS_S3_CLIP_PLANE_ENABLE(x) (((x) & 0xff) << 24)
+#define VIRGL_OBJ_RS_LINE_WIDTH 6
+#define VIRGL_OBJ_RS_OFFSET_UNITS 7
+#define VIRGL_OBJ_RS_OFFSET_SCALE 8
+#define VIRGL_OBJ_RS_OFFSET_CLAMP 9
+
+#define VIRGL_OBJ_CLEAR_SIZE 8
+#define VIRGL_OBJ_CLEAR_BUFFERS 1
+#define VIRGL_OBJ_CLEAR_COLOR_0 2 /* color is 4 * u32/f32/i32 */
+#define VIRGL_OBJ_CLEAR_COLOR_1 3
+#define VIRGL_OBJ_CLEAR_COLOR_2 4
+#define VIRGL_OBJ_CLEAR_COLOR_3 5
+#define VIRGL_OBJ_CLEAR_DEPTH_0 6 /* depth is a double precision float */
+#define VIRGL_OBJ_CLEAR_DEPTH_1 7
+#define VIRGL_OBJ_CLEAR_STENCIL 8
+
+/* shader object */
+#define VIRGL_OBJ_SHADER_HDR_SIZE(nso) (5 + ((nso) ? (2 * nso) + 4 : 0))
+#define VIRGL_OBJ_SHADER_HANDLE 1
+#define VIRGL_OBJ_SHADER_TYPE 2
+#define VIRGL_OBJ_SHADER_OFFSET 3
+#define VIRGL_OBJ_SHADER_OFFSET_VAL(x) (((x) & 0x7fffffff) << 0)
+/* start contains full length in VAL - also implies continuations */
+/* continuation contains offset in VAL */
+#define VIRGL_OBJ_SHADER_OFFSET_CONT (0x1 << 31)
+#define VIRGL_OBJ_SHADER_NUM_TOKENS 4
+#define VIRGL_OBJ_SHADER_SO_NUM_OUTPUTS 5
+#define VIRGL_OBJ_SHADER_SO_STRIDE(x) (6 + (x))
+#define VIRGL_OBJ_SHADER_SO_OUTPUT0(x) (10 + (x * 2))
+#define VIRGL_OBJ_SHADER_SO_OUTPUT_REGISTER_INDEX(x) (((x) & 0xff) << 0)
+#define VIRGL_OBJ_SHADER_SO_OUTPUT_START_COMPONENT(x) (((x) & 0x3) << 8)
+#define VIRGL_OBJ_SHADER_SO_OUTPUT_NUM_COMPONENTS(x) (((x) & 0x7) << 10)
+#define VIRGL_OBJ_SHADER_SO_OUTPUT_BUFFER(x) (((x) & 0x7) << 13)
+#define VIRGL_OBJ_SHADER_SO_OUTPUT_DST_OFFSET(x) (((x) & 0xffff) << 16)
+#define VIRGL_OBJ_SHADER_SO_OUTPUT0_SO(x) (11 + (x * 2))
+#define VIRGL_OBJ_SHADER_SO_OUTPUT_STREAM(x) (((x) & 0x03) << 0)
+
+/* viewport state */
+#define VIRGL_SET_VIEWPORT_STATE_SIZE(num_viewports) ((6 * num_viewports) + 1)
+#define VIRGL_SET_VIEWPORT_START_SLOT 1
+#define VIRGL_SET_VIEWPORT_STATE_SCALE_0(x) (2 + (x * 6))
+#define VIRGL_SET_VIEWPORT_STATE_SCALE_1(x) (3 + (x * 6))
+#define VIRGL_SET_VIEWPORT_STATE_SCALE_2(x) (4 + (x * 6))
+#define VIRGL_SET_VIEWPORT_STATE_TRANSLATE_0(x) (5 + (x * 6))
+#define VIRGL_SET_VIEWPORT_STATE_TRANSLATE_1(x) (6 + (x * 6))
+#define VIRGL_SET_VIEWPORT_STATE_TRANSLATE_2(x) (7 + (x * 6))
+
+/* framebuffer state */
+#define VIRGL_SET_FRAMEBUFFER_STATE_SIZE(nr_cbufs) (nr_cbufs + 2)
+#define VIRGL_SET_FRAMEBUFFER_STATE_NR_CBUFS 1
+#define VIRGL_SET_FRAMEBUFFER_STATE_NR_ZSURF_HANDLE 2
+#define VIRGL_SET_FRAMEBUFFER_STATE_CBUF_HANDLE(x) ((x) + 3)
+
+/* vertex elements object */
+#define VIRGL_OBJ_VERTEX_ELEMENTS_SIZE(num_elements) (((num_elements) * 4) + 1)
+#define VIRGL_OBJ_VERTEX_ELEMENTS_HANDLE 1
+#define VIRGL_OBJ_VERTEX_ELEMENTS_V0_SRC_OFFSET(x) (((x) * 4) + 2) /* repeated per VE */
+#define VIRGL_OBJ_VERTEX_ELEMENTS_V0_INSTANCE_DIVISOR(x) (((x) * 4) + 3)
+#define VIRGL_OBJ_VERTEX_ELEMENTS_V0_VERTEX_BUFFER_INDEX(x) (((x) * 4) + 4)
+#define VIRGL_OBJ_VERTEX_ELEMENTS_V0_SRC_FORMAT(x) (((x) * 4) + 5)
+
+/* vertex buffers */
+#define VIRGL_SET_VERTEX_BUFFERS_SIZE(num_buffers) ((num_buffers) * 3)
+#define VIRGL_SET_VERTEX_BUFFER_STRIDE(x) (((x) * 3) + 1)
+#define VIRGL_SET_VERTEX_BUFFER_OFFSET(x) (((x) * 3) + 2)
+#define VIRGL_SET_VERTEX_BUFFER_HANDLE(x) (((x) * 3) + 3)
+
+/* index buffer */
+#define VIRGL_SET_INDEX_BUFFER_SIZE(ib) (((ib) ? 2 : 0) + 1)
+#define VIRGL_SET_INDEX_BUFFER_HANDLE 1
+#define VIRGL_SET_INDEX_BUFFER_INDEX_SIZE 2 /* only if sending an IB handle */
+#define VIRGL_SET_INDEX_BUFFER_OFFSET 3     /* only if sending an IB handle */
+
+/* constant buffer */
+#define VIRGL_SET_CONSTANT_BUFFER_SHADER_TYPE 1
+#define VIRGL_SET_CONSTANT_BUFFER_INDEX 2
+#define VIRGL_SET_CONSTANT_BUFFER_DATA_START 3
+
+#define VIRGL_SET_UNIFORM_BUFFER_SIZE 5
+#define VIRGL_SET_UNIFORM_BUFFER_SHADER_TYPE 1
+#define VIRGL_SET_UNIFORM_BUFFER_INDEX 2
+#define VIRGL_SET_UNIFORM_BUFFER_OFFSET 3
+#define VIRGL_SET_UNIFORM_BUFFER_LENGTH 4
+#define VIRGL_SET_UNIFORM_BUFFER_RES_HANDLE 5
+
+/* draw VBO */
+#define VIRGL_DRAW_VBO_SIZE 12
+#define VIRGL_DRAW_VBO_START 1
+#define VIRGL_DRAW_VBO_COUNT 2
+#define VIRGL_DRAW_VBO_MODE 3
+#define VIRGL_DRAW_VBO_INDEXED 4
+#define VIRGL_DRAW_VBO_INSTANCE_COUNT 5
+#define VIRGL_DRAW_VBO_INDEX_BIAS 6
+#define VIRGL_DRAW_VBO_START_INSTANCE 7
+#define VIRGL_DRAW_VBO_PRIMITIVE_RESTART 8
+#define VIRGL_DRAW_VBO_RESTART_INDEX 9
+#define VIRGL_DRAW_VBO_MIN_INDEX 10
+#define VIRGL_DRAW_VBO_MAX_INDEX 11
+#define VIRGL_DRAW_VBO_COUNT_FROM_SO 12
+
+/* create surface */
+#define VIRGL_OBJ_SURFACE_SIZE 5
+#define VIRGL_OBJ_SURFACE_HANDLE 1
+#define VIRGL_OBJ_SURFACE_RES_HANDLE 2
+#define VIRGL_OBJ_SURFACE_FORMAT 3
+#define VIRGL_OBJ_SURFACE_BUFFER_FIRST_ELEMENT 4
+#define VIRGL_OBJ_SURFACE_BUFFER_LAST_ELEMENT 5
+#define VIRGL_OBJ_SURFACE_TEXTURE_LEVEL 4
+#define VIRGL_OBJ_SURFACE_TEXTURE_LAYERS 5
+
+/* create streamout target */
+#define VIRGL_OBJ_STREAMOUT_SIZE 4
+#define VIRGL_OBJ_STREAMOUT_HANDLE 1
+#define VIRGL_OBJ_STREAMOUT_RES_HANDLE 2
+#define VIRGL_OBJ_STREAMOUT_BUFFER_OFFSET 3
+#define VIRGL_OBJ_STREAMOUT_BUFFER_SIZE 4
+
+/* sampler state */
+#define VIRGL_OBJ_SAMPLER_STATE_SIZE 9
+#define VIRGL_OBJ_SAMPLER_STATE_HANDLE 1
+#define VIRGL_OBJ_SAMPLER_STATE_S0 2
+#define VIRGL_OBJ_SAMPLE_STATE_S0_WRAP_S(x) (((x) & 0x7) << 0)
+#define VIRGL_OBJ_SAMPLE_STATE_S0_WRAP_T(x) (((x) & 0x7) << 3)
+#define VIRGL_OBJ_SAMPLE_STATE_S0_WRAP_R(x) (((x) & 0x7) << 6)
+#define VIRGL_OBJ_SAMPLE_STATE_S0_MIN_IMG_FILTER(x) (((x) & 0x3) << 9)
+#define VIRGL_OBJ_SAMPLE_STATE_S0_MIN_MIP_FILTER(x) (((x) & 0x3) << 11)
+#define VIRGL_OBJ_SAMPLE_STATE_S0_MAG_IMG_FILTER(x) (((x) & 0x3) << 13)
+#define VIRGL_OBJ_SAMPLE_STATE_S0_COMPARE_MODE(x) (((x) & 0x1) << 15)
+#define VIRGL_OBJ_SAMPLE_STATE_S0_COMPARE_FUNC(x) (((x) & 0x7) << 16)
+
+#define VIRGL_OBJ_SAMPLER_STATE_LOD_BIAS 3
+#define VIRGL_OBJ_SAMPLER_STATE_MIN_LOD 4
+#define VIRGL_OBJ_SAMPLER_STATE_MAX_LOD 5
+#define VIRGL_OBJ_SAMPLER_STATE_BORDER_COLOR(x) ((x) + 6) /* 6 - 9 */
+
+
+/* sampler view */
+#define VIRGL_OBJ_SAMPLER_VIEW_SIZE 6
+#define VIRGL_OBJ_SAMPLER_VIEW_HANDLE 1
+#define VIRGL_OBJ_SAMPLER_VIEW_RES_HANDLE 2
+#define VIRGL_OBJ_SAMPLER_VIEW_FORMAT 3
+#define VIRGL_OBJ_SAMPLER_VIEW_BUFFER_FIRST_ELEMENT 4
+#define VIRGL_OBJ_SAMPLER_VIEW_BUFFER_LAST_ELEMENT 5
+#define VIRGL_OBJ_SAMPLER_VIEW_TEXTURE_LAYER 4
+#define VIRGL_OBJ_SAMPLER_VIEW_TEXTURE_LEVEL 5
+#define VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE 6
+#define VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE_R(x) (((x) & 0x7) << 0)
+#define VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE_G(x) (((x) & 0x7) << 3)
+#define VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE_B(x) (((x) & 0x7) << 6)
+#define VIRGL_OBJ_SAMPLER_VIEW_SWIZZLE_A(x) (((x) & 0x7) << 9)
+
+/* set sampler views */
+#define VIRGL_SET_SAMPLER_VIEWS_SIZE(num_views) ((num_views) + 2)
+#define VIRGL_SET_SAMPLER_VIEWS_SHADER_TYPE 1
+#define VIRGL_SET_SAMPLER_VIEWS_START_SLOT 2
+#define VIRGL_SET_SAMPLER_VIEWS_V0_HANDLE 3
+
+/* bind sampler states */
+#define VIRGL_BIND_SAMPLER_STATES(num_states) ((num_states) + 2)
+#define VIRGL_BIND_SAMPLER_STATES_SHADER_TYPE 1
+#define VIRGL_BIND_SAMPLER_STATES_START_SLOT 2
+#define VIRGL_BIND_SAMPLER_STATES_S0_HANDLE 3
+
+/* set stencil reference */
+#define VIRGL_SET_STENCIL_REF_SIZE 1
+#define VIRGL_SET_STENCIL_REF 1
+#define VIRGL_STENCIL_REF_VAL(f, s) ((f & 0xff) | (((s & 0xff) << 8)))
+
+/* set blend color */
+#define VIRGL_SET_BLEND_COLOR_SIZE 4
+#define VIRGL_SET_BLEND_COLOR(x) ((x) + 1)
+
+/* set scissor state */
+#define VIRGL_SET_SCISSOR_STATE_SIZE(x) (1 + 2 * x)
+#define VIRGL_SET_SCISSOR_START_SLOT 1
+#define VIRGL_SET_SCISSOR_MINX_MINY(x) (2 + (x * 2))
+#define VIRGL_SET_SCISSOR_MAXX_MAXY(x) (3 + (x * 2))
+
+/* resource copy region */
+#define VIRGL_CMD_RESOURCE_COPY_REGION_SIZE 13
+#define VIRGL_CMD_RCR_DST_RES_HANDLE 1
+#define VIRGL_CMD_RCR_DST_LEVEL 2
+#define VIRGL_CMD_RCR_DST_X 3
+#define VIRGL_CMD_RCR_DST_Y 4
+#define VIRGL_CMD_RCR_DST_Z 5
+#define VIRGL_CMD_RCR_SRC_RES_HANDLE 6
+#define VIRGL_CMD_RCR_SRC_LEVEL 7
+#define VIRGL_CMD_RCR_SRC_X 8
+#define VIRGL_CMD_RCR_SRC_Y 9
+#define VIRGL_CMD_RCR_SRC_Z 10
+#define VIRGL_CMD_RCR_SRC_W 11
+#define VIRGL_CMD_RCR_SRC_H 12
+#define VIRGL_CMD_RCR_SRC_D 13
+
+/* blit */
+#define VIRGL_CMD_BLIT_SIZE 21
+#define VIRGL_CMD_BLIT_S0 1
+#define VIRGL_CMD_BLIT_S0_MASK(x) (((x) & 0xff) << 0)
+#define VIRGL_CMD_BLIT_S0_FILTER(x) (((x) & 0x3) << 8)
+#define VIRGL_CMD_BLIT_S0_SCISSOR_ENABLE(x) (((x) & 0x1) << 10)
+#define VIRGL_CMD_BLIT_SCISSOR_MINX_MINY 2
+#define VIRGL_CMD_BLIT_SCISSOR_MAXX_MAXY 3
+#define VIRGL_CMD_BLIT_DST_RES_HANDLE 4
+#define VIRGL_CMD_BLIT_DST_LEVEL 5
+#define VIRGL_CMD_BLIT_DST_FORMAT 6
+#define VIRGL_CMD_BLIT_DST_X 7
+#define VIRGL_CMD_BLIT_DST_Y 8
+#define VIRGL_CMD_BLIT_DST_Z 9
+#define VIRGL_CMD_BLIT_DST_W 10
+#define VIRGL_CMD_BLIT_DST_H 11
+#define VIRGL_CMD_BLIT_DST_D 12
+#define VIRGL_CMD_BLIT_SRC_RES_HANDLE 13
+#define VIRGL_CMD_BLIT_SRC_LEVEL 14
+#define VIRGL_CMD_BLIT_SRC_FORMAT 15
+#define VIRGL_CMD_BLIT_SRC_X 16
+#define VIRGL_CMD_BLIT_SRC_Y 17
+#define VIRGL_CMD_BLIT_SRC_Z 18
+#define VIRGL_CMD_BLIT_SRC_W 19
+#define VIRGL_CMD_BLIT_SRC_H 20
+#define VIRGL_CMD_BLIT_SRC_D 21
+
+/* query object */
+#define VIRGL_OBJ_QUERY_SIZE 4
+#define VIRGL_OBJ_QUERY_HANDLE 1
+#define VIRGL_OBJ_QUERY_TYPE_INDEX 2
+#define VIRGL_OBJ_QUERY_TYPE(x) (x & 0xffff)
+#define VIRGL_OBJ_QUERY_INDEX(x) ((x & 0xffff) << 16)
+#define VIRGL_OBJ_QUERY_OFFSET 3
+#define VIRGL_OBJ_QUERY_RES_HANDLE 4
+
+#define VIRGL_QUERY_BEGIN_HANDLE 1
+
+#define VIRGL_QUERY_END_HANDLE 1
+
+#define VIRGL_QUERY_RESULT_HANDLE 1
+#define VIRGL_QUERY_RESULT_WAIT 2
+
+/* render condition */
+#define VIRGL_RENDER_CONDITION_SIZE 3
+#define VIRGL_RENDER_CONDITION_HANDLE 1
+#define VIRGL_RENDER_CONDITION_CONDITION 2
+#define VIRGL_RENDER_CONDITION_MODE 3
+
+/* resource inline write */
+#define VIRGL_RESOURCE_IW_RES_HANDLE 1
+#define VIRGL_RESOURCE_IW_LEVEL 2
+#define VIRGL_RESOURCE_IW_USAGE 3
+#define VIRGL_RESOURCE_IW_STRIDE 4
+#define VIRGL_RESOURCE_IW_LAYER_STRIDE 5
+#define VIRGL_RESOURCE_IW_X 6
+#define VIRGL_RESOURCE_IW_Y 7
+#define VIRGL_RESOURCE_IW_Z 8
+#define VIRGL_RESOURCE_IW_W 9
+#define VIRGL_RESOURCE_IW_H 10
+#define VIRGL_RESOURCE_IW_D 11
+#define VIRGL_RESOURCE_IW_DATA_START 12
+
+/* set streamout targets */
+#define VIRGL_SET_STREAMOUT_TARGETS_APPEND_BITMASK 1
+#define VIRGL_SET_STREAMOUT_TARGETS_H0 2
+
+/* set sample mask */
+#define VIRGL_SET_SAMPLE_MASK_SIZE 1
+#define VIRGL_SET_SAMPLE_MASK_MASK 1
+
+/* set clip state */
+#define VIRGL_SET_CLIP_STATE_SIZE 32
+#define VIRGL_SET_CLIP_STATE_C0 1
+
+/* polygon stipple */
+#define VIRGL_POLYGON_STIPPLE_SIZE 32
+#define VIRGL_POLYGON_STIPPLE_P0 1
+
+#define VIRGL_BIND_SHADER_SIZE 2
+#define VIRGL_BIND_SHADER_HANDLE 1
+#define VIRGL_BIND_SHADER_TYPE 2
+
+#endif
diff --git a/src/gallium/drivers/virgl/virgl_public.h b/src/gallium/drivers/virgl/virgl_public.h
new file mode 100644
index 00000000000..a3ea560df7b
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_public.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_PUBLIC_H
+#define VIRGL_PUBLIC_H
+
+struct pipe_screen;
+struct virgl_winsys;
+
+struct pipe_screen *
+virgl_create_screen(struct virgl_winsys *vws);
+#endif
diff --git a/src/gallium/drivers/virgl/virgl_query.c b/src/gallium/drivers/virgl/virgl_query.c
new file mode 100644
index 00000000000..b0200556342
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_query.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "virgl_context.h"
+#include "virgl_encode.h"
+#include "virgl_protocol.h"
+#include "virgl_resource.h"
+
+struct virgl_query {
+   uint32_t handle;
+   struct virgl_resource *buf;
+
+   unsigned index;
+   unsigned type;
+   unsigned result_size;
+   unsigned result_gotten_sent;
+};
+
+static inline struct virgl_query *virgl_query(struct pipe_query *q)
+{
+   return (struct virgl_query *)q;
+}
+
+static void virgl_render_condition(struct pipe_context *ctx,
+                                  struct pipe_query *q,
+                                  boolean condition,
+                                  uint mode)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_query *query = virgl_query(q);
+   uint32_t handle = 0;
+   if (q)
+      handle = query->handle;
+   virgl_encoder_render_condition(vctx, handle, condition, mode);
+}
+
+static struct pipe_query *virgl_create_query(struct pipe_context *ctx,
+                                            unsigned query_type, unsigned index)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_query *query;
+   uint32_t handle;
+
+   query = CALLOC_STRUCT(virgl_query);
+   if (!query)
+      return NULL;
+
+   query->buf = (struct virgl_resource *)pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM,
+                                                           PIPE_USAGE_STAGING, sizeof(struct virgl_host_query_state));
+   if (!query->buf) {
+      FREE(query);
+      return NULL;
+   }
+
+   handle = virgl_object_assign_handle();
+   query->type = query_type;
+   query->index = index;
+   query->handle = handle;
+   query->buf->clean = FALSE;
+   virgl_encoder_create_query(vctx, handle, query_type, index, query->buf, 0);
+
+   return (struct pipe_query *)query;
+}
+
+static void virgl_destroy_query(struct pipe_context *ctx,
+                        struct pipe_query *q)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_query *query = virgl_query(q);
+
+   virgl_encode_delete_object(vctx, query->handle, VIRGL_OBJECT_QUERY);
+
+   pipe_resource_reference((struct pipe_resource **)&query->buf, NULL);
+   FREE(query);
+}
+
+static boolean virgl_begin_query(struct pipe_context *ctx,
+                             struct pipe_query *q)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_query *query = virgl_query(q);
+
+   query->buf->clean = FALSE;
+   virgl_encoder_begin_query(vctx, query->handle);
+   return true;
+}
+
+static void virgl_end_query(struct pipe_context *ctx,
+                           struct pipe_query *q)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_query *query = virgl_query(q);
+   struct pipe_box box;
+
+   uint32_t qs = VIRGL_QUERY_STATE_WAIT_HOST;
+   u_box_1d(0, 4, &box);
+   virgl_transfer_inline_write(ctx, &query->buf->u.b, 0, PIPE_TRANSFER_WRITE,
+                              &box, &qs, 0, 0);
+
+
+   virgl_encoder_end_query(vctx, query->handle);
+}
+
+static boolean virgl_get_query_result(struct pipe_context *ctx,
+                                     struct pipe_query *q,
+                                     boolean wait,
+                                     union pipe_query_result *result)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_query *query = virgl_query(q);
+   struct pipe_transfer *transfer;
+   struct virgl_host_query_state *host_state;
+
+   /* ask host for query result */
+   if (!query->result_gotten_sent) {
+      query->result_gotten_sent = 1;
+      virgl_encoder_get_query_result(vctx, query->handle, 0);
+      ctx->flush(ctx, NULL, 0);
+   }
+
+   /* do we  have to flush? */
+   /* now we can do the transfer to get the result back? */
+ remap:
+   host_state = pipe_buffer_map(ctx, &query->buf->u.b,
+                               PIPE_TRANSFER_READ, &transfer);
+
+   if (host_state->query_state != VIRGL_QUERY_STATE_DONE) {
+      pipe_buffer_unmap(ctx, transfer);
+      if (wait)
+         goto remap;
+      else
+         return FALSE;
+   }
+
+   if (query->type == PIPE_QUERY_TIMESTAMP || query->type == PIPE_QUERY_TIME_ELAPSED)
+      result->u64 = host_state->result;
+   else
+      result->u64 = (uint32_t)host_state->result;
+
+   pipe_buffer_unmap(ctx, transfer);
+   query->result_gotten_sent = 0;
+   return TRUE;
+}
+
+void virgl_init_query_functions(struct virgl_context *vctx)
+{
+   vctx->base.render_condition = virgl_render_condition;
+   vctx->base.create_query = virgl_create_query;
+   vctx->base.destroy_query = virgl_destroy_query;
+   vctx->base.begin_query = virgl_begin_query;
+   vctx->base.end_query = virgl_end_query;
+   vctx->base.get_query_result = virgl_get_query_result;
+}
diff --git a/src/gallium/drivers/virgl/virgl_resource.c b/src/gallium/drivers/virgl/virgl_resource.c
new file mode 100644
index 00000000000..0b2fc4ec497
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_resource.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "util/u_inlines.h"
+#include "virgl_context.h"
+#include "virgl_resource.h"
+#include "virgl_screen.h"
+
+bool virgl_res_needs_flush_wait(struct virgl_context *vctx,
+                                struct virgl_resource *res,
+                                unsigned usage)
+{
+   struct virgl_screen *vs = virgl_screen(vctx->base.screen);
+
+   if ((!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) && vs->vws->res_is_referenced(vs->vws, vctx->cbuf, res->hw_res)) {
+      return true;
+   }
+   return false;
+}
+
+bool virgl_res_needs_readback(struct virgl_context *vctx,
+                              struct virgl_resource *res,
+                              unsigned usage)
+{
+   bool readback = true;
+   if (res->clean)
+      readback = false;
+   else if (usage & PIPE_TRANSFER_DISCARD_RANGE)
+      readback = false;
+   else if ((usage & (PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT)) ==
+            (PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT))
+      readback = false;
+   return readback;
+}
+
+static struct pipe_resource *virgl_resource_create(struct pipe_screen *screen,
+                                                   const struct pipe_resource *templ)
+{
+    struct virgl_screen *vs = virgl_screen(screen);
+    if (templ->target == PIPE_BUFFER)
+        return virgl_buffer_create(vs, templ);
+    else
+        return virgl_texture_create(vs, templ);
+}
+
+static struct pipe_resource *virgl_resource_from_handle(struct pipe_screen *screen,
+                                                        const struct pipe_resource *templ,
+                                                        struct winsys_handle *whandle)
+{
+    struct virgl_screen *vs = virgl_screen(screen);
+    if (templ->target == PIPE_BUFFER)
+        return NULL;
+    else
+        return virgl_texture_from_handle(vs, templ, whandle);
+}
+
+void virgl_init_screen_resource_functions(struct pipe_screen *screen)
+{
+    screen->resource_create = virgl_resource_create;
+    screen->resource_from_handle = virgl_resource_from_handle;
+    screen->resource_get_handle = u_resource_get_handle_vtbl;
+    screen->resource_destroy = u_resource_destroy_vtbl;
+}
+
+void virgl_init_context_resource_functions(struct pipe_context *ctx)
+{
+    ctx->transfer_map = u_transfer_map_vtbl;
+    ctx->transfer_flush_region = u_transfer_flush_region_vtbl;
+    ctx->transfer_unmap = u_transfer_unmap_vtbl;
+    ctx->transfer_inline_write = u_transfer_inline_write_vtbl;
+}
diff --git a/src/gallium/drivers/virgl/virgl_resource.h b/src/gallium/drivers/virgl/virgl_resource.h
new file mode 100644
index 00000000000..bab9bcb9b4e
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_resource.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef VIRGL_RESOURCE_H
+#define VIRGL_RESOURCE_H
+
+#include "util/u_resource.h"
+#include "util/u_range.h"
+#include "util/list.h"
+#include "util/u_transfer.h"
+
+#include "virgl_hw.h"
+#define VR_MAX_TEXTURE_2D_LEVELS 15
+
+struct winsys_handle;
+struct virgl_screen;
+struct virgl_context;
+
+struct virgl_resource {
+   struct u_resource u;
+   struct virgl_hw_res *hw_res;
+   boolean clean;
+};
+
+struct virgl_buffer {
+   struct virgl_resource base;
+
+   struct list_head flush_list;
+   boolean on_list;
+
+   /* The buffer range which is initialized (with a write transfer,
+    * streamout, DMA, or as a random access target). The rest of
+    * the buffer is considered invalid and can be mapped unsynchronized.
+    *
+    * This allows unsychronized mapping of a buffer range which hasn't
+    * been used yet. It's for applications which forget to use
+    * the unsynchronized map flag and expect the driver to figure it out.
+    */
+   struct util_range valid_buffer_range;
+};
+
+struct virgl_texture {
+   struct virgl_resource base;
+
+   unsigned long level_offset[VR_MAX_TEXTURE_2D_LEVELS];
+   unsigned stride[VR_MAX_TEXTURE_2D_LEVELS];
+};
+
+struct virgl_transfer {
+   struct pipe_transfer base;
+   uint32_t offset;
+   struct virgl_resource *resolve_tmp;
+};
+
+void virgl_resource_destroy(struct pipe_screen *screen,
+                            struct pipe_resource *resource);
+
+void virgl_init_screen_resource_functions(struct pipe_screen *screen);
+
+void virgl_init_context_resource_functions(struct pipe_context *ctx);
+
+struct pipe_resource *virgl_texture_create(struct virgl_screen *vs,
+                                           const struct pipe_resource *templ);
+
+struct pipe_resource *virgl_texture_from_handle(struct virgl_screen *vs,
+                                                const struct pipe_resource *templ,
+                                                struct winsys_handle *whandle);
+
+static inline struct virgl_resource *virgl_resource(struct pipe_resource *r)
+{
+   return (struct virgl_resource *)r;
+}
+
+static inline struct virgl_buffer *virgl_buffer(struct pipe_resource *r)
+{
+   return (struct virgl_buffer *)r;
+}
+
+static inline struct virgl_texture *virgl_texture(struct pipe_resource *r)
+{
+   return (struct virgl_texture *)r;
+}
+
+static inline struct virgl_transfer *virgl_transfer(struct pipe_transfer *trans)
+{
+   return (struct virgl_transfer *)trans;
+}
+
+struct pipe_resource *virgl_buffer_create(struct virgl_screen *vs,
+                                          const struct pipe_resource *templ);
+
+static inline unsigned pipe_to_virgl_bind(unsigned pbind)
+{
+   unsigned outbind = 0;
+   if (pbind & PIPE_BIND_DEPTH_STENCIL)
+      outbind |= VIRGL_BIND_DEPTH_STENCIL;
+   if (pbind & PIPE_BIND_RENDER_TARGET)
+      outbind |= VIRGL_BIND_RENDER_TARGET;
+   if (pbind & PIPE_BIND_SAMPLER_VIEW)
+      outbind |= VIRGL_BIND_SAMPLER_VIEW;
+   if (pbind & PIPE_BIND_VERTEX_BUFFER)
+      outbind |= VIRGL_BIND_VERTEX_BUFFER;
+   if (pbind & PIPE_BIND_INDEX_BUFFER)
+      outbind |= VIRGL_BIND_INDEX_BUFFER;
+   if (pbind & PIPE_BIND_CONSTANT_BUFFER)
+      outbind |= VIRGL_BIND_CONSTANT_BUFFER;
+   if (pbind & PIPE_BIND_DISPLAY_TARGET)
+      outbind |= VIRGL_BIND_DISPLAY_TARGET;
+   if (pbind & PIPE_BIND_STREAM_OUTPUT)
+      outbind |= VIRGL_BIND_STREAM_OUTPUT;
+   if (pbind & PIPE_BIND_CURSOR)
+      outbind |= VIRGL_BIND_CURSOR;
+   if (pbind & PIPE_BIND_CUSTOM)
+      outbind |= VIRGL_BIND_CUSTOM;
+   if (pbind & PIPE_BIND_SCANOUT)
+      outbind |= VIRGL_BIND_SCANOUT;
+   return outbind;
+}
+
+bool virgl_res_needs_flush_wait(struct virgl_context *vctx,
+                                struct virgl_resource *res,
+                                unsigned usage);
+bool virgl_res_needs_readback(struct virgl_context *vctx,
+                              struct virgl_resource *res,
+                              unsigned usage);
+#endif
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
new file mode 100644
index 00000000000..cca379d47ab
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -0,0 +1,553 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_format_s3tc.h"
+#include "util/u_video.h"
+#include "os/os_time.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "draw/draw_context.h"
+
+#include "tgsi/tgsi_exec.h"
+
+#include "virgl_screen.h"
+#include "virgl_resource.h"
+#include "virgl_public.h"
+#include "virgl_context.h"
+
+#define SP_MAX_TEXTURE_2D_LEVELS 15  /* 16K x 16K */
+#define SP_MAX_TEXTURE_3D_LEVELS 9   /* 512 x 512 x 512 */
+#define SP_MAX_TEXTURE_CUBE_LEVELS 13  /* 4K x 4K */
+
+static const char *
+virgl_get_vendor(struct pipe_screen *screen)
+{
+   return "Red Hat";
+}
+
+
+static const char *
+virgl_get_name(struct pipe_screen *screen)
+{
+   return "virgl";
+}
+
+static int
+virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
+{
+   struct virgl_screen *vscreen = virgl_screen(screen);
+   switch (param) {
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_SM3:
+      return 1;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 1;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return vscreen->caps.caps.v1.max_render_targets;
+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+      return vscreen->caps.caps.v1.max_dual_source_render_targets;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return vscreen->caps.caps.v1.bset.occlusion_query;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return vscreen->caps.caps.v1.bset.mirror_clamp;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return SP_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return SP_MAX_TEXTURE_3D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return SP_MAX_TEXTURE_CUBE_LEVELS;
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+      return 1;
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+      return vscreen->caps.caps.v1.bset.indep_blend_enable;
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+      return vscreen->caps.caps.v1.bset.indep_blend_func;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return vscreen->caps.caps.v1.bset.fragment_coord_conventions;
+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
+      return vscreen->caps.caps.v1.bset.depth_clip_disable;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+      return vscreen->caps.caps.v1.max_streamout_buffers;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+      return 16*4;
+   case PIPE_CAP_PRIMITIVE_RESTART:
+      return vscreen->caps.caps.v1.bset.primitive_restart;
+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
+      return vscreen->caps.caps.v1.bset.shader_stencil_export;
+   case PIPE_CAP_TGSI_INSTANCEID:
+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+      return 1;
+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
+      return vscreen->caps.caps.v1.bset.seamless_cube_map;
+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+      return vscreen->caps.caps.v1.bset.seamless_cube_map_per_texture;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      return vscreen->caps.caps.v1.max_texture_array_layers;
+   case PIPE_CAP_MIN_TEXEL_OFFSET:
+   case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+      return -8;
+   case PIPE_CAP_MAX_TEXEL_OFFSET:
+   case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+      return 7;
+   case PIPE_CAP_CONDITIONAL_RENDER:
+      return vscreen->caps.caps.v1.bset.conditional_render;
+   case PIPE_CAP_TEXTURE_BARRIER:
+      return 0;
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+      return 1;
+   case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+   case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+      return vscreen->caps.caps.v1.bset.color_clamping;
+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+      return 1;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL:
+      return vscreen->caps.caps.v1.glsl_level;
+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+      return 0;
+   case PIPE_CAP_COMPUTE:
+      return 0;
+   case PIPE_CAP_USER_VERTEX_BUFFERS:
+      return 0;
+   case PIPE_CAP_USER_INDEX_BUFFERS:
+   case PIPE_CAP_USER_CONSTANT_BUFFERS:
+      return 1;
+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+      return 16;
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+      return vscreen->caps.caps.v1.bset.streamout_pause_resume;
+   case PIPE_CAP_START_INSTANCE:
+      return vscreen->caps.caps.v1.bset.start_instance;
+   case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return 0;
+   case PIPE_CAP_QUERY_TIMESTAMP:
+      return 1;
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
+      return 0;
+   case PIPE_CAP_TGSI_TEXCOORD:
+      return 0;
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return VIRGL_MAP_BUFFER_ALIGNMENT;
+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+      return vscreen->caps.caps.v1.max_tbo_size > 0;
+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+      return 0;
+   case PIPE_CAP_CUBE_MAP_ARRAY:
+      return vscreen->caps.caps.v1.bset.cube_map_array;
+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
+      return vscreen->caps.caps.v1.bset.texture_multisample;
+   case PIPE_CAP_MAX_VIEWPORTS:
+      return vscreen->caps.caps.v1.max_viewports;
+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+      return vscreen->caps.caps.v1.max_tbo_size;
+   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+   case PIPE_CAP_ENDIANNESS:
+      return 0;
+   case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+      return 1;
+   case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+      return 0;
+   case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+      return 1024;
+   case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+      return 16384;
+   case PIPE_CAP_TEXTURE_QUERY_LOD:
+      return vscreen->caps.caps.v1.bset.texture_query_lod;
+   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+      return vscreen->caps.caps.v1.max_texture_gather_components;
+   case PIPE_CAP_TEXTURE_GATHER_SM5:
+   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+   case PIPE_CAP_SAMPLE_SHADING:
+   case PIPE_CAP_FAKE_SW_MSAA:
+   case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
+   case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+   case PIPE_CAP_MAX_VERTEX_STREAMS:
+   case PIPE_CAP_DRAW_INDIRECT:
+   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+   case PIPE_CAP_CLIP_HALFZ:
+   case PIPE_CAP_VERTEXID_NOBASE:
+   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
+   case PIPE_CAP_TGSI_TXQS:
+   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
+      return 0;
+   case PIPE_CAP_VENDOR_ID:
+      return 0x1af4;
+   case PIPE_CAP_DEVICE_ID:
+      return 0x1010;
+   case PIPE_CAP_ACCELERATED:
+      return 1;
+   case PIPE_CAP_UMA:
+   case PIPE_CAP_VIDEO_MEMORY:
+      return 0;
+   }
+   /* should only get here on unhandled cases */
+   debug_printf("Unexpected PIPE_CAP %d query\n", param);
+   return 0;
+}
+
+static int
+virgl_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_shader_cap param)
+{
+   struct virgl_screen *vscreen = virgl_screen(screen);
+   switch(shader)
+   {
+   case PIPE_SHADER_FRAGMENT:
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_GEOMETRY:
+      switch (param) {
+      case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
+      case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
+      case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
+      case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
+         return INT_MAX;
+      case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
+      case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
+      case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+         return 1;
+      case PIPE_SHADER_CAP_MAX_INPUTS:
+         if (vscreen->caps.caps.v1.glsl_level < 150)
+            return 16;
+         return shader == PIPE_SHADER_VERTEX ? 16 : 32;
+      case PIPE_SHADER_CAP_MAX_OUTPUTS:
+         return 128;
+     // case PIPE_SHADER_CAP_MAX_CONSTS:
+     //    return 4096;
+      case PIPE_SHADER_CAP_MAX_TEMPS:
+         return 256;
+      case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+         return vscreen->caps.caps.v1.max_uniform_blocks;
+    //  case PIPE_SHADER_CAP_MAX_ADDRS:
+     //    return 1;
+      case PIPE_SHADER_CAP_MAX_PREDS:
+         return 0;
+      case PIPE_SHADER_CAP_SUBROUTINES:
+         return 1;
+      case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+            return 16;
+      case PIPE_SHADER_CAP_INTEGERS:
+         return vscreen->caps.caps.v1.glsl_level >= 130;
+      case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH:
+         return 32;
+      case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
+         return 4096 * sizeof(float[4]);
+      default:
+         return 0;
+      }
+   default:
+      return 0;
+   }
+}
+
+static float
+virgl_get_paramf(struct pipe_screen *screen, enum pipe_capf param)
+{
+   switch (param) {
+   case PIPE_CAPF_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAPF_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+      return 16.0;
+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+   case PIPE_CAPF_GUARD_BAND_LEFT:
+   case PIPE_CAPF_GUARD_BAND_TOP:
+   case PIPE_CAPF_GUARD_BAND_RIGHT:
+   case PIPE_CAPF_GUARD_BAND_BOTTOM:
+      return 0.0;
+   }
+   /* should only get here on unhandled cases */
+   debug_printf("Unexpected PIPE_CAPF %d query\n", param);
+   return 0.0;
+}
+
+static boolean
+virgl_is_vertex_format_supported(struct pipe_screen *screen,
+                                 enum pipe_format format)
+{
+   struct virgl_screen *vscreen = virgl_screen(screen);
+   const struct util_format_description *format_desc;
+   int i;
+
+   format_desc = util_format_description(format);
+   if (!format_desc)
+      return FALSE;
+
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      int vformat = VIRGL_FORMAT_R11G11B10_FLOAT;
+      int big = vformat / 32;
+      int small = vformat % 32;
+      if (!(vscreen->caps.caps.v1.vertexbuffer.bitmask[big] & (1 << small)))
+         return FALSE;
+      return TRUE;
+   }
+
+   /* Find the first non-VOID channel. */
+   for (i = 0; i < 4; i++) {
+      if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+         break;
+      }
+   }
+
+   if (i == 4)
+      return FALSE;
+
+   if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+      return FALSE;
+
+   if (format_desc->channel[i].type == UTIL_FORMAT_TYPE_FIXED)
+      return FALSE;
+   return TRUE;
+}
+
+/**
+ * Query format support for creating a texture, drawing surface, etc.
+ * \param format  the format to test
+ * \param type  one of PIPE_TEXTURE, PIPE_SURFACE
+ */
+static boolean
+virgl_is_format_supported( struct pipe_screen *screen,
+                                 enum pipe_format format,
+                                 enum pipe_texture_target target,
+                                 unsigned sample_count,
+                                 unsigned bind)
+{
+   struct virgl_screen *vscreen = virgl_screen(screen);
+   const struct util_format_description *format_desc;
+   int i;
+
+   assert(target == PIPE_BUFFER ||
+          target == PIPE_TEXTURE_1D ||
+          target == PIPE_TEXTURE_1D_ARRAY ||
+          target == PIPE_TEXTURE_2D ||
+          target == PIPE_TEXTURE_2D_ARRAY ||
+          target == PIPE_TEXTURE_RECT ||
+          target == PIPE_TEXTURE_3D ||
+          target == PIPE_TEXTURE_CUBE ||
+          target == PIPE_TEXTURE_CUBE_ARRAY);
+
+   format_desc = util_format_description(format);
+   if (!format_desc)
+      return FALSE;
+
+   if (util_format_is_intensity(format))
+      return FALSE;
+
+   if (sample_count > 1) {
+      if (!vscreen->caps.caps.v1.bset.texture_multisample)
+         return FALSE;
+      if (sample_count > vscreen->caps.caps.v1.max_samples)
+         return FALSE;
+   }
+
+   if (bind & PIPE_BIND_VERTEX_BUFFER) {
+      return virgl_is_vertex_format_supported(screen, format);
+   }
+
+   if (bind & PIPE_BIND_RENDER_TARGET) {
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      /*
+       * Although possible, it is unnatural to render into compressed or YUV
+       * surfaces. So disable these here to avoid going into weird paths
+       * inside the state trackers.
+       */
+      if (format_desc->block.width != 1 ||
+          format_desc->block.height != 1)
+         return FALSE;
+
+      {
+         int big = format / 32;
+         int small = format % 32;
+         if (!(vscreen->caps.caps.v1.render.bitmask[big] & (1 << small)))
+            return FALSE;
+      }
+   }
+
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
+      if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+   }
+
+   /*
+    * All other operations (sampling, transfer, etc).
+    */
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      if (util_format_s3tc_enabled)
+         goto out_lookup;
+      return FALSE;
+   }
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
+      goto out_lookup;
+   }
+
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
+      goto out_lookup;
+   } else if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
+      goto out_lookup;
+   }
+
+   /* Find the first non-VOID channel. */
+   for (i = 0; i < 4; i++) {
+      if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
+         break;
+      }
+   }
+
+   if (i == 4)
+      return FALSE;
+
+   /* no L4A4 */
+   if (format_desc->nr_channels < 4 && format_desc->channel[i].size == 4)
+      return FALSE;
+
+ out_lookup:
+   {
+      int big = format / 32;
+      int small = format % 32;
+      if (!(vscreen->caps.caps.v1.sampler.bitmask[big] & (1 << small)))
+         return FALSE;
+   }
+   /*
+    * Everything else should be supported by u_format.
+    */
+   return TRUE;
+}
+
+static void virgl_flush_frontbuffer(struct pipe_screen *screen,
+                                      struct pipe_resource *res,
+                                      unsigned level, unsigned layer,
+                                    void *winsys_drawable_handle, struct pipe_box *sub_box)
+{
+   struct virgl_screen *vscreen = virgl_screen(screen);
+   struct virgl_winsys *vws = vscreen->vws;
+   struct virgl_resource *vres = virgl_resource(res);
+
+   if (vws->flush_frontbuffer)
+      vws->flush_frontbuffer(vws, vres->hw_res, level, layer, winsys_drawable_handle,
+                             sub_box);
+}
+
+static void virgl_fence_reference(struct pipe_screen *screen,
+                                  struct pipe_fence_handle **ptr,
+                                  struct pipe_fence_handle *fence)
+{
+   struct virgl_screen *vscreen = virgl_screen(screen);
+   struct virgl_winsys *vws = vscreen->vws;
+
+   vws->fence_reference(vws, ptr, fence);
+}
+
+static boolean virgl_fence_finish(struct pipe_screen *screen,
+                                  struct pipe_fence_handle *fence,
+                                  uint64_t timeout)
+{
+   struct virgl_screen *vscreen = virgl_screen(screen);
+   struct virgl_winsys *vws = vscreen->vws;
+
+   return vws->fence_wait(vws, fence, timeout);
+}
+
+static uint64_t
+virgl_get_timestamp(struct pipe_screen *_screen)
+{
+   return os_time_get_nano();
+}
+
+static void
+virgl_destroy_screen(struct pipe_screen *screen)
+{
+   struct virgl_screen *vscreen = virgl_screen(screen);
+   struct virgl_winsys *vws = vscreen->vws;
+
+   if (vws)
+      vws->destroy(vws);
+   FREE(vscreen);
+}
+
+struct pipe_screen *
+virgl_create_screen(struct virgl_winsys *vws)
+{
+   struct virgl_screen *screen = CALLOC_STRUCT(virgl_screen);
+
+   if (!screen)
+      return NULL;
+
+   screen->vws = vws;
+   screen->base.get_name = virgl_get_name;
+   screen->base.get_vendor = virgl_get_vendor;
+   screen->base.get_param = virgl_get_param;
+   screen->base.get_shader_param = virgl_get_shader_param;
+   screen->base.get_paramf = virgl_get_paramf;
+   screen->base.is_format_supported = virgl_is_format_supported;
+   screen->base.destroy = virgl_destroy_screen;
+   screen->base.context_create = virgl_context_create;
+   screen->base.flush_frontbuffer = virgl_flush_frontbuffer;
+   screen->base.get_timestamp = virgl_get_timestamp;
+   screen->base.fence_reference = virgl_fence_reference;
+   //screen->base.fence_signalled = virgl_fence_signalled;
+   screen->base.fence_finish = virgl_fence_finish;
+
+   virgl_init_screen_resource_functions(&screen->base);
+
+   vws->get_caps(vws, &screen->caps);
+
+
+   util_format_s3tc_init();
+   return &screen->base;
+}
diff --git a/src/gallium/drivers/virgl/virgl_screen.h b/src/gallium/drivers/virgl/virgl_screen.h
new file mode 100644
index 00000000000..52e72ca4958
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_screen.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_H
+#define VIRGL_H
+
+#include "pipe/p_screen.h"
+#include "virgl_winsys.h"
+
+struct virgl_screen {
+   struct pipe_screen base;
+   struct virgl_winsys *vws;
+
+   struct virgl_drm_caps caps;
+
+   uint32_t sub_ctx_id;
+};
+
+
+static inline struct virgl_screen *
+virgl_screen(struct pipe_screen *pipe)
+{
+   return (struct virgl_screen *)pipe;
+}
+
+#define VIRGL_MAP_BUFFER_ALIGNMENT 64
+
+#endif
diff --git a/src/gallium/drivers/virgl/virgl_streamout.c b/src/gallium/drivers/virgl/virgl_streamout.c
new file mode 100644
index 00000000000..b6a65fff29e
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_streamout.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "virgl_context.h"
+#include "virgl_encode.h"
+#include "virgl_protocol.h"
+#include "virgl_resource.h"
+
+static struct pipe_stream_output_target *virgl_create_so_target(
+   struct pipe_context *ctx,
+   struct pipe_resource *buffer,
+   unsigned buffer_offset,
+   unsigned buffer_size)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_resource *res = virgl_resource(buffer);
+   struct virgl_so_target *t = CALLOC_STRUCT(virgl_so_target);
+   uint32_t handle;
+
+   if (!t)
+      return NULL;
+   handle = virgl_object_assign_handle();
+
+   t->base.reference.count = 1;
+   t->base.context = ctx;
+   pipe_resource_reference(&t->base.buffer, buffer);
+   t->base.buffer_offset = buffer_offset;
+   t->base.buffer_size = buffer_size;
+   t->handle = handle;
+   res->clean = FALSE;
+   virgl_encoder_create_so_target(vctx, handle, res, buffer_offset, buffer_size);
+   return &t->base;
+}
+
+static void virgl_destroy_so_target(struct pipe_context *ctx,
+                                   struct pipe_stream_output_target *target)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_so_target *t = virgl_so_target(target);
+
+   pipe_resource_reference(&t->base.buffer, NULL);
+   virgl_encode_delete_object(vctx, t->handle, VIRGL_OBJECT_STREAMOUT_TARGET);
+   FREE(t);
+}
+
+static void virgl_set_so_targets(struct pipe_context *ctx,
+                                unsigned num_targets,
+                                struct pipe_stream_output_target **targets,
+                                const unsigned *offset)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   int i;
+   for (i = 0; i < num_targets; i++) {
+      pipe_resource_reference(&vctx->so_targets[i].base.buffer, targets[i]->buffer);
+   }
+   for (i = num_targets; i < vctx->num_so_targets; i++)
+      pipe_resource_reference(&vctx->so_targets[i].base.buffer, NULL);
+   vctx->num_so_targets = num_targets;
+   virgl_encoder_set_so_targets(vctx, num_targets, targets, 0);//append_bitmask);
+}
+
+void virgl_init_so_functions(struct virgl_context *vctx)
+{
+   vctx->base.create_stream_output_target = virgl_create_so_target;
+   vctx->base.stream_output_target_destroy = virgl_destroy_so_target;
+   vctx->base.set_stream_output_targets = virgl_set_so_targets;
+}
diff --git a/src/gallium/drivers/virgl/virgl_texture.c b/src/gallium/drivers/virgl/virgl_texture.c
new file mode 100644
index 00000000000..31189626144
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_texture.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#include "virgl_context.h"
+#include "virgl_resource.h"
+#include "virgl_screen.h"
+
+static void virgl_copy_region_with_blit(struct pipe_context *pipe,
+                                        struct pipe_resource *dst,
+                                        unsigned dst_level,
+                                        unsigned dstx, unsigned dsty, unsigned dstz,
+                                        struct pipe_resource *src,
+                                        unsigned src_level,
+                                        const struct pipe_box *src_box)
+{
+   struct pipe_blit_info blit;
+
+   memset(&blit, 0, sizeof(blit));
+   blit.src.resource = src;
+   blit.src.format = src->format;
+   blit.src.level = src_level;
+   blit.src.box = *src_box;
+   blit.dst.resource = dst;
+   blit.dst.format = dst->format;
+   blit.dst.level = dst_level;
+   blit.dst.box.x = dstx;
+   blit.dst.box.y = dsty;
+   blit.dst.box.z = dstz;
+   blit.dst.box.width = src_box->width;
+   blit.dst.box.height = src_box->height;
+   blit.dst.box.depth = src_box->depth;
+   blit.mask = util_format_get_mask(src->format) &
+      util_format_get_mask(dst->format);
+   blit.filter = PIPE_TEX_FILTER_NEAREST;
+
+   if (blit.mask) {
+      pipe->blit(pipe, &blit);
+   }
+}
+static void virgl_init_temp_resource_from_box(struct pipe_resource *res,
+                                              struct pipe_resource *orig,
+                                              const struct pipe_box *box,
+                                              unsigned level, unsigned flags)
+{
+   memset(res, 0, sizeof(*res));
+   res->format = orig->format;
+   res->width0 = box->width;
+   res->height0 = box->height;
+   res->depth0 = 1;
+   res->array_size = 1;
+   res->usage = PIPE_USAGE_STAGING;
+   res->flags = flags;
+
+   /* We must set the correct texture target and dimensions for a 3D box. */
+   if (box->depth > 1 && util_max_layer(orig, level) > 0)
+      res->target = orig->target;
+   else
+      res->target = PIPE_TEXTURE_2D;
+
+   switch (res->target) {
+   case PIPE_TEXTURE_1D_ARRAY:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      res->array_size = box->depth;
+      break;
+   case PIPE_TEXTURE_3D:
+      res->depth0 = box->depth;
+      break;
+   default:
+      break;
+   }
+}
+
+static unsigned
+vrend_get_tex_image_offset(const struct virgl_texture *res,
+                           unsigned level, unsigned layer)
+{
+   const struct pipe_resource *pres = &res->base.u.b;
+   const unsigned hgt = u_minify(pres->height0, level);
+   const unsigned nblocksy = util_format_get_nblocksy(pres->format, hgt);
+   unsigned offset = res->level_offset[level];
+
+   if (pres->target == PIPE_TEXTURE_CUBE ||
+       pres->target == PIPE_TEXTURE_CUBE_ARRAY ||
+       pres->target == PIPE_TEXTURE_3D ||
+       pres->target == PIPE_TEXTURE_2D_ARRAY) {
+      offset += layer * nblocksy * res->stride[level];
+   }
+   else if (pres->target == PIPE_TEXTURE_1D_ARRAY) {
+      offset += layer * res->stride[level];
+   }
+   else {
+      assert(layer == 0);
+   }
+
+   return offset;
+}
+
+static void *virgl_texture_transfer_map(struct pipe_context *ctx,
+                                        struct pipe_resource *resource,
+                                        unsigned level,
+                                        unsigned usage,
+                                        const struct pipe_box *box,
+                                        struct pipe_transfer **transfer)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *vs = virgl_screen(ctx->screen);
+   struct virgl_texture *vtex = virgl_texture(resource);
+   enum pipe_format format = resource->format;
+   struct virgl_transfer *trans;
+   void *ptr;
+   boolean readback = TRUE;
+   uint32_t offset;
+   struct virgl_hw_res *hw_res;
+   const unsigned h = u_minify(vtex->base.u.b.height0, level);
+   const unsigned nblocksy = util_format_get_nblocksy(format, h);
+   bool is_depth = util_format_has_depth(util_format_description(resource->format));
+   uint32_t l_stride;
+   bool doflushwait;
+
+   doflushwait = virgl_res_needs_flush_wait(vctx, &vtex->base, usage);
+   if (doflushwait)
+      ctx->flush(ctx, NULL, 0);
+
+   trans = util_slab_alloc(&vctx->texture_transfer_pool);
+   if (trans == NULL)
+      return NULL;
+
+   trans->base.resource = resource;
+   trans->base.level = level;
+   trans->base.usage = usage;
+   trans->base.box = *box;
+   trans->base.stride = vtex->stride[level];
+   trans->base.layer_stride = trans->base.stride * nblocksy;
+
+   if (resource->target != PIPE_TEXTURE_3D &&
+       resource->target != PIPE_TEXTURE_CUBE &&
+       resource->target != PIPE_TEXTURE_1D_ARRAY &&
+       resource->target != PIPE_TEXTURE_2D_ARRAY &&
+       resource->target != PIPE_TEXTURE_CUBE_ARRAY)
+      l_stride = 0;
+   else
+      l_stride = trans->base.layer_stride;
+
+   if (is_depth && resource->nr_samples > 1) {
+      struct pipe_resource tmp_resource;
+      virgl_init_temp_resource_from_box(&tmp_resource, resource, box,
+                                        level, 0);
+
+      trans->resolve_tmp = (struct virgl_resource *)ctx->screen->resource_create(ctx->screen, &tmp_resource);
+
+      virgl_copy_region_with_blit(ctx, &trans->resolve_tmp->u.b, 0, 0, 0, 0, resource, level, box);
+      ctx->flush(ctx, NULL, 0);
+      /* we want to do a resolve blit into the temporary */
+      hw_res = trans->resolve_tmp->hw_res;
+      offset = 0;
+   } else {
+      offset = vrend_get_tex_image_offset(vtex, level, box->z);
+
+      offset += box->y / util_format_get_blockheight(format) * trans->base.stride +
+      box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+      hw_res = vtex->base.hw_res;
+      trans->resolve_tmp = NULL;
+   }
+
+   readback = virgl_res_needs_readback(vctx, &vtex->base, usage);
+   if (readback)
+      vs->vws->transfer_get(vs->vws, hw_res, box, trans->base.stride, l_stride, offset, level);
+
+   if (doflushwait || readback)
+      vs->vws->resource_wait(vs->vws, vtex->base.hw_res);
+
+   ptr = vs->vws->resource_map(vs->vws, hw_res);
+   if (!ptr) {
+      return NULL;
+   }
+
+   trans->offset = offset;
+   *transfer = &trans->base;
+
+   return ptr + trans->offset;
+}
+
+static void virgl_texture_transfer_unmap(struct pipe_context *ctx,
+                                         struct pipe_transfer *transfer)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_transfer *trans = virgl_transfer(transfer);
+   struct virgl_texture *vtex = virgl_texture(transfer->resource);
+   uint32_t l_stride;
+
+   if (transfer->resource->target != PIPE_TEXTURE_3D &&
+       transfer->resource->target != PIPE_TEXTURE_CUBE &&
+       transfer->resource->target != PIPE_TEXTURE_1D_ARRAY &&
+       transfer->resource->target != PIPE_TEXTURE_2D_ARRAY &&
+       transfer->resource->target != PIPE_TEXTURE_CUBE_ARRAY)
+      l_stride = 0;
+   else
+      l_stride = trans->base.layer_stride;
+
+   if (trans->base.usage & PIPE_TRANSFER_WRITE) {
+      if (!(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) {
+         struct virgl_screen *vs = virgl_screen(ctx->screen);
+         vtex->base.clean = FALSE;
+         vctx->num_transfers++;
+         vs->vws->transfer_put(vs->vws, vtex->base.hw_res,
+                               &transfer->box, trans->base.stride, l_stride, trans->offset, transfer->level);
+
+      }
+   }
+
+   if (trans->resolve_tmp)
+      pipe_resource_reference((struct pipe_resource **)&trans->resolve_tmp, NULL);
+
+   util_slab_free(&vctx->texture_transfer_pool, trans);
+}
+
+
+static boolean
+vrend_resource_layout(struct virgl_texture *res,
+                      uint32_t *total_size)
+{
+   struct pipe_resource *pt = &res->base.u.b;
+   unsigned level;
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+   unsigned buffer_size = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      unsigned slices;
+
+      if (pt->target == PIPE_TEXTURE_CUBE)
+         slices = 6;
+      else if (pt->target == PIPE_TEXTURE_3D)
+         slices = depth;
+      else
+         slices = pt->array_size;
+
+      res->stride[level] = util_format_get_stride(pt->format, width);
+      res->level_offset[level] = buffer_size;
+
+      buffer_size += (util_format_get_nblocksy(pt->format, height) *
+                      slices * res->stride[level]);
+
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
+   }
+
+   if (pt->nr_samples <= 1)
+      *total_size = buffer_size;
+   else /* don't create guest backing store for MSAA */
+      *total_size = 0;
+   return TRUE;
+}
+
+static boolean virgl_texture_get_handle(struct pipe_screen *screen,
+                                         struct pipe_resource *ptex,
+                                         struct winsys_handle *whandle)
+{
+   struct virgl_screen *vs = virgl_screen(screen);
+   struct virgl_texture *vtex = virgl_texture(ptex);
+
+   return vs->vws->resource_get_handle(vs->vws, vtex->base.hw_res, vtex->stride[0], whandle);
+}
+
+static void virgl_texture_destroy(struct pipe_screen *screen,
+                                  struct pipe_resource *res)
+{
+   struct virgl_screen *vs = virgl_screen(screen);
+   struct virgl_texture *vtex = virgl_texture(res);
+   vs->vws->resource_unref(vs->vws, vtex->base.hw_res);
+   FREE(vtex);
+}
+
+static const struct u_resource_vtbl virgl_texture_vtbl =
+{
+   virgl_texture_get_handle,            /* get_handle */
+   virgl_texture_destroy,               /* resource_destroy */
+   virgl_texture_transfer_map,          /* transfer_map */
+   NULL,                                /* transfer_flush_region */
+   virgl_texture_transfer_unmap,        /* transfer_unmap */
+   NULL                                 /* transfer_inline_write */
+};
+
+struct pipe_resource *
+virgl_texture_from_handle(struct virgl_screen *vs,
+                          const struct pipe_resource *template,
+                          struct winsys_handle *whandle)
+{
+   struct virgl_texture *tex;
+   uint32_t size;
+
+   tex = CALLOC_STRUCT(virgl_texture);
+   tex->base.u.b = *template;
+   tex->base.u.b.screen = &vs->base;
+   pipe_reference_init(&tex->base.u.b.reference, 1);
+   tex->base.u.vtbl = &virgl_texture_vtbl;
+   vrend_resource_layout(tex, &size);
+
+   tex->base.hw_res = vs->vws->resource_create_from_handle(vs->vws, whandle);
+   return &tex->base.u.b;
+}
+
+struct pipe_resource *virgl_texture_create(struct virgl_screen *vs,
+                                           const struct pipe_resource *template)
+{
+   struct virgl_texture *tex;
+   uint32_t size;
+   unsigned vbind;
+
+   tex = CALLOC_STRUCT(virgl_texture);
+   tex->base.clean = TRUE;
+   tex->base.u.b = *template;
+   tex->base.u.b.screen = &vs->base;
+   pipe_reference_init(&tex->base.u.b.reference, 1);
+   tex->base.u.vtbl = &virgl_texture_vtbl;
+   vrend_resource_layout(tex, &size);
+
+   vbind = pipe_to_virgl_bind(template->bind);
+   tex->base.hw_res = vs->vws->resource_create(vs->vws, template->target, template->format, vbind, template->width0, template->height0, template->depth0, template->array_size, template->last_level, template->nr_samples, size);
+   if (!tex->base.hw_res) {
+      FREE(tex);
+      return NULL;
+   }
+   return &tex->base.u.b;
+}
diff --git a/src/gallium/drivers/virgl/virgl_tgsi.c b/src/gallium/drivers/virgl/virgl_tgsi.c
new file mode 100644
index 00000000000..641b0b3e3b5
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_tgsi.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/* the virgl hw tgsi vs what the current gallium want will diverge over time.
+   so add a transform stage to remove things we don't want to send unless
+   the receiver supports it.
+*/
+#include "tgsi/tgsi_transform.h"
+#include "virgl_context.h"
+struct virgl_transform_context {
+   struct tgsi_transform_context base;
+};
+
+/* for now just strip out the new properties the remote doesn't understand
+   yet */
+static void
+virgl_tgsi_transform_property(struct tgsi_transform_context *ctx,
+                              struct tgsi_full_property *prop)
+{
+   switch (prop->Property.PropertyName) {
+   case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
+   case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
+      break;
+   default:
+      ctx->emit_property(ctx, prop);
+      break;
+   }
+}
+
+struct tgsi_token *virgl_tgsi_transform(const struct tgsi_token *tokens_in)
+{
+
+   struct virgl_transform_context transform;
+   const uint newLen = tgsi_num_tokens(tokens_in);
+   struct tgsi_token *new_tokens;
+
+   new_tokens = tgsi_alloc_tokens(newLen);
+   if (!new_tokens)
+      return NULL;
+
+   memset(&transform, 0, sizeof(transform));
+   transform.base.transform_property = virgl_tgsi_transform_property;
+   tgsi_transform_shader(tokens_in, new_tokens, newLen, &transform.base);
+
+   return new_tokens;
+}
diff --git a/src/gallium/drivers/virgl/virgl_winsys.h b/src/gallium/drivers/virgl/virgl_winsys.h
new file mode 100644
index 00000000000..ea21f2b6712
--- /dev/null
+++ b/src/gallium/drivers/virgl/virgl_winsys.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_WINSYS_H
+#define VIRGL_WINSYS_H
+
+#include "pipe/p_defines.h"
+#include "virgl_hw.h"
+
+struct pipe_box;
+struct pipe_fence_handle;
+struct winsys_handle;
+struct virgl_hw_res;
+
+#define VIRGL_MAX_CMDBUF_DWORDS (16*1024)
+
+struct virgl_drm_caps {
+   union virgl_caps caps;
+};
+
+struct virgl_cmd_buf {
+   unsigned cdw;
+   uint32_t *buf;
+};
+
+struct virgl_winsys {
+   unsigned pci_id;
+
+   void (*destroy)(struct virgl_winsys *vws);
+
+   int (*transfer_put)(struct virgl_winsys *vws,
+                       struct virgl_hw_res *res,
+                       const struct pipe_box *box,
+                       uint32_t stride, uint32_t layer_stride,
+                       uint32_t buf_offset, uint32_t level);
+
+   int (*transfer_get)(struct virgl_winsys *vws,
+                       struct virgl_hw_res *res,
+                       const struct pipe_box *box,
+                       uint32_t stride, uint32_t layer_stride,
+                       uint32_t buf_offset, uint32_t level);
+
+   struct virgl_hw_res *(*resource_create)(struct virgl_winsys *vws,
+                               enum pipe_texture_target target,
+                               uint32_t format, uint32_t bind,
+                               uint32_t width, uint32_t height,
+                               uint32_t depth, uint32_t array_size,
+                               uint32_t last_level, uint32_t nr_samples,
+                               uint32_t size);
+
+   void (*resource_unref)(struct virgl_winsys *vws, struct virgl_hw_res *res);
+
+   void *(*resource_map)(struct virgl_winsys *vws, struct virgl_hw_res *res);
+   void (*resource_wait)(struct virgl_winsys *vws, struct virgl_hw_res *res);
+
+   struct virgl_hw_res *(*resource_create_from_handle)(struct virgl_winsys *vws,
+                                                       struct winsys_handle *whandle);
+   boolean (*resource_get_handle)(struct virgl_winsys *vws,
+                                  struct virgl_hw_res *res,
+                                  uint32_t stride,
+                                  struct winsys_handle *whandle);
+
+   struct virgl_cmd_buf *(*cmd_buf_create)(struct virgl_winsys *ws);
+   void (*cmd_buf_destroy)(struct virgl_cmd_buf *buf);
+
+   void (*emit_res)(struct virgl_winsys *vws, struct virgl_cmd_buf *buf, struct virgl_hw_res *res, boolean write_buffer);
+   int (*submit_cmd)(struct virgl_winsys *vws, struct virgl_cmd_buf *buf);
+
+   boolean (*res_is_referenced)(struct virgl_winsys *vws,
+                                struct virgl_cmd_buf *buf,
+                                struct virgl_hw_res *res);
+
+   int (*get_caps)(struct virgl_winsys *vws, struct virgl_drm_caps *caps);
+
+   /* fence */
+   struct pipe_fence_handle *(*cs_create_fence)(struct virgl_winsys *vws);
+   bool (*fence_wait)(struct virgl_winsys *vws,
+                      struct pipe_fence_handle *fence,
+                      uint64_t timeout);
+
+   void (*fence_reference)(struct virgl_winsys *vws,
+                           struct pipe_fence_handle **dst,
+                           struct pipe_fence_handle *src);
+
+   /* for sw paths */
+   void (*flush_frontbuffer)(struct virgl_winsys *vws,
+                             struct virgl_hw_res *res,
+                             unsigned level, unsigned layer,
+                             void *winsys_drawable_handle,
+                             struct pipe_box *sub_box);
+};
+
+
+#endif
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 1ad545aae09..b15c8809c1d 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -633,6 +633,7 @@ enum pipe_cap
    PIPE_CAP_TGSI_TXQS,
    PIPE_CAP_FORCE_PERSAMPLE_INTERP,
    PIPE_CAP_SHAREABLE_SHADERS,
+   PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h
index b2646d44c74..5f0690e5ae6 100644
--- a/src/gallium/include/pipe/p_format.h
+++ b/src/gallium/include/pipe/p_format.h
@@ -444,7 +444,8 @@ enum pipe_video_chroma_format
    PIPE_VIDEO_CHROMA_FORMAT_400,
    PIPE_VIDEO_CHROMA_FORMAT_420,
    PIPE_VIDEO_CHROMA_FORMAT_422,
-   PIPE_VIDEO_CHROMA_FORMAT_444
+   PIPE_VIDEO_CHROMA_FORMAT_444,
+   PIPE_VIDEO_CHROMA_FORMAT_NONE
 };
 
 #ifdef __cplusplus
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index a22fb938dbb..f868d71db23 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -169,6 +169,10 @@ struct pipe_screen {
    struct pipe_resource * (*resource_create)(struct pipe_screen *,
 					     const struct pipe_resource *templat);
 
+   struct pipe_resource * (*resource_create_front)(struct pipe_screen *,
+                                                   const struct pipe_resource *templat,
+                                                   const void *map_front_private);
+
    /**
     * Create a texture from a winsys_handle. The handle is often created in
     * another process by first creating a pipe texture and then calling
diff --git a/src/gallium/include/pipe/p_video_state.h b/src/gallium/include/pipe/p_video_state.h
index 7d13151e643..d353be60759 100644
--- a/src/gallium/include/pipe/p_video_state.h
+++ b/src/gallium/include/pipe/p_video_state.h
@@ -479,6 +479,8 @@ struct pipe_h265_picture_desc
    uint8_t RefPicSetStCurrBefore[8];
    uint8_t RefPicSetStCurrAfter[8];
    uint8_t RefPicSetLtCurr[8];
+   uint8_t RefPicList[2][15];
+   bool UseRefPicList;
 };
 
 #ifdef __cplusplus
diff --git a/src/gallium/include/state_tracker/drisw_api.h b/src/gallium/include/state_tracker/drisw_api.h
index 328440cf5ff..cd5a27e2482 100644
--- a/src/gallium/include/state_tracker/drisw_api.h
+++ b/src/gallium/include/state_tracker/drisw_api.h
@@ -11,6 +11,9 @@ struct dri_drawable;
  */
 struct drisw_loader_funcs
 {
+   void (*get_image) (struct dri_drawable *dri_drawable,
+                      int x, int y, unsigned width, unsigned height, unsigned stride,
+                      void *data);
    void (*put_image) (struct dri_drawable *dri_drawable,
                       void *data, unsigned width, unsigned height);
    void (*put_image2) (struct dri_drawable *dri_drawable,
diff --git a/src/gallium/include/state_tracker/sw_winsys.h b/src/gallium/include/state_tracker/sw_winsys.h
index a3479eb0bc3..0b792cd0ce4 100644
--- a/src/gallium/include/state_tracker/sw_winsys.h
+++ b/src/gallium/include/state_tracker/sw_winsys.h
@@ -90,6 +90,7 @@ struct sw_winsys
                             enum pipe_format format,
                             unsigned width, unsigned height,
                             unsigned alignment,
+                            const void *front_private,
                             unsigned *stride );
 
    /**
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index d74b50df45a..3b37f0802b0 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -247,8 +247,12 @@ namespace {
       // attribute.  This attribute will prevent Clang from creating
       // illegal uses of barrier() (e.g. Moving barrier() inside a conditional
       // that is no executed by all threads) during its optimizaton passes.
+#if HAVE_LLVM >= 0x0308
+      c.getCodeGenOpts().LinkBitcodeFiles.emplace_back(llvm::Linker::Flags::None,
+                                                       libclc_path);
+#else
       c.getCodeGenOpts().LinkBitcodeFile = libclc_path;
-
+#endif
       optimization_level = c.getCodeGenOpts().OptimizationLevel;
 
       // Compile the code
diff --git a/src/gallium/state_trackers/dri/drisw.c b/src/gallium/state_trackers/dri/drisw.c
index 4ec6992643a..753c59d696a 100644
--- a/src/gallium/state_trackers/dri/drisw.c
+++ b/src/gallium/state_trackers/dri/drisw.c
@@ -95,6 +95,21 @@ get_image(__DRIdrawable *dPriv, int x, int y, int width, int height, void *data)
                     data, dPriv->loaderPrivate);
 }
 
+static inline void
+get_image2(__DRIdrawable *dPriv, int x, int y, int width, int height, int stride, void *data)
+{
+   __DRIscreen *sPriv = dPriv->driScreenPriv;
+   const __DRIswrastLoaderExtension *loader = sPriv->swrast_loader;
+
+   /* getImage2 support is only in version 3 or newer */
+   if (loader->base.version < 3)
+      return;
+
+   loader->getImage2(dPriv,
+                     x, y, width, height, stride,
+                     data, dPriv->loaderPrivate);
+}
+
 static void
 drisw_update_drawable_info(struct dri_drawable *drawable)
 {
@@ -105,6 +120,18 @@ drisw_update_drawable_info(struct dri_drawable *drawable)
 }
 
 static void
+drisw_get_image(struct dri_drawable *drawable,
+                int x, int y, unsigned width, unsigned height, unsigned stride,
+                void *data)
+{
+   __DRIdrawable *dPriv = drawable->dPriv;
+   int draw_x, draw_y, draw_w, draw_h;
+
+   get_drawable_info(dPriv, &draw_x, &draw_y, &draw_w, &draw_h);
+   get_image2(dPriv, x, y, draw_w, draw_h, stride, data);
+}
+
+static void
 drisw_put_image(struct dri_drawable *drawable,
                 void *data, unsigned width, unsigned height)
 {
@@ -236,6 +263,7 @@ drisw_allocate_textures(struct dri_context *stctx,
                         unsigned count)
 {
    struct dri_screen *screen = dri_screen(drawable->sPriv);
+   const __DRIswrastLoaderExtension *loader = drawable->dPriv->driScreenPriv->swrast_loader;
    struct pipe_resource templ;
    unsigned width, height;
    boolean resized;
@@ -281,8 +309,14 @@ drisw_allocate_textures(struct dri_context *stctx,
       templ.format = format;
       templ.bind = bind;
 
-      drawable->textures[statts[i]] =
-         screen->base.screen->resource_create(screen->base.screen, &templ);
+      if (statts[i] == ST_ATTACHMENT_FRONT_LEFT &&
+          screen->base.screen->resource_create_front &&
+          loader->base.version >= 3) {
+         drawable->textures[statts[i]] =
+            screen->base.screen->resource_create_front(screen->base.screen, &templ, (const void *)drawable);
+      } else
+         drawable->textures[statts[i]] =
+            screen->base.screen->resource_create(screen->base.screen, &templ);
    }
 
    drawable->old_w = width;
@@ -338,6 +372,7 @@ static const __DRIextension *drisw_screen_extensions[] = {
 };
 
 static struct drisw_loader_funcs drisw_lf = {
+   .get_image = drisw_get_image,
    .put_image = drisw_put_image,
    .put_image2 = drisw_put_image2
 };
diff --git a/src/gallium/state_trackers/va/Makefile.am b/src/gallium/state_trackers/va/Makefile.am
index 2a93a904346..348cfe17759 100644
--- a/src/gallium/state_trackers/va/Makefile.am
+++ b/src/gallium/state_trackers/va/Makefile.am
@@ -30,6 +30,15 @@ AM_CFLAGS = \
 	$(VA_CFLAGS) \
 	-DVA_DRIVER_INIT_FUNC="__vaDriverInit_$(VA_MAJOR)_$(VA_MINOR)"
 
+AM_CFLAGS += \
+	$(GALLIUM_PIPE_LOADER_DEFINES) \
+	-DPIPE_SEARCH_DIR=\"$(libdir)/gallium-pipe\"
+
+if HAVE_GALLIUM_STATIC_TARGETS
+AM_CFLAGS += \
+	-DGALLIUM_STATIC_TARGETS=1
+endif
+
 AM_CPPFLAGS = \
 	-I$(top_srcdir)/include
 
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c
index 8f9ba440a75..71a65037757 100644
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -26,8 +26,12 @@
  *
  **************************************************************************/
 
+#include "pipe/p_screen.h"
+#include "state_tracker/drm_driver.h"
 #include "util/u_memory.h"
 #include "util/u_handle_table.h"
+#include "util/u_transfer.h"
+#include "vl/vl_winsys.h"
 
 #include "va_private.h"
 
@@ -73,6 +77,12 @@ vlVaBufferSetNumElements(VADriverContextP ctx, VABufferID buf_id,
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
    buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
+   if (!buf)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
+
+   if (buf->derived_surface.resource)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
+
    buf->data = REALLOC(buf->data, buf->size * buf->num_elements,
                        buf->size * num_elements);
    buf->num_elements = num_elements;
@@ -86,16 +96,37 @@ vlVaBufferSetNumElements(VADriverContextP ctx, VABufferID buf_id,
 VAStatus
 vlVaMapBuffer(VADriverContextP ctx, VABufferID buf_id, void **pbuff)
 {
+   vlVaDriver *drv;
    vlVaBuffer *buf;
 
    if (!ctx)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
-   buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
+   drv = VL_VA_DRIVER(ctx);
+   if (!drv)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   if (!pbuff)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   buf = handle_table_get(drv->htab, buf_id);
    if (!buf)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
-   *pbuff = buf->data;
+   if (buf->export_refcount > 0)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
+
+   if (buf->derived_surface.resource) {
+      *pbuff = pipe_buffer_map(drv->pipe, buf->derived_surface.resource,
+                               PIPE_TRANSFER_WRITE,
+                               &buf->derived_surface.transfer);
+
+      if (!buf->derived_surface.transfer || !*pbuff)
+         return VA_STATUS_ERROR_INVALID_BUFFER;
+
+   } else {
+      *pbuff = buf->data;
+   }
 
    return VA_STATUS_SUCCESS;
 }
@@ -103,16 +134,30 @@ vlVaMapBuffer(VADriverContextP ctx, VABufferID buf_id, void **pbuff)
 VAStatus
 vlVaUnmapBuffer(VADriverContextP ctx, VABufferID buf_id)
 {
+   vlVaDriver *drv;
    vlVaBuffer *buf;
 
    if (!ctx)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
-   buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
+   drv = VL_VA_DRIVER(ctx);
+   if (!drv)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   buf = handle_table_get(drv->htab, buf_id);
    if (!buf)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
-   /* Nothing to do here */
+   if (buf->export_refcount > 0)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
+
+   if (buf->derived_surface.resource) {
+     if (!buf->derived_surface.transfer)
+        return VA_STATUS_ERROR_INVALID_BUFFER;
+
+     pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer);
+     buf->derived_surface.transfer = NULL;
+   }
 
    return VA_STATUS_SUCCESS;
 }
@@ -129,6 +174,13 @@ vlVaDestroyBuffer(VADriverContextP ctx, VABufferID buf_id)
    if (!buf)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
+   if (buf->derived_surface.resource) {
+     if (buf->export_refcount > 0)
+       return VA_STATUS_ERROR_INVALID_BUFFER;
+
+     pipe_resource_reference(&buf->derived_surface.resource, NULL);
+   }
+
    FREE(buf->data);
    FREE(buf);
    handle_table_remove(VL_VA_DRIVER(ctx)->htab, buf_id);
@@ -155,3 +207,126 @@ vlVaBufferInfo(VADriverContextP ctx, VABufferID buf_id, VABufferType *type,
 
    return VA_STATUS_SUCCESS;
 }
+
+VAStatus
+vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
+                        VABufferInfo *out_buf_info)
+{
+   uint32_t i;
+   uint32_t mem_type;
+   vlVaBuffer *buf ;
+   struct pipe_screen *screen;
+
+   /* List of supported memory types, in preferred order. */
+   static const uint32_t mem_types[] = {
+      VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME,
+      0
+   };
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
+
+   if (!buf)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
+
+   /* Only VA surface|image like buffers are supported for now .*/
+   if (buf->type != VAImageBufferType)
+      return VA_STATUS_ERROR_UNSUPPORTED_BUFFERTYPE;
+
+   if (!out_buf_info)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   if (!out_buf_info->mem_type)
+      mem_type = mem_types[0];
+   else {
+      mem_type = 0;
+      for (i = 0; mem_types[i] != 0; i++) {
+         if (out_buf_info->mem_type & mem_types[i]) {
+            mem_type = out_buf_info->mem_type;
+            break;
+         }
+      }
+      if (!mem_type)
+         return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
+   }
+
+   if (!buf->derived_surface.resource)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
+
+   screen = VL_VA_PSCREEN(ctx);
+
+   if (buf->derived_surface.fence) {
+      screen->fence_finish(screen, buf->derived_surface.fence, PIPE_TIMEOUT_INFINITE);
+      screen->fence_reference(screen, &buf->derived_surface.fence, NULL);
+   }
+
+   if (buf->export_refcount > 0) {
+      if (buf->export_state.mem_type != mem_type)
+         return VA_STATUS_ERROR_INVALID_PARAMETER;
+   } else {
+      VABufferInfo * const buf_info = &buf->export_state;
+
+      switch (mem_type) {
+      case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: {
+         struct winsys_handle whandle;
+
+         memset(&whandle, 0, sizeof(whandle));
+         whandle.type = DRM_API_HANDLE_TYPE_FD;
+
+         if (!screen->resource_get_handle(screen, buf->derived_surface.resource, &whandle))
+            return VA_STATUS_ERROR_INVALID_BUFFER;
+
+         buf_info->handle = (intptr_t)whandle.handle;
+         break;
+      default:
+         return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
+      }
+   }
+
+   buf_info->type = buf->type;
+   buf_info->mem_type = mem_type;
+   buf_info->mem_size = buf->num_elements * buf->size;
+
+   }
+
+   buf->export_refcount++;
+
+   *out_buf_info = buf->export_state;
+
+   return VA_STATUS_SUCCESS;
+}
+
+VAStatus
+vlVaReleaseBufferHandle(VADriverContextP ctx, VABufferID buf_id)
+{
+   vlVaBuffer *buf;
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, buf_id);
+
+   if (!buf)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
+
+   if (buf->export_refcount == 0)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
+
+   if (--buf->export_refcount == 0) {
+      VABufferInfo * const buf_info = &buf->export_state;
+
+      switch (buf_info->mem_type) {
+      case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+         close((intptr_t)buf_info->handle);
+         break;
+      default:
+         return VA_STATUS_ERROR_INVALID_BUFFER;
+      }
+
+      buf_info->mem_type = 0;
+   }
+
+   return VA_STATUS_SUCCESS;
+}
diff --git a/src/gallium/state_trackers/va/config.c b/src/gallium/state_trackers/va/config.c
index cfb0b25b71f..0f47aacdbd6 100644
--- a/src/gallium/state_trackers/va/config.c
+++ b/src/gallium/state_trackers/va/config.c
@@ -45,13 +45,16 @@ vlVaQueryConfigProfiles(VADriverContextP ctx, VAProfile *profile_list, int *num_
    *num_profiles = 0;
 
    pscreen = VL_VA_PSCREEN(ctx);
-   for (p = PIPE_VIDEO_PROFILE_MPEG2_SIMPLE; p <= PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH; ++p)
+   for (p = PIPE_VIDEO_PROFILE_MPEG2_SIMPLE; p <= PIPE_VIDEO_PROFILE_HEVC_MAIN_444; ++p)
       if (pscreen->get_video_param(pscreen, p, PIPE_VIDEO_ENTRYPOINT_BITSTREAM, PIPE_VIDEO_CAP_SUPPORTED)) {
          vap = PipeToProfile(p);
          if (vap != VAProfileNone)
             profile_list[(*num_profiles)++] = vap;
       }
 
+   /* Support postprocessing through vl_compositor */
+   profile_list[(*num_profiles)++] = VAProfileNone;
+
    return VA_STATUS_SUCCESS;
 }
 
@@ -67,6 +70,11 @@ vlVaQueryConfigEntrypoints(VADriverContextP ctx, VAProfile profile,
 
    *num_entrypoints = 0;
 
+   if (profile == VAProfileNone) {
+       entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc;
+       return VA_STATUS_SUCCESS;
+   }
+
    p = ProfileToPipe(profile);
    if (p == PIPE_VIDEO_PROFILE_UNKNOWN)
       return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
@@ -118,6 +126,11 @@ vlVaCreateConfig(VADriverContextP ctx, VAProfile profile, VAEntrypoint entrypoin
    if (!ctx)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
+   if (profile == VAProfileNone && entrypoint == VAEntrypointVideoProc) {
+       *config_id = PIPE_VIDEO_PROFILE_UNKNOWN;
+       return VA_STATUS_SUCCESS;
+   }
+
    p = ProfileToPipe(profile);
    if (p == PIPE_VIDEO_PROFILE_UNKNOWN)
       return VA_STATUS_ERROR_UNSUPPORTED_PROFILE;
@@ -151,6 +164,13 @@ vlVaQueryConfigAttributes(VADriverContextP ctx, VAConfigID config_id, VAProfile
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
    *profile = PipeToProfile(config_id);
+
+   if (config_id == PIPE_VIDEO_PROFILE_UNKNOWN) {
+      *entrypoint = VAEntrypointVideoProc;
+       *num_attribs = 0;
+      return VA_STATUS_SUCCESS;
+   }
+
    *entrypoint = VAEntrypointVLD;
 
    *num_attribs = 1;
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index 8b003aedaec..ec9e0488d85 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -28,7 +28,8 @@
 
 #include "pipe/p_screen.h"
 #include "pipe/p_video_codec.h"
-
+#include "pipe-loader/pipe_loader.h"
+#include "state_tracker/drm_driver.h"
 #include "util/u_memory.h"
 #include "util/u_handle_table.h"
 #include "util/u_video.h"
@@ -36,6 +37,8 @@
 
 #include "va_private.h"
 
+#include <va/va_drmcommon.h>
+
 static struct VADriverVTable vtable =
 {
    &vlVaTerminate,
@@ -81,13 +84,28 @@ static struct VADriverVTable vtable =
    &vlVaSetDisplayAttributes,
    &vlVaBufferInfo,
    &vlVaLockSurface,
-   &vlVaUnlockSurface
+   &vlVaUnlockSurface,
+   NULL, /* DEPRECATED VaGetSurfaceAttributes */
+   &vlVaCreateSurfaces2,
+   &vlVaQuerySurfaceAttributes,
+   &vlVaAcquireBufferHandle,
+   &vlVaReleaseBufferHandle
+};
+
+static struct VADriverVTableVPP vtable_vpp =
+{
+   1,
+   &vlVaQueryVideoProcFilters,
+   &vlVaQueryVideoProcFilterCaps,
+   &vlVaQueryVideoProcPipelineCaps
 };
 
 PUBLIC VAStatus
 VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
 {
    vlVaDriver *drv;
+   int drm_fd;
+   struct drm_state *drm_info;
 
    if (!ctx)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
@@ -96,9 +114,56 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
    if (!drv)
       return VA_STATUS_ERROR_ALLOCATION_FAILED;
 
-   drv->vscreen = vl_screen_create(ctx->native_dpy, ctx->x11_screen);
-   if (!drv->vscreen)
-      goto error_screen;
+   switch (ctx->display_type) {
+   case VA_DISPLAY_ANDROID:
+   case VA_DISPLAY_WAYLAND:
+      FREE(drv);
+      return VA_STATUS_ERROR_UNIMPLEMENTED;
+   case VA_DISPLAY_GLX:
+   case VA_DISPLAY_X11:
+      drv->vscreen = vl_screen_create(ctx->native_dpy, ctx->x11_screen);
+      if (!drv->vscreen)
+         goto error_screen;
+      break;
+   case VA_DISPLAY_DRM:
+   case VA_DISPLAY_DRM_RENDERNODES: {
+      drm_info = (struct drm_state *) ctx->drm_state;
+      if (!drm_info) {
+         FREE(drv);
+         return VA_STATUS_ERROR_INVALID_PARAMETER;
+      }
+
+#if GALLIUM_STATIC_TARGETS
+      drm_fd = drm_info->fd;
+#else
+      drm_fd = dup(drm_info->fd);
+#endif
+
+      if (drm_fd < 0) {
+         FREE(drv);
+         return VA_STATUS_ERROR_INVALID_PARAMETER;
+      }
+
+      drv->vscreen = CALLOC_STRUCT(vl_screen);
+      if (!drv->vscreen)
+         goto error_screen;
+
+#if GALLIUM_STATIC_TARGETS
+      drv->vscreen->pscreen = dd_create_screen(drm_fd);
+#else
+      if (pipe_loader_drm_probe_fd(&drv->dev, drm_fd))
+         drv->vscreen->pscreen = pipe_loader_create_screen(drv->dev, PIPE_SEARCH_DIR);
+#endif
+
+      if (!drv->vscreen->pscreen)
+         goto error_pipe;
+
+      }
+      break;
+   default:
+      FREE(drv);
+      return VA_STATUS_ERROR_INVALID_DISPLAY;
+   }
 
    drv->pipe = drv->vscreen->pscreen->context_create(drv->vscreen->pscreen,
                                                      drv->vscreen, 0);
@@ -119,6 +184,7 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
    ctx->version_major = 0;
    ctx->version_minor = 1;
    *ctx->vtable = vtable;
+   *ctx->vtable_vpp = vtable_vpp;
    ctx->max_profiles = PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH - PIPE_VIDEO_PROFILE_UNKNOWN;
    ctx->max_entrypoints = 1;
    ctx->max_attributes = 1;
@@ -133,7 +199,10 @@ error_htab:
    drv->pipe->destroy(drv->pipe);
 
 error_pipe:
-   vl_screen_destroy(drv->vscreen);
+   if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11)
+      vl_screen_destroy(drv->vscreen);
+   else
+      FREE(drv->vscreen);
 
 error_screen:
    FREE(drv);
@@ -148,11 +217,15 @@ vlVaCreateContext(VADriverContextP ctx, VAConfigID config_id, int picture_width,
    struct pipe_video_codec templat = {};
    vlVaDriver *drv;
    vlVaContext *context;
+   int is_vpp;
 
    if (!ctx)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
-   if (!(picture_width && picture_height))
+   is_vpp = config_id == PIPE_VIDEO_PROFILE_UNKNOWN && !picture_width &&
+            !picture_height && !flag && !render_targets && !num_render_targets;
+
+   if (!(picture_width && picture_height) && !is_vpp)
       return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
 
    drv = VL_VA_DRIVER(ctx);
@@ -160,38 +233,61 @@ vlVaCreateContext(VADriverContextP ctx, VAConfigID config_id, int picture_width,
    if (!context)
       return VA_STATUS_ERROR_ALLOCATION_FAILED;
 
-   templat.profile = config_id;
-   templat.entrypoint = PIPE_VIDEO_ENTRYPOINT_BITSTREAM;
-   templat.chroma_format = PIPE_VIDEO_CHROMA_FORMAT_420;
-   templat.width = picture_width;
-   templat.height = picture_height;
-   templat.max_references = num_render_targets;
-   templat.expect_chunked_decode = true;
-
-   if (u_reduce_video_profile(templat.profile) ==
-       PIPE_VIDEO_FORMAT_MPEG4_AVC)
-      templat.level = u_get_h264_level(templat.width, templat.height,
-                            &templat.max_references);
-
-   context->decoder = drv->pipe->create_video_codec(drv->pipe, &templat);
-   if (!context->decoder) {
-      FREE(context);
-      return VA_STATUS_ERROR_ALLOCATION_FAILED;
-   }
-
-   if (u_reduce_video_profile(context->decoder->profile) ==
-         PIPE_VIDEO_FORMAT_MPEG4_AVC) {
-      context->desc.h264.pps = CALLOC_STRUCT(pipe_h264_pps);
-      if (!context->desc.h264.pps) {
+   if (is_vpp) {
+      context->decoder = NULL;
+      if (!drv->compositor.upload) {
          FREE(context);
-         return VA_STATUS_ERROR_ALLOCATION_FAILED;
+         return VA_STATUS_ERROR_INVALID_CONTEXT;
       }
-      context->desc.h264.pps->sps = CALLOC_STRUCT(pipe_h264_sps);
-      if (!context->desc.h264.pps->sps) {
-         FREE(context->desc.h264.pps);
+   } else {
+      templat.profile = config_id;
+      templat.entrypoint = PIPE_VIDEO_ENTRYPOINT_BITSTREAM;
+      templat.chroma_format = PIPE_VIDEO_CHROMA_FORMAT_420;
+      templat.width = picture_width;
+      templat.height = picture_height;
+      templat.max_references = num_render_targets;
+      templat.expect_chunked_decode = true;
+
+      if (u_reduce_video_profile(templat.profile) ==
+        PIPE_VIDEO_FORMAT_MPEG4_AVC)
+        templat.level = u_get_h264_level(templat.width, templat.height,
+                             &templat.max_references);
+
+      context->decoder = drv->pipe->create_video_codec(drv->pipe, &templat);
+      if (!context->decoder) {
          FREE(context);
          return VA_STATUS_ERROR_ALLOCATION_FAILED;
       }
+
+      if (u_reduce_video_profile(context->decoder->profile) ==
+         PIPE_VIDEO_FORMAT_MPEG4_AVC) {
+         context->desc.h264.pps = CALLOC_STRUCT(pipe_h264_pps);
+         if (!context->desc.h264.pps) {
+            FREE(context);
+            return VA_STATUS_ERROR_ALLOCATION_FAILED;
+         }
+         context->desc.h264.pps->sps = CALLOC_STRUCT(pipe_h264_sps);
+         if (!context->desc.h264.pps->sps) {
+            FREE(context->desc.h264.pps);
+            FREE(context);
+            return VA_STATUS_ERROR_ALLOCATION_FAILED;
+         }
+      }
+
+      if (u_reduce_video_profile(context->decoder->profile) ==
+            PIPE_VIDEO_FORMAT_HEVC) {
+         context->desc.h265.pps = CALLOC_STRUCT(pipe_h265_pps);
+         if (!context->desc.h265.pps) {
+            FREE(context);
+            return VA_STATUS_ERROR_ALLOCATION_FAILED;
+         }
+         context->desc.h265.pps->sps = CALLOC_STRUCT(pipe_h265_sps);
+         if (!context->desc.h265.pps->sps) {
+            FREE(context->desc.h265.pps);
+            FREE(context);
+            return VA_STATUS_ERROR_ALLOCATION_FAILED;
+         }
+      }
    }
 
    context->desc.base.profile = config_id;
@@ -211,12 +307,20 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID context_id)
 
    drv = VL_VA_DRIVER(ctx);
    context = handle_table_get(drv->htab, context_id);
-   if (u_reduce_video_profile(context->decoder->profile) ==
-         PIPE_VIDEO_FORMAT_MPEG4_AVC) {
-      FREE(context->desc.h264.pps->sps);
-      FREE(context->desc.h264.pps);
+
+   if (context->decoder) {
+      if (u_reduce_video_profile(context->decoder->profile) ==
+            PIPE_VIDEO_FORMAT_MPEG4_AVC) {
+         FREE(context->desc.h264.pps->sps);
+         FREE(context->desc.h264.pps);
+      }
+      if (u_reduce_video_profile(context->decoder->profile) ==
+            PIPE_VIDEO_FORMAT_HEVC) {
+         FREE(context->desc.h265.pps->sps);
+         FREE(context->desc.h265.pps);
+      }
+      context->decoder->destroy(context->decoder);
    }
-   context->decoder->destroy(context->decoder);
    FREE(context);
    handle_table_remove(drv->htab, context_id);
 
@@ -235,7 +339,10 @@ vlVaTerminate(VADriverContextP ctx)
    vl_compositor_cleanup_state(&drv->cstate);
    vl_compositor_cleanup(&drv->compositor);
    drv->pipe->destroy(drv->pipe);
-   vl_screen_destroy(drv->vscreen);
+   if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11)
+      vl_screen_destroy(drv->vscreen);
+   else
+      FREE(drv->vscreen);
    handle_table_destroy(drv->htab);
    FREE(drv);
 
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c
index b37a9714437..c6d0c5abf65 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -37,14 +37,21 @@
 
 #include "va_private.h"
 
-static const VAImageFormat formats[VL_VA_MAX_IMAGE_FORMATS] =
+static const VAImageFormat formats[] =
 {
    {VA_FOURCC('N','V','1','2')},
    {VA_FOURCC('I','4','2','0')},
    {VA_FOURCC('Y','V','1','2')},
    {VA_FOURCC('Y','U','Y','V')},
    {VA_FOURCC('U','Y','V','Y')},
-   {VA_FOURCC('B','G','R','A')}
+   {.fourcc = VA_FOURCC('B','G','R','A'), .byte_order = VA_LSB_FIRST, 32, 32,
+    0x00ff0000, 0x0000ff00, 0x000000ff, 0xff000000},
+   {.fourcc = VA_FOURCC('R','G','B','A'), .byte_order = VA_LSB_FIRST, 32, 32,
+    0x000000ff, 0x0000ff00, 0x00ff0000, 0xff000000},
+   {.fourcc = VA_FOURCC('B','G','R','X'), .byte_order = VA_LSB_FIRST, 32, 24,
+    0x00ff0000, 0x0000ff00, 0x000000ff, 0x00000000},
+   {.fourcc = VA_FOURCC('R','G','B','X'), .byte_order = VA_LSB_FIRST, 32, 24,
+    0x000000ff, 0x0000ff00, 0x00ff0000, 0x00000000}
 };
 
 static void
@@ -72,6 +79,8 @@ vlVaQueryImageFormats(VADriverContextP ctx, VAImageFormat *format_list, int *num
    enum pipe_format format;
    int i;
 
+   STATIC_ASSERT(ARRAY_SIZE(formats) == VL_VA_MAX_IMAGE_FORMATS);
+
    if (!ctx)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
@@ -80,8 +89,8 @@ vlVaQueryImageFormats(VADriverContextP ctx, VAImageFormat *format_list, int *num
 
    *num_formats = 0;
    pscreen = VL_VA_PSCREEN(ctx);
-   for (i = 0; i < VL_VA_MAX_IMAGE_FORMATS; ++i) {
-      format = YCbCrToPipe(formats[i].fourcc);
+   for (i = 0; i < ARRAY_SIZE(formats); ++i) {
+      format = VaFourccToPipeFormat(formats[i].fourcc);
       if (pscreen->is_video_format_supported(pscreen, format,
           PIPE_VIDEO_PROFILE_UNKNOWN,
           PIPE_VIDEO_ENTRYPOINT_BITSTREAM))
@@ -149,6 +158,9 @@ vlVaCreateImage(VADriverContextP ctx, VAImageFormat *format, int width, int heig
       break;
 
    case VA_FOURCC('B','G','R','A'):
+   case VA_FOURCC('R','G','B','A'):
+   case VA_FOURCC('B','G','R','X'):
+   case VA_FOURCC('R','G','B','X'):
       img->num_planes = 1;
       img->pitches[0] = w * 4;
       img->offsets[0] = 0;
@@ -172,10 +184,97 @@ vlVaCreateImage(VADriverContextP ctx, VAImageFormat *format, int width, int heig
 VAStatus
 vlVaDeriveImage(VADriverContextP ctx, VASurfaceID surface, VAImage *image)
 {
+   vlVaDriver *drv;
+   vlVaSurface *surf;
+   vlVaBuffer *img_buf;
+   VAImage *img;
+   struct pipe_surface **surfaces;
+   int w;
+   int h;
+   int i;
+
    if (!ctx)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
-   return VA_STATUS_ERROR_UNIMPLEMENTED;
+   drv = VL_VA_DRIVER(ctx);
+
+   if (!drv)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   surf = handle_table_get(drv->htab, surface);
+
+   if (!surf || !surf->buffer || surf->buffer->interlaced)
+      return VA_STATUS_ERROR_INVALID_SURFACE;
+
+   surfaces = surf->buffer->get_surfaces(surf->buffer);
+   if (!surfaces || !surfaces[0]->texture)
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+   img = CALLOC(1, sizeof(VAImage));
+   if (!img)
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+   img->format.fourcc = PipeFormatToVaFourcc(surf->buffer->buffer_format);
+   img->buf = VA_INVALID_ID;
+   img->width = surf->buffer->width;
+   img->height = surf->buffer->height;
+   img->num_palette_entries = 0;
+   img->entry_bytes = 0;
+   w = align(surf->buffer->width, 2);
+   h = align(surf->buffer->height, 2);
+
+   for (i = 0; i < ARRAY_SIZE(formats); ++i) {
+      if (img->format.fourcc == formats[i].fourcc) {
+         img->format = formats[i];
+         break;
+      }
+   }
+
+   switch (img->format.fourcc) {
+   case VA_FOURCC('U','Y','V','Y'):
+   case VA_FOURCC('Y','U','Y','V'):
+      img->num_planes = 1;
+      img->pitches[0] = w * 2;
+      img->offsets[0] = 0;
+      img->data_size  = w * h * 2;
+      break;
+
+   case VA_FOURCC('B','G','R','A'):
+   case VA_FOURCC('R','G','B','A'):
+   case VA_FOURCC('B','G','R','X'):
+   case VA_FOURCC('R','G','B','X'):
+      img->num_planes = 1;
+      img->pitches[0] = w * 4;
+      img->offsets[0] = 0;
+      img->data_size  = w * h * 4;
+      break;
+
+   default:
+      /* VaDeriveImage is designed for contiguous planes. */
+      FREE(img);
+      return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
+   }
+
+   img_buf = CALLOC(1, sizeof(vlVaBuffer));
+   if (!img_buf) {
+      FREE(img);
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+   }
+
+   img->image_id = handle_table_add(drv->htab, img);
+
+   img_buf->type = VAImageBufferType;
+   img_buf->size = image->data_size;
+   img_buf->num_elements = 1;
+   img_buf->derived_surface.fence = surf->fence;
+
+   pipe_resource_reference(&img_buf->derived_surface.resource, surfaces[0]->texture);
+
+   img->buf = handle_table_add(VL_VA_DRIVER(ctx)->htab, img_buf);
+
+   *image = *img;
+
+   return VA_STATUS_SUCCESS;
 }
 
 VAStatus
@@ -235,7 +334,7 @@ vlVaGetImage(VADriverContextP ctx, VASurfaceID surface, int x, int y,
    if (!img_buf)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
-   format = YCbCrToPipe(vaimage->format.fourcc);
+   format = VaFourccToPipeFormat(vaimage->format.fourcc);
    if (format == PIPE_FORMAT_NONE)
       return VA_STATUS_ERROR_OPERATION_FAILED;
 
@@ -330,17 +429,30 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image,
    if (!img_buf)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
-   format = YCbCrToPipe(vaimage->format.fourcc);
+   if (img_buf->derived_surface.resource) {
+      /* Attempting to transfer derived image to surface */
+      return VA_STATUS_ERROR_UNIMPLEMENTED;
+   }
+
+   format = VaFourccToPipeFormat(vaimage->format.fourcc);
+
    if (format == PIPE_FORMAT_NONE)
       return VA_STATUS_ERROR_OPERATION_FAILED;
 
-   if (surf->buffer == NULL || format != surf->buffer->buffer_format) {
-      if (surf->buffer)
-         surf->buffer->destroy(surf->buffer);
+   if (format != surf->buffer->buffer_format) {
+      struct pipe_video_buffer *tmp_buf;
+      enum pipe_format old_surf_format = surf->templat.buffer_format;
+
       surf->templat.buffer_format = format;
-      surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &surf->templat);
-      if (!surf->buffer)
-         return VA_STATUS_ERROR_ALLOCATION_FAILED;
+      tmp_buf = drv->pipe->create_video_buffer(drv->pipe, &surf->templat);
+
+      if (!tmp_buf) {
+          surf->templat.buffer_format = old_surf_format;
+          return VA_STATUS_ERROR_ALLOCATION_FAILED;
+      }
+
+      surf->buffer->destroy(surf->buffer);
+      surf->buffer = tmp_buf;
    }
 
    views = surf->buffer->get_sampler_view_planes(surf->buffer);
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index 9b94b397b07..e850689005d 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -32,6 +32,7 @@
 #include "util/u_video.h"
 
 #include "vl/vl_vlc.h"
+#include "vl/vl_winsys.h"
 
 #include "va_private.h"
 
@@ -58,7 +59,17 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende
       return VA_STATUS_ERROR_INVALID_SURFACE;
 
    context->target = surf->buffer;
-   context->decoder->begin_frame(context->decoder, context->target, NULL);
+
+   if (!context->decoder) {
+      /* VPP */
+      if ((context->target->buffer_format != PIPE_FORMAT_B8G8R8A8_UNORM  &&
+           context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM) ||
+           context->target->interlaced)
+          return VA_STATUS_ERROR_UNIMPLEMENTED;
+      return VA_STATUS_SUCCESS;
+   }
+
+   context->decoder->begin_frame(context->decoder, context->target, &context->desc.base);
 
    return VA_STATUS_SUCCESS;
 }
@@ -81,6 +92,7 @@ handlePictureParameterBuffer(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *
    VAPictureParameterBufferH264 *h264;
    VAPictureParameterBufferVC1 * vc1;
    VAPictureParameterBufferMPEG4 *mpeg4;
+   VAPictureParameterBufferHEVC *hevc;
    vlVaSurface *surf_forward;
    vlVaSurface *surf_backward;
    unsigned int i;
@@ -286,6 +298,157 @@ handlePictureParameterBuffer(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *
 
       break;
 
+  case PIPE_VIDEO_FORMAT_HEVC:
+      assert(buf->size >= sizeof(VAPictureParameterBufferHEVC) && buf->num_elements == 1);
+      hevc = buf->data;
+      context->desc.h265.pps->sps->chroma_format_idc = hevc->pic_fields.bits.chroma_format_idc;
+      context->desc.h265.pps->sps->separate_colour_plane_flag =
+         hevc->pic_fields.bits.separate_colour_plane_flag;
+      context->desc.h265.pps->sps->pic_width_in_luma_samples = hevc->pic_width_in_luma_samples;
+      context->desc.h265.pps->sps->pic_height_in_luma_samples = hevc->pic_height_in_luma_samples;
+      context->desc.h265.pps->sps->bit_depth_luma_minus8 = hevc->bit_depth_luma_minus8;
+      context->desc.h265.pps->sps->bit_depth_chroma_minus8 = hevc->bit_depth_chroma_minus8;
+      context->desc.h265.pps->sps->log2_max_pic_order_cnt_lsb_minus4 =
+         hevc->log2_max_pic_order_cnt_lsb_minus4;
+      context->desc.h265.pps->sps->sps_max_dec_pic_buffering_minus1 =
+         hevc->sps_max_dec_pic_buffering_minus1;
+      context->desc.h265.pps->sps->log2_min_luma_coding_block_size_minus3 =
+         hevc->log2_min_luma_coding_block_size_minus3;
+      context->desc.h265.pps->sps->log2_diff_max_min_luma_coding_block_size =
+         hevc->log2_diff_max_min_luma_coding_block_size;
+      context->desc.h265.pps->sps->log2_min_transform_block_size_minus2 =
+         hevc->log2_min_transform_block_size_minus2;
+      context->desc.h265.pps->sps->log2_diff_max_min_transform_block_size =
+         hevc->log2_diff_max_min_transform_block_size;
+      context->desc.h265.pps->sps->max_transform_hierarchy_depth_inter =
+         hevc->max_transform_hierarchy_depth_inter;
+      context->desc.h265.pps->sps->max_transform_hierarchy_depth_intra =
+         hevc->max_transform_hierarchy_depth_intra;
+      context->desc.h265.pps->sps->scaling_list_enabled_flag =
+         hevc->pic_fields.bits.scaling_list_enabled_flag;
+      context->desc.h265.pps->sps->amp_enabled_flag = hevc->pic_fields.bits.amp_enabled_flag;
+      context->desc.h265.pps->sps->sample_adaptive_offset_enabled_flag =
+         hevc->slice_parsing_fields.bits.sample_adaptive_offset_enabled_flag;
+      context->desc.h265.pps->sps->pcm_enabled_flag = hevc->pic_fields.bits.pcm_enabled_flag;
+      if (hevc->pic_fields.bits.pcm_enabled_flag == 1) {
+         context->desc.h265.pps->sps->pcm_sample_bit_depth_luma_minus1 =
+            hevc->pcm_sample_bit_depth_luma_minus1;
+         context->desc.h265.pps->sps->pcm_sample_bit_depth_chroma_minus1 =
+            hevc->pcm_sample_bit_depth_chroma_minus1;
+         context->desc.h265.pps->sps->log2_min_pcm_luma_coding_block_size_minus3 =
+            hevc->log2_min_pcm_luma_coding_block_size_minus3;
+         context->desc.h265.pps->sps->log2_diff_max_min_pcm_luma_coding_block_size =
+            hevc->log2_diff_max_min_pcm_luma_coding_block_size;
+         context->desc.h265.pps->sps->pcm_loop_filter_disabled_flag =
+            hevc->pic_fields.bits.pcm_loop_filter_disabled_flag;
+      }
+      context->desc.h265.pps->sps->num_short_term_ref_pic_sets = hevc->num_short_term_ref_pic_sets;
+      context->desc.h265.pps->sps->long_term_ref_pics_present_flag =
+         hevc->slice_parsing_fields.bits.long_term_ref_pics_present_flag;
+      context->desc.h265.pps->sps->num_long_term_ref_pics_sps = hevc->num_long_term_ref_pic_sps;
+      context->desc.h265.pps->sps->sps_temporal_mvp_enabled_flag =
+         hevc->slice_parsing_fields.bits.sps_temporal_mvp_enabled_flag;
+      context->desc.h265.pps->sps->strong_intra_smoothing_enabled_flag =
+         hevc->pic_fields.bits.strong_intra_smoothing_enabled_flag;
+
+      context->desc.h265.pps->dependent_slice_segments_enabled_flag =
+         hevc->slice_parsing_fields.bits.dependent_slice_segments_enabled_flag;
+      context->desc.h265.pps->output_flag_present_flag =
+         hevc->slice_parsing_fields.bits.output_flag_present_flag;
+      context->desc.h265.pps->num_extra_slice_header_bits = hevc->num_extra_slice_header_bits;
+      context->desc.h265.pps->sign_data_hiding_enabled_flag =
+         hevc->pic_fields.bits.sign_data_hiding_enabled_flag;
+      context->desc.h265.pps->cabac_init_present_flag =
+         hevc->slice_parsing_fields.bits.cabac_init_present_flag;
+      context->desc.h265.pps->num_ref_idx_l0_default_active_minus1 =
+         hevc->num_ref_idx_l0_default_active_minus1;
+      context->desc.h265.pps->num_ref_idx_l1_default_active_minus1 =
+         hevc->num_ref_idx_l1_default_active_minus1;
+      context->desc.h265.pps->init_qp_minus26 = hevc->init_qp_minus26;
+      context->desc.h265.pps->constrained_intra_pred_flag =
+         hevc->pic_fields.bits.constrained_intra_pred_flag;
+      context->desc.h265.pps->transform_skip_enabled_flag =
+         hevc->pic_fields.bits.transform_skip_enabled_flag;
+      context->desc.h265.pps->cu_qp_delta_enabled_flag =
+         hevc->pic_fields.bits.cu_qp_delta_enabled_flag;
+      context->desc.h265.pps->diff_cu_qp_delta_depth = hevc->diff_cu_qp_delta_depth;
+      context->desc.h265.pps->pps_cb_qp_offset = hevc->pps_cb_qp_offset;
+      context->desc.h265.pps->pps_cr_qp_offset = hevc->pps_cr_qp_offset;
+      context->desc.h265.pps->pps_slice_chroma_qp_offsets_present_flag =
+         hevc->slice_parsing_fields.bits.pps_slice_chroma_qp_offsets_present_flag;
+      context->desc.h265.pps->weighted_pred_flag = hevc->pic_fields.bits.weighted_pred_flag;
+      context->desc.h265.pps->weighted_bipred_flag = hevc->pic_fields.bits.weighted_bipred_flag;
+      context->desc.h265.pps->transquant_bypass_enabled_flag =
+         hevc->pic_fields.bits.transquant_bypass_enabled_flag;
+      context->desc.h265.pps->tiles_enabled_flag = hevc->pic_fields.bits.tiles_enabled_flag;
+      context->desc.h265.pps->entropy_coding_sync_enabled_flag =
+         hevc->pic_fields.bits.entropy_coding_sync_enabled_flag;
+      if (hevc->pic_fields.bits.tiles_enabled_flag == 1) {
+         context->desc.h265.pps->num_tile_columns_minus1 = hevc->num_tile_columns_minus1;
+         context->desc.h265.pps->num_tile_rows_minus1 = hevc->num_tile_rows_minus1;
+         for (i = 0 ; i < 19 ; i++)
+            context->desc.h265.pps->column_width_minus1[i] = hevc->column_width_minus1[i];
+         for (i = 0 ; i < 21 ; i++)
+            context->desc.h265.pps->row_height_minus1[i] = hevc->row_height_minus1[i];
+         context->desc.h265.pps->loop_filter_across_tiles_enabled_flag =
+            hevc->pic_fields.bits.loop_filter_across_tiles_enabled_flag;
+      }
+      context->desc.h265.pps->pps_loop_filter_across_slices_enabled_flag =
+         hevc->pic_fields.bits.pps_loop_filter_across_slices_enabled_flag;
+      context->desc.h265.pps->deblocking_filter_override_enabled_flag =
+         hevc->slice_parsing_fields.bits.deblocking_filter_override_enabled_flag;
+      context->desc.h265.pps->pps_deblocking_filter_disabled_flag =
+         hevc->slice_parsing_fields.bits.pps_disable_deblocking_filter_flag;
+      context->desc.h265.pps->pps_beta_offset_div2 = hevc->pps_beta_offset_div2;
+      context->desc.h265.pps->pps_tc_offset_div2 = hevc->pps_tc_offset_div2;
+      context->desc.h265.pps->lists_modification_present_flag =
+         hevc->slice_parsing_fields.bits.lists_modification_present_flag;
+      context->desc.h265.pps->log2_parallel_merge_level_minus2 =
+         hevc->log2_parallel_merge_level_minus2;
+      context->desc.h265.pps->slice_segment_header_extension_present_flag =
+         hevc->slice_parsing_fields.bits.slice_segment_header_extension_present_flag;
+
+      context->desc.h265.IDRPicFlag = hevc->slice_parsing_fields.bits.IdrPicFlag;
+      context->desc.h265.RAPPicFlag = hevc->slice_parsing_fields.bits.RapPicFlag;
+
+      context->desc.h265.CurrPicOrderCntVal = hevc->CurrPic.pic_order_cnt;
+
+      for (i = 0 ; i < 8 ; i++) {
+         context->desc.h265.RefPicSetStCurrBefore[i] = 0xFF;
+         context->desc.h265.RefPicSetStCurrAfter[i] = 0xFF;
+         context->desc.h265.RefPicSetLtCurr[i] = 0xFF;
+      }
+      context->desc.h265.NumPocStCurrBefore = 0;
+      context->desc.h265.NumPocStCurrAfter = 0;
+      context->desc.h265.NumPocLtCurr = 0;
+      unsigned int iBefore = 0;
+      unsigned int iAfter = 0;
+      unsigned int iCurr = 0;
+      for (i = 0 ; i < 15 ; i++) {
+         context->desc.h265.PicOrderCntVal[i] = hevc->ReferenceFrames[i].pic_order_cnt;
+
+         unsigned int index = hevc->ReferenceFrames[i].picture_id & 0x7F;
+
+         if (index == 0x7F)
+            continue;
+
+         getReferenceFrame(drv, hevc->ReferenceFrames[i].picture_id, &context->desc.h265.ref[i]);
+
+         if ((hevc->ReferenceFrames[i].flags & VA_PICTURE_HEVC_RPS_ST_CURR_BEFORE) && (iBefore < 8)) {
+            context->desc.h265.RefPicSetStCurrBefore[iBefore++] = i;
+            context->desc.h265.NumPocStCurrBefore++;
+         }
+         if ((hevc->ReferenceFrames[i].flags & VA_PICTURE_HEVC_RPS_ST_CURR_AFTER) && (iAfter < 8)) {
+            context->desc.h265.RefPicSetStCurrAfter[iAfter++] = i;
+            context->desc.h265.NumPocStCurrAfter++;
+         }
+         if ((hevc->ReferenceFrames[i].flags & VA_PICTURE_HEVC_RPS_LT_CURR) && (iCurr < 8)) {
+            context->desc.h265.RefPicSetLtCurr[iCurr++] = i;
+            context->desc.h265.NumPocLtCurr++;
+         }
+      }
+      break;
+
    default:
       break;
    }
@@ -297,6 +460,7 @@ handleIQMatrixBuffer(vlVaContext *context, vlVaBuffer *buf)
    VAIQMatrixBufferMPEG2 *mpeg2;
    VAIQMatrixBufferH264 *h264;
    VAIQMatrixBufferMPEG4 *mpeg4;
+   VAIQMatrixBufferHEVC *h265;
 
    switch (u_reduce_video_profile(context->decoder->profile)) {
    case PIPE_VIDEO_FORMAT_MPEG12:
@@ -320,6 +484,17 @@ handleIQMatrixBuffer(vlVaContext *context, vlVaBuffer *buf)
       memcpy(&context->desc.h264.pps->ScalingList8x8, h264->ScalingList8x8, 2 * 64);
       break;
 
+   case PIPE_VIDEO_FORMAT_HEVC:
+      assert(buf->size >= sizeof(VAIQMatrixBufferH264) && buf->num_elements == 1);
+      h265 = buf->data;
+      memcpy(&context->desc.h265.pps->sps->ScalingList4x4, h265->ScalingList4x4, 6 * 16);
+      memcpy(&context->desc.h265.pps->sps->ScalingList8x8, h265->ScalingList8x8, 6 * 64);
+      memcpy(&context->desc.h265.pps->sps->ScalingList16x16, h265->ScalingList16x16, 6 * 64);
+      memcpy(&context->desc.h265.pps->sps->ScalingList32x32, h265->ScalingList32x32, 2 * 64);
+      memcpy(&context->desc.h265.pps->sps->ScalingListDCCoeff16x16, h265->ScalingListDC16x16, 6);
+      memcpy(&context->desc.h265.pps->sps->ScalingListDCCoeff32x32, h265->ScalingListDC32x32, 2);
+      break;
+
    case PIPE_VIDEO_FORMAT_MPEG4:
       assert(buf->size >= sizeof(VAIQMatrixBufferMPEG4) && buf->num_elements == 1);
       mpeg4 = buf->data;
@@ -345,6 +520,7 @@ handleSliceParameterBuffer(vlVaContext *context, vlVaBuffer *buf)
 {
    VASliceParameterBufferH264 *h264;
    VASliceParameterBufferMPEG4 *mpeg4;
+   VASliceParameterBufferHEVC *h265;
 
    switch (u_reduce_video_profile(context->decoder->profile)) {
    case PIPE_VIDEO_FORMAT_MPEG4_AVC:
@@ -361,6 +537,15 @@ handleSliceParameterBuffer(vlVaContext *context, vlVaBuffer *buf)
 
       context->mpeg4.quant_scale = mpeg4->quant_scale;
       break;
+   case PIPE_VIDEO_FORMAT_HEVC:
+      assert(buf->size >= sizeof(VASliceParameterBufferHEVC) && buf->num_elements == 1);
+      h265 = buf->data;
+      for (int i = 0 ; i < 2 ; i++) {
+         for (int j = 0 ; j < 15 ; j++)
+            context->desc.h265.RefPicList[i][j] = h265->RefPicList[i][j];
+      }
+      context->desc.h265.UseRefPicList = true;
+      break;
    default:
       break;
    }
@@ -483,6 +668,7 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
    void * const *buffers[2];
    unsigned sizes[2];
    static const uint8_t start_code_h264[] = { 0x00, 0x00, 0x01 };
+   static const uint8_t start_code_h265[] = { 0x00, 0x00, 0x01 };
    static const uint8_t start_code_vc1[] = { 0x00, 0x00, 0x01, 0x0d };
 
    format = u_reduce_video_profile(context->decoder->profile);
@@ -494,6 +680,13 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
          buffers[num_buffers] = (void *const)&start_code_h264;
          sizes[num_buffers++] = sizeof(start_code_h264);
       break;
+   case PIPE_VIDEO_FORMAT_HEVC:
+      if (bufHasStartcode(buf, 0x000001, 24))
+         break;
+
+         buffers[num_buffers] = (void *const)&start_code_h265;
+         sizes[num_buffers++] = sizeof(start_code_h265);
+         break;
    case PIPE_VIDEO_FORMAT_VC1:
       if (bufHasStartcode(buf, 0x0000010d, 32) ||
           bufHasStartcode(buf, 0x0000010c, 32) ||
@@ -517,15 +710,75 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
    buffers[num_buffers] = buf->data;
    sizes[num_buffers] = buf->size;
    ++num_buffers;
-   context->decoder->decode_bitstream(context->decoder, context->target, NULL,
+   context->decoder->decode_bitstream(context->decoder, context->target, &context->desc.base,
       num_buffers, (const void * const*)buffers, sizes);
 }
 
+static VAStatus
+handleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
+{
+    struct u_rect src_rect;
+    struct u_rect dst_rect;
+    struct u_rect *dirty_area;
+    vlVaSurface *src_surface;
+    VAProcPipelineParameterBuffer *pipeline_param;
+    struct pipe_surface **surfaces;
+    struct pipe_screen *screen;
+    struct pipe_surface *psurf;
+
+    if (!drv || !context)
+       return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+    if (!buf || !buf->data)
+       return VA_STATUS_ERROR_INVALID_BUFFER;
+
+    if (!context->target)
+        return VA_STATUS_ERROR_INVALID_SURFACE;
+
+    pipeline_param = (VAProcPipelineParameterBuffer *)buf->data;
+
+    src_surface = handle_table_get(drv->htab, pipeline_param->surface);
+    if (!src_surface || !src_surface->buffer)
+       return VA_STATUS_ERROR_INVALID_SURFACE;
+
+    surfaces = context->target->get_surfaces(context->target);
+
+    if (!surfaces || !surfaces[0])
+        return VA_STATUS_ERROR_INVALID_SURFACE;
+
+    screen = drv->pipe->screen;
+
+    psurf = surfaces[0];
+
+    src_rect.x0 = pipeline_param->surface_region->x;
+    src_rect.y0 = pipeline_param->surface_region->y;
+    src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width;
+    src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height;
+
+    dst_rect.x0 = pipeline_param->output_region->x;
+    dst_rect.y0 = pipeline_param->output_region->y;
+    dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width;
+    dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height;
+
+    dirty_area = vl_screen_get_dirty_area(drv->vscreen);
+
+    vl_compositor_clear_layers(&drv->cstate);
+    vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
+    vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect);
+    vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true);
+
+    screen->fence_reference(screen, &src_surface->fence, NULL);
+    drv->pipe->flush(drv->pipe, &src_surface->fence, 0);
+
+    return VA_STATUS_SUCCESS;
+}
+
 VAStatus
 vlVaRenderPicture(VADriverContextP ctx, VAContextID context_id, VABufferID *buffers, int num_buffers)
 {
    vlVaDriver *drv;
    vlVaContext *context;
+   VAStatus vaStatus = VA_STATUS_SUCCESS;
 
    unsigned i;
 
@@ -561,13 +814,16 @@ vlVaRenderPicture(VADriverContextP ctx, VAContextID context_id, VABufferID *buff
       case VASliceDataBufferType:
          handleVASliceDataBufferType(context, buf);
          break;
+      case VAProcPipelineParameterBufferType:
+         vaStatus = handleVAProcPipelineParameterBufferType(drv, context, buf);
+         break;
 
       default:
          break;
       }
    }
 
-   return VA_STATUS_SUCCESS;
+   return vaStatus;
 }
 
 VAStatus
@@ -587,6 +843,11 @@ vlVaEndPicture(VADriverContextP ctx, VAContextID context_id)
    if (!context)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
+   if (!context->decoder) {
+      /* VPP */
+      return VA_STATUS_SUCCESS;
+   }
+
    context->mpeg4.frame_num++;
    context->decoder->end_frame(context->decoder, context->target, &context->desc.base);
 
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 8d4487bfb5a..8f406e09990 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -29,6 +29,8 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_video_codec.h"
 
+#include "state_tracker/drm_driver.h"
+
 #include "util/u_memory.h"
 #include "util/u_handle_table.h"
 #include "util/u_rect.h"
@@ -36,64 +38,19 @@
 #include "util/u_surface.h"
 
 #include "vl/vl_compositor.h"
+#include "vl/vl_video_buffer.h"
 #include "vl/vl_winsys.h"
 
 #include "va_private.h"
 
+#include <va/va_drmcommon.h>
+
 VAStatus
 vlVaCreateSurfaces(VADriverContextP ctx, int width, int height, int format,
                    int num_surfaces, VASurfaceID *surfaces)
 {
-   struct pipe_video_buffer templat = {};
-   struct pipe_screen *pscreen;
-   vlVaDriver *drv;
-   int i;
-
-   if (!ctx)
-      return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-   if (!(width && height))
-      return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
-
-   drv = VL_VA_DRIVER(ctx);
-   pscreen = VL_VA_PSCREEN(ctx);
-
-   templat.buffer_format = pscreen->get_video_param
-   (
-      pscreen,
-      PIPE_VIDEO_PROFILE_UNKNOWN,
-      PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
-      PIPE_VIDEO_CAP_PREFERED_FORMAT
-   );
-   templat.chroma_format = ChromaToPipe(format);
-   templat.width = width;
-   templat.height = height;
-   templat.interlaced = pscreen->get_video_param
-   (
-      pscreen,
-      PIPE_VIDEO_PROFILE_UNKNOWN,
-      PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
-      PIPE_VIDEO_CAP_PREFERS_INTERLACED
-   );
-
-   for (i = 0; i < num_surfaces; ++i) {
-      vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface));
-      if (!surf)
-         goto no_res;
-
-      surf->templat = templat;
-      surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat);
-      util_dynarray_init(&surf->subpics);
-      surfaces[i] = handle_table_add(drv->htab, surf);
-   }
-
-   return VA_STATUS_SUCCESS;
-
-no_res:
-   if (i)
-      vlVaDestroySurfaces(ctx, surfaces, i);
-
-   return VA_STATUS_ERROR_ALLOCATION_FAILED;
+   return vlVaCreateSurfaces2(ctx, format, width, height, surfaces, num_surfaces,
+                              NULL, 0);
 }
 
 VAStatus
@@ -349,3 +306,427 @@ vlVaUnlockSurface(VADriverContextP ctx, VASurfaceID surface)
 
    return VA_STATUS_ERROR_UNIMPLEMENTED;
 }
+
+VAStatus
+vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config,
+                           VASurfaceAttrib *attrib_list, unsigned int *num_attribs)
+{
+    vlVaDriver *drv;
+    VASurfaceAttrib *attribs;
+    struct pipe_screen *pscreen;
+    int i;
+
+    if (config == VA_INVALID_ID)
+        return VA_STATUS_ERROR_INVALID_CONFIG;
+
+    if (!attrib_list && !num_attribs)
+        return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+    if (!attrib_list) {
+        *num_attribs = VASurfaceAttribCount;
+        return VA_STATUS_SUCCESS;
+    }
+
+    if (!ctx)
+       return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+    drv = VL_VA_DRIVER(ctx);
+
+    if (!drv)
+        return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+    pscreen = VL_VA_PSCREEN(ctx);
+
+    if (!pscreen)
+       return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+    attribs = CALLOC(VASurfaceAttribCount, sizeof(VASurfaceAttrib));
+
+    if (!attribs)
+        return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+    i = 0;
+
+    if (config == PIPE_VIDEO_PROFILE_UNKNOWN) {
+       /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN
+          only for VAEntrypointVideoProc. */
+       attribs[i].type = VASurfaceAttribPixelFormat;
+       attribs[i].value.type = VAGenericValueTypeInteger;
+       attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+       attribs[i].value.value.i = VA_FOURCC_BGRA;
+       i++;
+
+       attribs[i].type = VASurfaceAttribPixelFormat;
+       attribs[i].value.type = VAGenericValueTypeInteger;
+       attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+       attribs[i].value.value.i = VA_FOURCC_RGBA;
+       i++;
+    } else {
+       /* Assume VAEntrypointVLD for now. */
+       attribs[i].type = VASurfaceAttribPixelFormat;
+       attribs[i].value.type = VAGenericValueTypeInteger;
+       attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+       attribs[i].value.value.i = VA_FOURCC_NV12;
+       i++;
+    }
+
+    attribs[i].type = VASurfaceAttribMemoryType;
+    attribs[i].value.type = VAGenericValueTypeInteger;
+    attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+    attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA |
+        VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
+    i++;
+
+    attribs[i].type = VASurfaceAttribExternalBufferDescriptor;
+    attribs[i].value.type = VAGenericValueTypePointer;
+    attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE;
+    attribs[i].value.value.p = NULL; /* ignore */
+    i++;
+
+    attribs[i].type = VASurfaceAttribMaxWidth;
+    attribs[i].value.type = VAGenericValueTypeInteger;
+    attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
+    attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
+    i++;
+
+    attribs[i].type = VASurfaceAttribMaxHeight;
+    attribs[i].value.type = VAGenericValueTypeInteger;
+    attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
+    attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
+    i++;
+
+    if (i > *num_attribs) {
+        *num_attribs = i;
+        FREE(attribs);
+        return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
+    }
+
+    *num_attribs = i;
+    memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib));
+    FREE(attribs);
+
+    return VA_STATUS_SUCCESS;
+}
+
+static VAStatus
+suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface,
+                            VASurfaceAttribExternalBuffers *memory_attibute,
+                            int index, VASurfaceID *surfaces,
+                            struct pipe_video_buffer *templat)
+{
+    vlVaDriver *drv;
+    struct pipe_screen *pscreen;
+    struct pipe_resource *resource;
+    struct pipe_resource res_templ;
+    struct winsys_handle whandle;
+    struct pipe_resource *resources[VL_NUM_COMPONENTS];
+
+    if (!ctx)
+        return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+    pscreen = VL_VA_PSCREEN(ctx);
+    drv = VL_VA_DRIVER(ctx);
+
+    if (!memory_attibute || !memory_attibute->buffers ||
+        index > memory_attibute->num_buffers)
+        return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+    if (surface->templat.width != memory_attibute->width ||
+        surface->templat.height != memory_attibute->height ||
+        memory_attibute->num_planes < 1)
+        return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+    switch (memory_attibute->pixel_format) {
+    case VA_FOURCC_RGBA:
+    case VA_FOURCC_RGBX:
+    case VA_FOURCC_BGRA:
+    case VA_FOURCC_BGRX:
+        if (memory_attibute->num_planes != 1)
+            return VA_STATUS_ERROR_INVALID_PARAMETER;
+        break;
+    default:
+        return VA_STATUS_ERROR_INVALID_PARAMETER;
+    }
+
+    memset(&res_templ, 0, sizeof(res_templ));
+    res_templ.target = PIPE_TEXTURE_2D;
+    res_templ.last_level = 0;
+    res_templ.depth0 = 1;
+    res_templ.array_size = 1;
+    res_templ.width0 = memory_attibute->width;
+    res_templ.height0 = memory_attibute->height;
+    res_templ.format = surface->templat.buffer_format;
+    res_templ.bind = PIPE_BIND_SAMPLER_VIEW;
+    res_templ.usage = PIPE_USAGE_DEFAULT;
+
+    memset(&whandle, 0, sizeof(struct winsys_handle));
+    whandle.type = DRM_API_HANDLE_TYPE_FD;
+    whandle.handle = memory_attibute->buffers[index];
+    whandle.stride = memory_attibute->pitches[index];
+
+    resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle);
+
+    if (!resource)
+       return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+    memset(resources, 0, sizeof resources);
+    resources[0] = resource;
+
+    surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources);
+    if (!surface->buffer)
+        return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+    util_dynarray_init(&surface->subpics);
+    surfaces[index] = handle_table_add(drv->htab, surface);
+
+    if (!surfaces[index])
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+    return VA_STATUS_SUCCESS;
+}
+
+VAStatus
+vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format,
+                    unsigned int width, unsigned int height,
+                    VASurfaceID *surfaces, unsigned int num_surfaces,
+                    VASurfaceAttrib *attrib_list, unsigned int num_attribs)
+{
+    vlVaDriver *drv;
+    VASurfaceAttribExternalBuffers *memory_attibute;
+    struct pipe_video_buffer templat;
+    struct pipe_screen *pscreen;
+    int i;
+    int memory_type;
+    int expected_fourcc;
+    VAStatus vaStatus;
+
+    if (!ctx)
+       return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+    if (!(width && height))
+       return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
+
+    drv = VL_VA_DRIVER(ctx);
+
+    if (!drv)
+        return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+    pscreen = VL_VA_PSCREEN(ctx);
+
+    if (!pscreen)
+        return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+    /* Default. */
+    memory_attibute = NULL;
+    memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA;
+    expected_fourcc = 0;
+
+    for (i = 0; i < num_attribs && attrib_list; i++) {
+        if ((attrib_list[i].type == VASurfaceAttribPixelFormat) &&
+            (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
+            if (attrib_list[i].value.type != VAGenericValueTypeInteger)
+                return VA_STATUS_ERROR_INVALID_PARAMETER;
+            expected_fourcc = attrib_list[i].value.value.i;
+        }
+
+        if ((attrib_list[i].type == VASurfaceAttribMemoryType) &&
+            (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
+
+            if (attrib_list[i].value.type != VAGenericValueTypeInteger)
+                return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+            switch (attrib_list[i].value.value.i) {
+                case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+                case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+                   memory_type = attrib_list[i].value.value.i;
+                   break;
+                default:
+                   return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
+            }
+        }
+
+        if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) &&
+            (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) {
+            if (attrib_list[i].value.type != VAGenericValueTypePointer)
+                return VA_STATUS_ERROR_INVALID_PARAMETER;
+            memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p;
+        }
+    }
+
+    if (VA_RT_FORMAT_YUV420 != format &&
+        VA_RT_FORMAT_YUV422 != format &&
+        VA_RT_FORMAT_YUV444 != format &&
+        VA_RT_FORMAT_RGB32  != format) {
+        return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT;
+    }
+
+    switch (memory_type) {
+        case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+            break;
+        case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+            if (!memory_attibute)
+               return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+            expected_fourcc = memory_attibute->pixel_format;
+            break;
+        default:
+            assert(0);
+    }
+
+    memset(&templat, 0, sizeof(templat));
+
+    if (expected_fourcc) {
+       templat.buffer_format = VaFourccToPipeFormat(expected_fourcc);
+       templat.interlaced = 0;
+    } else {
+        templat.buffer_format = pscreen->get_video_param
+        (
+           pscreen,
+           PIPE_VIDEO_PROFILE_UNKNOWN,
+           PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+           PIPE_VIDEO_CAP_PREFERED_FORMAT
+        );
+        templat.interlaced = pscreen->get_video_param
+        (
+           pscreen,
+           PIPE_VIDEO_PROFILE_UNKNOWN,
+           PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+           PIPE_VIDEO_CAP_PREFERS_INTERLACED
+        );
+    }
+
+    templat.chroma_format = ChromaToPipe(format);
+
+    templat.width = width;
+    templat.height = height;
+
+    memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID));
+
+    for (i = 0; i < num_surfaces; i++) {
+        vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface));
+        if (!surf)
+            goto no_res;
+
+        surf->templat = templat;
+
+        switch (memory_type) {
+            case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+                surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat);
+                if (!surf->buffer)
+                    goto no_res;
+                util_dynarray_init(&surf->subpics);
+                surfaces[i] = handle_table_add(drv->htab, surf);
+                break;
+            case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+                vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat);
+                if (vaStatus != VA_STATUS_SUCCESS)
+                  goto no_res;
+                break;
+            default:
+                assert(0);
+        }
+    }
+
+    return VA_STATUS_SUCCESS;
+
+no_res:
+   if (i)
+      vlVaDestroySurfaces(ctx, surfaces, i);
+
+   return VA_STATUS_ERROR_ALLOCATION_FAILED;
+}
+
+VAStatus
+vlVaQueryVideoProcFilters(VADriverContextP ctx, VAContextID context,
+                          VAProcFilterType *filters, unsigned int *num_filters)
+{
+   unsigned int num = 0;
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   if (!num_filters || !filters)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   filters[num++] = VAProcFilterNone;
+
+   *num_filters = num;
+
+   return VA_STATUS_SUCCESS;
+}
+
+VAStatus
+vlVaQueryVideoProcFilterCaps(VADriverContextP ctx, VAContextID context,
+                             VAProcFilterType type, void *filter_caps,
+                             unsigned int *num_filter_caps)
+{
+   unsigned int i;
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   if (!filter_caps || !num_filter_caps)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   i = 0;
+
+   switch (type) {
+   case VAProcFilterNone:
+      break;
+   case VAProcFilterNoiseReduction:
+   case VAProcFilterDeinterlacing:
+   case VAProcFilterSharpening:
+   case VAProcFilterColorBalance:
+   case VAProcFilterSkinToneEnhancement:
+      return VA_STATUS_ERROR_UNIMPLEMENTED;
+   default:
+      assert(0);
+   }
+
+   *num_filter_caps = i;
+
+   return VA_STATUS_SUCCESS;
+}
+
+static VAProcColorStandardType vpp_input_color_standards[VAProcColorStandardCount] = {
+   VAProcColorStandardBT601
+};
+
+static VAProcColorStandardType vpp_output_color_standards[VAProcColorStandardCount] = {
+   VAProcColorStandardBT601
+};
+
+VAStatus
+vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context,
+                               VABufferID *filters, unsigned int num_filters,
+                               VAProcPipelineCaps *pipeline_cap)
+{
+   unsigned int i = 0;
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   if (!pipeline_cap)
+   return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   if (num_filters && !filters)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   pipeline_cap->pipeline_flags = 0;
+   pipeline_cap->filter_flags = 0;
+   pipeline_cap->num_forward_references = 0;
+   pipeline_cap->num_backward_references = 0;
+   pipeline_cap->num_input_color_standards = 1;
+   pipeline_cap->input_color_standards = vpp_input_color_standards;
+   pipeline_cap->num_output_color_standards = 1;
+   pipeline_cap->output_color_standards = vpp_output_color_standards;
+
+   for (i = 0; i < num_filters; i++) {
+      vlVaBuffer *buf = handle_table_get(VL_VA_DRIVER(ctx)->htab, filters[i]);
+
+      if (!buf || buf->type >= VABufferTypeMax)
+         return VA_STATUS_ERROR_INVALID_BUFFER;
+   }
+
+   return VA_STATUS_SUCCESS;
+}
diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h
index 1ea7be79aa3..2b645d08a03 100644
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -33,6 +33,8 @@
 
 #include <va/va.h>
 #include <va/va_backend.h>
+#include <va/va_backend_vpp.h>
+#include <va/va_drmcommon.h>
 
 #include "pipe/p_video_enums.h"
 #include "pipe/p_video_codec.h"
@@ -46,7 +48,7 @@
 #define VL_VA_DRIVER(ctx) ((vlVaDriver *)ctx->pDriverData)
 #define VL_VA_PSCREEN(ctx) (VL_VA_DRIVER(ctx)->vscreen->pscreen)
 
-#define VL_VA_MAX_IMAGE_FORMATS 6
+#define VL_VA_MAX_IMAGE_FORMATS 9
 
 static inline enum pipe_video_chroma_format
 ChromaToPipe(int format)
@@ -59,13 +61,12 @@ ChromaToPipe(int format)
    case VA_RT_FORMAT_YUV444:
       return PIPE_VIDEO_CHROMA_FORMAT_444;
    default:
-      assert(0);
-      return PIPE_VIDEO_CHROMA_FORMAT_420;
+      return PIPE_VIDEO_CHROMA_FORMAT_NONE;
    }
 }
 
 static inline enum pipe_format
-YCbCrToPipe(unsigned format)
+VaFourccToPipeFormat(unsigned format)
 {
    switch(format) {
    case VA_FOURCC('N','V','1','2'):
@@ -80,12 +81,46 @@ YCbCrToPipe(unsigned format)
       return PIPE_FORMAT_UYVY;
    case VA_FOURCC('B','G','R','A'):
       return PIPE_FORMAT_B8G8R8A8_UNORM;
+   case VA_FOURCC('R','G','B','A'):
+      return PIPE_FORMAT_R8G8B8A8_UNORM;
+   case VA_FOURCC('B','G','R','X'):
+      return PIPE_FORMAT_B8G8R8X8_UNORM;
+   case VA_FOURCC('R','G','B','X'):
+      return PIPE_FORMAT_R8G8B8X8_UNORM;
    default:
       assert(0);
       return PIPE_FORMAT_NONE;
    }
 }
 
+static inline unsigned
+PipeFormatToVaFourcc(enum pipe_format p_format)
+{
+   switch (p_format) {
+   case PIPE_FORMAT_NV12:
+      return VA_FOURCC('N','V','1','2');
+   case PIPE_FORMAT_IYUV:
+      return VA_FOURCC('I','4','2','0');
+   case PIPE_FORMAT_YV12:
+      return VA_FOURCC('Y','V','1','2');
+   case PIPE_FORMAT_UYVY:
+      return VA_FOURCC('U','Y','V','Y');
+   case PIPE_FORMAT_YUYV:
+      return VA_FOURCC('Y','U','Y','V');
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return VA_FOURCC('B','G','R','A');
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return VA_FOURCC('R','G','B','A');
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return VA_FOURCC('B','G','R','X');
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      return VA_FOURCC('R','G','B','X');
+   default:
+      assert(0);
+      return -1;
+   }
+}
+
 static inline VAProfile
 PipeToProfile(enum pipe_video_profile profile)
 {
@@ -110,8 +145,11 @@ PipeToProfile(enum pipe_video_profile profile)
       return VAProfileH264Main;
    case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
       return VAProfileH264High;
+   case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+      return VAProfileHEVCMain;
    case PIPE_VIDEO_PROFILE_MPEG4_AVC_EXTENDED:
-       return VAProfileNone;
+   case PIPE_VIDEO_PROFILE_UNKNOWN:
+      return VAProfileNone;
    default:
       assert(0);
       return -1;
@@ -142,6 +180,10 @@ ProfileToPipe(VAProfile profile)
       return PIPE_VIDEO_PROFILE_MPEG4_AVC_MAIN;
    case VAProfileH264High:
       return PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH;
+   case VAProfileHEVCMain:
+      return PIPE_VIDEO_PROFILE_HEVC_MAIN;
+   case VAProfileNone:
+       return PIPE_VIDEO_PROFILE_UNKNOWN;
    default:
       return PIPE_VIDEO_PROFILE_UNKNOWN;
    }
@@ -174,6 +216,7 @@ typedef struct {
       struct pipe_mpeg4_picture_desc mpeg4;
       struct pipe_vc1_picture_desc vc1;
       struct pipe_h264_picture_desc h264;
+      struct pipe_h265_picture_desc h265;
    } desc;
 
    struct {
@@ -191,6 +234,13 @@ typedef struct {
    unsigned int size;
    unsigned int num_elements;
    void *data;
+   struct {
+      struct pipe_resource *resource;
+      struct pipe_transfer *transfer;
+      struct pipe_fence_handle *fence;
+   } derived_surface;
+   unsigned int export_refcount;
+   VABufferInfo export_state;
 } vlVaBuffer;
 
 typedef struct {
@@ -275,5 +325,19 @@ VAStatus vlVaLockSurface(VADriverContextP ctx, VASurfaceID surface, unsigned int
                          unsigned int *luma_offset, unsigned int *chroma_u_offset, unsigned int *chroma_v_offset,
                          unsigned int *buffer_name, void **buffer);
 VAStatus vlVaUnlockSurface(VADriverContextP ctx, VASurfaceID surface);
+VAStatus vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format, unsigned int width, unsigned int height,
+                             VASurfaceID *surfaces, unsigned int num_surfaces, VASurfaceAttrib *attrib_list,
+                             unsigned int num_attribs);
+VAStatus vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config, VASurfaceAttrib *attrib_list,
+                                    unsigned int *num_attribs);
+
+VAStatus vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id, VABufferInfo *out_buf_info);
+VAStatus vlVaReleaseBufferHandle(VADriverContextP ctx, VABufferID buf_id);
 
+VAStatus vlVaQueryVideoProcFilters(VADriverContextP ctx, VAContextID context, VAProcFilterType *filters,
+                                   unsigned int *num_filters);
+VAStatus vlVaQueryVideoProcFilterCaps(VADriverContextP ctx, VAContextID context, VAProcFilterType type,
+                                      void *filter_caps, unsigned int *num_filter_caps);
+VAStatus vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context, VABufferID *filters,
+                                        unsigned int num_filters, VAProcPipelineCaps *pipeline_cap);
 #endif //VA_PRIVATE_H
diff --git a/src/gallium/state_trackers/vdpau/decode.c b/src/gallium/state_trackers/vdpau/decode.c
index 3233799d650..f85bce823bb 100644
--- a/src/gallium/state_trackers/vdpau/decode.c
+++ b/src/gallium/state_trackers/vdpau/decode.c
@@ -518,6 +518,7 @@ vlVdpDecoderRenderH265(struct pipe_h265_picture_desc *picture,
    memcpy(picture->RefPicSetStCurrBefore, picture_info->RefPicSetStCurrBefore, 8);
    memcpy(picture->RefPicSetStCurrAfter, picture_info->RefPicSetStCurrAfter, 8);
    memcpy(picture->RefPicSetLtCurr, picture_info->RefPicSetLtCurr, 8);
+   picture->UseRefPicList = false;
 
    return VDP_STATUS_OK;
 }
diff --git a/src/gallium/targets/dri/Makefile.am b/src/gallium/targets/dri/Makefile.am
index 7f945d14b5c..95efdd4451c 100644
--- a/src/gallium/targets/dri/Makefile.am
+++ b/src/gallium/targets/dri/Makefile.am
@@ -83,6 +83,8 @@ include $(top_srcdir)/src/gallium/drivers/freedreno/Automake.inc
 
 include $(top_srcdir)/src/gallium/drivers/vc4/Automake.inc
 
+include $(top_srcdir)/src/gallium/drivers/virgl/Automake.inc
+
 include $(top_srcdir)/src/gallium/drivers/softpipe/Automake.inc
 include $(top_srcdir)/src/gallium/drivers/llvmpipe/Automake.inc
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
index 8882c418e12..90fe0cd50f1 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
@@ -151,11 +151,15 @@ enum {
 
 /* CZ specific rev IDs */
 enum {
-	CZ_CARRIZO_A0      = 0x01,
+	CARRIZO_A0   = 0x01,
+    STONEY_A0    = 0x61,
 	CZ_UNKNOWN      = 0xFF
 };
 
 #define ASICREV_IS_CARRIZO(eChipRev) \
-	(eChipRev >= CARRIZO_A0)
+	((eChipRev >= CARRIZO_A0) && (eChipRev < STONEY_A0))
+
+#define ASICREV_IS_STONEY(eChipRev) \
+	((eChipRev >= STONEY_A0) && (eChipRev < CZ_UNKNOWN))
 
 #endif /* AMDGPU_ID_H */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
index 358df381011..3006bd17958 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
@@ -175,7 +175,9 @@ static int compute_level(struct amdgpu_winsys *ws,
                          struct radeon_surf *surf, bool is_stencil,
                          unsigned level, unsigned type, bool compressed,
                          ADDR_COMPUTE_SURFACE_INFO_INPUT *AddrSurfInfoIn,
-                         ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut)
+                         ADDR_COMPUTE_SURFACE_INFO_OUTPUT *AddrSurfInfoOut,
+                         ADDR_COMPUTE_DCCINFO_INPUT *AddrDccIn,
+                         ADDR_COMPUTE_DCCINFO_OUTPUT *AddrDccOut)
 {
    struct radeon_surf_level *surf_level;
    ADDR_E_RETURNCODE ret;
@@ -248,6 +250,31 @@ static int compute_level(struct amdgpu_winsys *ws,
       surf->tiling_index[level] = AddrSurfInfoOut->tileIndex;
 
    surf->bo_size = surf_level->offset + AddrSurfInfoOut->surfSize;
+
+   if (AddrSurfInfoIn->flags.dccCompatible) {
+      AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize;
+      AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
+      AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
+      AddrDccIn->tileIndex = AddrSurfInfoOut->tileIndex;
+      AddrDccIn->macroModeIndex = AddrSurfInfoOut->macroModeIndex;
+
+      ret = AddrComputeDccInfo(ws->addrlib,
+                               AddrDccIn,
+                               AddrDccOut);
+
+      if (ret == ADDR_OK) {
+         surf_level->dcc_offset = surf->dcc_size;
+         surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
+         surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
+      } else {
+         surf->dcc_size = 0;
+         surf_level->dcc_offset = 0;
+      }
+   } else {
+      surf->dcc_size = 0;
+      surf_level->dcc_offset = 0;
+   }
+
    return 0;
 }
 
@@ -259,6 +286,8 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    bool compressed;
    ADDR_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
    ADDR_COMPUTE_SURFACE_INFO_OUTPUT AddrSurfInfoOut = {0};
+   ADDR_COMPUTE_DCCINFO_INPUT AddrDccIn = {0};
+   ADDR_COMPUTE_DCCINFO_OUTPUT AddrDccOut = {0};
    ADDR_TILEINFO AddrTileInfoIn = {0};
    ADDR_TILEINFO AddrTileInfoOut = {0};
    int r;
@@ -269,6 +298,8 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
 
    AddrSurfInfoIn.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_INPUT);
    AddrSurfInfoOut.size = sizeof(ADDR_COMPUTE_SURFACE_INFO_OUTPUT);
+   AddrDccIn.size = sizeof(ADDR_COMPUTE_DCCINFO_INPUT);
+   AddrDccOut.size = sizeof(ADDR_COMPUTE_DCCINFO_OUTPUT);
    AddrSurfInfoOut.pTileInfo = &AddrTileInfoOut;
 
    type = RADEON_SURF_GET(surf->flags, TYPE);
@@ -318,10 +349,10 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
       }
    }
    else {
-      AddrSurfInfoIn.bpp = surf->bpe * 8;
+      AddrDccIn.bpp = AddrSurfInfoIn.bpp = surf->bpe * 8;
    }
 
-   AddrSurfInfoIn.numSamples = surf->nsamples;
+   AddrDccIn.numSamples = AddrSurfInfoIn.numSamples = surf->nsamples;
    AddrSurfInfoIn.tileIndex = -1;
 
    /* Set the micro tile type. */
@@ -339,6 +370,9 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    AddrSurfInfoIn.flags.display = (surf->flags & RADEON_SURF_SCANOUT) != 0;
    AddrSurfInfoIn.flags.pow2Pad = surf->last_level > 0;
    AddrSurfInfoIn.flags.degrade4Space = 1;
+   AddrSurfInfoIn.flags.dccCompatible = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) &&
+                                        !(surf->flags & RADEON_SURF_SCANOUT) &&
+                                        !compressed && AddrDccIn.numSamples <= 1;
 
    /* This disables incorrect calculations (hacks) in addrlib. */
    AddrSurfInfoIn.flags.noStencil = 1;
@@ -375,11 +409,13 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    }
 
    surf->bo_size = 0;
+   surf->dcc_size = 0;
+   surf->dcc_alignment = 1;
 
    /* Calculate texture layout information. */
    for (level = 0; level <= surf->last_level; level++) {
       r = compute_level(ws, surf, false, level, type, compressed,
-                        &AddrSurfInfoIn, &AddrSurfInfoOut);
+                        &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
       if (r)
          return r;
 
@@ -406,7 +442,7 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
 
       for (level = 0; level <= surf->last_level; level++) {
          r = compute_level(ws, surf, true, level, type, compressed,
-                           &AddrSurfInfoIn, &AddrSurfInfoOut);
+                           &AddrSurfInfoIn, &AddrSurfInfoOut, &AddrDccIn, &AddrDccOut);
          if (r)
             return r;
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index c8772490e74..32cd9d9aa50 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -226,7 +226,11 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws)
       break;
    case CHIP_CARRIZO:
       ws->family = FAMILY_CZ;
-      ws->rev_id = CZ_CARRIZO_A0;
+      ws->rev_id = CARRIZO_A0;
+      break;
+   case CHIP_STONEY:
+      ws->family = FAMILY_CZ;
+      ws->rev_id = STONEY_A0;
       break;
    case CHIP_FIJI:
       ws->family = FAMILY_VI;
diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c
index 1675af4cbc8..4dc32366d61 100644
--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@@ -251,6 +251,7 @@ vmw_swc_flush(struct svga_winsys_context *swc,
    vswc->must_flush = FALSE;
    debug_flush_flush(vswc->fctx);
 #endif
+   swc->hints &= ~SVGA_HINT_FLAG_DRAW_EMITTED;
    vswc->preemptive_flush = FALSE;
    vswc->seen_surfaces = 0;
    vswc->seen_regions = 0;
@@ -372,7 +373,8 @@ vmw_swc_region_relocation(struct svga_winsys_context *swc,
 
    if (vmw_swc_add_validate_buffer(vswc, reloc->buffer, flags)) {
       vswc->seen_regions += reloc->buffer->size;
-      if(vswc->seen_regions >= VMW_GMR_POOL_SIZE/5)
+      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+          vswc->seen_regions >= VMW_GMR_POOL_SIZE/5)
          vswc->preemptive_flush = TRUE;
    }
 
@@ -413,8 +415,10 @@ vmw_swc_mob_relocation(struct svga_winsys_context *swc,
 
    if (vmw_swc_add_validate_buffer(vswc, pb_buffer, flags)) {
       vswc->seen_mobs += pb_buffer->size;
-      /* divide by 5, tested for best performance */
-      if (vswc->seen_mobs >= vswc->vws->ioctl.max_mob_memory / VMW_MAX_MOB_MEM_FACTOR)
+
+      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+          vswc->seen_mobs >=
+            vswc->vws->ioctl.max_mob_memory / VMW_MAX_MOB_MEM_FACTOR)
          vswc->preemptive_flush = TRUE;
    }
 
@@ -475,8 +479,9 @@ vmw_swc_surface_only_relocation(struct svga_winsys_context *swc,
       ++vswc->surface.staged;
 
       vswc->seen_surfaces += vsurf->size;
-      /* divide by 5 not well tuned for performance */
-      if (vswc->seen_surfaces >= vswc->vws->ioctl.max_surface_memory / VMW_MAX_SURF_MEM_FACTOR)
+      if ((swc->hints & SVGA_HINT_FLAG_DRAW_EMITTED) &&
+          vswc->seen_surfaces >=
+            vswc->vws->ioctl.max_surface_memory / VMW_MAX_SURF_MEM_FACTOR)
          vswc->preemptive_flush = TRUE;
    }
 
diff --git a/src/gallium/winsys/sw/dri/dri_sw_winsys.c b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
index 8451d832806..5c98f2603c7 100644
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
@@ -44,8 +44,10 @@ struct dri_sw_displaytarget
    unsigned height;
    unsigned stride;
 
+   unsigned map_flags;
    void *data;
    void *mapped;
+   const void *front_private;
 };
 
 struct dri_sw_winsys
@@ -83,6 +85,7 @@ dri_sw_displaytarget_create(struct sw_winsys *winsys,
                             enum pipe_format format,
                             unsigned width, unsigned height,
                             unsigned alignment,
+                            const void *front_private,
                             unsigned *stride)
 {
    struct dri_sw_displaytarget *dri_sw_dt;
@@ -95,6 +98,7 @@ dri_sw_displaytarget_create(struct sw_winsys *winsys,
    dri_sw_dt->format = format;
    dri_sw_dt->width = width;
    dri_sw_dt->height = height;
+   dri_sw_dt->front_private = front_private;
 
    format_stride = util_format_get_stride(format, width);
    dri_sw_dt->stride = align(format_stride, alignment);
@@ -133,6 +137,12 @@ dri_sw_displaytarget_map(struct sw_winsys *ws,
 {
    struct dri_sw_displaytarget *dri_sw_dt = dri_sw_displaytarget(dt);
    dri_sw_dt->mapped = dri_sw_dt->data;
+
+   if (dri_sw_dt->front_private && (flags & PIPE_TRANSFER_READ)) {
+      struct dri_sw_winsys *dri_sw_ws = dri_sw_winsys(ws);
+      dri_sw_ws->lf->get_image((void *)dri_sw_dt->front_private, 0, 0, dri_sw_dt->width, dri_sw_dt->height, dri_sw_dt->stride, dri_sw_dt->data);
+   }
+   dri_sw_dt->map_flags = flags;
    return dri_sw_dt->mapped;
 }
 
@@ -141,6 +151,11 @@ dri_sw_displaytarget_unmap(struct sw_winsys *ws,
                            struct sw_displaytarget *dt)
 {
    struct dri_sw_displaytarget *dri_sw_dt = dri_sw_displaytarget(dt);
+   if (dri_sw_dt->front_private && (dri_sw_dt->map_flags & PIPE_TRANSFER_WRITE)) {
+      struct dri_sw_winsys *dri_sw_ws = dri_sw_winsys(ws);
+      dri_sw_ws->lf->put_image2((void *)dri_sw_dt->front_private, dri_sw_dt->data, 0, 0, dri_sw_dt->width, dri_sw_dt->height, dri_sw_dt->stride);
+   }
+   dri_sw_dt->map_flags = 0;
    dri_sw_dt->mapped = NULL;
 }
 
diff --git a/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c b/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c
index dc725f4b90c..16f641833c6 100644
--- a/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c
+++ b/src/gallium/winsys/sw/gdi/gdi_sw_winsys.c
@@ -124,6 +124,7 @@ gdi_sw_displaytarget_create(struct sw_winsys *winsys,
                                   enum pipe_format format,
                                   unsigned width, unsigned height,
                                   unsigned alignment,
+                                  const void *front_private,
                                   unsigned *stride)
 {
    struct gdi_sw_displaytarget *gdt;
diff --git a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
index 900c49f83e6..1e859717f1c 100644
--- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
@@ -111,6 +111,7 @@ kms_sw_displaytarget_create(struct sw_winsys *ws,
                             enum pipe_format format,
                             unsigned width, unsigned height,
                             unsigned alignment,
+                            const void *front_private,
                             unsigned *stride)
 {
    struct kms_sw_winsys *kms_sw = kms_sw_winsys(ws);
diff --git a/src/gallium/winsys/sw/null/null_sw_winsys.c b/src/gallium/winsys/sw/null/null_sw_winsys.c
index 9c8b3ec4396..10ce2508507 100644
--- a/src/gallium/winsys/sw/null/null_sw_winsys.c
+++ b/src/gallium/winsys/sw/null/null_sw_winsys.c
@@ -84,6 +84,7 @@ null_sw_displaytarget_create(struct sw_winsys *winsys,
                              enum pipe_format format,
                              unsigned width, unsigned height,
                              unsigned alignment,
+                             const void *front_private,
                              unsigned *stride)
 {
    fprintf(stderr, "null_sw_displaytarget_create() returning NULL\n");
diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
index 5c179930d9b..4d87a580cb1 100644
--- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
+++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
@@ -148,6 +148,7 @@ wsw_dt_create(struct sw_winsys *ws,
               enum pipe_format format,
               unsigned width, unsigned height,
               unsigned alignment,
+              const void *front_private,
               unsigned *stride)
 {
    struct wrapper_sw_winsys *wsw = wrapper_sw_winsys(ws);
diff --git a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
index 515ecd9f7b7..cc2a3de9dd3 100644
--- a/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
+++ b/src/gallium/winsys/sw/xlib/xlib_sw_winsys.c
@@ -391,6 +391,7 @@ xlib_displaytarget_create(struct sw_winsys *winsys,
                           enum pipe_format format,
                           unsigned width, unsigned height,
                           unsigned alignment,
+                          const void *front_private,
                           unsigned *stride)
 {
    struct xlib_displaytarget *xlib_dt;
diff --git a/src/gallium/winsys/virgl/drm/Makefile.am b/src/gallium/winsys/virgl/drm/Makefile.am
new file mode 100644
index 00000000000..2473b88ad6e
--- /dev/null
+++ b/src/gallium/winsys/virgl/drm/Makefile.am
@@ -0,0 +1,33 @@
+# Copyright © 2015 Red Hat Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CFLAGS = \
+	-I$(top_srcdir)/src/gallium/drivers \
+	$(GALLIUM_WINSYS_CFLAGS) \
+	$(LIBDRM_CFLAGS)
+
+noinst_LTLIBRARIES = libvirgldrm.la
+
+libvirgldrm_la_SOURCES = $(C_SOURCES)
diff --git a/src/gallium/winsys/virgl/drm/Makefile.sources b/src/gallium/winsys/virgl/drm/Makefile.sources
new file mode 100644
index 00000000000..0430d4ada4a
--- /dev/null
+++ b/src/gallium/winsys/virgl/drm/Makefile.sources
@@ -0,0 +1,5 @@
+C_SOURCES := \
+	virgl_drm_public.h \
+	virgl_drm_winsys.c \
+	virgl_drm_winsys.h \
+	virtgpu_drm.h
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_public.h b/src/gallium/winsys/virgl/drm/virgl_drm_public.h
new file mode 100644
index 00000000000..be01021ca9a
--- /dev/null
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_public.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_DRM_PUBLIC_H
+#define VIRGL_DRM_PUBLIC_H
+
+struct virgl_winsys;
+
+struct virgl_winsys *virgl_drm_winsys_create(int drmFD);
+
+#endif
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
new file mode 100644
index 00000000000..d77ebd6ca15
--- /dev/null
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -0,0 +1,774 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+
+#include "os/os_mman.h"
+#include "os/os_time.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_hash_table.h"
+#include "util/u_inlines.h"
+#include "state_tracker/drm_driver.h"
+
+#include <xf86drm.h>
+#include "virtgpu_drm.h"
+
+#include "virgl_drm_winsys.h"
+#include "virgl_drm_public.h"
+
+static inline boolean can_cache_resource(struct virgl_hw_res *res)
+{
+   return res->cacheable == TRUE;
+}
+
+static void virgl_hw_res_destroy(struct virgl_drm_winsys *qdws,
+                                 struct virgl_hw_res *res)
+{
+      struct drm_gem_close args;
+
+      if (res->name) {
+         pipe_mutex_lock(qdws->bo_handles_mutex);
+         util_hash_table_remove(qdws->bo_handles,
+                                (void *)(uintptr_t)res->name);
+         pipe_mutex_unlock(qdws->bo_handles_mutex);
+      }
+
+      if (res->ptr)
+         os_munmap(res->ptr, res->size);
+
+      memset(&args, 0, sizeof(args));
+      args.handle = res->bo_handle;
+      drmIoctl(qdws->fd, DRM_IOCTL_GEM_CLOSE, &args);
+      FREE(res);
+}
+
+static boolean virgl_drm_resource_is_busy(struct virgl_drm_winsys *qdws,
+                                          struct virgl_hw_res *res)
+{
+   struct drm_virtgpu_3d_wait waitcmd;
+   int ret;
+
+   memset(&waitcmd, 0, sizeof(waitcmd));
+   waitcmd.handle = res->bo_handle;
+   waitcmd.flags = VIRTGPU_WAIT_NOWAIT;
+
+   ret = drmIoctl(qdws->fd, DRM_IOCTL_VIRTGPU_WAIT, &waitcmd);
+   if (ret && errno == EBUSY)
+      return TRUE;
+   return FALSE;
+}
+
+static void
+virgl_cache_flush(struct virgl_drm_winsys *qdws)
+{
+   struct list_head *curr, *next;
+   struct virgl_hw_res *res;
+
+   pipe_mutex_lock(qdws->mutex);
+   curr = qdws->delayed.next;
+   next = curr->next;
+
+   while (curr != &qdws->delayed) {
+      res = LIST_ENTRY(struct virgl_hw_res, curr, head);
+      LIST_DEL(&res->head);
+      virgl_hw_res_destroy(qdws, res);
+      curr = next;
+      next = curr->next;
+   }
+   pipe_mutex_unlock(qdws->mutex);
+}
+static void
+virgl_drm_winsys_destroy(struct virgl_winsys *qws)
+{
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+
+   virgl_cache_flush(qdws);
+
+   util_hash_table_destroy(qdws->bo_handles);
+   pipe_mutex_destroy(qdws->bo_handles_mutex);
+   pipe_mutex_destroy(qdws->mutex);
+
+   FREE(qdws);
+}
+
+static void
+virgl_cache_list_check_free(struct virgl_drm_winsys *qdws)
+{
+   struct list_head *curr, *next;
+   struct virgl_hw_res *res;
+   int64_t now;
+
+   now = os_time_get();
+   curr = qdws->delayed.next;
+   next = curr->next;
+   while (curr != &qdws->delayed) {
+      res = LIST_ENTRY(struct virgl_hw_res, curr, head);
+      if (!os_time_timeout(res->start, res->end, now))
+         break;
+
+      LIST_DEL(&res->head);
+      virgl_hw_res_destroy(qdws, res);
+      curr = next;
+      next = curr->next;
+   }
+}
+
+static void virgl_drm_resource_reference(struct virgl_drm_winsys *qdws,
+                                       struct virgl_hw_res **dres,
+                                       struct virgl_hw_res *sres)
+{
+   struct virgl_hw_res *old = *dres;
+   if (pipe_reference(&(*dres)->reference, &sres->reference)) {
+
+      if (!can_cache_resource(old)) {
+         virgl_hw_res_destroy(qdws, old);
+      } else {
+         pipe_mutex_lock(qdws->mutex);
+         virgl_cache_list_check_free(qdws);
+
+         old->start = os_time_get();
+         old->end = old->start + qdws->usecs;
+         LIST_ADDTAIL(&old->head, &qdws->delayed);
+         qdws->num_delayed++;
+         pipe_mutex_unlock(qdws->mutex);
+      }
+   }
+   *dres = sres;
+}
+
+static struct virgl_hw_res *
+virgl_drm_winsys_resource_create(struct virgl_winsys *qws,
+                                 enum pipe_texture_target target,
+                                 uint32_t format,
+                                 uint32_t bind,
+                                 uint32_t width,
+                                 uint32_t height,
+                                 uint32_t depth,
+                                 uint32_t array_size,
+                                 uint32_t last_level,
+                                 uint32_t nr_samples,
+                                 uint32_t size)
+{
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+   struct drm_virtgpu_resource_create createcmd;
+   int ret;
+   struct virgl_hw_res *res;
+   uint32_t stride = width * util_format_get_blocksize(format);
+
+   res = CALLOC_STRUCT(virgl_hw_res);
+   if (!res)
+      return NULL;
+
+   memset(&createcmd, 0, sizeof(createcmd));
+   createcmd.target = target;
+   createcmd.format = format;
+   createcmd.bind = bind;
+   createcmd.width = width;
+   createcmd.height = height;
+   createcmd.depth = depth;
+   createcmd.array_size = array_size;
+   createcmd.last_level = last_level;
+   createcmd.nr_samples = nr_samples;
+   createcmd.stride = stride;
+   createcmd.size = size;
+
+   ret = drmIoctl(qdws->fd, DRM_IOCTL_VIRTGPU_RESOURCE_CREATE, &createcmd);
+   if (ret != 0) {
+      FREE(res);
+      return NULL;
+   }
+
+   res->bind = bind;
+   res->format = format;
+
+   res->res_handle = createcmd.res_handle;
+   res->bo_handle = createcmd.bo_handle;
+   res->size = size;
+   res->stride = stride;
+   pipe_reference_init(&res->reference, 1);
+   res->num_cs_references = 0;
+   return res;
+}
+
+static inline int virgl_is_res_compat(struct virgl_drm_winsys *qdws,
+                                      struct virgl_hw_res *res,
+                                      uint32_t size, uint32_t bind,
+                                      uint32_t format)
+{
+   if (res->bind != bind)
+      return 0;
+   if (res->format != format)
+      return 0;
+   if (res->size < size)
+      return 0;
+   if (res->size > size * 2)
+      return 0;
+
+   if (virgl_drm_resource_is_busy(qdws, res)) {
+      return -1;
+   }
+
+   return 1;
+}
+
+static int
+virgl_bo_transfer_put(struct virgl_winsys *vws,
+                      struct virgl_hw_res *res,
+                      const struct pipe_box *box,
+                      uint32_t stride, uint32_t layer_stride,
+                      uint32_t buf_offset, uint32_t level)
+{
+   struct virgl_drm_winsys *vdws = virgl_drm_winsys(vws);
+   struct drm_virtgpu_3d_transfer_to_host tohostcmd;
+
+   memset(&tohostcmd, 0, sizeof(tohostcmd));
+   tohostcmd.bo_handle = res->bo_handle;
+   tohostcmd.box = *(struct drm_virtgpu_3d_box *)box;
+   tohostcmd.offset = buf_offset;
+   tohostcmd.level = level;
+  // tohostcmd.stride = stride;
+  // tohostcmd.layer_stride = stride;
+   return drmIoctl(vdws->fd, DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST, &tohostcmd);
+}
+
+static int
+virgl_bo_transfer_get(struct virgl_winsys *vws,
+                      struct virgl_hw_res *res,
+                      const struct pipe_box *box,
+                      uint32_t stride, uint32_t layer_stride,
+                      uint32_t buf_offset, uint32_t level)
+{
+   struct virgl_drm_winsys *vdws = virgl_drm_winsys(vws);
+   struct drm_virtgpu_3d_transfer_from_host fromhostcmd;
+
+   memset(&fromhostcmd, 0, sizeof(fromhostcmd));
+   fromhostcmd.bo_handle = res->bo_handle;
+   fromhostcmd.level = level;
+   fromhostcmd.offset = buf_offset;
+  // fromhostcmd.stride = stride;
+  // fromhostcmd.layer_stride = layer_stride;
+   fromhostcmd.box = *(struct drm_virtgpu_3d_box *)box;
+   return drmIoctl(vdws->fd, DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST, &fromhostcmd);
+}
+
+static struct virgl_hw_res *
+virgl_drm_winsys_resource_cache_create(struct virgl_winsys *qws,
+                                       enum pipe_texture_target target,
+                                       uint32_t format,
+                                       uint32_t bind,
+                                       uint32_t width,
+                                       uint32_t height,
+                                       uint32_t depth,
+                                       uint32_t array_size,
+                                       uint32_t last_level,
+                                       uint32_t nr_samples,
+                                       uint32_t size)
+{
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+   struct virgl_hw_res *res, *curr_res;
+   struct list_head *curr, *next;
+   int64_t now;
+   int ret;
+
+   /* only store binds for vertex/index/const buffers */
+   if (bind != VIRGL_BIND_CONSTANT_BUFFER && bind != VIRGL_BIND_INDEX_BUFFER &&
+       bind != VIRGL_BIND_VERTEX_BUFFER && bind != VIRGL_BIND_CUSTOM)
+      goto alloc;
+
+   pipe_mutex_lock(qdws->mutex);
+
+   res = NULL;
+   curr = qdws->delayed.next;
+   next = curr->next;
+
+   now = os_time_get();
+   while (curr != &qdws->delayed) {
+      curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
+
+      if (!res && (ret = virgl_is_res_compat(qdws, curr_res, size, bind, format) > 0))
+         res = curr_res;
+      else if (os_time_timeout(curr_res->start, curr_res->end, now)) {
+         LIST_DEL(&curr_res->head);
+         virgl_hw_res_destroy(qdws, curr_res);
+      } else
+         break;
+
+      if (ret == -1)
+         break;
+
+      curr = next;
+      next = curr->next;
+   }
+
+   if (!res && ret != -1) {
+      while (curr != &qdws->delayed) {
+         curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
+         ret = virgl_is_res_compat(qdws, curr_res, size, bind, format);
+         if (ret > 0) {
+            res = curr_res;
+            break;
+         }
+         if (ret == -1)
+            break;
+         curr = next;
+         next = curr->next;
+      }
+   }
+
+   if (res) {
+      LIST_DEL(&res->head);
+      --qdws->num_delayed;
+      pipe_mutex_unlock(qdws->mutex);
+      pipe_reference_init(&res->reference, 1);
+      return res;
+   }
+
+   pipe_mutex_unlock(qdws->mutex);
+
+alloc:
+   res = virgl_drm_winsys_resource_create(qws, target, format, bind,
+                                           width, height, depth, array_size,
+                                           last_level, nr_samples, size);
+   if (bind == VIRGL_BIND_CONSTANT_BUFFER || bind == VIRGL_BIND_INDEX_BUFFER ||
+       bind == VIRGL_BIND_VERTEX_BUFFER)
+      res->cacheable = TRUE;
+   return res;
+}
+
+static struct virgl_hw_res *
+virgl_drm_winsys_resource_create_handle(struct virgl_winsys *qws,
+                                        struct winsys_handle *whandle)
+{
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+   struct drm_gem_open open_arg = {};
+   struct drm_virtgpu_resource_info info_arg = {};
+   struct virgl_hw_res *res;
+
+   pipe_mutex_lock(qdws->bo_handles_mutex);
+
+   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+      res = util_hash_table_get(qdws->bo_handles, (void*)(uintptr_t)whandle->handle);
+      if (res) {
+         struct virgl_hw_res *r = NULL;
+         virgl_drm_resource_reference(qdws, &r, res);
+         goto done;
+      }
+   }
+
+   res = CALLOC_STRUCT(virgl_hw_res);
+   if (!res)
+      goto done;
+
+   if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+      int r;
+      uint32_t handle;
+      r = drmPrimeFDToHandle(qdws->fd, whandle->handle, &handle);
+      if (r) {
+         FREE(res);
+         res = NULL;
+         goto done;
+      }
+      res->bo_handle = handle;
+   } else {
+      memset(&open_arg, 0, sizeof(open_arg));
+      open_arg.name = whandle->handle;
+      if (drmIoctl(qdws->fd, DRM_IOCTL_GEM_OPEN, &open_arg)) {
+         FREE(res);
+         res = NULL;
+         goto done;
+      }
+      res->bo_handle = open_arg.handle;
+   }
+   res->name = whandle->handle;
+
+   memset(&info_arg, 0, sizeof(info_arg));
+   info_arg.bo_handle = res->bo_handle;
+
+   if (drmIoctl(qdws->fd, DRM_IOCTL_VIRTGPU_RESOURCE_INFO, &info_arg)) {
+      /* close */
+      FREE(res);
+      res = NULL;
+      goto done;
+   }
+
+   res->res_handle = info_arg.res_handle;
+
+   res->size = info_arg.size;
+   res->stride = info_arg.stride;
+   pipe_reference_init(&res->reference, 1);
+   res->num_cs_references = 0;
+
+   util_hash_table_set(qdws->bo_handles, (void *)(uintptr_t)whandle->handle, res);
+
+done:
+   pipe_mutex_unlock(qdws->bo_handles_mutex);
+   return res;
+}
+
+static boolean virgl_drm_winsys_resource_get_handle(struct virgl_winsys *qws,
+                                                    struct virgl_hw_res *res,
+                                                    uint32_t stride,
+                                                    struct winsys_handle *whandle)
+ {
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+   struct drm_gem_flink flink;
+
+   if (!res)
+       return FALSE;
+
+   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+      if (!res->flinked) {
+         memset(&flink, 0, sizeof(flink));
+         flink.handle = res->bo_handle;
+
+         if (drmIoctl(qdws->fd, DRM_IOCTL_GEM_FLINK, &flink)) {
+            return FALSE;
+         }
+         res->flinked = TRUE;
+         res->flink = flink.name;
+
+         pipe_mutex_lock(qdws->bo_handles_mutex);
+         util_hash_table_set(qdws->bo_handles, (void *)(uintptr_t)res->flink, res);
+         pipe_mutex_unlock(qdws->bo_handles_mutex);
+      }
+      whandle->handle = res->flink;
+   } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+      whandle->handle = res->bo_handle;
+   } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+      if (drmPrimeHandleToFD(qdws->fd, res->bo_handle, DRM_CLOEXEC, (int*)&whandle->handle))
+            return FALSE;
+   }
+   whandle->stride = stride;
+   return TRUE;
+}
+
+static void virgl_drm_winsys_resource_unref(struct virgl_winsys *qws,
+                                            struct virgl_hw_res *hres)
+{
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+
+   virgl_drm_resource_reference(qdws, &hres, NULL);
+}
+
+static void *virgl_drm_resource_map(struct virgl_winsys *qws,
+                                    struct virgl_hw_res *res)
+{
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+   struct drm_virtgpu_map mmap_arg;
+   void *ptr;
+
+   if (res->ptr)
+      return res->ptr;
+
+   memset(&mmap_arg, 0, sizeof(mmap_arg));
+   mmap_arg.handle = res->bo_handle;
+   if (drmIoctl(qdws->fd, DRM_IOCTL_VIRTGPU_MAP, &mmap_arg))
+      return NULL;
+
+   ptr = os_mmap(0, res->size, PROT_READ|PROT_WRITE, MAP_SHARED,
+                 qdws->fd, mmap_arg.offset);
+   if (ptr == MAP_FAILED)
+      return NULL;
+
+   res->ptr = ptr;
+   return ptr;
+
+}
+
+static void virgl_drm_resource_wait(struct virgl_winsys *qws,
+                                    struct virgl_hw_res *res)
+{
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+   struct drm_virtgpu_3d_wait waitcmd;
+   int ret;
+
+   memset(&waitcmd, 0, sizeof(waitcmd));
+   waitcmd.handle = res->bo_handle;
+ again:
+   ret = drmIoctl(qdws->fd, DRM_IOCTL_VIRTGPU_WAIT, &waitcmd);
+   if (ret == -EAGAIN)
+      goto again;
+}
+
+static struct virgl_cmd_buf *virgl_drm_cmd_buf_create(struct virgl_winsys *qws)
+{
+   struct virgl_drm_cmd_buf *cbuf;
+
+   cbuf = CALLOC_STRUCT(virgl_drm_cmd_buf);
+   if (!cbuf)
+      return NULL;
+
+   cbuf->ws = qws;
+
+   cbuf->nres = 512;
+   cbuf->res_bo = CALLOC(cbuf->nres, sizeof(struct virgl_hw_buf*));
+   if (!cbuf->res_bo) {
+      FREE(cbuf);
+      return NULL;
+   }
+   cbuf->res_hlist = MALLOC(cbuf->nres * sizeof(uint32_t));
+   if (!cbuf->res_hlist) {
+      FREE(cbuf->res_bo);
+      FREE(cbuf);
+      return NULL;
+   }
+
+   cbuf->base.buf = cbuf->buf;
+   return &cbuf->base;
+}
+
+static void virgl_drm_cmd_buf_destroy(struct virgl_cmd_buf *_cbuf)
+{
+   struct virgl_drm_cmd_buf *cbuf = virgl_drm_cmd_buf(_cbuf);
+
+   FREE(cbuf->res_hlist);
+   FREE(cbuf->res_bo);
+   FREE(cbuf);
+
+}
+
+static boolean virgl_drm_lookup_res(struct virgl_drm_cmd_buf *cbuf,
+                                    struct virgl_hw_res *res)
+{
+   unsigned hash = res->res_handle & (sizeof(cbuf->is_handle_added)-1);
+   int i;
+
+   if (cbuf->is_handle_added[hash]) {
+      i = cbuf->reloc_indices_hashlist[hash];
+      if (cbuf->res_bo[i] == res)
+         return true;
+
+      for (i = 0; i < cbuf->cres; i++) {
+         if (cbuf->res_bo[i] == res) {
+            cbuf->reloc_indices_hashlist[hash] = i;
+            return true;
+         }
+      }
+   }
+   return false;
+}
+
+static void virgl_drm_add_res(struct virgl_drm_winsys *qdws,
+                              struct virgl_drm_cmd_buf *cbuf,
+                              struct virgl_hw_res *res)
+{
+   unsigned hash = res->res_handle & (sizeof(cbuf->is_handle_added)-1);
+
+   if (cbuf->cres > cbuf->nres) {
+      fprintf(stderr,"failure to add relocation\n");
+      return;
+   }
+
+   cbuf->res_bo[cbuf->cres] = NULL;
+   virgl_drm_resource_reference(qdws, &cbuf->res_bo[cbuf->cres], res);
+   cbuf->res_hlist[cbuf->cres] = res->bo_handle;
+   cbuf->is_handle_added[hash] = TRUE;
+
+   cbuf->reloc_indices_hashlist[hash] = cbuf->cres;
+   p_atomic_inc(&res->num_cs_references);
+   cbuf->cres++;
+}
+
+static void virgl_drm_release_all_res(struct virgl_drm_winsys *qdws,
+                                      struct virgl_drm_cmd_buf *cbuf)
+{
+   int i;
+
+   for (i = 0; i < cbuf->cres; i++) {
+      p_atomic_dec(&cbuf->res_bo[i]->num_cs_references);
+      virgl_drm_resource_reference(qdws, &cbuf->res_bo[i], NULL);
+   }
+   cbuf->cres = 0;
+}
+
+static void virgl_drm_emit_res(struct virgl_winsys *qws,
+                               struct virgl_cmd_buf *_cbuf,
+                               struct virgl_hw_res *res, boolean write_buf)
+{
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+   struct virgl_drm_cmd_buf *cbuf = virgl_drm_cmd_buf(_cbuf);
+   boolean already_in_list = virgl_drm_lookup_res(cbuf, res);
+
+   if (write_buf)
+      cbuf->base.buf[cbuf->base.cdw++] = res->res_handle;
+
+   if (!already_in_list)
+      virgl_drm_add_res(qdws, cbuf, res);
+}
+
+static boolean virgl_drm_res_is_ref(struct virgl_winsys *qws,
+                                    struct virgl_cmd_buf *_cbuf,
+                                    struct virgl_hw_res *res)
+{
+   if (!res->num_cs_references)
+      return FALSE;
+
+   return TRUE;
+}
+
+static int virgl_drm_winsys_submit_cmd(struct virgl_winsys *qws,
+                                       struct virgl_cmd_buf *_cbuf)
+{
+   struct virgl_drm_winsys *qdws = virgl_drm_winsys(qws);
+   struct virgl_drm_cmd_buf *cbuf = virgl_drm_cmd_buf(_cbuf);
+   struct drm_virtgpu_execbuffer eb;
+   int ret;
+
+   if (cbuf->base.cdw == 0)
+      return 0;
+
+   memset(&eb, 0, sizeof(struct drm_virtgpu_execbuffer));
+   eb.command = (unsigned long)(void*)cbuf->buf;
+   eb.size = cbuf->base.cdw * 4;
+   eb.num_bo_handles = cbuf->cres;
+   eb.bo_handles = (unsigned long)(void *)cbuf->res_hlist;
+
+   ret = drmIoctl(qdws->fd, DRM_IOCTL_VIRTGPU_EXECBUFFER, &eb);
+   if (ret == -1)
+      fprintf(stderr,"got error from kernel - expect bad rendering %d\n", errno);
+   cbuf->base.cdw = 0;
+
+   virgl_drm_release_all_res(qdws, cbuf);
+
+   memset(cbuf->is_handle_added, 0, sizeof(cbuf->is_handle_added));
+   return ret;
+}
+
+static int virgl_drm_get_caps(struct virgl_winsys *vws,
+                              struct virgl_drm_caps *caps)
+{
+   struct virgl_drm_winsys *vdws = virgl_drm_winsys(vws);
+   struct drm_virtgpu_get_caps args;
+
+   memset(&args, 0, sizeof(args));
+
+   args.cap_set_id = 1;
+   args.addr = (unsigned long)&caps->caps;
+   args.size = sizeof(union virgl_caps);
+   return drmIoctl(vdws->fd, DRM_IOCTL_VIRTGPU_GET_CAPS, &args);
+}
+
+#define PTR_TO_UINT(x) ((unsigned)((intptr_t)(x)))
+
+static unsigned handle_hash(void *key)
+{
+    return PTR_TO_UINT(key);
+}
+
+static int handle_compare(void *key1, void *key2)
+{
+    return PTR_TO_UINT(key1) != PTR_TO_UINT(key2);
+}
+
+static struct pipe_fence_handle *
+virgl_cs_create_fence(struct virgl_winsys *vws)
+{
+   struct virgl_hw_res *res;
+
+   res = virgl_drm_winsys_resource_cache_create(vws,
+                                                PIPE_BUFFER,
+                                                PIPE_FORMAT_R8_UNORM,
+                                                VIRGL_BIND_CUSTOM,
+                                                8, 1, 1, 0, 0, 0, 8);
+
+   return (struct pipe_fence_handle *)res;
+}
+
+static bool virgl_fence_wait(struct virgl_winsys *vws,
+                             struct pipe_fence_handle *fence,
+                             uint64_t timeout)
+{
+   struct virgl_drm_winsys *vdws = virgl_drm_winsys(vws);
+   struct virgl_hw_res *res = virgl_hw_res(fence);
+
+   if (timeout == 0)
+      return virgl_drm_resource_is_busy(vdws, res);
+
+   if (timeout != PIPE_TIMEOUT_INFINITE) {
+      int64_t start_time = os_time_get();
+      timeout /= 1000;
+      while (virgl_drm_resource_is_busy(vdws, res)) {
+         if (os_time_get() - start_time >= timeout)
+            return FALSE;
+         os_time_sleep(10);
+      }
+      return TRUE;
+   }
+   virgl_drm_resource_wait(vws, res);
+   return TRUE;
+}
+
+static void virgl_fence_reference(struct virgl_winsys *vws,
+                                  struct pipe_fence_handle **dst,
+                                  struct pipe_fence_handle *src)
+{
+   struct virgl_drm_winsys *vdws = virgl_drm_winsys(vws);
+   virgl_drm_resource_reference(vdws, (struct virgl_hw_res **)dst,
+                                virgl_hw_res(src));
+}
+
+
+struct virgl_winsys *
+virgl_drm_winsys_create(int drmFD)
+{
+   struct virgl_drm_winsys *qdws;
+
+   qdws = CALLOC_STRUCT(virgl_drm_winsys);
+   if (!qdws)
+      return NULL;
+
+   qdws->fd = drmFD;
+   qdws->num_delayed = 0;
+   qdws->usecs = 1000000;
+   LIST_INITHEAD(&qdws->delayed);
+   pipe_mutex_init(qdws->mutex);
+   pipe_mutex_init(qdws->bo_handles_mutex);
+   qdws->bo_handles = util_hash_table_create(handle_hash, handle_compare);
+   qdws->base.destroy = virgl_drm_winsys_destroy;
+
+   qdws->base.transfer_put = virgl_bo_transfer_put;
+   qdws->base.transfer_get = virgl_bo_transfer_get;
+   qdws->base.resource_create = virgl_drm_winsys_resource_cache_create;
+   qdws->base.resource_unref = virgl_drm_winsys_resource_unref;
+   qdws->base.resource_create_from_handle = virgl_drm_winsys_resource_create_handle;
+   qdws->base.resource_get_handle = virgl_drm_winsys_resource_get_handle;
+   qdws->base.resource_map = virgl_drm_resource_map;
+   qdws->base.resource_wait = virgl_drm_resource_wait;
+   qdws->base.cmd_buf_create = virgl_drm_cmd_buf_create;
+   qdws->base.cmd_buf_destroy = virgl_drm_cmd_buf_destroy;
+   qdws->base.submit_cmd = virgl_drm_winsys_submit_cmd;
+   qdws->base.emit_res = virgl_drm_emit_res;
+   qdws->base.res_is_referenced = virgl_drm_res_is_ref;
+
+   qdws->base.cs_create_fence = virgl_cs_create_fence;
+   qdws->base.fence_wait = virgl_fence_wait;
+   qdws->base.fence_reference = virgl_fence_reference;
+
+   qdws->base.get_caps = virgl_drm_get_caps;
+   return &qdws->base;
+
+}
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h
new file mode 100644
index 00000000000..da85ff87d2a
--- /dev/null
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_DRM_WINSYS_H
+#define VIRGL_DRM_WINSYS_H
+
+#include <stdint.h>
+#include "os/os_thread.h"
+#include "pipe/p_state.h"
+#include "util/list.h"
+
+#include "virgl/virgl_winsys.h"
+
+struct pipe_fence_handle;
+struct util_hash_table;
+
+struct virgl_hw_res {
+   struct pipe_reference reference;
+   uint32_t res_handle;
+   uint32_t bo_handle;
+   uint32_t name;
+   int num_cs_references;
+   uint32_t size;
+   void *ptr;
+   uint32_t stride;
+
+   struct list_head head;
+   uint32_t format;
+   uint32_t bind;
+   boolean cacheable;
+   int64_t start, end;
+   boolean flinked;
+   uint32_t flink;
+};
+
+struct virgl_drm_winsys
+{
+   struct virgl_winsys base;
+   int fd;
+   struct list_head delayed;
+   int num_delayed;
+   unsigned usecs;
+   pipe_mutex mutex;
+
+   struct util_hash_table *bo_handles;
+   pipe_mutex bo_handles_mutex;
+};
+
+struct virgl_drm_cmd_buf {
+   struct virgl_cmd_buf base;
+
+   uint32_t buf[VIRGL_MAX_CMDBUF_DWORDS];
+
+   unsigned nres;
+   unsigned cres;
+   struct virgl_hw_res **res_bo;
+   struct virgl_winsys *ws;
+   uint32_t *res_hlist;
+
+   char                        is_handle_added[512];
+   unsigned                    reloc_indices_hashlist[512];
+
+};
+
+static inline struct virgl_hw_res *
+virgl_hw_res(struct pipe_fence_handle *f)
+{
+   return (struct virgl_hw_res *)f;
+}
+
+static inline struct virgl_drm_winsys *
+virgl_drm_winsys(struct virgl_winsys *iws)
+{
+   return (struct virgl_drm_winsys *)iws;
+}
+
+static inline struct virgl_drm_cmd_buf *
+virgl_drm_cmd_buf(struct virgl_cmd_buf *cbuf)
+{
+   return (struct virgl_drm_cmd_buf *)cbuf;
+}
+
+#endif
diff --git a/src/gallium/winsys/virgl/drm/virtgpu_drm.h b/src/gallium/winsys/virgl/drm/virtgpu_drm.h
new file mode 100644
index 00000000000..30bc3afdd81
--- /dev/null
+++ b/src/gallium/winsys/virgl/drm/virtgpu_drm.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2013 Red Hat
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRTGPU_DRM_H
+#define VIRTGPU_DRM_H
+
+#include <stddef.h>
+#include "drm.h"
+
+/* Please note that modifications to all structs defined here are
+ * subject to backwards-compatibility constraints.
+ *
+ * Do not use pointers, use uint64_t instead for 32 bit / 64 bit user/kernel
+ * compatibility Keep fields aligned to their size
+ */
+
+#define DRM_VIRTGPU_MAP         0x01
+#define DRM_VIRTGPU_EXECBUFFER  0x02
+#define DRM_VIRTGPU_GETPARAM    0x03
+#define DRM_VIRTGPU_RESOURCE_CREATE 0x04
+#define DRM_VIRTGPU_RESOURCE_INFO     0x05
+#define DRM_VIRTGPU_TRANSFER_FROM_HOST 0x06
+#define DRM_VIRTGPU_TRANSFER_TO_HOST 0x07
+#define DRM_VIRTGPU_WAIT     0x08
+#define DRM_VIRTGPU_GET_CAPS  0x09
+
+struct drm_virtgpu_map {
+	uint64_t offset; /* use for mmap system call */
+	uint32_t handle;
+	uint32_t pad;
+};
+
+struct drm_virtgpu_execbuffer {
+	uint32_t		flags;		/* for future use */
+	uint32_t size;
+	uint64_t command; /* void* */
+	uint64_t bo_handles;
+	uint32_t num_bo_handles;
+	uint32_t pad;
+};
+
+#define VIRTGPU_PARAM_3D_FEATURES 1 /* do we have 3D features in the hw */
+
+struct drm_virtgpu_getparam {
+	uint64_t param;
+	uint64_t value;
+};
+
+/* NO_BO flags? NO resource flag? */
+/* resource flag for y_0_top */
+struct drm_virtgpu_resource_create {
+	uint32_t target;
+	uint32_t format;
+	uint32_t bind;
+	uint32_t width;
+	uint32_t height;
+	uint32_t depth;
+	uint32_t array_size;
+	uint32_t last_level;
+	uint32_t nr_samples;
+	uint32_t flags;
+	uint32_t bo_handle; /* if this is set - recreate a new resource attached to this bo ? */
+	uint32_t res_handle;  /* returned by kernel */
+	uint32_t size;        /* validate transfer in the host */
+	uint32_t stride;      /* validate transfer in the host */
+};
+
+struct drm_virtgpu_resource_info {
+	uint32_t bo_handle;
+	uint32_t res_handle;
+	uint32_t size;
+	uint32_t stride;
+};
+
+struct drm_virtgpu_3d_box {
+	uint32_t x, y, z;
+	uint32_t w, h, d;
+};
+
+struct drm_virtgpu_3d_transfer_to_host {
+	uint32_t bo_handle;
+	struct drm_virtgpu_3d_box box;
+	uint32_t level;
+	uint32_t offset;
+};
+
+struct drm_virtgpu_3d_transfer_from_host {
+	uint32_t bo_handle;
+	struct drm_virtgpu_3d_box box;
+	uint32_t level;
+	uint32_t offset;
+};
+
+#define VIRTGPU_WAIT_NOWAIT 1 /* like it */
+struct drm_virtgpu_3d_wait {
+	uint32_t handle; /* 0 is an invalid handle */
+	uint32_t flags;
+};
+
+struct drm_virtgpu_get_caps {
+	uint32_t cap_set_id;
+	uint32_t cap_set_ver;
+	uint64_t addr;
+	uint32_t size;
+	uint32_t pad;
+};
+
+#define DRM_IOCTL_VIRTGPU_MAP \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_MAP, struct drm_virtgpu_map)
+
+#define DRM_IOCTL_VIRTGPU_EXECBUFFER \
+	DRM_IOW(DRM_COMMAND_BASE + DRM_VIRTGPU_EXECBUFFER,\
+		struct drm_virtgpu_execbuffer)
+
+#define DRM_IOCTL_VIRTGPU_GETPARAM \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GETPARAM,\
+		struct drm_virtgpu_getparam)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_CREATE			\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_CREATE,	\
+		struct drm_virtgpu_resource_create)
+
+#define DRM_IOCTL_VIRTGPU_RESOURCE_INFO \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_RESOURCE_INFO, \
+		 struct drm_virtgpu_resource_info)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_FROM_HOST \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_FROM_HOST,	\
+		struct drm_virtgpu_3d_transfer_from_host)
+
+#define DRM_IOCTL_VIRTGPU_TRANSFER_TO_HOST \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_TRANSFER_TO_HOST,	\
+		struct drm_virtgpu_3d_transfer_to_host)
+
+#define DRM_IOCTL_VIRTGPU_WAIT				\
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_WAIT,	\
+		struct drm_virtgpu_3d_wait)
+
+#define DRM_IOCTL_VIRTGPU_GET_CAPS \
+	DRM_IOWR(DRM_COMMAND_BASE + DRM_VIRTGPU_GET_CAPS, \
+	struct drm_virtgpu_get_caps)
+
+#endif
diff --git a/src/gallium/winsys/virgl/vtest/Makefile.am b/src/gallium/winsys/virgl/vtest/Makefile.am
new file mode 100644
index 00000000000..b15a3afd60c
--- /dev/null
+++ b/src/gallium/winsys/virgl/vtest/Makefile.am
@@ -0,0 +1,32 @@
+# Copyright © 2015 Red Hat
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CFLAGS = \
+	-I$(top_srcdir)/src/gallium/drivers \
+	$(GALLIUM_WINSYS_CFLAGS)
+
+noinst_LTLIBRARIES = libvirglvtest.la
+
+libvirglvtest_la_SOURCES = $(C_SOURCES)
diff --git a/src/gallium/winsys/virgl/vtest/Makefile.sources b/src/gallium/winsys/virgl/vtest/Makefile.sources
new file mode 100644
index 00000000000..12370d96fa8
--- /dev/null
+++ b/src/gallium/winsys/virgl/vtest/Makefile.sources
@@ -0,0 +1,6 @@
+C_SOURCES := \
+	virgl_vtest_public.h \
+	virgl_vtest_socket.c \
+	virgl_vtest_winsys.c \
+	virgl_vtest_winsys.h \
+	vtest_protocol.h
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_public.h b/src/gallium/winsys/virgl/vtest/virgl_vtest_public.h
new file mode 100644
index 00000000000..47379d9c735
--- /dev/null
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_public.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_VTEST_PUBLIC_H
+#define VIRGL_VTEST_PUBLIC_H
+
+struct virgl_winsys;
+struct sw_winsys;
+
+struct virgl_winsys *virgl_vtest_winsys_wrap(struct sw_winsys *sws);
+
+#endif
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c
new file mode 100644
index 00000000000..4541419d8e8
--- /dev/null
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <sys/socket.h>
+#include <errno.h>
+#include <stdio.h>
+#include <netinet/in.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+#include <os/os_process.h>
+#include <util/u_format.h>
+/* connect to remote socket */
+#define VTEST_SOCKET_NAME "/tmp/.virgl_test"
+
+#include "virgl_vtest_winsys.h"
+#include "virgl_vtest_public.h"
+
+/* block read/write routines */
+static int virgl_block_write(int fd, void *buf, int size)
+{
+   void *ptr = buf;
+   int left;
+   int ret;
+   left = size;
+   do {
+      ret = write(fd, ptr, left);
+      if (ret < 0)
+         return -errno;
+      left -= ret;
+      ptr += ret;
+   } while (left);
+   return size;
+}
+
+static int virgl_block_read(int fd, void *buf, int size)
+{
+   void *ptr = buf;
+   int left;
+   int ret;
+   left = size;
+   do {
+      ret = read(fd, ptr, left);
+      if (ret <= 0) {
+         fprintf(stderr,
+                 "lost connection to rendering server on %d read %d %d\n",
+                 size, ret, errno);
+         abort();
+         return ret < 0 ? -errno : 0;
+      }
+      left -= ret;
+      ptr += ret;
+   } while (left);
+   return size;
+}
+
+static int virgl_vtest_send_init(struct virgl_vtest_winsys *vws)
+{
+   uint32_t buf[VTEST_HDR_SIZE];
+   const char *nstr = "virtest";
+   char cmdline[64];
+   int ret;
+
+   ret = os_get_process_name(cmdline, 63);
+   if (ret == FALSE)
+      strcpy(cmdline, nstr);
+#if defined(__GLIBC__) || defined(__CYGWIN__)
+   if (!strcmp(cmdline, "shader_runner")) {
+      const char *name;
+      /* hack to get better testname */
+      name = program_invocation_short_name;
+      name += strlen(name) + 1;
+      strncpy(cmdline, name, 63);
+   }
+#endif
+   buf[VTEST_CMD_LEN] = strlen(cmdline) + 1;
+   buf[VTEST_CMD_ID] = VCMD_CREATE_RENDERER;
+
+   virgl_block_write(vws->sock_fd, &buf, sizeof(buf));
+   virgl_block_write(vws->sock_fd, (void *)cmdline, strlen(cmdline) + 1);
+   return 0;
+}
+
+int virgl_vtest_connect(struct virgl_vtest_winsys *vws)
+{
+   struct sockaddr_un un;
+   int sock, ret;
+
+   sock = socket(PF_UNIX, SOCK_STREAM, 0);
+   if (sock < 0)
+      return -1;
+
+   memset(&un, 0, sizeof(un));
+   un.sun_family = AF_UNIX;
+   snprintf(un.sun_path, sizeof(un.sun_path), "%s", VTEST_SOCKET_NAME);
+
+   do {
+      ret = 0;
+      if (connect(sock, (struct sockaddr *)&un, sizeof(un)) < 0) {
+         ret = -errno;
+      }
+   } while (ret == -EINTR);
+
+   vws->sock_fd = sock;
+   virgl_vtest_send_init(vws);
+   return 0;
+}
+
+int virgl_vtest_send_get_caps(struct virgl_vtest_winsys *vws,
+                              struct virgl_drm_caps *caps)
+{
+   uint32_t get_caps_buf[VTEST_HDR_SIZE];
+   uint32_t resp_buf[VTEST_HDR_SIZE];
+
+   int ret;
+   get_caps_buf[VTEST_CMD_LEN] = 0;
+   get_caps_buf[VTEST_CMD_ID] = VCMD_GET_CAPS;
+
+   virgl_block_write(vws->sock_fd, &get_caps_buf, sizeof(get_caps_buf));
+
+   ret = virgl_block_read(vws->sock_fd, resp_buf, sizeof(resp_buf));
+   if (ret <= 0)
+      return 0;
+
+   ret = virgl_block_read(vws->sock_fd, &caps->caps, sizeof(union virgl_caps));
+
+   return 0;
+}
+
+int virgl_vtest_send_resource_create(struct virgl_vtest_winsys *vws,
+                                     uint32_t handle,
+                                     enum pipe_texture_target target,
+                                     uint32_t format,
+                                     uint32_t bind,
+                                     uint32_t width,
+                                     uint32_t height,
+                                     uint32_t depth,
+                                     uint32_t array_size,
+                                     uint32_t last_level,
+                                     uint32_t nr_samples)
+{
+   uint32_t res_create_buf[VCMD_RES_CREATE_SIZE], vtest_hdr[VTEST_HDR_SIZE];
+
+   vtest_hdr[VTEST_CMD_LEN] = VCMD_RES_CREATE_SIZE;
+   vtest_hdr[VTEST_CMD_ID] = VCMD_RESOURCE_CREATE;
+
+   res_create_buf[VCMD_RES_CREATE_RES_HANDLE] = handle;
+   res_create_buf[VCMD_RES_CREATE_TARGET] = target;
+   res_create_buf[VCMD_RES_CREATE_FORMAT] = format;
+   res_create_buf[VCMD_RES_CREATE_BIND] = bind;
+   res_create_buf[VCMD_RES_CREATE_WIDTH] = width;
+   res_create_buf[VCMD_RES_CREATE_HEIGHT] = height;
+   res_create_buf[VCMD_RES_CREATE_DEPTH] = depth;
+   res_create_buf[VCMD_RES_CREATE_ARRAY_SIZE] = array_size;
+   res_create_buf[VCMD_RES_CREATE_LAST_LEVEL] = last_level;
+   res_create_buf[VCMD_RES_CREATE_NR_SAMPLES] = nr_samples;
+
+   virgl_block_write(vws->sock_fd, &vtest_hdr, sizeof(vtest_hdr));
+   virgl_block_write(vws->sock_fd, &res_create_buf, sizeof(res_create_buf));
+
+   return 0;
+}
+
+int virgl_vtest_submit_cmd(struct virgl_vtest_winsys *vws,
+                           struct virgl_vtest_cmd_buf *cbuf)
+{
+   uint32_t vtest_hdr[VTEST_HDR_SIZE];
+
+   vtest_hdr[VTEST_CMD_LEN] = cbuf->base.cdw;
+   vtest_hdr[VTEST_CMD_ID] = VCMD_SUBMIT_CMD;
+
+   virgl_block_write(vws->sock_fd, &vtest_hdr, sizeof(vtest_hdr));
+   virgl_block_write(vws->sock_fd, cbuf->buf, cbuf->base.cdw * 4);
+   return 0;
+}
+
+int virgl_vtest_send_resource_unref(struct virgl_vtest_winsys *vws,
+                                    uint32_t handle)
+{
+   uint32_t vtest_hdr[VTEST_HDR_SIZE];
+   uint32_t cmd[1];
+   vtest_hdr[VTEST_CMD_LEN] = 1;
+   vtest_hdr[VTEST_CMD_ID] = VCMD_RESOURCE_UNREF;
+
+   cmd[0] = handle;
+   virgl_block_write(vws->sock_fd, &vtest_hdr, sizeof(vtest_hdr));
+   virgl_block_write(vws->sock_fd, &cmd, sizeof(cmd));
+   return 0;
+}
+
+int virgl_vtest_send_transfer_cmd(struct virgl_vtest_winsys *vws,
+                                  uint32_t vcmd,
+                                  uint32_t handle,
+                                  uint32_t level, uint32_t stride,
+                                  uint32_t layer_stride,
+                                  const struct pipe_box *box,
+                                  uint32_t data_size)
+{
+   uint32_t vtest_hdr[VTEST_HDR_SIZE];
+   uint32_t cmd[VCMD_TRANSFER_HDR_SIZE];
+   vtest_hdr[VTEST_CMD_LEN] = VCMD_TRANSFER_HDR_SIZE;
+   vtest_hdr[VTEST_CMD_ID] = vcmd;
+
+   if (vcmd == VCMD_TRANSFER_PUT)
+      vtest_hdr[VTEST_CMD_LEN] += data_size + 3 / 4;
+
+   cmd[0] = handle;
+   cmd[1] = level;
+   cmd[2] = stride;
+   cmd[3] = layer_stride;
+   cmd[4] = box->x;
+   cmd[5] = box->y;
+   cmd[6] = box->z;
+   cmd[7] = box->width;
+   cmd[8] = box->height;
+   cmd[9] = box->depth;
+   cmd[10] = data_size;
+   virgl_block_write(vws->sock_fd, &vtest_hdr, sizeof(vtest_hdr));
+   virgl_block_write(vws->sock_fd, &cmd, sizeof(cmd));
+
+   return 0;
+}
+
+int virgl_vtest_send_transfer_put_data(struct virgl_vtest_winsys *vws,
+                                       void *data,
+                                       uint32_t data_size)
+{
+   return virgl_block_write(vws->sock_fd, data, data_size);
+}
+
+int virgl_vtest_recv_transfer_get_data(struct virgl_vtest_winsys *vws,
+                                       void *data,
+                                       uint32_t data_size,
+                                       uint32_t stride,
+                                       const struct pipe_box *box,
+                                       uint32_t format)
+{
+   void *line;
+   void *ptr = data;
+   int hblocks = util_format_get_nblocksy(format, box->height);
+
+   line = malloc(stride);
+   while (hblocks) {
+      virgl_block_read(vws->sock_fd, line, stride);
+      memcpy(ptr, line, util_format_get_stride(format, box->width));
+      ptr += stride;
+      hblocks--;
+   }
+   free(line);
+   return 0;
+}
+
+int virgl_vtest_busy_wait(struct virgl_vtest_winsys *vws, int handle,
+                          int flags)
+{
+   uint32_t vtest_hdr[VTEST_HDR_SIZE];
+   uint32_t cmd[VCMD_BUSY_WAIT_SIZE];
+   uint32_t result[1];
+   int ret;
+   vtest_hdr[VTEST_CMD_LEN] = VCMD_BUSY_WAIT_SIZE;
+   vtest_hdr[VTEST_CMD_ID] = VCMD_RESOURCE_BUSY_WAIT;
+   cmd[VCMD_BUSY_WAIT_HANDLE] = handle;
+   cmd[VCMD_BUSY_WAIT_FLAGS] = flags;
+
+   virgl_block_write(vws->sock_fd, &vtest_hdr, sizeof(vtest_hdr));
+   virgl_block_write(vws->sock_fd, &cmd, sizeof(cmd));
+
+   ret = virgl_block_read(vws->sock_fd, vtest_hdr, sizeof(vtest_hdr));
+   assert(ret);
+   ret = virgl_block_read(vws->sock_fd, result, sizeof(result));
+   assert(ret);
+   return result[0];
+}
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
new file mode 100644
index 00000000000..b19c4561493
--- /dev/null
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
@@ -0,0 +1,666 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include <stdio.h>
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "os/os_time.h"
+#include "state_tracker/sw_winsys.h"
+
+#include "virgl_vtest_winsys.h"
+#include "virgl_vtest_public.h"
+
+static void *virgl_vtest_resource_map(struct virgl_winsys *vws,
+                                      struct virgl_hw_res *res);
+static void virgl_vtest_resource_unmap(struct virgl_winsys *vws,
+                                       struct virgl_hw_res *res);
+
+static inline boolean can_cache_resource(struct virgl_hw_res *res)
+{
+   return res->cacheable == TRUE;
+}
+
+static uint32_t vtest_get_transfer_size(struct virgl_hw_res *res,
+                                        const struct pipe_box *box,
+                                        uint32_t stride, uint32_t layer_stride,
+                                        uint32_t level, uint32_t *valid_stride_p)
+{
+   uint32_t valid_stride, valid_layer_stride;
+
+   valid_stride = util_format_get_stride(res->format, box->width);
+   if (stride) {
+      if (box->height > 1)
+         valid_stride = stride;
+   }
+
+   valid_layer_stride = util_format_get_2d_size(res->format, valid_stride,
+                                                box->height);
+   if (layer_stride) {
+      if (box->depth > 1)
+         valid_layer_stride = layer_stride;
+   }
+
+   *valid_stride_p = valid_stride;
+   return valid_layer_stride * box->depth;
+}
+
+static int
+virgl_vtest_transfer_put(struct virgl_winsys *vws,
+                         struct virgl_hw_res *res,
+                         const struct pipe_box *box,
+                         uint32_t stride, uint32_t layer_stride,
+                         uint32_t buf_offset, uint32_t level)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   uint32_t size;
+   void *ptr;
+   uint32_t valid_stride;
+
+   size = vtest_get_transfer_size(res, box, stride, layer_stride, level,
+                                  &valid_stride);
+
+   virgl_vtest_send_transfer_cmd(vtws, VCMD_TRANSFER_PUT, res->res_handle,
+                                 level, stride, layer_stride,
+                                 box, size);
+   ptr = virgl_vtest_resource_map(vws, res);
+   virgl_vtest_send_transfer_put_data(vtws, ptr + buf_offset, size);
+   virgl_vtest_resource_unmap(vws, res);
+   return 0;
+}
+
+static int
+virgl_vtest_transfer_get(struct virgl_winsys *vws,
+                         struct virgl_hw_res *res,
+                         const struct pipe_box *box,
+                         uint32_t stride, uint32_t layer_stride,
+                         uint32_t buf_offset, uint32_t level)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   uint32_t size;
+   void *ptr;
+   uint32_t valid_stride;
+
+   size = vtest_get_transfer_size(res, box, stride, layer_stride, level,
+                                  &valid_stride);
+
+   virgl_vtest_send_transfer_cmd(vtws, VCMD_TRANSFER_GET, res->res_handle,
+                                 level, stride, layer_stride,
+                                 box, size);
+
+
+   ptr = virgl_vtest_resource_map(vws, res);
+   virgl_vtest_recv_transfer_get_data(vtws, ptr + buf_offset, size,
+                                      valid_stride, box, res->format);
+   virgl_vtest_resource_unmap(vws, res);
+   return 0;
+}
+
+static void virgl_hw_res_destroy(struct virgl_vtest_winsys *vtws,
+                                 struct virgl_hw_res *res)
+{
+   virgl_vtest_send_resource_unref(vtws, res->res_handle);
+   if (res->dt)
+      vtws->sws->displaytarget_destroy(vtws->sws, res->dt);
+   free(res->ptr);
+   FREE(res);
+}
+
+static boolean virgl_vtest_resource_is_busy(struct virgl_vtest_winsys *vtws,
+                                            struct virgl_hw_res *res)
+{
+   /* implement busy check */
+   int ret;
+   ret = virgl_vtest_busy_wait(vtws, res->res_handle, 0);
+
+   if (ret < 0)
+      return FALSE;
+
+   return ret == 1 ? TRUE : FALSE;
+}
+
+static void
+virgl_cache_flush(struct virgl_vtest_winsys *vtws)
+{
+   struct list_head *curr, *next;
+   struct virgl_hw_res *res;
+
+   pipe_mutex_lock(vtws->mutex);
+   curr = vtws->delayed.next;
+   next = curr->next;
+
+   while (curr != &vtws->delayed) {
+      res = LIST_ENTRY(struct virgl_hw_res, curr, head);
+      LIST_DEL(&res->head);
+      virgl_hw_res_destroy(vtws, res);
+      curr = next;
+      next = curr->next;
+   }
+   pipe_mutex_unlock(vtws->mutex);
+}
+
+static void
+virgl_cache_list_check_free(struct virgl_vtest_winsys *vtws)
+{
+   struct list_head *curr, *next;
+   struct virgl_hw_res *res;
+   int64_t now;
+
+   now = os_time_get();
+   curr = vtws->delayed.next;
+   next = curr->next;
+   while (curr != &vtws->delayed) {
+      res = LIST_ENTRY(struct virgl_hw_res, curr, head);
+      if (!os_time_timeout(res->start, res->end, now))
+         break;
+
+      LIST_DEL(&res->head);
+      virgl_hw_res_destroy(vtws, res);
+      curr = next;
+      next = curr->next;
+   }
+}
+
+static void virgl_vtest_resource_reference(struct virgl_vtest_winsys *vtws,
+                                           struct virgl_hw_res **dres,
+                                           struct virgl_hw_res *sres)
+{
+   struct virgl_hw_res *old = *dres;
+   if (pipe_reference(&(*dres)->reference, &sres->reference)) {
+      if (!can_cache_resource(old)) {
+         virgl_hw_res_destroy(vtws, old);
+      } else {
+         pipe_mutex_lock(vtws->mutex);
+         virgl_cache_list_check_free(vtws);
+
+         old->start = os_time_get();
+         old->end = old->start + vtws->usecs;
+         LIST_ADDTAIL(&old->head, &vtws->delayed);
+         vtws->num_delayed++;
+         pipe_mutex_unlock(vtws->mutex);
+      }
+   }
+   *dres = sres;
+}
+
+static struct virgl_hw_res *
+virgl_vtest_winsys_resource_create(struct virgl_winsys *vws,
+                                   enum pipe_texture_target target,
+                                   uint32_t format,
+                                   uint32_t bind,
+                                   uint32_t width,
+                                   uint32_t height,
+                                   uint32_t depth,
+                                   uint32_t array_size,
+                                   uint32_t last_level,
+                                   uint32_t nr_samples,
+                                   uint32_t size)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   struct virgl_hw_res *res;
+   static int handle = 1;
+
+   res = CALLOC_STRUCT(virgl_hw_res);
+   if (!res)
+      return NULL;
+
+   if (bind & (VIRGL_BIND_DISPLAY_TARGET | VIRGL_BIND_SCANOUT)) {
+      res->dt = vtws->sws->displaytarget_create(vtws->sws, bind, format,
+                                                width, height, 64, NULL,
+                                                &res->stride);
+
+   } else {
+      res->ptr = align_malloc(size, 64);
+      if (!res->ptr) {
+         FREE(res);
+         return NULL;
+      }
+   }
+
+   res->bind = bind;
+   res->format = format;
+   res->height = height;
+   res->width = width;
+   virgl_vtest_send_resource_create(vtws, handle, target, format, bind,
+                                    width, height, depth, array_size,
+                                    last_level, nr_samples);
+
+   res->res_handle = handle++;
+   pipe_reference_init(&res->reference, 1);
+   return res;
+}
+
+static void virgl_vtest_winsys_resource_unref(struct virgl_winsys *vws,
+                                              struct virgl_hw_res *hres)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   virgl_vtest_resource_reference(vtws, &hres, NULL);
+}
+
+static void *virgl_vtest_resource_map(struct virgl_winsys *vws,
+                                      struct virgl_hw_res *res)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+
+   if (res->dt) {
+      return vtws->sws->displaytarget_map(vtws->sws, res->dt, 0);
+   } else {
+      res->mapped = res->ptr;
+      return res->mapped;
+   }
+}
+
+static void virgl_vtest_resource_unmap(struct virgl_winsys *vws,
+                                       struct virgl_hw_res *res)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   if (res->mapped)
+      res->mapped = NULL;
+
+   if (res->dt)
+      vtws->sws->displaytarget_unmap(vtws->sws, res->dt);
+}
+
+static void virgl_vtest_resource_wait(struct virgl_winsys *vws,
+                                      struct virgl_hw_res *res)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+
+   virgl_vtest_busy_wait(vtws, res->res_handle, VCMD_BUSY_WAIT_FLAG_WAIT);
+}
+
+static inline int virgl_is_res_compat(struct virgl_vtest_winsys *vtws,
+                                      struct virgl_hw_res *res,
+                                      uint32_t size, uint32_t bind,
+                                      uint32_t format)
+{
+   if (res->bind != bind)
+      return 0;
+   if (res->format != format)
+      return 0;
+   if (res->size < size)
+      return 0;
+   if (res->size > size * 2)
+      return 0;
+
+   if (virgl_vtest_resource_is_busy(vtws, res)) {
+      return -1;
+   }
+
+   return 1;
+}
+
+static struct virgl_hw_res *
+virgl_vtest_winsys_resource_cache_create(struct virgl_winsys *vws,
+                                         enum pipe_texture_target target,
+                                         uint32_t format,
+                                         uint32_t bind,
+                                         uint32_t width,
+                                         uint32_t height,
+                                         uint32_t depth,
+                                         uint32_t array_size,
+                                         uint32_t last_level,
+                                         uint32_t nr_samples,
+                                         uint32_t size)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   struct virgl_hw_res *res, *curr_res;
+   struct list_head *curr, *next;
+   int64_t now;
+   int ret;
+
+   /* only store binds for vertex/index/const buffers */
+   if (bind != VIRGL_BIND_CONSTANT_BUFFER && bind != VIRGL_BIND_INDEX_BUFFER &&
+       bind != VIRGL_BIND_VERTEX_BUFFER && bind != VIRGL_BIND_CUSTOM)
+      goto alloc;
+
+   pipe_mutex_lock(vtws->mutex);
+
+   res = NULL;
+   curr = vtws->delayed.next;
+   next = curr->next;
+
+   now = os_time_get();
+   while (curr != &vtws->delayed) {
+      curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
+
+      if (!res && (ret = virgl_is_res_compat(vtws, curr_res, size, bind, format) > 0))
+         res = curr_res;
+      else if (os_time_timeout(curr_res->start, curr_res->end, now)) {
+         LIST_DEL(&curr_res->head);
+         virgl_hw_res_destroy(vtws, curr_res);
+      } else
+         break;
+
+      if (ret == -1)
+         break;
+
+      curr = next;
+      next = curr->next;
+   }
+
+   if (!res && ret != -1) {
+      while (curr != &vtws->delayed) {
+         curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
+         ret = virgl_is_res_compat(vtws, curr_res, size, bind, format);
+         if (ret > 0) {
+            res = curr_res;
+            break;
+         }
+         if (ret == -1)
+            break;
+         curr = next;
+         next = curr->next;
+      }
+   }
+
+   if (res) {
+      LIST_DEL(&res->head);
+      --vtws->num_delayed;
+      pipe_mutex_unlock(vtws->mutex);
+      pipe_reference_init(&res->reference, 1);
+      return res;
+   }
+
+   pipe_mutex_unlock(vtws->mutex);
+
+alloc:
+   res = virgl_vtest_winsys_resource_create(vws, target, format, bind,
+                                            width, height, depth, array_size,
+                                            last_level, nr_samples, size);
+   if (bind == VIRGL_BIND_CONSTANT_BUFFER || bind == VIRGL_BIND_INDEX_BUFFER ||
+       bind == VIRGL_BIND_VERTEX_BUFFER)
+      res->cacheable = TRUE;
+   return res;
+}
+
+static struct virgl_cmd_buf *virgl_vtest_cmd_buf_create(struct virgl_winsys *vws)
+{
+   struct virgl_vtest_cmd_buf *cbuf;
+
+   cbuf = CALLOC_STRUCT(virgl_vtest_cmd_buf);
+   if (!cbuf)
+      return NULL;
+
+   cbuf->nres = 512;
+   cbuf->res_bo = CALLOC(cbuf->nres, sizeof(struct virgl_hw_buf*));
+   if (!cbuf->res_bo) {
+      FREE(cbuf);
+      return NULL;
+   }
+   cbuf->ws = vws;
+   cbuf->base.buf = cbuf->buf;
+   return &cbuf->base;
+}
+
+static void virgl_vtest_cmd_buf_destroy(struct virgl_cmd_buf *_cbuf)
+{
+   struct virgl_vtest_cmd_buf *cbuf = virgl_vtest_cmd_buf(_cbuf);
+
+   FREE(cbuf->res_bo);
+   FREE(cbuf);
+}
+
+static boolean virgl_vtest_lookup_res(struct virgl_vtest_cmd_buf *cbuf,
+                                      struct virgl_hw_res *res)
+{
+   unsigned hash = res->res_handle & (sizeof(cbuf->is_handle_added)-1);
+   int i;
+
+   if (cbuf->is_handle_added[hash]) {
+      i = cbuf->reloc_indices_hashlist[hash];
+      if (cbuf->res_bo[i] == res)
+         return true;
+
+      for (i = 0; i < cbuf->cres; i++) {
+         if (cbuf->res_bo[i] == res) {
+            cbuf->reloc_indices_hashlist[hash] = i;
+            return true;
+         }
+      }
+   }
+   return false;
+}
+
+static void virgl_vtest_release_all_res(struct virgl_vtest_winsys *vtws,
+                                        struct virgl_vtest_cmd_buf *cbuf)
+{
+   int i;
+
+   for (i = 0; i < cbuf->cres; i++) {
+      p_atomic_dec(&cbuf->res_bo[i]->num_cs_references);
+      virgl_vtest_resource_reference(vtws, &cbuf->res_bo[i], NULL);
+   }
+   cbuf->cres = 0;
+}
+
+static void virgl_vtest_add_res(struct virgl_vtest_winsys *vtws,
+                                struct virgl_vtest_cmd_buf *cbuf,
+                                struct virgl_hw_res *res)
+{
+   unsigned hash = res->res_handle & (sizeof(cbuf->is_handle_added)-1);
+
+   if (cbuf->cres > cbuf->nres) {
+      fprintf(stderr,"failure to add relocation\n");
+      return;
+   }
+
+   cbuf->res_bo[cbuf->cres] = NULL;
+   virgl_vtest_resource_reference(vtws, &cbuf->res_bo[cbuf->cres], res);
+   cbuf->is_handle_added[hash] = TRUE;
+
+   cbuf->reloc_indices_hashlist[hash] = cbuf->cres;
+   p_atomic_inc(&res->num_cs_references);
+   cbuf->cres++;
+}
+
+static int virgl_vtest_winsys_submit_cmd(struct virgl_winsys *vws,
+                                         struct virgl_cmd_buf *_cbuf)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   struct virgl_vtest_cmd_buf *cbuf = virgl_vtest_cmd_buf(_cbuf);
+   int ret;
+
+   if (cbuf->base.cdw == 0)
+      return 0;
+
+   ret = virgl_vtest_submit_cmd(vtws, cbuf);
+
+   virgl_vtest_release_all_res(vtws, cbuf);
+   memset(cbuf->is_handle_added, 0, sizeof(cbuf->is_handle_added));
+   cbuf->base.cdw = 0;
+   return ret;
+}
+
+static void virgl_vtest_emit_res(struct virgl_winsys *vws,
+                                 struct virgl_cmd_buf *_cbuf,
+                                 struct virgl_hw_res *res, boolean write_buf)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   struct virgl_vtest_cmd_buf *cbuf = virgl_vtest_cmd_buf(_cbuf);
+   boolean already_in_list = virgl_vtest_lookup_res(cbuf, res);
+
+   if (write_buf)
+      cbuf->base.buf[cbuf->base.cdw++] = res->res_handle;
+   if (!already_in_list)
+      virgl_vtest_add_res(vtws, cbuf, res);
+}
+
+static boolean virgl_vtest_res_is_ref(struct virgl_winsys *vws,
+                                      struct virgl_cmd_buf *_cbuf,
+                                      struct virgl_hw_res *res)
+{
+   if (!res->num_cs_references)
+      return FALSE;
+
+   return TRUE;
+}
+
+static int virgl_vtest_get_caps(struct virgl_winsys *vws,
+                                struct virgl_drm_caps *caps)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   return virgl_vtest_send_get_caps(vtws, caps);
+}
+
+static struct pipe_fence_handle *
+virgl_cs_create_fence(struct virgl_winsys *vws)
+{
+   struct virgl_hw_res *res;
+
+   res = virgl_vtest_winsys_resource_cache_create(vws,
+                                                PIPE_BUFFER,
+                                                PIPE_FORMAT_R8_UNORM,
+                                                PIPE_BIND_CUSTOM,
+                                                8, 1, 1, 0, 0, 0, 8);
+
+   return (struct pipe_fence_handle *)res;
+}
+
+static bool virgl_fence_wait(struct virgl_winsys *vws,
+                             struct pipe_fence_handle *fence,
+                             uint64_t timeout)
+{
+   struct virgl_vtest_winsys *vdws = virgl_vtest_winsys(vws);
+   struct virgl_hw_res *res = virgl_hw_res(fence);
+
+   if (timeout == 0)
+      return virgl_vtest_resource_is_busy(vdws, res);
+
+   if (timeout != PIPE_TIMEOUT_INFINITE) {
+      int64_t start_time = os_time_get();
+      timeout /= 1000;
+      while (virgl_vtest_resource_is_busy(vdws, res)) {
+         if (os_time_get() - start_time >= timeout)
+            return FALSE;
+         os_time_sleep(10);
+      }
+      return TRUE;
+   }
+   virgl_vtest_resource_wait(vws, res);
+   return TRUE;
+}
+
+static void virgl_fence_reference(struct virgl_winsys *vws,
+                                  struct pipe_fence_handle **dst,
+                                  struct pipe_fence_handle *src)
+{
+   struct virgl_vtest_winsys *vdws = virgl_vtest_winsys(vws);
+   virgl_vtest_resource_reference(vdws, (struct virgl_hw_res **)dst,
+                                  virgl_hw_res(src));
+}
+
+static void virgl_vtest_flush_frontbuffer(struct virgl_winsys *vws,
+                                          struct virgl_hw_res *res,
+                                          unsigned level, unsigned layer,
+                                          void *winsys_drawable_handle,
+                                          struct pipe_box *sub_box)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+   struct pipe_box box;
+   void *map;
+   uint32_t size;
+   uint32_t offset = 0, valid_stride;
+   if (!res->dt)
+      return;
+
+   memset(&box, 0, sizeof(box));
+
+   if (sub_box) {
+      box = *sub_box;
+      offset = box.y / util_format_get_blockheight(res->format) * res->stride +
+               box.x / util_format_get_blockwidth(res->format) * util_format_get_blocksize(res->format);
+   } else {
+      box.z = layer;
+      box.width = res->width;
+      box.height = res->height;
+      box.depth = 1;
+   }
+
+   size = vtest_get_transfer_size(res, &box, res->stride, 0, level, &valid_stride);
+
+   virgl_vtest_busy_wait(vtws, res->res_handle, VCMD_BUSY_WAIT_FLAG_WAIT);
+   map = vtws->sws->displaytarget_map(vtws->sws, res->dt, 0);
+
+   /* execute a transfer */
+   virgl_vtest_send_transfer_cmd(vtws, VCMD_TRANSFER_GET, res->res_handle,
+                                 level, res->stride, 0, &box, size);
+   virgl_vtest_recv_transfer_get_data(vtws, map + offset, size, valid_stride,
+                                      &box, res->format);
+   vtws->sws->displaytarget_unmap(vtws->sws, res->dt);
+
+   vtws->sws->displaytarget_display(vtws->sws, res->dt, winsys_drawable_handle,
+                                    sub_box);
+}
+
+static void
+virgl_vtest_winsys_destroy(struct virgl_winsys *vws)
+{
+   struct virgl_vtest_winsys *vtws = virgl_vtest_winsys(vws);
+
+   virgl_cache_flush(vtws);
+
+   pipe_mutex_destroy(vtws->mutex);
+   FREE(vtws);
+}
+
+struct virgl_winsys *
+virgl_vtest_winsys_wrap(struct sw_winsys *sws)
+{
+   struct virgl_vtest_winsys *vtws;
+
+   vtws = CALLOC_STRUCT(virgl_vtest_winsys);
+   if (!vtws)
+      return NULL;
+
+   virgl_vtest_connect(vtws);
+   vtws->sws = sws;
+
+   vtws->usecs = 1000000;
+   LIST_INITHEAD(&vtws->delayed);
+   pipe_mutex_init(vtws->mutex);
+
+   vtws->base.destroy = virgl_vtest_winsys_destroy;
+
+   vtws->base.transfer_put = virgl_vtest_transfer_put;
+   vtws->base.transfer_get = virgl_vtest_transfer_get;
+
+   vtws->base.resource_create = virgl_vtest_winsys_resource_cache_create;
+   vtws->base.resource_unref = virgl_vtest_winsys_resource_unref;
+   vtws->base.resource_map = virgl_vtest_resource_map;
+   vtws->base.resource_wait = virgl_vtest_resource_wait;
+   vtws->base.cmd_buf_create = virgl_vtest_cmd_buf_create;
+   vtws->base.cmd_buf_destroy = virgl_vtest_cmd_buf_destroy;
+   vtws->base.submit_cmd = virgl_vtest_winsys_submit_cmd;
+
+   vtws->base.emit_res = virgl_vtest_emit_res;
+   vtws->base.res_is_referenced = virgl_vtest_res_is_ref;
+   vtws->base.get_caps = virgl_vtest_get_caps;
+
+   vtws->base.cs_create_fence = virgl_cs_create_fence;
+   vtws->base.fence_wait = virgl_fence_wait;
+   vtws->base.fence_reference = virgl_fence_reference;
+
+   vtws->base.flush_frontbuffer = virgl_vtest_flush_frontbuffer;
+
+   return &vtws->base;
+}
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.h b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.h
new file mode 100644
index 00000000000..b4faa70b67e
--- /dev/null
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VIRGL_DRM_WINSYS_H
+#define VIRGL_DRM_WINSYS_H
+
+#include <stdint.h>
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/list.h"
+#include "os/os_thread.h"
+
+#include "virgl/virgl_winsys.h"
+#include "vtest_protocol.h"
+
+struct pipe_fence_handle;
+struct sw_winsys;
+struct sw_displaytarget;
+
+struct virgl_vtest_winsys {
+   struct virgl_winsys base;
+
+   struct sw_winsys *sws;
+
+   /* fd to remote renderer */
+   int sock_fd;
+
+   struct list_head delayed;
+   int num_delayed;
+   unsigned usecs;
+   pipe_mutex mutex;
+};
+
+struct virgl_hw_res {
+   struct pipe_reference reference;
+   uint32_t res_handle;
+   int num_cs_references;
+
+   void *ptr;
+   int size;
+
+   uint32_t format;
+   uint32_t stride;
+   uint32_t width;
+   uint32_t height;
+
+   struct sw_displaytarget *dt;
+   void *mapped;
+
+   struct list_head head;
+   uint32_t bind;
+   boolean cacheable;
+   int64_t start, end;
+
+};
+
+struct virgl_vtest_cmd_buf {
+   struct virgl_cmd_buf base;
+   uint32_t buf[VIRGL_MAX_CMDBUF_DWORDS];
+   unsigned nres;
+   unsigned cres;
+   struct virgl_winsys *ws;
+   struct virgl_hw_res **res_bo;
+
+   char                        is_handle_added[512];
+   unsigned                    reloc_indices_hashlist[512];
+};
+
+static inline struct virgl_hw_res *
+virgl_hw_res(struct pipe_fence_handle *f)
+{
+   return (struct virgl_hw_res *)f;
+}
+
+static inline struct virgl_vtest_winsys *
+virgl_vtest_winsys(struct virgl_winsys *iws)
+{
+   return (struct virgl_vtest_winsys *)iws;
+}
+
+static inline struct virgl_vtest_cmd_buf *
+virgl_vtest_cmd_buf(struct virgl_cmd_buf *cbuf)
+{
+   return (struct virgl_vtest_cmd_buf *)cbuf;
+}
+
+
+int virgl_vtest_connect(struct virgl_vtest_winsys *vws);
+int virgl_vtest_send_get_caps(struct virgl_vtest_winsys *vws,
+                              struct virgl_drm_caps *caps);
+
+int virgl_vtest_send_resource_create(struct virgl_vtest_winsys *vws,
+                                     uint32_t handle,
+                                     enum pipe_texture_target target,
+                                     uint32_t format,
+                                     uint32_t bind,
+                                     uint32_t width,
+                                     uint32_t height,
+                                     uint32_t depth,
+                                     uint32_t array_size,
+                                     uint32_t last_level,
+                                     uint32_t nr_samples);
+
+int virgl_vtest_send_resource_unref(struct virgl_vtest_winsys *vws,
+                                    uint32_t handle);
+int virgl_vtest_submit_cmd(struct virgl_vtest_winsys *vtws,
+                           struct virgl_vtest_cmd_buf *cbuf);
+
+int virgl_vtest_send_transfer_cmd(struct virgl_vtest_winsys *vws,
+                                  uint32_t vcmd,
+                                  uint32_t handle,
+                                  uint32_t level, uint32_t stride,
+                                  uint32_t layer_stride,
+                                  const struct pipe_box *box,
+                                  uint32_t data_size);
+
+int virgl_vtest_send_transfer_put_data(struct virgl_vtest_winsys *vws,
+                                       void *data,
+                                       uint32_t data_size);
+int virgl_vtest_recv_transfer_get_data(struct virgl_vtest_winsys *vws,
+                                       void *data,
+                                       uint32_t data_size,
+                                       uint32_t stride,
+                                       const struct pipe_box *box,
+                                       uint32_t format);
+
+int virgl_vtest_busy_wait(struct virgl_vtest_winsys *vws, int handle,
+                          int flags);
+#endif
diff --git a/src/gallium/winsys/virgl/vtest/vtest_protocol.h b/src/gallium/winsys/virgl/vtest/vtest_protocol.h
new file mode 100644
index 00000000000..86d197f006c
--- /dev/null
+++ b/src/gallium/winsys/virgl/vtest/vtest_protocol.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2014, 2015 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VTEST_PROTOCOL
+#define VTEST_PROTOCOL
+
+#define VTEST_DEFAULT_SOCKET_NAME "/tmp/.virgl_test"
+
+/* 32-bit length field */
+/* 32-bit cmd field */
+#define VTEST_HDR_SIZE 2
+#define VTEST_CMD_LEN 0 /* length of data */
+#define VTEST_CMD_ID  1
+#define VTEST_CMD_DATA_START 2
+
+/* vtest cmds */
+#define VCMD_GET_CAPS 1
+
+#define VCMD_RESOURCE_CREATE 2
+#define VCMD_RESOURCE_UNREF 3
+
+#define VCMD_TRANSFER_GET 4
+#define VCMD_TRANSFER_PUT 5
+
+#define VCMD_SUBMIT_CMD 6
+
+#define VCMD_RESOURCE_BUSY_WAIT 7
+
+/* pass the process cmd line for debugging */
+#define VCMD_CREATE_RENDERER 8
+/* get caps */
+/* 0 length cmd */
+/* resp VCMD_GET_CAPS + caps */
+
+#define VCMD_RES_CREATE_SIZE 10
+#define VCMD_RES_CREATE_RES_HANDLE 0
+#define VCMD_RES_CREATE_TARGET 1
+#define VCMD_RES_CREATE_FORMAT 2
+#define VCMD_RES_CREATE_BIND 3
+#define VCMD_RES_CREATE_WIDTH 4
+#define VCMD_RES_CREATE_HEIGHT 5
+#define VCMD_RES_CREATE_DEPTH 6
+#define VCMD_RES_CREATE_ARRAY_SIZE 7
+#define VCMD_RES_CREATE_LAST_LEVEL 8
+#define VCMD_RES_CREATE_NR_SAMPLES 9
+
+#define VCMD_RES_UNREF_SIZE 1
+#define VCMD_RES_UNREF_RES_HANDLE 0
+
+#define VCMD_TRANSFER_HDR_SIZE 11
+#define VCMD_TRANSFER_RES_HANDLE 0
+#define VCMD_TRANSFER_LEVEL 1
+#define VCMD_TRANSFER_STRIDE 2
+#define VCMD_TRANSFER_LAYER_STRIDE 3
+#define VCMD_TRANSFER_X 4
+#define VCMD_TRANSFER_Y 5
+#define VCMD_TRANSFER_Z 6
+#define VCMD_TRANSFER_WIDTH 7
+#define VCMD_TRANSFER_HEIGHT 8
+#define VCMD_TRANSFER_DEPTH 9
+#define VCMD_TRANSFER_DATA_SIZE 10
+
+#define VCMD_BUSY_WAIT_FLAG_WAIT 1
+
+#define VCMD_BUSY_WAIT_SIZE 2
+#define VCMD_BUSY_WAIT_HANDLE 0
+#define VCMD_BUSY_WAIT_FLAGS 1
+
+#endif
diff --git a/src/gbm/main/gbm.h b/src/gbm/main/gbm.h
index 2708e50a45c..8db2153e84b 100644
--- a/src/gbm/main/gbm.h
+++ b/src/gbm/main/gbm.h
@@ -35,6 +35,7 @@ extern "C" {
 
 #define __GBM__ 1
 
+#include <stddef.h>
 #include <stdint.h>
 
 /**
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index 961183636a9..0a79fb14633 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -2423,21 +2423,6 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
       const struct gl_context *const ctx = state->ctx;
       unsigned max_loc = qual->location + var->type->uniform_locations() - 1;
 
-      /* ARB_explicit_uniform_location specification states:
-       *
-       *     "The explicitly defined locations and the generated locations
-       *     must be in the range of 0 to MAX_UNIFORM_LOCATIONS minus one."
-       *
-       *     "Valid locations for default-block uniform variable locations
-       *     are in the range of 0 to the implementation-defined maximum
-       *     number of uniform locations."
-       */
-      if (qual->location < 0) {
-         _mesa_glsl_error(loc, state,
-                          "explicit location < 0 for uniform %s", var->name);
-         return;
-      }
-
       if (max_loc >= ctx->Const.MaxUserAssignableUniformLocations) {
          _mesa_glsl_error(loc, state, "location(s) consumed by uniform %s "
                           ">= MAX_UNIFORM_LOCATIONS (%u)", var->name,
@@ -2528,41 +2513,30 @@ validate_explicit_location(const struct ast_type_qualifier *qual,
    } else {
       var->data.explicit_location = true;
 
-      /* This bit of silliness is needed because invalid explicit locations
-       * are supposed to be flagged during linking.  Small negative values
-       * biased by VERT_ATTRIB_GENERIC0 or FRAG_RESULT_DATA0 could alias
-       * built-in values (e.g., -16+VERT_ATTRIB_GENERIC0 = VERT_ATTRIB_POS).
-       * The linker needs to be able to differentiate these cases.  This
-       * ensures that negative values stay negative.
-       */
-      if (qual->location >= 0) {
-         switch (state->stage) {
-         case MESA_SHADER_VERTEX:
-            var->data.location = (var->data.mode == ir_var_shader_in)
-               ? (qual->location + VERT_ATTRIB_GENERIC0)
-               : (qual->location + VARYING_SLOT_VAR0);
-            break;
+      switch (state->stage) {
+      case MESA_SHADER_VERTEX:
+         var->data.location = (var->data.mode == ir_var_shader_in)
+            ? (qual->location + VERT_ATTRIB_GENERIC0)
+            : (qual->location + VARYING_SLOT_VAR0);
+         break;
 
-         case MESA_SHADER_TESS_CTRL:
-         case MESA_SHADER_TESS_EVAL:
-         case MESA_SHADER_GEOMETRY:
-            if (var->data.patch)
-               var->data.location = qual->location + VARYING_SLOT_PATCH0;
-            else
-               var->data.location = qual->location + VARYING_SLOT_VAR0;
-            break;
+      case MESA_SHADER_TESS_CTRL:
+      case MESA_SHADER_TESS_EVAL:
+      case MESA_SHADER_GEOMETRY:
+         if (var->data.patch)
+            var->data.location = qual->location + VARYING_SLOT_PATCH0;
+         else
+            var->data.location = qual->location + VARYING_SLOT_VAR0;
+         break;
 
-         case MESA_SHADER_FRAGMENT:
-            var->data.location = (var->data.mode == ir_var_shader_out)
-               ? (qual->location + FRAG_RESULT_DATA0)
-               : (qual->location + VARYING_SLOT_VAR0);
-            break;
-         case MESA_SHADER_COMPUTE:
-            assert(!"Unexpected shader type");
-            break;
-         }
-      } else {
-         var->data.location = qual->location;
+      case MESA_SHADER_FRAGMENT:
+         var->data.location = (var->data.mode == ir_var_shader_out)
+            ? (qual->location + FRAG_RESULT_DATA0)
+            : (qual->location + VARYING_SLOT_VAR0);
+         break;
+      case MESA_SHADER_COMPUTE:
+         assert(!"Unexpected shader type");
+         break;
       }
 
       if (qual->flags.q.explicit_index) {
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index aae25f893e8..509a57b8813 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -401,6 +401,12 @@ shader_atomic_counters(const _mesa_glsl_parse_state *state)
 }
 
 static bool
+shader_clock(const _mesa_glsl_parse_state *state)
+{
+   return state->ARB_shader_clock_enable;
+}
+
+static bool
 shader_storage_buffer_object(const _mesa_glsl_parse_state *state)
 {
    return state->has_shader_storage_buffer_objects();
@@ -782,6 +788,11 @@ private:
    ir_function_signature *_memory_barrier(
       builtin_available_predicate avail);
 
+   ir_function_signature *_shader_clock_intrinsic(builtin_available_predicate avail,
+                                                  const glsl_type *type);
+   ir_function_signature *_shader_clock(builtin_available_predicate avail,
+                                        const glsl_type *type);
+
 #undef B0
 #undef B1
 #undef B2
@@ -952,6 +963,11 @@ builtin_builder::create_intrinsics()
    add_function("__intrinsic_memory_barrier",
                 _memory_barrier_intrinsic(shader_image_load_store),
                 NULL);
+
+   add_function("__intrinsic_shader_clock",
+                _shader_clock_intrinsic(shader_clock,
+                                        glsl_type::uvec2_type),
+                NULL);
 }
 
 /**
@@ -2741,6 +2757,11 @@ builtin_builder::create_builtins()
                 _memory_barrier(shader_image_load_store),
                 NULL);
 
+   add_function("clock2x32ARB",
+                _shader_clock(shader_clock,
+                              glsl_type::uvec2_type),
+                NULL);
+
 #undef F
 #undef FI
 #undef FIUD
@@ -5251,6 +5272,28 @@ builtin_builder::_memory_barrier(builtin_available_predicate avail)
    return sig;
 }
 
+ir_function_signature *
+builtin_builder::_shader_clock_intrinsic(builtin_available_predicate avail,
+                                         const glsl_type *type)
+{
+   MAKE_INTRINSIC(type, avail, 0);
+   return sig;
+}
+
+ir_function_signature *
+builtin_builder::_shader_clock(builtin_available_predicate avail,
+                               const glsl_type *type)
+{
+   MAKE_SIG(type, avail, 0);
+
+   ir_variable *retval = body.make_temp(type, "clock_retval");
+
+   body.emit(call(shader->symbols->get_function("__intrinsic_shader_clock"),
+                  retval, sig->parameters));
+   body.emit(ret(retval));
+   return sig;
+}
+
 /** @} */
 
 /******************************************************************************/
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index a6ad1050552..c30fb9226e5 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -710,7 +710,7 @@ builtin_variable_generator::generate_constants()
       }
    }
 
-   if (state->is_version(430, 0) || state->ARB_compute_shader_enable) {
+   if (state->is_version(430, 310) || state->ARB_compute_shader_enable) {
       add_const("gl_MaxComputeAtomicCounterBuffers", MAX_COMPUTE_ATOMIC_COUNTER_BUFFERS);
       add_const("gl_MaxComputeAtomicCounters", MAX_COMPUTE_ATOMIC_COUNTERS);
       add_const("gl_MaxComputeImageUniforms", MAX_COMPUTE_IMAGE_UNIFORMS);
@@ -887,16 +887,22 @@ builtin_variable_generator::generate_uniforms()
 void
 builtin_variable_generator::generate_vs_special_vars()
 {
+   ir_variable *var;
+
    if (state->is_version(130, 300))
       add_system_value(SYSTEM_VALUE_VERTEX_ID, int_t, "gl_VertexID");
    if (state->ARB_draw_instanced_enable)
       add_system_value(SYSTEM_VALUE_INSTANCE_ID, int_t, "gl_InstanceIDARB");
    if (state->ARB_draw_instanced_enable || state->is_version(140, 300))
       add_system_value(SYSTEM_VALUE_INSTANCE_ID, int_t, "gl_InstanceID");
-   if (state->AMD_vertex_shader_layer_enable)
-      add_output(VARYING_SLOT_LAYER, int_t, "gl_Layer");
-   if (state->AMD_vertex_shader_viewport_index_enable)
-      add_output(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex");
+   if (state->AMD_vertex_shader_layer_enable) {
+      var = add_output(VARYING_SLOT_LAYER, int_t, "gl_Layer");
+      var->data.interpolation = INTERP_QUALIFIER_FLAT;
+   }
+   if (state->AMD_vertex_shader_viewport_index_enable) {
+      var = add_output(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex");
+      var->data.interpolation = INTERP_QUALIFIER_FLAT;
+   }
    if (compatibility) {
       add_input(VERT_ATTRIB_POS, vec4_t, "gl_Vertex");
       add_input(VERT_ATTRIB_NORMAL, vec3_t, "gl_Normal");
@@ -954,9 +960,14 @@ builtin_variable_generator::generate_tes_special_vars()
 void
 builtin_variable_generator::generate_gs_special_vars()
 {
-   add_output(VARYING_SLOT_LAYER, int_t, "gl_Layer");
-   if (state->is_version(410, 0) || state->ARB_viewport_array_enable)
-      add_output(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex");
+   ir_variable *var;
+
+   var = add_output(VARYING_SLOT_LAYER, int_t, "gl_Layer");
+   var->data.interpolation = INTERP_QUALIFIER_FLAT;
+   if (state->is_version(410, 0) || state->ARB_viewport_array_enable) {
+      var = add_output(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex");
+      var->data.interpolation = INTERP_QUALIFIER_FLAT;
+   }
    if (state->is_version(400, 0) || state->ARB_gpu_shader5_enable)
       add_system_value(SYSTEM_VALUE_INVOCATION_ID, int_t, "gl_InvocationID");
 
@@ -970,7 +981,6 @@ builtin_variable_generator::generate_gs_special_vars()
     * the specific case of gl_PrimitiveIDIn.  So we don't need to treat
     * gl_PrimitiveIDIn as an {ARB,EXT}_geometry_shader4-only variable.
     */
-   ir_variable *var;
    var = add_input(VARYING_SLOT_PRIMITIVE_ID, int_t, "gl_PrimitiveIDIn");
    var->data.interpolation = INTERP_QUALIFIER_FLAT;
    var = add_output(VARYING_SLOT_PRIMITIVE_ID, int_t, "gl_PrimitiveID");
@@ -984,14 +994,15 @@ builtin_variable_generator::generate_gs_special_vars()
 void
 builtin_variable_generator::generate_fs_special_vars()
 {
+   ir_variable *var;
+
    add_input(VARYING_SLOT_POS, vec4_t, "gl_FragCoord");
    add_input(VARYING_SLOT_FACE, bool_t, "gl_FrontFacing");
    if (state->is_version(120, 100))
       add_input(VARYING_SLOT_PNTC, vec2_t, "gl_PointCoord");
 
    if (state->is_version(150, 0)) {
-      ir_variable *var =
-         add_input(VARYING_SLOT_PRIMITIVE_ID, int_t, "gl_PrimitiveID");
+      var = add_input(VARYING_SLOT_PRIMITIVE_ID, int_t, "gl_PrimitiveID");
       var->data.interpolation = INTERP_QUALIFIER_FLAT;
    }
 
@@ -1043,8 +1054,10 @@ builtin_variable_generator::generate_fs_special_vars()
    }
 
    if (state->is_version(430, 0) || state->ARB_fragment_layer_viewport_enable) {
-      add_input(VARYING_SLOT_LAYER, int_t, "gl_Layer");
-      add_input(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex");
+      var = add_input(VARYING_SLOT_LAYER, int_t, "gl_Layer");
+      var->data.interpolation = INTERP_QUALIFIER_FLAT;
+      var = add_input(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex");
+      var->data.interpolation = INTERP_QUALIFIER_FLAT;
    }
 }
 
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index 1d7a3af8b74..4acccf74065 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2426,6 +2426,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	      if (extensions->ARB_shader_bit_encoding)
 	         add_builtin_define(parser, "GL_ARB_shader_bit_encoding", 1);
 
+	      if (extensions->ARB_shader_clock)
+	         add_builtin_define(parser, "GL_ARB_shader_clock", 1);
+
 	      if (extensions->ARB_uniform_buffer_object)
 	         add_builtin_define(parser, "GL_ARB_uniform_buffer_object", 1);
 
diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy
index 2f2e10d7992..4636435f191 100644
--- a/src/glsl/glsl_parser.yy
+++ b/src/glsl/glsl_parser.yy
@@ -948,7 +948,8 @@ parameter_qualifier:
       if ($2.precision != ast_precision_none)
          _mesa_glsl_error(&@1, state, "duplicate precision qualifier");
 
-      if (!state->has_420pack() && $2.flags.i != 0)
+      if (!(state->has_420pack() || state->is_version(420, 310)) &&
+          $2.flags.i != 0)
          _mesa_glsl_error(&@1, state, "precision qualifiers must come last");
 
       $$ = $2;
@@ -1847,7 +1848,8 @@ type_qualifier:
       if ($2.precision != ast_precision_none)
          _mesa_glsl_error(&@1, state, "duplicate precision qualifier");
 
-      if (!state->has_420pack() && $2.flags.i != 0)
+      if (!(state->has_420pack() || state->is_version(420, 310)) &&
+          $2.flags.i != 0)
          _mesa_glsl_error(&@1, state, "precision qualifiers must come last");
 
       $$ = $2;
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 692b1228ee9..f856a200e09 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -606,6 +606,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(ARB_separate_shader_objects,      true,  false,     dummy_true),
    EXT(ARB_shader_atomic_counters,       true,  false,     ARB_shader_atomic_counters),
    EXT(ARB_shader_bit_encoding,          true,  false,     ARB_shader_bit_encoding),
+   EXT(ARB_shader_clock,                 true,  false,     ARB_shader_clock),
    EXT(ARB_shader_image_load_store,      true,  false,     ARB_shader_image_load_store),
    EXT(ARB_shader_image_size,            true,  false,     ARB_shader_image_size),
    EXT(ARB_shader_precision,             true,  false,     ARB_shader_precision),
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index e8740f9ecb9..b54c5359149 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -519,6 +519,8 @@ struct _mesa_glsl_parse_state {
    bool ARB_shader_atomic_counters_warn;
    bool ARB_shader_bit_encoding_enable;
    bool ARB_shader_bit_encoding_warn;
+   bool ARB_shader_clock_enable;
+   bool ARB_shader_clock_warn;
    bool ARB_shader_image_load_store_enable;
    bool ARB_shader_image_load_store_warn;
    bool ARB_shader_image_size_enable;
diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp
index 70ef0e1c891..cdcc06d53e2 100644
--- a/src/glsl/link_atomics.cpp
+++ b/src/glsl/link_atomics.cpp
@@ -198,6 +198,7 @@ link_assign_atomic_counter_resources(struct gl_context *ctx,
                                      struct gl_shader_program *prog)
 {
    unsigned num_buffers;
+   unsigned num_atomic_buffers[MESA_SHADER_STAGES] = {};
    active_atomic_buffer *abs =
       find_active_atomic_counters(ctx, prog, &num_buffers);
 
@@ -242,13 +243,49 @@ link_assign_atomic_counter_resources(struct gl_context *ctx,
       }
 
       /* Assign stage-specific fields. */
-      for (unsigned j = 0; j < MESA_SHADER_STAGES; ++j)
-         mab.StageReferences[j] =
-            (ab.stage_references[j] ? GL_TRUE : GL_FALSE);
+      for (unsigned j = 0; j < MESA_SHADER_STAGES; ++j) {
+         if (ab.stage_references[j]) {
+            mab.StageReferences[j] = GL_TRUE;
+            num_atomic_buffers[j]++;
+         } else {
+            mab.StageReferences[j] = GL_FALSE;
+         }
+      }
 
       i++;
    }
 
+   /* Store a list pointers to atomic buffers per stage and store the index
+    * to the intra-stage buffer list in uniform storage.
+    */
+   for (unsigned j = 0; j < MESA_SHADER_STAGES; ++j) {
+      if (prog->_LinkedShaders[j] && num_atomic_buffers[j] > 0) {
+         prog->_LinkedShaders[j]->NumAtomicBuffers = num_atomic_buffers[j];
+         prog->_LinkedShaders[j]->AtomicBuffers =
+            rzalloc_array(prog, gl_active_atomic_buffer *,
+                          num_atomic_buffers[j]);
+
+         unsigned intra_stage_idx = 0;
+         for (unsigned i = 0; i < num_buffers; i++) {
+            struct gl_active_atomic_buffer *atomic_buffer =
+               &prog->AtomicBuffers[i];
+            if (atomic_buffer->StageReferences[j]) {
+               prog->_LinkedShaders[j]->AtomicBuffers[intra_stage_idx] =
+                  atomic_buffer;
+
+               for (unsigned u = 0; u < atomic_buffer->NumUniforms; u++) {
+                  prog->UniformStorage[atomic_buffer->Uniforms[u]].opaque[j].index =
+                     intra_stage_idx;
+                  prog->UniformStorage[atomic_buffer->Uniforms[u]].opaque[j].active =
+                     true;
+               }
+
+               intra_stage_idx++;
+            }
+         }
+      }
+   }
+
    delete [] abs;
    assert(i == num_buffers);
 }
diff --git a/src/glsl/link_uniform_blocks.cpp b/src/glsl/link_uniform_blocks.cpp
index 5285d8d01e4..d5d30bb0a0d 100644
--- a/src/glsl/link_uniform_blocks.cpp
+++ b/src/glsl/link_uniform_blocks.cpp
@@ -100,7 +100,7 @@ private:
    virtual void visit_field(const glsl_type *type, const char *name,
                             bool row_major, const glsl_type *,
                             const unsigned packing,
-                            bool /* last_field */)
+                            bool last_field)
    {
       assert(this->index < this->num_variables);
 
@@ -131,12 +131,28 @@ private:
       unsigned alignment = 0;
       unsigned size = 0;
 
+      /* From ARB_program_interface_query:
+       *
+       *     "If the final member of an active shader storage block is array
+       *      with no declared size, the minimum buffer size is computed
+       *      assuming the array was declared as an array with one element."
+       *
+       * For that reason, we use the base type of the unsized array to calculate
+       * its size. We don't need to check if the unsized array is the last member
+       * of a shader storage block (that check was already done by the parser).
+       */
+      const glsl_type *type_for_size = type;
+      if (type->is_unsized_array()) {
+         assert(last_field);
+         type_for_size = type->without_array();
+      }
+
       if (packing == GLSL_INTERFACE_PACKING_STD430) {
          alignment = type->std430_base_alignment(v->RowMajor);
-         size = type->std430_size(v->RowMajor);
+         size = type_for_size->std430_size(v->RowMajor);
       } else {
          alignment = type->std140_base_alignment(v->RowMajor);
-         size = type->std140_size(v->RowMajor);
+         size = type_for_size->std140_size(v->RowMajor);
       }
 
       this->offset = glsl_align(this->offset, alignment);
diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 8183e65d2f5..47bb7717f84 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -1010,38 +1010,37 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
    }
 }
 
-/**
- * Scan the program for image uniforms and store image unit access
- * information into the gl_shader data structure.
- */
 static void
-link_set_image_access_qualifiers(struct gl_shader_program *prog)
+link_set_image_access_qualifiers(struct gl_shader_program *prog,
+                                 gl_shader *sh, unsigned shader_stage,
+                                 ir_variable *var, const glsl_type *type,
+                                 char **name, size_t name_length)
 {
-   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      gl_shader *sh = prog->_LinkedShaders[i];
-
-      if (sh == NULL)
-	 continue;
+   /* Handle arrays of arrays */
+   if (type->is_array() && type->fields.array->is_array()) {
+      for (unsigned i = 0; i < type->length; i++) {
+	 size_t new_length = name_length;
 
-      foreach_in_list(ir_instruction, node, sh->ir) {
-	 ir_variable *var = node->as_variable();
+	 /* Append the subscript to the current variable name */
+	 ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
 
-         if (var && var->data.mode == ir_var_uniform &&
-             var->type->contains_image()) {
-            unsigned id = 0;
-            bool found = prog->UniformHash->get(id, var->name);
-            assert(found);
-            (void) found;
-            const gl_uniform_storage *storage = &prog->UniformStorage[id];
-            const unsigned index = storage->opaque[i].index;
-            const GLenum access = (var->data.image_read_only ? GL_READ_ONLY :
-                                   var->data.image_write_only ? GL_WRITE_ONLY :
-                                   GL_READ_WRITE);
-
-            for (unsigned j = 0; j < MAX2(1, storage->array_elements); ++j)
-               sh->ImageAccess[index + j] = access;
-         }
+         link_set_image_access_qualifiers(prog, sh, shader_stage, var,
+                                          type->fields.array, name,
+                                          new_length);
       }
+   } else {
+      unsigned id = 0;
+      bool found = prog->UniformHash->get(id, *name);
+      assert(found);
+      (void) found;
+      const gl_uniform_storage *storage = &prog->UniformStorage[id];
+      const unsigned index = storage->opaque[shader_stage].index;
+      const GLenum access = (var->data.image_read_only ? GL_READ_ONLY :
+                             var->data.image_write_only ? GL_WRITE_ONLY :
+                             GL_READ_WRITE);
+
+      for (unsigned j = 0; j < MAX2(1, storage->array_elements); ++j)
+         sh->ImageAccess[index + j] = access;
    }
 }
 
@@ -1305,7 +1304,29 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    prog->NumHiddenUniforms = hidden_uniforms;
    prog->UniformStorage = uniforms;
 
-   link_set_image_access_qualifiers(prog);
+   /**
+    * Scan the program for image uniforms and store image unit access
+    * information into the gl_shader data structure.
+    */
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      gl_shader *sh = prog->_LinkedShaders[i];
+
+      if (sh == NULL)
+	 continue;
+
+      foreach_in_list(ir_instruction, node, sh->ir) {
+	 ir_variable *var = node->as_variable();
+
+         if (var && var->data.mode == ir_var_uniform &&
+             var->type->contains_image()) {
+            char *name_copy = ralloc_strdup(NULL, var->name);
+            link_set_image_access_qualifiers(prog, sh, i, var, var->type,
+                                             &name_copy, strlen(var->name));
+            ralloc_free(name_copy);
+         }
+      }
+   }
+
    link_set_uniform_initializers(prog, boolean_true);
 
    return;
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 07ea0e0c7e5..c35d87acea6 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -2282,6 +2282,22 @@ resize_tes_inputs(struct gl_context *ctx,
    foreach_in_list(ir_instruction, ir, tes->ir) {
       ir->accept(&input_resize_visitor);
    }
+
+   if (tcs) {
+      /* Convert the gl_PatchVerticesIn system value into a constant, since
+       * the value is known at this point.
+       */
+      foreach_in_list(ir_instruction, ir, tes->ir) {
+         ir_variable *var = ir->as_variable();
+         if (var && var->data.mode == ir_var_system_value &&
+             var->data.location == SYSTEM_VALUE_VERTICES_IN) {
+            void *mem_ctx = ralloc_parent(var);
+            var->data.mode = ir_var_auto;
+            var->data.location = 0;
+            var->constant_value = new(mem_ctx) ir_constant(num_vertices);
+         }
+      }
+   }
 }
 
 /**
@@ -3137,7 +3153,8 @@ should_add_buffer_variable(struct gl_shader_program *shProg,
                            GLenum type, const char *name)
 {
    bool found_interface = false;
-   const char *block_name = NULL;
+   unsigned block_name_len = 0;
+   const char *block_name_dot = strchr(name, '.');
 
    /* These rules only apply to buffer variables. So we return
     * true for the rest of types.
@@ -3146,8 +3163,28 @@ should_add_buffer_variable(struct gl_shader_program *shProg,
       return true;
 
    for (unsigned i = 0; i < shProg->NumBufferInterfaceBlocks; i++) {
-      block_name = shProg->BufferInterfaceBlocks[i].Name;
-      if (strncmp(block_name, name, strlen(block_name)) == 0) {
+      const char *block_name = shProg->BufferInterfaceBlocks[i].Name;
+      block_name_len = strlen(block_name);
+
+      const char *block_square_bracket = strchr(block_name, '[');
+      if (block_square_bracket) {
+         /* The block is part of an array of named interfaces,
+          * for the name comparison we ignore the "[x]" part.
+          */
+         block_name_len -= strlen(block_square_bracket);
+      }
+
+      if (block_name_dot) {
+         /* Check if the variable name starts with the interface
+          * name. The interface name (if present) should have the
+          * length than the interface block name we are comparing to.
+          */
+         unsigned len = strlen(name) - strlen(block_name_dot);
+         if (len != block_name_len)
+            continue;
+      }
+
+      if (strncmp(block_name, name, block_name_len) == 0) {
          found_interface = true;
          break;
       }
@@ -3157,7 +3194,7 @@ should_add_buffer_variable(struct gl_shader_program *shProg,
     * including the dot that follows it.
     */
    if (found_interface)
-      name = name + strlen(block_name) + 1;
+      name = name + block_name_len + 1;
 
    /* From: ARB_program_interface_query extension:
     *
@@ -3166,14 +3203,14 @@ should_add_buffer_variable(struct gl_shader_program *shProg,
     *   of its type.  For arrays of aggregate types, the enumeration rules are
     *   applied recursively for the single enumerated array element.
     */
-   const char *first_dot = strchr(name, '.');
+   const char *struct_first_dot = strchr(name, '.');
    const char *first_square_bracket = strchr(name, '[');
 
    /* The buffer variable is on top level and it is not an array */
    if (!first_square_bracket) {
       return true;
    /* The shader storage block member is a struct, then generate the entry */
-   } else if (first_dot && first_dot < first_square_bracket) {
+   } else if (struct_first_dot && struct_first_dot < first_square_bracket) {
       return true;
    } else {
       /* Shader storage block member is an array, only generate an entry for the
@@ -3349,6 +3386,12 @@ add_interface_variables(struct gl_shader_program *shProg,
       if (strncmp(var->name, "packed:", 7) == 0)
          continue;
 
+      /* Skip fragdata arrays, these are handled separately
+       * by add_fragdata_arrays.
+       */
+      if (strncmp(var->name, "gl_out_FragData", 15) == 0)
+         continue;
+
       if (!add_program_resource(shProg, programInterface, var,
                                 build_stageref(shProg, var->name,
                                                var->data.mode) | mask))
@@ -3388,6 +3431,26 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage)
    return true;
 }
 
+static bool
+add_fragdata_arrays(struct gl_shader_program *shProg)
+{
+   struct gl_shader *sh = shProg->_LinkedShaders[MESA_SHADER_FRAGMENT];
+
+   if (!sh || !sh->fragdata_arrays)
+      return true;
+
+   foreach_in_list(ir_instruction, node, sh->fragdata_arrays) {
+      ir_variable *var = node->as_variable();
+      if (var) {
+         assert(var->data.mode == ir_var_shader_out);
+         if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, var,
+                                   1 << MESA_SHADER_FRAGMENT))
+            return false;
+      }
+   }
+   return true;
+}
+
 static char*
 get_top_level_name(const char *name)
 {
@@ -3467,80 +3530,78 @@ is_top_level_shader_storage_block_member(const char* name,
    return result;
 }
 
-static void
-calculate_array_size(struct gl_shader_program *shProg,
-                     struct gl_uniform_storage *uni)
+static int
+get_array_size(struct gl_uniform_storage *uni, const glsl_struct_field *field,
+               char *interface_name, char *var_name)
 {
-   int block_index = uni->block_index;
-   int array_size = -1;
-   char *var_name = get_top_level_name(uni->name);
-   char *interface_name =
-      get_top_level_name(shProg->BufferInterfaceBlocks[block_index].Name);
-
-   if (strcmp(var_name, interface_name) == 0) {
-      /* Deal with instanced array of SSBOs */
-      char *temp_name = get_var_name(uni->name);
-      free(var_name);
-      var_name = get_top_level_name(temp_name);
-      free(temp_name);
-   }
-
-   for (unsigned i = 0; i < shProg->NumShaders; i++) {
-      if (shProg->Shaders[i] == NULL)
-         continue;
-
-      const gl_shader *stage = shProg->Shaders[i];
-      foreach_in_list(ir_instruction, node, stage->ir) {
-         ir_variable *var = node->as_variable();
-         if (!var || !var->get_interface_type() ||
-             var->data.mode != ir_var_shader_storage)
-            continue;
-
-         const glsl_type *interface = var->get_interface_type();
-
-         if (strcmp(interface_name, interface->name) != 0)
-            continue;
-
-         for (unsigned i = 0; i < interface->length; i++) {
-            const glsl_struct_field *field = &interface->fields.structure[i];
-            if (strcmp(field->name, var_name) != 0)
-               continue;
-            /* From GL_ARB_program_interface_query spec:
-             *
-             * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer
-             * identifying the number of active array elements of the top-level
-             * shader storage block member containing to the active variable is
-             * written to <params>.  If the top-level block member is not
-             * declared as an array, the value one is written to <params>.  If
-             * the top-level block member is an array with no declared size,
-             * the value zero is written to <params>.
-             */
-            if (is_top_level_shader_storage_block_member(uni->name,
-                                                         interface_name,
-                                                         var_name))
-               array_size = 1;
-            else if (field->type->is_unsized_array())
-               array_size = 0;
-            else if (field->type->is_array())
-               array_size = field->type->length;
-            else
-               array_size = 1;
+   /* From GL_ARB_program_interface_query spec:
+    *
+    * "For the property TOP_LEVEL_ARRAY_SIZE, a single integer
+    * identifying the number of active array elements of the top-level
+    * shader storage block member containing to the active variable is
+    * written to <params>.  If the top-level block member is not
+    * declared as an array, the value one is written to <params>.  If
+    * the top-level block member is an array with no declared size,
+    * the value zero is written to <params>.
+    */
+   if (is_top_level_shader_storage_block_member(uni->name,
+                                                interface_name,
+                                                var_name))
+      return  1;
+   else if (field->type->is_unsized_array())
+      return 0;
+   else if (field->type->is_array())
+      return field->type->length;
+
+   return 1;
+}
 
-            goto found_top_level_array_size;
-         }
+static int
+get_array_stride(struct gl_uniform_storage *uni, const glsl_type *interface,
+                 const glsl_struct_field *field, char *interface_name,
+                 char *var_name)
+{
+   /* From GL_ARB_program_interface_query:
+    *
+    * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer
+    *  identifying the stride between array elements of the top-level
+    *  shader storage block member containing the active variable is
+    *  written to <params>.  For top-level block members declared as
+    *  arrays, the value written is the difference, in basic machine
+    *  units, between the offsets of the active variable for
+    *  consecutive elements in the top-level array.  For top-level
+    *  block members not declared as an array, zero is written to
+    *  <params>."
+    */
+   if (field->type->is_array()) {
+      const enum glsl_matrix_layout matrix_layout =
+         glsl_matrix_layout(field->matrix_layout);
+      bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR;
+      const glsl_type *array_type = field->type->fields.array;
+
+      if (is_top_level_shader_storage_block_member(uni->name,
+                                                   interface_name,
+                                                   var_name))
+         return 0;
+
+      if (interface->interface_packing != GLSL_INTERFACE_PACKING_STD430) {
+         if (array_type->is_record() || array_type->is_array())
+            return glsl_align(array_type->std140_size(row_major), 16);
+         else
+            return MAX2(array_type->std140_base_alignment(row_major), 16);
+      } else {
+         return array_type->std430_array_stride(row_major);
       }
    }
-found_top_level_array_size:
-   free(interface_name);
-   free(var_name);
-   uni->top_level_array_size = array_size;
+   return 0;
 }
 
 static void
-calculate_array_stride(struct gl_shader_program *shProg,
-                       struct gl_uniform_storage *uni)
+calculate_array_size_and_stride(struct gl_shader_program *shProg,
+                                struct gl_uniform_storage *uni)
 {
    int block_index = uni->block_index;
+   int array_size = -1;
    int array_stride = -1;
    char *var_name = get_top_level_name(uni->name);
    char *interface_name =
@@ -3549,9 +3610,17 @@ calculate_array_stride(struct gl_shader_program *shProg,
    if (strcmp(var_name, interface_name) == 0) {
       /* Deal with instanced array of SSBOs */
       char *temp_name = get_var_name(uni->name);
+      if (!temp_name) {
+         linker_error(shProg, "Out of memory during linking.\n");
+         goto write_top_level_array_size_and_stride;
+      }
       free(var_name);
       var_name = get_top_level_name(temp_name);
       free(temp_name);
+      if (!var_name) {
+         linker_error(shProg, "Out of memory during linking.\n");
+         goto write_top_level_array_size_and_stride;
+      }
    }
 
    for (unsigned i = 0; i < shProg->NumShaders; i++) {
@@ -3567,61 +3636,26 @@ calculate_array_stride(struct gl_shader_program *shProg,
 
          const glsl_type *interface = var->get_interface_type();
 
-         if (strcmp(interface_name, interface->name) != 0) {
+         if (strcmp(interface_name, interface->name) != 0)
             continue;
-         }
 
          for (unsigned i = 0; i < interface->length; i++) {
             const glsl_struct_field *field = &interface->fields.structure[i];
             if (strcmp(field->name, var_name) != 0)
                continue;
-            /* From GL_ARB_program_interface_query:
-             *
-             * "For the property TOP_LEVEL_ARRAY_STRIDE, a single integer
-             *  identifying the stride between array elements of the top-level
-             *  shader storage block member containing the active variable is
-             *  written to <params>.  For top-level block members declared as
-             *  arrays, the value written is the difference, in basic machine
-             *  units, between the offsets of the active variable for
-             *  consecutive elements in the top-level array.  For top-level
-             *  block members not declared as an array, zero is written to
-             *  <params>."
-             */
-            if (field->type->is_array()) {
-               const enum glsl_matrix_layout matrix_layout =
-                  glsl_matrix_layout(field->matrix_layout);
-               bool row_major = matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR;
-               const glsl_type *array_type = field->type->fields.array;
-
-               if (is_top_level_shader_storage_block_member(uni->name,
-                                                            interface_name,
-                                                            var_name)) {
-                  array_stride = 0;
-                  goto found_top_level_array_stride;
-               }
-               if (interface->interface_packing != GLSL_INTERFACE_PACKING_STD430) {
-                  if (array_type->is_record() || array_type->is_array()) {
-                     array_stride = array_type->std140_size(row_major);
-                     array_stride = glsl_align(array_stride, 16);
-                  } else {
-                     unsigned element_base_align = 0;
-                     element_base_align = array_type->std140_base_alignment(row_major);
-                     array_stride = MAX2(element_base_align, 16);
-                  }
-               } else {
-                  array_stride = array_type->std430_array_stride(row_major);
-               }
-            } else {
-               array_stride = 0;
-            }
-            goto found_top_level_array_stride;
+
+            array_stride = get_array_stride(uni, interface, field,
+                                            interface_name, var_name);
+            array_size = get_array_size(uni, field, interface_name, var_name);
+            goto write_top_level_array_size_and_stride;
          }
       }
    }
-found_top_level_array_stride:
+write_top_level_array_size_and_stride:
    free(interface_name);
    free(var_name);
    uni->top_level_array_stride = array_stride;
+   uni->top_level_array_size = array_size;
 }
 
 /**
@@ -3664,6 +3698,9 @@ build_program_resource_list(struct gl_shader_program *shProg)
          return;
    }
 
+   if (!add_fragdata_arrays(shProg))
+      return;
+
    /* Add inputs and outputs to the resource list. */
    if (!add_interface_variables(shProg, shProg->_LinkedShaders[input_stage]->ir,
                                 GL_PROGRAM_INPUT))
@@ -3709,8 +3746,7 @@ build_program_resource_list(struct gl_shader_program *shProg)
          continue;
 
       if (is_shader_storage) {
-         calculate_array_size(shProg, &shProg->UniformStorage[i]);
-         calculate_array_stride(shProg, &shProg->UniformStorage[i]);
+         calculate_array_size_and_stride(shProg, &shProg->UniformStorage[i]);
       }
 
       if (!add_program_resource(shProg, type,
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 129dd02781b..ba14bbbeb6a 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -306,6 +306,7 @@ nir_visitor::visit(ir_variable *ir)
    var->data.read_only = ir->data.read_only;
    var->data.centroid = ir->data.centroid;
    var->data.sample = ir->data.sample;
+   var->data.patch = ir->data.patch;
    var->data.invariant = ir->data.invariant;
    var->data.location = ir->data.location;
 
@@ -396,8 +397,6 @@ nir_visitor::visit(ir_variable *ir)
    var->data.index = ir->data.index;
    var->data.descriptor_set = 0;
    var->data.binding = ir->data.binding;
-   /* XXX Get rid of buffer_index */
-   var->data.atomic.buffer_index = ir->data.binding;
    var->data.atomic.offset = ir->data.atomic.offset;
    var->data.image.read_only = ir->data.image_read_only;
    var->data.image.write_only = ir->data.image_write_only;
@@ -722,6 +721,8 @@ nir_visitor::visit(ir_call *ir)
          op = nir_intrinsic_ssbo_atomic_exchange;
       } else if (strcmp(ir->callee_name(), "__intrinsic_ssbo_atomic_comp_swap_internal") == 0) {
          op = nir_intrinsic_ssbo_atomic_comp_swap;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_shader_clock") == 0) {
+         op = nir_intrinsic_shader_clock;
       } else {
          unreachable("not reached");
       }
@@ -826,6 +827,10 @@ nir_visitor::visit(ir_call *ir)
       case nir_intrinsic_memory_barrier:
          nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
          break;
+      case nir_intrinsic_shader_clock:
+         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+         nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+         break;
       case nir_intrinsic_store_ssbo: {
          exec_node *param = ir->actual_parameters.get_head();
          ir_rvalue *block = ((ir_instruction *)param)->as_rvalue();
diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index 793bdafb54b..5f03095d673 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -1557,12 +1557,14 @@ nir_intrinsic_from_system_value(gl_system_value val)
       return nir_intrinsic_load_num_work_groups;
    case SYSTEM_VALUE_PRIMITIVE_ID:
       return nir_intrinsic_load_primitive_id;
-   /* FINISHME: Add tessellation intrinsics.
    case SYSTEM_VALUE_TESS_COORD:
-   case SYSTEM_VALUE_VERTICES_IN:
+      return nir_intrinsic_load_tess_coord;
    case SYSTEM_VALUE_TESS_LEVEL_OUTER:
+      return nir_intrinsic_load_tess_level_outer;
    case SYSTEM_VALUE_TESS_LEVEL_INNER:
-    */
+      return nir_intrinsic_load_tess_level_inner;
+   case SYSTEM_VALUE_VERTICES_IN:
+      return nir_intrinsic_load_patch_vertices_in;
    default:
       unreachable("system value does not directly correspond to intrinsic");
    }
@@ -1598,13 +1600,14 @@ nir_system_value_from_intrinsic(nir_intrinsic_op intrin)
       return SYSTEM_VALUE_WORK_GROUP_ID;
    case nir_intrinsic_load_primitive_id:
       return SYSTEM_VALUE_PRIMITIVE_ID;
-   /* FINISHME: Add tessellation intrinsics.
+   case nir_intrinsic_load_tess_coord:
       return SYSTEM_VALUE_TESS_COORD;
-      return SYSTEM_VALUE_VERTICES_IN;
-      return SYSTEM_VALUE_PRIMITIVE_ID;
+   case nir_intrinsic_load_tess_level_outer:
       return SYSTEM_VALUE_TESS_LEVEL_OUTER;
+   case nir_intrinsic_load_tess_level_inner:
       return SYSTEM_VALUE_TESS_LEVEL_INNER;
-    */
+   case nir_intrinsic_load_patch_vertices_in:
+      return SYSTEM_VALUE_VERTICES_IN;
    default:
       unreachable("intrinsic doesn't produce a system value");
    }
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 229d534bf3d..9b278d6a767 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -171,6 +171,7 @@ typedef struct {
       unsigned read_only:1;
       unsigned centroid:1;
       unsigned sample:1;
+      unsigned patch:1;
       unsigned invariant:1;
 
       /**
@@ -313,7 +314,6 @@ typedef struct {
        * Location an atomic counter is stored at.
        */
       struct {
-         unsigned buffer_index;
          unsigned offset;
       } atomic;
 
@@ -2016,7 +2016,8 @@ void nir_lower_clip_fs(nir_shader *shader, unsigned ucp_enables);
 
 void nir_lower_two_sided_color(nir_shader *shader);
 
-void nir_lower_atomics(nir_shader *shader);
+void nir_lower_atomics(nir_shader *shader,
+                       const struct gl_shader_program *shader_program);
 void nir_lower_to_source_mods(nir_shader *shader);
 
 bool nir_lower_gs_intrinsics(nir_shader *shader);
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h
index b2ceff566cf..9fd91de157f 100644
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -83,6 +83,14 @@ BARRIER(discard)
  */
 BARRIER(memory_barrier)
 
+/*
+ * Shader clock intrinsic with semantics analogous to the clock2x32ARB()
+ * GLSL intrinsic.
+ * The latter can be used as code motion barrier, which is currently not
+ * feasible with NIR.
+ */
+INTRINSIC(shader_clock, 0, ARR(), true, 1, 0, 0, NIR_INTRINSIC_CAN_ELIMINATE)
+
 /** A conditional discard, with a single boolean source. */
 INTRINSIC(discard_if, 1, ARR(1), false, 0, 0, 0, 0)
 
@@ -217,6 +225,10 @@ SYSTEM_VALUE(sample_pos, 2, 0)
 SYSTEM_VALUE(sample_mask_in, 1, 0)
 SYSTEM_VALUE(primitive_id, 1, 0)
 SYSTEM_VALUE(invocation_id, 1, 0)
+SYSTEM_VALUE(tess_coord, 3, 0)
+SYSTEM_VALUE(tess_level_outer, 4, 0)
+SYSTEM_VALUE(tess_level_inner, 2, 0)
+SYSTEM_VALUE(patch_vertices_in, 1, 0)
 SYSTEM_VALUE(local_invocation_id, 3, 0)
 SYSTEM_VALUE(work_group_id, 3, 0)
 SYSTEM_VALUE(user_clip_plane, 4, 1) /* const_index[0] is user_clip_plane[idx] */
diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c
index 46e137652a1..40ca3de96cf 100644
--- a/src/glsl/nir/nir_lower_atomics.c
+++ b/src/glsl/nir/nir_lower_atomics.c
@@ -25,17 +25,24 @@
  *
  */
 
+#include "ir_uniform.h"
 #include "nir.h"
 #include "main/config.h"
 #include <assert.h>
 
+typedef struct {
+   const struct gl_shader_program *shader_program;
+   nir_shader   *shader;
+} lower_atomic_state;
+
 /*
  * replace atomic counter intrinsics that use a variable with intrinsics
  * that directly store the buffer index and byte offset
  */
 
 static void
-lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl)
+lower_instr(nir_intrinsic_instr *instr,
+            lower_atomic_state *state)
 {
    nir_intrinsic_op op;
    switch (instr->intrinsic) {
@@ -60,10 +67,11 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl)
       return; /* atomics passed as function arguments can't be lowered */
 
    void *mem_ctx = ralloc_parent(instr);
+   unsigned uniform_loc = instr->variables[0]->var->data.location;
 
    nir_intrinsic_instr *new_instr = nir_intrinsic_instr_create(mem_ctx, op);
    new_instr->const_index[0] =
-      (int) instr->variables[0]->var->data.atomic.buffer_index;
+      state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index;
 
    nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
    offset_const->value.u[0] = instr->variables[0]->var->data.atomic.offset;
@@ -132,18 +140,25 @@ lower_block(nir_block *block, void *state)
 {
    nir_foreach_instr_safe(block, instr) {
       if (instr->type == nir_instr_type_intrinsic)
-         lower_instr(nir_instr_as_intrinsic(instr), state);
+         lower_instr(nir_instr_as_intrinsic(instr),
+                     (lower_atomic_state *) state);
    }
 
    return true;
 }
 
 void
-nir_lower_atomics(nir_shader *shader)
+nir_lower_atomics(nir_shader *shader,
+                  const struct gl_shader_program *shader_program)
 {
+   lower_atomic_state state = {
+      .shader = shader,
+      .shader_program = shader_program,
+   };
+
    nir_foreach_overload(shader, overload) {
       if (overload->impl) {
-         nir_foreach_block(overload->impl, lower_block, overload->impl);
+         nir_foreach_block(overload->impl, lower_block, (void *) &state);
          nir_metadata_preserve(overload->impl, nir_metadata_block_index |
                                                nir_metadata_dominance);
       }
diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
index f2d584fe484..3c0f1da94af 100644
--- a/src/glsl/nir/nir_opcodes.py
+++ b/src/glsl/nir/nir_opcodes.py
@@ -468,6 +468,51 @@ binop("fmax", tfloat, "", "fmaxf(src0, src1)")
 binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 binop("umax", tunsigned, commutative + associative, "src1 > src0 ? src1 : src0")
 
+# Saturated vector add for 4 8bit ints.
+binop("usadd_4x8", tint, commutative + associative, """
+dst = 0;
+for (int i = 0; i < 32; i += 8) {
+   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
+}
+""")
+
+# Saturated vector subtract for 4 8bit ints.
+binop("ussub_4x8", tint, "", """
+dst = 0;
+for (int i = 0; i < 32; i += 8) {
+   int src0_chan = (src0 >> i) & 0xff;
+   int src1_chan = (src1 >> i) & 0xff;
+   if (src0_chan > src1_chan)
+      dst |= (src0_chan - src1_chan) << i;
+}
+""")
+
+# vector min for 4 8bit ints.
+binop("umin_4x8", tint, commutative + associative, """
+dst = 0;
+for (int i = 0; i < 32; i += 8) {
+   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
+}
+""")
+
+# vector max for 4 8bit ints.
+binop("umax_4x8", tint, commutative + associative, """
+dst = 0;
+for (int i = 0; i < 32; i += 8) {
+   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
+}
+""")
+
+# unorm multiply: (a * b) / 255.
+binop("umul_unorm_4x8", tint, commutative + associative, """
+dst = 0;
+for (int i = 0; i < 32; i += 8) {
+   int src0_chan = (src0 >> i) & 0xff;
+   int src1_chan = (src1 >> i) & 0xff;
+   dst |= ((src0_chan * src1_chan) / 255) << i;
+}
+""")
+
 binop("fpow", tfloat, "", "powf(src0, src1)")
 
 binop_horiz("pack_half_2x16_split", 1, tunsigned, 1, tfloat, 1, tfloat,
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index cafbd6d66a5..30ede52b146 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -56,12 +56,16 @@ optimizations = [
    (('iabs', ('ineg', a)), ('iabs', a)),
    (('fadd', a, 0.0), a),
    (('iadd', a, 0), a),
+   (('usadd_4x8', a, 0), a),
+   (('usadd_4x8', a, ~0), ~0),
    (('fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
    (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
    (('fadd', ('fneg', a), a), 0.0),
    (('iadd', ('ineg', a), a), 0),
    (('fmul', a, 0.0), 0.0),
    (('imul', a, 0), 0),
+   (('umul_unorm_4x8', a, 0), 0),
+   (('umul_unorm_4x8', a, ~0), a),
    (('fmul', a, 1.0), a),
    (('imul', a, 1), a),
    (('fmul', a, -1.0), ('fneg', a)),
@@ -202,6 +206,8 @@ optimizations = [
    # Subtracts
    (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
    (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
+   (('ussub_4x8', a, 0), a),
+   (('ussub_4x8', a, ~0), 0),
    (('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'),
    (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'),
    (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c
index 09663996869..30220c5e48d 100644
--- a/src/glsl/nir/nir_print.c
+++ b/src/glsl/nir/nir_print.c
@@ -228,12 +228,13 @@ print_var_decl(nir_variable *var, print_state *state)
 
    const char *const cent = (var->data.centroid) ? "centroid " : "";
    const char *const samp = (var->data.sample) ? "sample " : "";
+   const char *const patch = (var->data.patch) ? "patch " : "";
    const char *const inv = (var->data.invariant) ? "invariant " : "";
    const char *const mode[] = { "shader_in ", "shader_out ", "", "",
                                 "uniform ", "shader_storage", "system " };
 
-   fprintf(fp, "%s%s%s%s%s ",
-      cent, samp, inv, mode[var->data.mode],
+   fprintf(fp, "%s%s%s%s%s%s ",
+      cent, samp, patch, inv, mode[var->data.mode],
 	  glsl_interp_qualifier_name(var->data.interpolation));
 
    glsl_print_type(var->type, fp);
diff --git a/src/glsl/opt_dead_builtin_varyings.cpp b/src/glsl/opt_dead_builtin_varyings.cpp
index 31719d20c05..68b70eedf92 100644
--- a/src/glsl/opt_dead_builtin_varyings.cpp
+++ b/src/glsl/opt_dead_builtin_varyings.cpp
@@ -269,14 +269,14 @@ public:
  */
 class replace_varyings_visitor : public ir_rvalue_visitor {
 public:
-   replace_varyings_visitor(exec_list *ir,
+   replace_varyings_visitor(struct gl_shader *sha,
                             const varying_info_visitor *info,
                             unsigned external_texcoord_usage,
                             unsigned external_color_usage,
                             bool external_has_fog)
-      : info(info), new_fog(NULL)
+      : shader(sha), info(info), new_fog(NULL)
    {
-      void *const ctx = ir;
+      void *const ctx = shader->ir;
 
       memset(this->new_fragdata, 0, sizeof(this->new_fragdata));
       memset(this->new_texcoord, 0, sizeof(this->new_texcoord));
@@ -293,14 +293,16 @@ public:
        * occurrences of gl_TexCoord will be replaced with.
        */
       if (info->lower_texcoord_array) {
-         prepare_array(ir, this->new_texcoord, ARRAY_SIZE(this->new_texcoord),
+         prepare_array(shader->ir, this->new_texcoord,
+                       ARRAY_SIZE(this->new_texcoord),
                        VARYING_SLOT_TEX0, "TexCoord", mode_str,
                        info->texcoord_usage, external_texcoord_usage);
       }
 
       /* Handle gl_FragData in the same way like gl_TexCoord. */
       if (info->lower_fragdata_array) {
-         prepare_array(ir, this->new_fragdata, ARRAY_SIZE(this->new_fragdata),
+         prepare_array(shader->ir, this->new_fragdata,
+                       ARRAY_SIZE(this->new_fragdata),
                        FRAG_RESULT_DATA0, "FragData", mode_str,
                        info->fragdata_usage, (1 << MAX_DRAW_BUFFERS) - 1);
       }
@@ -340,7 +342,7 @@ public:
       }
 
       /* Now do the replacing. */
-      visit_list_elements(this, ir);
+      visit_list_elements(this, shader->ir);
    }
 
    void prepare_array(exec_list *ir,
@@ -389,6 +391,13 @@ public:
       /* Remove the gl_FragData array. */
       if (this->info->lower_fragdata_array &&
           var == this->info->fragdata_array) {
+
+         /* Clone variable for program resource list before it is removed. */
+         if (!shader->fragdata_arrays)
+            shader->fragdata_arrays = new (shader) exec_list;
+
+         shader->fragdata_arrays->push_tail(var->clone(shader, NULL));
+
          var->remove();
       }
 
@@ -487,6 +496,7 @@ public:
    }
 
 private:
+   struct gl_shader *shader;
    const varying_info_visitor *info;
    ir_variable *new_fragdata[MAX_DRAW_BUFFERS];
    ir_variable *new_texcoord[MAX_TEXTURE_COORD_UNITS];
@@ -498,20 +508,20 @@ private:
 } /* anonymous namespace */
 
 static void
-lower_texcoord_array(exec_list *ir, const varying_info_visitor *info)
+lower_texcoord_array(struct gl_shader *shader, const varying_info_visitor *info)
 {
-   replace_varyings_visitor(ir, info,
+   replace_varyings_visitor(shader, info,
                             (1 << MAX_TEXTURE_COORD_UNITS) - 1,
                             1 | 2, true);
 }
 
 static void
-lower_fragdata_array(exec_list *ir)
+lower_fragdata_array(struct gl_shader *shader)
 {
    varying_info_visitor info(ir_var_shader_out, true);
-   info.get(ir, 0, NULL);
+   info.get(shader->ir, 0, NULL);
 
-   replace_varyings_visitor(ir, &info, 0, 0, 0);
+   replace_varyings_visitor(shader, &info, 0, 0, 0);
 }
 
 
@@ -523,7 +533,7 @@ do_dead_builtin_varyings(struct gl_context *ctx,
 {
    /* Lower the gl_FragData array to separate variables. */
    if (consumer && consumer->Stage == MESA_SHADER_FRAGMENT) {
-      lower_fragdata_array(consumer->ir);
+      lower_fragdata_array(consumer);
    }
 
    /* Lowering of built-in varyings has no effect with the core context and
@@ -544,7 +554,7 @@ do_dead_builtin_varyings(struct gl_context *ctx,
       if (!consumer) {
          /* At least eliminate unused gl_TexCoord elements. */
          if (producer_info.lower_texcoord_array) {
-            lower_texcoord_array(producer->ir, &producer_info);
+            lower_texcoord_array(producer, &producer_info);
          }
          return;
       }
@@ -556,7 +566,7 @@ do_dead_builtin_varyings(struct gl_context *ctx,
       if (!producer) {
          /* At least eliminate unused gl_TexCoord elements. */
          if (consumer_info.lower_texcoord_array) {
-            lower_texcoord_array(consumer->ir, &consumer_info);
+            lower_texcoord_array(consumer, &consumer_info);
          }
          return;
       }
@@ -566,7 +576,7 @@ do_dead_builtin_varyings(struct gl_context *ctx,
    if (producer_info.lower_texcoord_array ||
        producer_info.color_usage ||
        producer_info.has_fog) {
-      replace_varyings_visitor(producer->ir,
+      replace_varyings_visitor(producer,
                                &producer_info,
                                consumer_info.texcoord_usage,
                                consumer_info.color_usage,
@@ -587,7 +597,7 @@ do_dead_builtin_varyings(struct gl_context *ctx,
    if (consumer_info.lower_texcoord_array ||
        consumer_info.color_usage ||
        consumer_info.has_fog) {
-      replace_varyings_visitor(consumer->ir,
+      replace_varyings_visitor(consumer,
                                &consumer_info,
                                producer_info.texcoord_usage,
                                producer_info.color_usage,
diff --git a/src/glx/drisw_glx.c b/src/glx/drisw_glx.c
index 749ceb08aac..76cc3214b7b 100644
--- a/src/glx/drisw_glx.c
+++ b/src/glx/drisw_glx.c
@@ -177,9 +177,9 @@ swrastPutImage(__DRIdrawable * draw, int op,
 }
 
 static void
-swrastGetImage(__DRIdrawable * read,
-               int x, int y, int w, int h,
-               char *data, void *loaderPrivate)
+swrastGetImage2(__DRIdrawable * read,
+                int x, int y, int w, int h, int stride,
+                char *data, void *loaderPrivate)
 {
    struct drisw_drawable *prp = loaderPrivate;
    __GLXDRIdrawable *pread = &(prp->base);
@@ -193,20 +193,29 @@ swrastGetImage(__DRIdrawable * read,
    ximage->data = data;
    ximage->width = w;
    ximage->height = h;
-   ximage->bytes_per_line = bytes_per_line(w * ximage->bits_per_pixel, 32);
+   ximage->bytes_per_line = stride ? stride : bytes_per_line(w * ximage->bits_per_pixel, 32);
 
    XGetSubImage(dpy, readable, x, y, w, h, ~0L, ZPixmap, ximage, 0, 0);
 
    ximage->data = NULL;
 }
 
+static void
+swrastGetImage(__DRIdrawable * read,
+               int x, int y, int w, int h,
+               char *data, void *loaderPrivate)
+{
+   swrastGetImage2(read, x, y, w, h, 0, data, loaderPrivate);
+}
+
 static const __DRIswrastLoaderExtension swrastLoaderExtension = {
-   .base = {__DRI_SWRAST_LOADER, 2 },
+   .base = {__DRI_SWRAST_LOADER, 3 },
 
    .getDrawableInfo     = swrastGetDrawableInfo,
    .putImage            = swrastPutImage,
    .getImage            = swrastGetImage,
    .putImage2           = swrastPutImage2,
+   .getImage2           = swrastGetImage2,
 };
 
 static const __DRIextension *loader_extensions[] = {
diff --git a/src/mapi/glapi/gen/ARB_draw_elements_base_vertex.xml b/src/mapi/glapi/gen/ARB_draw_elements_base_vertex.xml
index 120bda13dd8..72aa62c7751 100644
--- a/src/mapi/glapi/gen/ARB_draw_elements_base_vertex.xml
+++ b/src/mapi/glapi/gen/ARB_draw_elements_base_vertex.xml
@@ -8,7 +8,7 @@
 
 <category name="GL_ARB_draw_elements_base_vertex" number="62">
 
-    <function name="DrawElementsBaseVertex" exec="dynamic">
+    <function name="DrawElementsBaseVertex" es2="3.2" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="count" type="GLsizei"/>
         <param name="type" type="GLenum"/>
@@ -16,7 +16,7 @@
         <param name="basevertex" type="GLint"/>
     </function>
 
-    <function name="DrawRangeElementsBaseVertex" exec="dynamic">
+    <function name="DrawRangeElementsBaseVertex" es2="3.2" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="start" type="GLuint"/>
         <param name="end" type="GLuint"/>
@@ -35,7 +35,7 @@
         <param name="basevertex" type="const GLint *"/>
     </function>
 
-    <function name="DrawElementsInstancedBaseVertex" exec="dynamic">
+    <function name="DrawElementsInstancedBaseVertex" es2="3.2" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="count" type="GLsizei"/>
         <param name="type" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index cfca5a980bb..bf20e4801cc 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -817,4 +817,92 @@
     </function>
 </category>
 
+<category name="GL_EXT_draw_elements_base_vertex" number="204">
+
+    <function name="DrawElementsBaseVertexEXT" alias="DrawElementsBaseVertex"
+              es2="2.0" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="count" type="GLsizei"/>
+        <param name="type" type="GLenum"/>
+        <param name="indices" type="const GLvoid *"/>
+        <param name="basevertex" type="GLint"/>
+    </function>
+
+    <function name="DrawRangeElementsBaseVertexEXT" alias="DrawRangeElementsBaseVertex"
+              es2="3.0" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="start" type="GLuint"/>
+        <param name="end" type="GLuint"/>
+        <param name="count" type="GLsizei"/>
+        <param name="type" type="GLenum"/>
+        <param name="indices" type="const GLvoid *"/>
+        <param name="basevertex" type="GLint"/>
+    </function>
+
+    <function name="MultiDrawElementsBaseVertexEXT" alias="MultiDrawElementsBaseVertex"
+              es2="2.0" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="count" type="const GLsizei *"/>
+        <param name="type" type="GLenum"/>
+        <param name="indices" type="const GLvoid * const *"/>
+        <param name="primcount" type="GLsizei"/>
+        <param name="basevertex" type="const GLint *"/>
+    </function>
+
+    <function name="DrawElementsInstancedBaseVertexEXT" alias="DrawElementsInstancedBaseVertex"
+              es2="3.0" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="count" type="GLsizei"/>
+        <param name="type" type="GLenum"/>
+        <param name="indices" type="const GLvoid *"/>
+        <param name="primcount" type="GLsizei"/>
+        <param name="basevertex" type="GLint"/>
+    </function>
+
+</category>
+
+<category name="GL_OES_draw_elements_base_vertex" number="219">
+
+    <function name="DrawElementsBaseVertexOES" alias="DrawElementsBaseVertex"
+              es2="2.0" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="count" type="GLsizei"/>
+        <param name="type" type="GLenum"/>
+        <param name="indices" type="const GLvoid *"/>
+        <param name="basevertex" type="GLint"/>
+    </function>
+
+    <function name="DrawRangeElementsBaseVertexOES" alias="DrawRangeElementsBaseVertex"
+              es2="3.0" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="start" type="GLuint"/>
+        <param name="end" type="GLuint"/>
+        <param name="count" type="GLsizei"/>
+        <param name="type" type="GLenum"/>
+        <param name="indices" type="const GLvoid *"/>
+        <param name="basevertex" type="GLint"/>
+    </function>
+
+    <function name="MultiDrawElementsBaseVertexOES" alias="MultiDrawElementsBaseVertex"
+              es2="2.0" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="count" type="const GLsizei *"/>
+        <param name="type" type="GLenum"/>
+        <param name="indices" type="const GLvoid * const *"/>
+        <param name="primcount" type="GLsizei"/>
+        <param name="basevertex" type="const GLint *"/>
+    </function>
+
+    <function name="DrawElementsInstancedBaseVertexOES" alias="DrawElementsInstancedBaseVertex"
+              es2="3.0" exec="dynamic">
+        <param name="mode" type="GLenum"/>
+        <param name="count" type="GLsizei"/>
+        <param name="type" type="GLenum"/>
+        <param name="indices" type="const GLvoid *"/>
+        <param name="primcount" type="GLsizei"/>
+        <param name="basevertex" type="GLint"/>
+    </function>
+
+</category>
+
 </OpenGLAPI>
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 34fb4461985..de0e330b7d1 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -345,7 +345,6 @@ TNL_FILES = \
 	tnl/tnl.h \
 	tnl/t_pipeline.c \
 	tnl/t_pipeline.h \
-	tnl/t_rasterpos.c \
 	tnl/t_vb_cliptmp.h \
 	tnl/t_vb_fog.c \
 	tnl/t_vb_light.c \
@@ -424,6 +423,8 @@ STATETRACKER_FILES = \
 	state_tracker/st_cb_clear.h \
 	state_tracker/st_cb_condrender.c \
 	state_tracker/st_cb_condrender.h \
+	state_tracker/st_cb_copyimage.c \
+	state_tracker/st_cb_copyimage.h \
 	state_tracker/st_cb_drawpixels.c \
 	state_tracker/st_cb_drawpixels.h \
 	state_tracker/st_cb_drawpixels_shader.c \
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 3d1fccb3ab4..752aaf6c006 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -33,6 +33,7 @@
 #include "main/mipmap.h"
 #include "main/queryobj.h"
 #include "main/readpix.h"
+#include "main/rastpos.h"
 #include "main/renderbuffer.h"
 #include "main/shaderobj.h"
 #include "main/texcompress.h"
@@ -81,7 +82,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
 
    /* framebuffer/image functions */
    driver->Clear = _swrast_Clear;
-   driver->RasterPos = _tnl_RasterPos;
+   driver->RasterPos = _mesa_RasterPos;
    driver->DrawPixels = _swrast_DrawPixels;
    driver->ReadPixels = _mesa_readpixels;
    driver->CopyPixels = _swrast_CopyPixels;
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index 04b3f9cc8ce..9d003e48bd8 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -59,6 +59,7 @@ TESTS = \
 	test_fs_saturate_propagation \
         test_eu_compact \
 	test_vf_float_conversions \
+	test_vec4_cmod_propagation \
         test_vec4_copy_propagation \
         test_vec4_register_coalesce
 
@@ -94,6 +95,12 @@ test_vec4_copy_propagation_LDADD = \
         $(top_builddir)/src/gtest/libgtest.la \
         $(TEST_LIBS)
 
+test_vec4_cmod_propagation_SOURCES = \
+	test_vec4_cmod_propagation.cpp
+test_vec4_cmod_propagation_LDADD = \
+	$(top_builddir)/src/gtest/libgtest.la \
+	$(TEST_LIBS)
+
 test_eu_compact_SOURCES = \
 	test_eu_compact.c
 nodist_EXTRA_test_eu_compact_SOURCES = dummy.cpp
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index ccd540dabca..ed2654ef329 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -58,6 +58,7 @@ i965_compiler_FILES = \
 	brw_util.c \
 	brw_util.h \
 	brw_vec4_builder.h \
+	brw_vec4_cmod_propagation.cpp \
 	brw_vec4_copy_propagation.cpp \
 	brw_vec4.cpp \
 	brw_vec4_cse.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
index 10bcd4bafd4..5d46615bc7b 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
@@ -528,7 +528,9 @@ cfg_t::dump_domtree()
 {
    printf("digraph DominanceTree {\n");
    foreach_block(block, this) {
-      printf("\t%d -> %d\n", block->idom->num, block->num);
+      if (block->idom) {
+         printf("\t%d -> %d\n", block->idom->num, block->num);
+      }
    }
    printf("}\n");
 }
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h
index a06b0aa1cd0..69e39e8964d 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.h
+++ b/src/mesa/drivers/dri/i965/brw_cfg.h
@@ -90,6 +90,8 @@ struct bblock_t {
    struct exec_list parents;
    struct exec_list children;
    int num;
+
+   unsigned cycle_count;
 };
 
 static inline struct backend_instruction *
@@ -285,6 +287,8 @@ struct cfg_t {
    int num_blocks;
 
    bool idom_dirty;
+
+   unsigned cycle_count;
 };
 
 /* Note that this is implemented with a double for loop -- break will
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index d9967143d8a..e5133ef5a3d 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -338,6 +338,7 @@ struct brw_wm_prog_data {
    } binding_table;
 
    uint8_t computed_depth_mode;
+   bool computed_stencil;
 
    bool early_fragment_tests;
    bool no_8;
@@ -443,9 +444,7 @@ struct brw_vue_map {
     * directly correspond to a gl_varying_slot, the value comes from
     * brw_varying_slot.
     *
-    * For slots that are not in use, the value is BRW_VARYING_SLOT_COUNT (this
-    * simplifies code that uses the value stored in slot_to_varying to
-    * create a bit mask).
+    * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD.
     */
    signed char slot_to_varying[BRW_VARYING_SLOT_COUNT];
 
@@ -467,8 +466,8 @@ static inline GLuint brw_vue_slot_to_offset(GLuint slot)
  * Convert a vertex output (brw_varying_slot) into a byte offset within the
  * VUE.
  */
-static inline GLuint brw_varying_to_offset(struct brw_vue_map *vue_map,
-                                           GLuint varying)
+static inline
+GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying)
 {
    return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 4f503ae4869..c83f47bdff7 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -501,8 +501,6 @@ struct brw_cache_item {
 };
 
 
-typedef void (*cache_aux_free_func)(const void *aux);
-
 struct brw_cache {
    struct brw_context *brw;
 
@@ -512,9 +510,6 @@ struct brw_cache {
 
    uint32_t next_offset;
    bool bo_used_by_gpu;
-
-   /** Optional functions for freeing other pointers attached to a prog_data. */
-   cache_aux_free_func aux_free[BRW_MAX_CACHE];
 };
 
 
@@ -1177,7 +1172,7 @@ struct brw_context
 
    int num_atoms[BRW_NUM_PIPELINES];
    const struct brw_tracked_state render_atoms[60];
-   const struct brw_tracked_state compute_atoms[8];
+   const struct brw_tracked_state compute_atoms[9];
 
    /* If (INTEL_DEBUG & DEBUG_BATCH) */
    struct {
@@ -1463,7 +1458,7 @@ void brw_upload_ubo_surfaces(struct brw_context *brw,
                              struct brw_stage_prog_data *prog_data,
                              bool dword_pitch);
 void brw_upload_abo_surfaces(struct brw_context *brw,
-                             struct gl_shader_program *prog,
+                             struct gl_shader *shader,
                              struct brw_stage_state *stage_state,
                              struct brw_stage_prog_data *prog_data);
 void brw_upload_image_surfaces(struct brw_context *brw,
@@ -1680,6 +1675,7 @@ struct opcode_desc {
 
 extern const struct opcode_desc opcode_descs[128];
 extern const char * const conditional_modifier[16];
+extern const char *const pred_ctrl_align16[16];
 
 void
 brw_emit_depthbuffer(struct brw_context *brw);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 169d092f90e..754da9fc3da 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -913,20 +913,15 @@ enum opcode {
 
    /**
     * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as
-    * individual sources instead of as a single payload blob:
-    *
-    * Source 0: [required] Color 0.
-    * Source 1: [optional] Color 1 (for dual source blend messages).
-    * Source 2: [optional] Src0 Alpha.
-    * Source 3: [optional] Source Depth (gl_FragDepth)
-    * Source 4: [optional (gen4-5)] Destination Depth passthrough from thread
-    * Source 5: [optional] Sample Mask (gl_SampleMask).
-    * Source 6: [required] Number of color components (as a UD immediate).
+    * individual sources instead of as a single payload blob. The
+    * position/ordering of the arguments are defined by the enum
+    * fb_write_logical_srcs.
     */
    FS_OPCODE_FB_WRITE_LOGICAL,
 
    FS_OPCODE_BLORP_FB_WRITE,
    FS_OPCODE_REP_FB_WRITE,
+   FS_OPCODE_PACK_STENCIL_REF,
    SHADER_OPCODE_RCP,
    SHADER_OPCODE_RSQ,
    SHADER_OPCODE_SQRT,
@@ -1332,6 +1327,17 @@ enum brw_urb_write_flags {
       BRW_URB_WRITE_ALLOCATE | BRW_URB_WRITE_COMPLETE,
 };
 
+enum fb_write_logical_srcs {
+   FB_WRITE_LOGICAL_SRC_COLOR0,      /* REQUIRED */
+   FB_WRITE_LOGICAL_SRC_COLOR1,      /* for dual source blend messages */
+   FB_WRITE_LOGICAL_SRC_SRC0_ALPHA,
+   FB_WRITE_LOGICAL_SRC_SRC_DEPTH,   /* gl_FragDepth */
+   FB_WRITE_LOGICAL_SRC_DST_DEPTH,   /* GEN4-5: passthrough from thread */
+   FB_WRITE_LOGICAL_SRC_SRC_STENCIL, /* gl_FragStencilRefARB */
+   FB_WRITE_LOGICAL_SRC_OMASK,       /* Sample Mask (gl_SampleMask) */
+   FB_WRITE_LOGICAL_SRC_COMPONENTS,  /* REQUIRED */
+};
+
 #ifdef __cplusplus
 /**
  * Allow brw_urb_write_flags enums to be ORed together.
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 65172490da3..6372fb5c55f 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -311,7 +311,7 @@ static const struct brw_device_info brw_device_info_chv = {
    .max_gs_threads = 336,                           \
    .max_hs_threads = 336,                           \
    .max_ds_threads = 336,                           \
-   .max_wm_threads = 64 * 6,                        \
+   .max_wm_threads = 64 * 9,                        \
    .max_cs_threads = 56,                            \
    .urb = {                                         \
       .size = 384,                                  \
@@ -335,6 +335,10 @@ static const struct brw_device_info brw_device_info_skl_gt3 = {
    GEN9_FEATURES, .gt = 3,
 };
 
+static const struct brw_device_info brw_device_info_skl_gt4 = {
+   GEN9_FEATURES, .gt = 4,
+};
+
 static const struct brw_device_info brw_device_info_bxt = {
    GEN9_FEATURES,
    .is_broxton = 1,
@@ -359,7 +363,7 @@ static const struct brw_device_info brw_device_info_bxt = {
 };
 
 const struct brw_device_info *
-brw_get_device_info(int devid, int revision)
+brw_get_device_info(int devid)
 {
    const struct brw_device_info *devinfo;
    switch (devid) {
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h
index 7bab5716b43..6f4a250e874 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.h
+++ b/src/mesa/drivers/dri/i965/brw_device_info.h
@@ -86,5 +86,5 @@ struct brw_device_info
    /** @} */
 };
 
-const struct brw_device_info *brw_get_device_info(int devid, int revision);
+const struct brw_device_info *brw_get_device_info(int devid);
 const char *brw_get_device_name(int devid);
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index db23a187a93..df747107188 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -252,7 +252,7 @@ static const char *const pred_inv[2] = {
    [1] = "-"
 };
 
-static const char *const pred_ctrl_align16[16] = {
+const char *const pred_ctrl_align16[16] = {
    [1] = "",
    [2] = ".x",
    [3] = ".y",
@@ -726,7 +726,7 @@ reg(FILE *file, unsigned _reg_file, unsigned _reg_nr)
       switch (_reg_nr & 0xf0) {
       case BRW_ARF_NULL:
          string(file, "null");
-         return -1;
+         break;
       case BRW_ARF_ADDRESS:
          format(file, "a%d", _reg_nr & 0x0f);
          break;
@@ -908,7 +908,6 @@ src_ia1(FILE *file,
         unsigned _addr_subreg_nr,
         unsigned _negate,
         unsigned __abs,
-        unsigned _addr_mode,
         unsigned _horiz_stride, unsigned _width, unsigned _vert_stride)
 {
    int err = 0;
@@ -1143,7 +1142,6 @@ src0(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst)
                         brw_inst_src0_ia_subreg_nr(devinfo, inst),
                         brw_inst_src0_negate(devinfo, inst),
                         brw_inst_src0_abs(devinfo, inst),
-                        brw_inst_src0_address_mode(devinfo, inst),
                         brw_inst_src0_hstride(devinfo, inst),
                         brw_inst_src0_width(devinfo, inst),
                         brw_inst_src0_vstride(devinfo, inst));
@@ -1200,7 +1198,6 @@ src1(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst)
                         brw_inst_src1_ia_subreg_nr(devinfo, inst),
                         brw_inst_src1_negate(devinfo, inst),
                         brw_inst_src1_abs(devinfo, inst),
-                        brw_inst_src1_address_mode(devinfo, inst),
                         brw_inst_src1_hstride(devinfo, inst),
                         brw_inst_src1_width(devinfo, inst),
                         brw_inst_src1_vstride(devinfo, inst));
diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c
index 1f4a3516fa2..40ec87d38f0 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.c
+++ b/src/mesa/drivers/dri/i965/brw_eu.c
@@ -261,7 +261,7 @@ void
 brw_disassemble(const struct brw_device_info *devinfo,
                 void *assembly, int start, int end, FILE *out)
 {
-   bool dump_hex = false;
+   bool dump_hex = (INTEL_DEBUG & DEBUG_HEX) != 0;
 
    for (int offset = start; offset < end;) {
       brw_inst *insn = assembly + offset;
diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c
index f787ea3d4f8..07ace6bfbcb 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_compact.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c
@@ -1407,6 +1407,9 @@ void
 brw_compact_instructions(struct brw_codegen *p, int start_offset,
                          int num_annotations, struct annotation *annotation)
 {
+   if (unlikely(INTEL_DEBUG & DEBUG_NO_COMPACTION))
+      return;
+
    const struct brw_device_info *devinfo = p->devinfo;
    void *store = p->store + start_offset / 16;
    /* For an instruction at byte offset 16*i before compaction, this is the
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index bf2fee9ed48..a6fbb542919 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -410,7 +410,7 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
             brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset);
 	 } else {
-            brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.dw1.bits.indirect_offset);
+            brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset);
 	 }
       }
 
@@ -2511,12 +2511,20 @@ brw_send_indirect_message(struct brw_codegen *p,
                           struct brw_reg desc)
 {
    const struct brw_device_info *devinfo = p->devinfo;
-   struct brw_inst *send, *setup;
+   struct brw_inst *send;
+   int setup;
 
    assert(desc.type == BRW_REGISTER_TYPE_UD);
 
+   /* We hold on to the setup instruction (the SEND in the direct case, the OR
+    * in the indirect case) by its index in the instruction store.  The
+    * pointer returned by next_insn() may become invalid if emitting the SEND
+    * in the indirect case reallocs the store.
+    */
+
    if (desc.file == BRW_IMMEDIATE_VALUE) {
-      setup = send = next_insn(p, BRW_OPCODE_SEND);
+      setup = p->nr_insn;
+      send = next_insn(p, BRW_OPCODE_SEND);
       brw_set_src1(p, send, desc);
 
    } else {
@@ -2531,7 +2539,8 @@ brw_send_indirect_message(struct brw_codegen *p,
        * caller can specify additional descriptor bits with the usual
        * brw_set_*_message() helper functions.
        */
-      setup = brw_OR(p, addr, desc, brw_imm_ud(0));
+      setup = p->nr_insn;
+      brw_OR(p, addr, desc, brw_imm_ud(0));
 
       brw_pop_insn_state(p);
 
@@ -2543,7 +2552,7 @@ brw_send_indirect_message(struct brw_codegen *p,
    brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
    brw_inst_set_sfid(devinfo, send, sfid);
 
-   return setup;
+   return &p->store[setup];
 }
 
 static struct brw_inst *
@@ -2906,11 +2915,10 @@ brw_untyped_surface_read(struct brw_codegen *p,
    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
                           GEN7_SFID_DATAPORT_DATA_CACHE);
-   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
    struct brw_inst *insn = brw_send_indirect_surface_message(
       p, sfid, dst, payload, surface, msg_length,
       brw_surface_payload_size(p, num_channels, true, true),
-      align1);
+      false);
 
    brw_set_dp_untyped_surface_read_message(
       p, insn, num_channels);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8320cd77299..e218a85a363 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -88,8 +88,6 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
    case IMM:
    case UNIFORM:
       unreachable("Invalid destination register file");
-   default:
-      unreachable("Invalid register file");
    }
 
    this->writes_accumulator = false;
@@ -538,18 +536,6 @@ fs_visitor::get_timestamp(const fs_builder &bld)
     */
    bld.group(4, 0).exec_all().MOV(dst, ts);
 
-   /* The caller wants the low 32 bits of the timestamp.  Since it's running
-    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
-    * which is plenty of time for our purposes.  It is identical across the
-    * EUs, but since it's tracking GPU core speed it will increment at a
-    * varying rate as render P-states change.
-    *
-    * The caller could also check if render P-states have changed (or anything
-    * else that might disrupt timing) by setting smear to 2 and checking if
-    * that field is != 0.
-    */
-   dst.set_smear(0);
-
    return dst;
 }
 
@@ -557,6 +543,14 @@ void
 fs_visitor::emit_shader_time_begin()
 {
    shader_start_time = get_timestamp(bld.annotate("shader time start"));
+
+   /* We want only the low 32 bits of the timestamp.  Since it's running
+    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
+    * which is plenty of time for our purposes.  It is identical across the
+    * EUs, but since it's tracking GPU core speed it will increment at a
+    * varying rate as render P-states change.
+    */
+   shader_start_time.set_smear(0);
 }
 
 void
@@ -570,6 +564,15 @@ fs_visitor::emit_shader_time_end()
 
    fs_reg shader_end_time = get_timestamp(ibld);
 
+   /* We only use the low 32 bits of the timestamp - see
+    * emit_shader_time_begin()).
+    *
+    * We could also check if render P-states have changed (or anything
+    * else that might disrupt timing) by setting smear to 2 and checking if
+    * that field is != 0.
+    */
+   shader_end_time.set_smear(0);
+
    /* Check that there weren't any timestamp reset events (assuming these
     * were the only two timestamp reads that happened).
     */
@@ -700,10 +703,10 @@ fs_inst::components_read(unsigned i) const
       return 2;
 
    case FS_OPCODE_FB_WRITE_LOGICAL:
-      assert(src[6].file == IMM);
+      assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
       /* First/second FB write color. */
       if (i < 2)
-         return src[6].fixed_hw_reg.dw1.ud;
+         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud;
       else
          return 1;
 
@@ -841,9 +844,8 @@ fs_inst::regs_read(int arg) const
                           REG_SIZE);
    case MRF:
       unreachable("MRF registers are not allowed as sources");
-   default:
-      unreachable("Invalid register file");
    }
+   return 0;
 }
 
 bool
@@ -1283,9 +1285,9 @@ fs_visitor::emit_sampleid_setup()
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
 
    if (key->compute_sample_id) {
-      fs_reg t1 = vgrf(glsl_type::int_type);
-      fs_reg t2 = vgrf(glsl_type::int_type);
-      t2.type = BRW_REGISTER_TYPE_UW;
+      fs_reg t1(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
+      t1.set_smear(0);
+      fs_reg t2(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
 
       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
        * 8x multisampling, subspan 0 will represent sample N (where N
@@ -1306,13 +1308,13 @@ fs_visitor::emit_sampleid_setup()
        * are sample 1 of subspan 0; the third group is sample 0 of
        * subspan 1, and finally sample 1 of subspan 1.
        */
-      abld.exec_all()
-          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+      abld.exec_all().group(1, 0)
+          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
                fs_reg(0xc0));
-      abld.exec_all().SHR(t1, t1, fs_reg(5));
+      abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5));
 
       /* This works for both SIMD8 and SIMD16 */
-      abld.exec_all()
+      abld.exec_all().group(4, 0)
           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
 
       /* This special instruction takes care of setting vstride=1,
@@ -1443,6 +1445,9 @@ fs_visitor::calculate_urb_setup()
             }
          }
       } else {
+         bool include_vue_header =
+            nir->info.inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+
          /* We have enough input varyings that the SF/SBE pipeline stage can't
           * arbitrarily rearrange them to suit our whim; we have to put them
           * in an order that matches the output of the previous pipeline stage
@@ -1452,15 +1457,14 @@ fs_visitor::calculate_urb_setup()
          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
                              key->input_slots_valid,
                              nir->info.separate_shader);
-         int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
+         int first_slot =
+            include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
+
          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
               slot++) {
             int varying = prev_stage_vue_map.slot_to_varying[slot];
-            /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
-             * unused.
-             */
-            if (varying != BRW_VARYING_SLOT_COUNT &&
+            if (varying != BRW_VARYING_SLOT_PAD &&
                 (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
                  BITFIELD64_BIT(varying))) {
                prog_data->urb_setup[varying] = slot - first_slot;
@@ -2615,7 +2619,7 @@ fs_visitor::eliminate_find_live_channel()
       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
          if (depth == 0) {
             inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0] = fs_reg(0);
+            inst->src[0] = fs_reg(0u);
             inst->sources = 1;
             inst->force_writemask_all = true;
             progress = true;
@@ -2643,8 +2647,9 @@ fs_visitor::emit_repclear_shader()
    fs_inst *mov;
 
    if (uniforms == 1) {
-      mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
-                               fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+      mov = bld.exec_all().group(4, 0)
+               .MOV(brw_message_reg(color_mrf),
+                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
    } else {
       struct brw_reg reg =
          brw_reg(BRW_GENERAL_REGISTER_FILE,
@@ -2653,8 +2658,8 @@ fs_visitor::emit_repclear_shader()
                  BRW_WIDTH_2,
                  BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 
-      mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
-                               fs_reg(reg));
+      mov = bld.exec_all().group(4, 0)
+               .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
    }
 
    fs_inst *write;
@@ -3366,15 +3371,17 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
                             const brw_wm_prog_key *key,
                             const fs_visitor::thread_payload &payload)
 {
-   assert(inst->src[6].file == IMM);
+   assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
    const brw_device_info *devinfo = bld.shader->devinfo;
-   const fs_reg &color0 = inst->src[0];
-   const fs_reg &color1 = inst->src[1];
-   const fs_reg &src0_alpha = inst->src[2];
-   const fs_reg &src_depth = inst->src[3];
-   const fs_reg &dst_depth = inst->src[4];
-   fs_reg sample_mask = inst->src[5];
-   const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
+   const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
+   const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
+   const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
+   const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
+   const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
+   const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
+   fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
+   const unsigned components =
+      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud;
 
    /* We can potentially have a message length of up to 15, so we have to set
     * base_mrf to either 0 or 1 in order to fit in m0..m15.
@@ -3464,6 +3471,17 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
       length++;
    }
 
+   if (src_stencil.file != BAD_FILE) {
+      assert(devinfo->gen >= 9);
+      assert(bld.dispatch_width() != 16);
+
+      sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.exec_all().annotate("FB write OS")
+         .emit(FS_OPCODE_PACK_STENCIL_REF, sources[length],
+               retype(src_stencil, BRW_REGISTER_TYPE_UB));
+      length++;
+   }
+
    fs_inst *load;
    if (devinfo->gen >= 7) {
       /* Send from the GRF */
@@ -4073,7 +4091,7 @@ fs_visitor::lower_logical_sends()
       case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
          lower_surface_logical_send(ibld, inst,
                                     SHADER_OPCODE_UNTYPED_SURFACE_READ,
-                                    fs_reg(0xffff));
+                                    fs_reg());
          break;
 
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
@@ -4202,10 +4220,12 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
       /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
        * here.
        */
-      assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE ||
+      assert(devinfo->gen != 6 ||
+             inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
              inst->exec_size == 8);
       /* Dual-source FB writes are unsupported in SIMD16 mode. */
-      return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
+      return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
+              8 : inst->exec_size);
 
    case SHADER_OPCODE_TXD_LOGICAL:
       /* TXD is unsupported in SIMD16 mode. */
@@ -4499,9 +4519,8 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       if (inst->dst.fixed_hw_reg.subnr)
          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
       break;
-   default:
-      fprintf(file, "???");
-      break;
+   case IMM:
+      unreachable("not reached");
    }
    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
 
@@ -4594,9 +4613,6 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
          if (inst->src[i].fixed_hw_reg.abs)
             fprintf(file, "|");
          break;
-      default:
-         fprintf(file, "???");
-         break;
       }
       if (inst->src[i].abs)
          fprintf(file, "|");
@@ -4977,8 +4993,7 @@ fs_visitor::allocate_registers()
    if (failed)
       return;
 
-   if (!allocated_without_spills)
-      schedule_instructions(SCHEDULE_POST);
+   schedule_instructions(SCHEDULE_POST);
 
    if (last_scratch > 0)
       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
@@ -5236,6 +5251,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
    prog_data->uses_omask =
       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    prog_data->computed_depth_mode = computed_depth_mode(shader);
+   prog_data->computed_stencil =
+      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
 
    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 50e98becf03..8058b344b7a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -145,6 +145,8 @@ public:
    void assign_vs_urb_setup();
    bool assign_regs(bool allow_spilling);
    void assign_regs_trivial();
+   void calculate_payload_ranges(int payload_node_count,
+                                 int *payload_last_use_ip);
    void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
                                    int first_payload_node);
    int choose_spill_reg(struct ra_graph *g);
@@ -337,6 +339,7 @@ public:
    int *push_constant_loc;
 
    fs_reg frag_depth;
+   fs_reg frag_stencil;
    fs_reg sample_mask;
    fs_reg outputs[VARYING_SLOT_MAX];
    unsigned output_components[VARYING_SLOT_MAX];
@@ -427,6 +430,8 @@ private:
    void generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload);
    void generate_urb_write(fs_inst *inst, struct brw_reg payload);
    void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
+   void generate_stencil_ref_packing(fs_inst *inst, struct brw_reg dst,
+                                     struct brw_reg src);
    void generate_barrier(fs_inst *inst, struct brw_reg src);
    void generate_blorp_fb_write(fs_inst *inst);
    void generate_linterp(fs_inst *inst, struct brw_reg dst,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index df10a9de293..f121f3463d3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -390,14 +390,21 @@ namespace brw {
       src_reg
       emit_uniformize(const src_reg &src) const
       {
+         /* FIXME: We use a vector chan_index and dst to allow constant and
+          * copy propagration to move result all the way into the consuming
+          * instruction (typically a surface index or sampler index for a
+          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
+          * dispatch. Once we teach const/copy propagation about scalars we
+          * should go back to scalar destinations here.
+          */
          const fs_builder ubld = exec_all();
-         const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0);
-         const dst_reg dst = component(vgrf(src.type), 0);
+         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+         const dst_reg dst = vgrf(src.type);
 
          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
-         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index);
+         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
 
-         return src_reg(dst);
+         return src_reg(component(dst, 0));
       }
 
       /**
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 5589716239a..26204827156 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -416,9 +416,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
          inst->src[arg].subreg_offset = offset % 32;
       }
       break;
-   default:
-      unreachable("Invalid register file");
-      break;
+
+   case MRF:
+   case IMM:
+      unreachable("not reached");
    }
 
    if (has_source_modifiers) {
@@ -612,6 +613,21 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
          }
          break;
 
+      case SHADER_OPCODE_UNTYPED_ATOMIC:
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+      case SHADER_OPCODE_TYPED_ATOMIC:
+      case SHADER_OPCODE_TYPED_SURFACE_READ:
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+         /* We only propagate into the surface argument of the
+          * instruction. Everything else goes through LOAD_PAYLOAD.
+          */
+         if (i == 1) {
+            inst->src[i] = val;
+            progress = true;
+         }
+         break;
+
       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
       case SHADER_OPCODE_BROADCAST:
          inst->src[i] = val;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index c7628dcc2f4..3a28c8d591d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -93,7 +93,8 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case SHADER_OPCODE_LOAD_PAYLOAD:
       return !inst->is_copy_payload(v->alloc);
    default:
-      return inst->is_send_from_grf() && !inst->has_side_effects();
+      return inst->is_send_from_grf() && !inst->has_side_effects() &&
+         !inst->is_volatile();
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index bb7e792044f..e207a77fdc1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -42,9 +42,13 @@ static uint32_t brw_file_from_reg(fs_reg *reg)
       return BRW_MESSAGE_REGISTER_FILE;
    case IMM:
       return BRW_IMMEDIATE_VALUE;
-   default:
+   case BAD_FILE:
+   case HW_REG:
+   case ATTR:
+   case UNIFORM:
       unreachable("not reached");
    }
+   return 0;
 }
 
 static struct brw_reg
@@ -116,7 +120,8 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen)
       /* Probably unused. */
       brw_reg = brw_null_reg();
       break;
-   default:
+   case ATTR:
+   case UNIFORM:
       unreachable("not reached");
    }
    if (reg->abs)
@@ -317,6 +322,14 @@ fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
 		    brw_imm_ud(inst->target));
 	 }
 
+         /* Set computes stencil to render target */
+         if (prog_data->computed_stencil) {
+            brw_OR(p,
+                   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
+                   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+                   brw_imm_ud(0x1 << 14));
+         }
+
 	 implied_header = brw_null_reg();
       } else {
 	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
@@ -437,6 +450,47 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
 }
 
 void
+fs_generator::generate_stencil_ref_packing(fs_inst *inst,
+                                           struct brw_reg dst,
+                                           struct brw_reg src)
+{
+   assert(dispatch_width == 8);
+   assert(devinfo->gen >= 9);
+
+   /* Stencil value updates are provided in 8 slots of 1 byte per slot.
+    * Presumably, in order to save memory bandwidth, the stencil reference
+    * values written from the FS need to be packed into 2 dwords (this makes
+    * sense because the stencil values are limited to 1 byte each and a SIMD8
+    * send, so stencil slots 0-3 in dw0, and 4-7 in dw1.)
+    *
+    * The spec is confusing here because in the payload definition of MDP_RTW_S8
+    * (Message Data Payload for Render Target Writes with Stencil 8b) the
+    * stencil value seems to be dw4.0-dw4.7. However, if you look at the type of
+    * dw4 it is type MDPR_STENCIL (Message Data Payload Register) which is the
+    * packed values specified above and diagrammed below:
+    *
+    *     31                             0
+    *     --------------------------------
+    * DW  |                              |
+    * 2-7 |            IGNORED           |
+    *     |                              |
+    *     --------------------------------
+    * DW1 | STC   | STC   | STC   | STC  |
+    *     | slot7 | slot6 | slot5 | slot4|
+    *     --------------------------------
+    * DW0 | STC   | STC   | STC   | STC  |
+    *     | slot3 | slot2 | slot1 | slot0|
+    *     --------------------------------
+    */
+
+   src.vstride = BRW_VERTICAL_STRIDE_4;
+   src.width = BRW_WIDTH_1;
+   src.hstride = BRW_HORIZONTAL_STRIDE_0;
+   assert(src.type == BRW_REGISTER_TYPE_UB);
+   brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UB), src);
+}
+
+void
 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
 {
    brw_barrier(p, src);
@@ -1455,18 +1509,18 @@ fs_generator::generate_set_sample_id(fs_inst *inst,
    assert(src0.type == BRW_REGISTER_TYPE_D ||
           src0.type == BRW_REGISTER_TYPE_UD);
 
-   brw_push_insn_state(p);
-   brw_set_default_exec_size(p, BRW_EXECUTE_8);
-   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-   struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
-   if (dispatch_width == 8) {
+   struct brw_reg reg = stride(src1, 1, 4, 0);
+   if (devinfo->gen >= 8 || dispatch_width == 8) {
       brw_ADD(p, dst, src0, reg);
    } else if (dispatch_width == 16) {
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
       brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
       brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
+      brw_pop_insn_state(p);
    }
-   brw_pop_insn_state(p);
 }
 
 void
@@ -2182,6 +2236,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 	 generate_barrier(inst, src[0]);
 	 break;
 
+      case FS_OPCODE_PACK_STENCIL_REF:
+         generate_stencil_ref_packing(inst, dst, src[0]);
+         break;
+
       default:
          unreachable("Unsupported opcode");
 
@@ -2216,9 +2274,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 
    if (unlikely(debug_flag)) {
       fprintf(stderr, "Native code for %s\n"
-              "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
+              "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
               " bytes (%.0f%%)\n",
-              shader_name, dispatch_width, before_size / 16, loop_count,
+              shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
               spill_count, fill_count, promoted_constants, before_size, after_size,
               100.0f * (before_size - after_size) / before_size);
 
@@ -2228,12 +2286,13 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
    }
 
    compiler->shader_debug_log(log_data,
-                              "%s SIMD%d shader: %d inst, %d loops, "
+                              "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
                               "%d:%d spills:fills, Promoted %u constants, "
                               "compacted %d to %d bytes.\n",
                               stage_abbrev, dispatch_width, before_size / 16,
-                              loop_count, spill_count, fill_count,
-                              promoted_constants, before_size, after_size);
+                              loop_count, cfg->cycle_count, spill_count,
+                              fill_count, promoted_constants, before_size,
+                              after_size);
 
    return start_offset;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 7b5a0482519..486741bea31 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -71,6 +71,14 @@ fs_visitor::nir_setup_inputs()
                                              var->data.origin_upper_left);
          emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
                                    input, reg), 0xF);
+      } else if (var->data.location == VARYING_SLOT_LAYER) {
+         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER, 1), 3);
+         reg.type = BRW_REGISTER_TYPE_D;
+         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg);
+      } else if (var->data.location == VARYING_SLOT_VIEWPORT) {
+         struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT, 2), 3);
+         reg.type = BRW_REGISTER_TYPE_D;
+         bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg);
       } else {
          emit_general_interpolation(input, var->name, var->type,
                                     (glsl_interp_qualifier) var->data.interpolation,
@@ -114,6 +122,8 @@ fs_visitor::nir_setup_outputs()
             }
          } else if (var->data.location == FRAG_RESULT_DEPTH) {
             this->frag_depth = reg;
+         } else if (var->data.location == FRAG_RESULT_STENCIL) {
+            this->frag_stencil = reg;
          } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
             this->sample_mask = reg;
          } else {
@@ -896,12 +906,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
        * subtract the result from 31 to convert the MSB count into an LSB count.
        */
-
       bld.CMP(bld.null_reg_d(), result, fs_reg(-1), BRW_CONDITIONAL_NZ);
-      fs_reg neg_result(result);
-      neg_result.negate = true;
-      inst = bld.ADD(result, neg_result, fs_reg(31));
+
+      inst = bld.ADD(result, result, fs_reg(31));
       inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->src[0].negate = true;
       break;
    }
 
@@ -1322,6 +1331,15 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
+   case nir_intrinsic_shader_clock: {
+      /* We cannot do anything if there is an event, so ignore it for now */
+      fs_reg shader_clock = get_timestamp(bld);
+      const fs_reg srcs[] = { shader_clock.set_smear(0), shader_clock.set_smear(1) };
+
+      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
+      break;
+   }
+
    case nir_intrinsic_image_size: {
       /* Get the referenced image variable and type. */
       const nir_variable *var = instr->variables[0]->var;
@@ -1509,7 +1527,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[0]),
                  fs_reg(stage_prog_data->binding_table.ssbo_start));
-         surf_index = bld.emit_uniformize(surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -1520,34 +1537,21 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       }
 
       /* Get the offset to read from */
-      fs_reg offset_reg = vgrf(glsl_type::uint_type);
-      unsigned const_offset_bytes = 0;
+      fs_reg offset_reg;
       if (has_indirect) {
-         bld.MOV(offset_reg, get_nir_src(instr->src[1]));
+         offset_reg = get_nir_src(instr->src[1]);
       } else {
-         const_offset_bytes = instr->const_index[0];
-         bld.MOV(offset_reg, fs_reg(const_offset_bytes));
+         offset_reg = fs_reg(instr->const_index[0]);
       }
 
       /* Read the vector */
-      for (int i = 0; i < instr->num_components; i++) {
-         fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
-                                                1 /* dims */, 1 /* size */,
-                                                BRW_PREDICATE_NONE);
-         read_result.type = dest.type;
-         bld.MOV(dest, read_result);
-         dest = offset(dest, bld, 1);
-
-         /* Vector components are stored contiguous in memory */
-         if (i < instr->num_components) {
-            if (!has_indirect) {
-               const_offset_bytes += 4;
-               bld.MOV(offset_reg, fs_reg(const_offset_bytes));
-            } else {
-               bld.ADD(offset_reg, offset_reg, brw_imm_ud(4));
-            }
-         }
-      }
+      fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
+                                             1 /* dims */,
+                                             instr->num_components,
+                                             BRW_PREDICATE_NONE);
+      read_result.type = dest.type;
+      for (int i = 0; i < instr->num_components; i++)
+         bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
 
       break;
    }
@@ -1765,52 +1769,46 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          surf_index = vgrf(glsl_type::uint_type);
          bld.ADD(surf_index, get_nir_src(instr->src[1]),
                   fs_reg(stage_prog_data->binding_table.ssbo_start));
-         surf_index = bld.emit_uniformize(surf_index);
 
          brw_mark_surface_used(prog_data,
                                stage_prog_data->binding_table.ssbo_start +
                                nir->info.num_ssbos - 1);
       }
 
-      /* Offset */
-      fs_reg offset_reg = vgrf(glsl_type::uint_type);
-      unsigned const_offset_bytes = 0;
-      if (has_indirect) {
-         bld.MOV(offset_reg, get_nir_src(instr->src[2]));
-      } else {
-         const_offset_bytes = instr->const_index[0];
-         bld.MOV(offset_reg, fs_reg(const_offset_bytes));
-      }
-
       /* Value */
       fs_reg val_reg = get_nir_src(instr->src[0]);
 
       /* Writemask */
       unsigned writemask = instr->const_index[1];
 
-      /* Write each component present in the writemask */
-      unsigned skipped_channels = 0;
-      for (int i = 0; i < instr->num_components; i++) {
-         int component_mask = 1 << i;
-         if (writemask & component_mask) {
-            if (skipped_channels) {
-               if (!has_indirect) {
-                  const_offset_bytes += 4 * skipped_channels;
-                  bld.MOV(offset_reg, fs_reg(const_offset_bytes));
-               } else {
-                  bld.ADD(offset_reg, offset_reg,
-                           brw_imm_ud(4 * skipped_channels));
-               }
-               skipped_channels = 0;
-            }
+      /* Combine groups of consecutive enabled channels in one write
+       * message. We use ffs to find the first enabled channel and then ffs on
+       * the bit-inverse, down-shifted writemask to determine the length of
+       * the block of enabled bits.
+       */
+      while (writemask) {
+         unsigned first_component = ffs(writemask) - 1;
+         unsigned length = ffs(~(writemask >> first_component)) - 1;
+         fs_reg offset_reg;
 
-            emit_untyped_write(bld, surf_index, offset_reg,
-                               offset(val_reg, bld, i),
-                               1 /* dims */, 1 /* size */,
-                               BRW_PREDICATE_NONE);
+         if (!has_indirect) {
+            offset_reg = fs_reg(instr->const_index[0] + 4 * first_component);
+         } else {
+            offset_reg = vgrf(glsl_type::uint_type);
+            bld.ADD(offset_reg,
+                    retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
+                    fs_reg(4 * first_component));
          }
 
-         skipped_channels++;
+         emit_untyped_write(bld, surf_index, offset_reg,
+                            offset(val_reg, bld, first_component),
+                            1 /* dims */, length,
+                            BRW_PREDICATE_NONE);
+
+         /* Clear the bits in the writemask that we just wrote, then try
+          * again to see if more channels are left.
+          */
+         writemask &= (15 << (first_component + length));
       }
       break;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 36388fad98d..9251d9552a5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -330,32 +330,12 @@ count_to_loop_end(const bblock_t *block)
    unreachable("not reached");
 }
 
-/**
- * Sets up interference between thread payload registers and the virtual GRFs
- * to be allocated for program temporaries.
- *
- * We want to be able to reallocate the payload for our virtual GRFs, notably
- * because the setup coefficients for a full set of 16 FS inputs takes up 8 of
- * our 128 registers.
- *
- * The layout of the payload registers is:
- *
- * 0..payload.num_regs-1: fixed function setup (including bary coordinates).
- * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data
- * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients.
- *
- * And we have payload_node_count nodes covering these registers in order
- * (note that in SIMD16, a node is two registers).
- */
-void
-fs_visitor::setup_payload_interference(struct ra_graph *g,
-                                       int payload_node_count,
-                                       int first_payload_node)
+void fs_visitor::calculate_payload_ranges(int payload_node_count,
+                                          int *payload_last_use_ip)
 {
    int loop_depth = 0;
    int loop_end_ip = 0;
 
-   int payload_last_use_ip[payload_node_count];
    for (int i = 0; i < payload_node_count; i++)
       payload_last_use_ip[i] = -1;
 
@@ -426,6 +406,33 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
 
       ip++;
    }
+}
+
+
+/**
+ * Sets up interference between thread payload registers and the virtual GRFs
+ * to be allocated for program temporaries.
+ *
+ * We want to be able to reallocate the payload for our virtual GRFs, notably
+ * because the setup coefficients for a full set of 16 FS inputs takes up 8 of
+ * our 128 registers.
+ *
+ * The layout of the payload registers is:
+ *
+ * 0..payload.num_regs-1: fixed function setup (including bary coordinates).
+ * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data
+ * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients.
+ *
+ * And we have payload_node_count nodes covering these registers in order
+ * (note that in SIMD16, a node is two registers).
+ */
+void
+fs_visitor::setup_payload_interference(struct ra_graph *g,
+                                       int payload_node_count,
+                                       int first_payload_node)
+{
+   int payload_last_use_ip[payload_node_count];
+   calculate_payload_ranges(payload_node_count, payload_last_use_ip);
 
    for (int i = 0; i < payload_node_count; i++) {
       if (payload_last_use_ip[i] == -1)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 7cc4f3c927a..5c57944ca39 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -697,7 +697,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
    const fs_reg dst_depth = (payload.dest_depth_reg ?
                              fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) :
                              fs_reg());
-   fs_reg src_depth;
+   fs_reg src_depth, src_stencil;
 
    if (source_depth_to_render_target) {
       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
@@ -706,10 +706,14 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
          src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
    }
 
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
+      src_stencil = frag_stencil;
+
    const fs_reg sources[] = {
-      color0, color1, src0_alpha, src_depth, dst_depth, sample_mask,
-      fs_reg(components)
+      color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
+      sample_mask, fs_reg(components)
    };
+   assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
    fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
                              sources, ARRAY_SIZE(sources));
 
@@ -740,6 +744,16 @@ fs_visitor::emit_fb_writes()
       no16("Missing support for simd16 depth writes on gen6\n");
    }
 
+   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
+      /* From the 'Render Target Write message' section of the docs:
+       * "Output Stencil is not supported with SIMD16 Render Target Write
+       * Messages."
+       *
+       * FINISHME: split 16 into 2 8s
+       */
+      no16("FINISHME: support 2 simd8 writes for gl_FragStencilRefARB\n");
+   }
+
    if (do_dual_src) {
       const fs_builder abld = bld.annotate("FB dual-source write");
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
index 00125c0f405..76ed237d88a 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
@@ -105,8 +105,8 @@ brw_upload_gs_abo_surfaces(struct brw_context *brw)
 
    if (prog) {
       /* BRW_NEW_GS_PROG_DATA */
-      brw_upload_abo_surfaces(brw, prog, &brw->gs.base,
-                              &brw->gs.prog_data->base.base);
+      brw_upload_abo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY],
+                              &brw->gs.base, &brw->gs.prog_data->base.base);
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index 7726e4b78a0..4417555f18e 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -97,7 +97,9 @@ byte_offset(fs_reg reg, unsigned delta)
    case MRF:
       reg.reg += delta / 32;
       break;
-   default:
+   case IMM:
+   case HW_REG:
+   case UNIFORM:
       assert(delta == 0);
    }
    reg.subreg_offset += delta % 32;
@@ -119,7 +121,7 @@ horiz_offset(fs_reg reg, unsigned delta)
    case MRF:
    case ATTR:
       return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
-   default:
+   case HW_REG:
       assert(delta == 0);
    }
    return reg;
@@ -163,7 +165,6 @@ half(fs_reg reg, unsigned idx)
 
    case ATTR:
    case HW_REG:
-   default:
       unreachable("Cannot take half of this register type");
    }
    return reg;
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index 1b57b65db27..29642c6d2a4 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -161,9 +161,6 @@ public:
                     const src_reg &src1 = src_reg(),
                     const src_reg &src2 = src_reg());
 
-   struct brw_reg get_dst(unsigned gen);
-   struct brw_reg get_src(const struct brw_vue_prog_data *prog_data, int i);
-
    dst_reg dst;
    src_reg src[3];
 
@@ -186,6 +183,27 @@ public:
       return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2;
    }
 
+   bool reads_flag(unsigned c)
+   {
+      if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
+         return true;
+
+      switch (predicate) {
+      case BRW_PREDICATE_NONE:
+         return false;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_X:
+         return c == 0;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_Y:
+         return c == 1;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_Z:
+         return c == 2;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_W:
+         return c == 3;
+      default:
+         return true;
+      }
+   }
+
    bool writes_flag()
    {
       return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 9a33188cb5c..8c1a34ee17a 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -205,6 +205,9 @@ brw_create_nir(struct brw_context *brw,
    if (shader_prog) {
       nir_lower_samplers(nir, shader_prog);
       nir_validate_shader(nir);
+
+      nir_lower_atomics(nir, shader_prog);
+      nir_validate_shader(nir);
    }
 
    brw_postprocess_nir(nir, brw->intelScreen->devinfo, is_scalar);
@@ -278,9 +281,6 @@ brw_postprocess_nir(nir_shader *nir,
    nir_lower_system_values(nir);
    nir_validate_shader(nir);
 
-   nir_lower_atomics(nir);
-   nir_validate_shader(nir);
-
    nir_optimize(nir, is_scalar);
 
    if (devinfo->gen >= 6) {
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index 87e7e011541..083c46a3726 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -205,7 +205,7 @@ enum PACKED brw_reg_type {
    /** @} */
 
    /** Immediates only: @{ */
-   BRW_REGISTER_TYPE_UV,
+   BRW_REGISTER_TYPE_UV, /* Gen6+ */
    BRW_REGISTER_TYPE_V,
    BRW_REGISTER_TYPE_VF,
    /** @} */
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index b710c60148c..88c45f74333 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -26,6 +26,7 @@
  */
 
 #include "brw_fs.h"
+#include "brw_fs_live_variables.h"
 #include "brw_vec4.h"
 #include "brw_cfg.h"
 #include "brw_shader.h"
@@ -400,22 +401,49 @@ schedule_node::set_latency_gen7(bool is_haswell)
 class instruction_scheduler {
 public:
    instruction_scheduler(backend_shader *s, int grf_count,
+                         int hw_reg_count, int block_count,
                          instruction_scheduler_mode mode)
    {
       this->bs = s;
       this->mem_ctx = ralloc_context(NULL);
       this->grf_count = grf_count;
+      this->hw_reg_count = hw_reg_count;
       this->instructions.make_empty();
       this->instructions_to_schedule = 0;
       this->post_reg_alloc = (mode == SCHEDULE_POST);
       this->mode = mode;
       this->time = 0;
       if (!post_reg_alloc) {
-         this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
-         this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
+         this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count);
+
+         this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+         for (int i = 0; i < block_count; i++)
+            this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+                                            BITSET_WORDS(grf_count));
+
+         this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+         for (int i = 0; i < block_count; i++)
+            this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+                                             BITSET_WORDS(grf_count));
+
+         this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+         for (int i = 0; i < block_count; i++)
+            this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+                                                BITSET_WORDS(hw_reg_count));
+
+         this->written = rzalloc_array(mem_ctx, bool, grf_count);
+
+         this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count);
+
+         this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count);
       } else {
-         this->remaining_grf_uses = NULL;
-         this->grf_active = NULL;
+         this->reg_pressure_in = NULL;
+         this->livein = NULL;
+         this->liveout = NULL;
+         this->hw_liveout = NULL;
+         this->written = NULL;
+         this->reads_remaining = NULL;
+         this->hw_reads_remaining = NULL;
       }
    }
 
@@ -442,7 +470,8 @@ public:
     */
    virtual int issue_time(backend_instruction *inst) = 0;
 
-   virtual void count_remaining_grf_uses(backend_instruction *inst) = 0;
+   virtual void count_reads_remaining(backend_instruction *inst) = 0;
+   virtual void setup_liveness(cfg_t *cfg) = 0;
    virtual void update_register_pressure(backend_instruction *inst) = 0;
    virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
 
@@ -453,33 +482,63 @@ public:
    bool post_reg_alloc;
    int instructions_to_schedule;
    int grf_count;
+   int hw_reg_count;
    int time;
+   int reg_pressure;
+   int block_idx;
    exec_list instructions;
    backend_shader *bs;
 
    instruction_scheduler_mode mode;
 
-   /**
-    * Number of instructions left to schedule that reference each vgrf.
-    *
-    * Used so that we can prefer scheduling instructions that will end the
-    * live intervals of multiple variables, to reduce register pressure.
+   /*
+    * The register pressure at the beginning of each basic block.
     */
-   int *remaining_grf_uses;
 
-   /**
-    * Tracks whether each VGRF has had an instruction scheduled that uses it.
-    *
-    * This is used to estimate whether scheduling a new instruction will
-    * increase register pressure.
+   int *reg_pressure_in;
+
+   /*
+    * The virtual GRF's whose range overlaps the beginning of each basic block.
+    */
+
+   BITSET_WORD **livein;
+
+   /*
+    * The virtual GRF's whose range overlaps the end of each basic block.
+    */
+
+   BITSET_WORD **liveout;
+
+   /*
+    * The hardware GRF's whose range overlaps the end of each basic block.
+    */
+
+   BITSET_WORD **hw_liveout;
+
+   /*
+    * Whether we've scheduled a write for this virtual GRF yet.
+    */
+
+   bool *written;
+
+   /*
+    * How many reads we haven't scheduled for this virtual GRF yet.
+    */
+
+   int *reads_remaining;
+
+   /*
+    * How many reads we haven't scheduled for this hardware GRF yet.
     */
-   bool *grf_active;
+
+   int *hw_reads_remaining;
 };
 
 class fs_instruction_scheduler : public instruction_scheduler
 {
 public:
-   fs_instruction_scheduler(fs_visitor *v, int grf_count,
+   fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count,
+                            int block_count,
                             instruction_scheduler_mode mode);
    void calculate_deps();
    bool is_compressed(fs_inst *inst);
@@ -487,35 +546,109 @@ public:
    int issue_time(backend_instruction *inst);
    fs_visitor *v;
 
-   void count_remaining_grf_uses(backend_instruction *inst);
+   void count_reads_remaining(backend_instruction *inst);
+   void setup_liveness(cfg_t *cfg);
    void update_register_pressure(backend_instruction *inst);
    int get_register_pressure_benefit(backend_instruction *inst);
 };
 
 fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
-                                                   int grf_count,
+                                                   int grf_count, int hw_reg_count,
+                                                   int block_count,
                                                    instruction_scheduler_mode mode)
-   : instruction_scheduler(v, grf_count, mode),
+   : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode),
      v(v)
 {
 }
 
+static bool
+is_src_duplicate(fs_inst *inst, int src)
+{
+   for (int i = 0; i < src; i++)
+     if (inst->src[i].equals(inst->src[src]))
+       return true;
+
+  return false;
+}
+
 void
-fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
+fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
 {
    fs_inst *inst = (fs_inst *)be;
 
-   if (!remaining_grf_uses)
+   if (!reads_remaining)
       return;
 
-   if (inst->dst.file == GRF)
-      remaining_grf_uses[inst->dst.reg]++;
-
    for (int i = 0; i < inst->sources; i++) {
-      if (inst->src[i].file != GRF)
+      if (is_src_duplicate(inst, i))
          continue;
 
-      remaining_grf_uses[inst->src[i].reg]++;
+      if (inst->src[i].file == GRF) {
+         reads_remaining[inst->src[i].reg]++;
+      } else if (inst->src[i].file == HW_REG &&
+               inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
+         if (inst->src[i].fixed_hw_reg.nr >= hw_reg_count)
+            continue;
+
+         for (int j = 0; j < inst->regs_read(i); j++)
+            hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + j]++;
+      }
+   }
+}
+
+void
+fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
+{
+   /* First, compute liveness on a per-GRF level using the in/out sets from
+    * liveness calculation.
+    */
+   for (int block = 0; block < cfg->num_blocks; block++) {
+      for (int i = 0; i < v->live_intervals->num_vars; i++) {
+         if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) {
+            int vgrf = v->live_intervals->vgrf_from_var[i];
+            if (!BITSET_TEST(livein[block], vgrf)) {
+               reg_pressure_in[block] += v->alloc.sizes[vgrf];
+               BITSET_SET(livein[block], vgrf);
+            }
+         }
+
+         if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i))
+            BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]);
+      }
+   }
+
+   /* Now, extend the live in/live out sets for when a range crosses a block
+    * boundary, which matches what our register allocator/interference code
+    * does to account for force_writemask_all and incompatible exec_mask's.
+    */
+   for (int block = 0; block < cfg->num_blocks - 1; block++) {
+      for (int i = 0; i < grf_count; i++) {
+         if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip &&
+             v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) {
+            if (!BITSET_TEST(livein[block + 1], i)) {
+                reg_pressure_in[block + 1] += v->alloc.sizes[i];
+                BITSET_SET(livein[block + 1], i);
+            }
+
+            BITSET_SET(liveout[block], i);
+         }
+      }
+   }
+
+   int payload_last_use_ip[hw_reg_count];
+   v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip);
+
+   for (int i = 0; i < hw_reg_count; i++) {
+      if (payload_last_use_ip[i] == -1)
+         continue;
+
+      for (int block = 0; block < cfg->num_blocks; block++) {
+         if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
+            reg_pressure_in[block]++;
+
+         if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
+            BITSET_SET(hw_liveout[block], i);
+      }
    }
 }
 
@@ -524,18 +657,24 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
 {
    fs_inst *inst = (fs_inst *)be;
 
-   if (!remaining_grf_uses)
+   if (!reads_remaining)
       return;
 
    if (inst->dst.file == GRF) {
-      remaining_grf_uses[inst->dst.reg]--;
-      grf_active[inst->dst.reg] = true;
+      written[inst->dst.reg] = true;
    }
 
    for (int i = 0; i < inst->sources; i++) {
+      if (is_src_duplicate(inst, i))
+          continue;
+
       if (inst->src[i].file == GRF) {
-         remaining_grf_uses[inst->src[i].reg]--;
-         grf_active[inst->src[i].reg] = true;
+         reads_remaining[inst->src[i].reg]--;
+      } else if (inst->src[i].file == HW_REG &&
+                 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE &&
+                 inst->src[i].fixed_hw_reg.nr < hw_reg_count) {
+         for (int off = 0; off < inst->regs_read(i); off++)
+            hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + off]--;
       }
    }
 }
@@ -547,20 +686,31 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
    int benefit = 0;
 
    if (inst->dst.file == GRF) {
-      if (remaining_grf_uses[inst->dst.reg] == 1)
-         benefit += v->alloc.sizes[inst->dst.reg];
-      if (!grf_active[inst->dst.reg])
+      if (!BITSET_TEST(livein[block_idx], inst->dst.reg) &&
+          !written[inst->dst.reg])
          benefit -= v->alloc.sizes[inst->dst.reg];
    }
 
    for (int i = 0; i < inst->sources; i++) {
-      if (inst->src[i].file != GRF)
+      if (is_src_duplicate(inst, i))
          continue;
 
-      if (remaining_grf_uses[inst->src[i].reg] == 1)
+      if (inst->src[i].file == GRF &&
+          !BITSET_TEST(liveout[block_idx], inst->src[i].reg) &&
+          reads_remaining[inst->src[i].reg] == 1)
          benefit += v->alloc.sizes[inst->src[i].reg];
-      if (!grf_active[inst->src[i].reg])
-         benefit -= v->alloc.sizes[inst->src[i].reg];
+
+      if (inst->src[i].file == HW_REG &&
+          inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE &&
+          inst->src[i].fixed_hw_reg.nr < hw_reg_count) {
+         for (int off = 0; off < inst->regs_read(i); off++) {
+            int reg = inst->src[i].fixed_hw_reg.nr + off;
+            if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
+                hw_reads_remaining[reg] == 1) {
+               benefit++;
+            }
+         }
+      }
    }
 
    return benefit;
@@ -575,20 +725,26 @@ public:
    int issue_time(backend_instruction *inst);
    vec4_visitor *v;
 
-   void count_remaining_grf_uses(backend_instruction *inst);
+   void count_reads_remaining(backend_instruction *inst);
+   void setup_liveness(cfg_t *cfg);
    void update_register_pressure(backend_instruction *inst);
    int get_register_pressure_benefit(backend_instruction *inst);
 };
 
 vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
                                                        int grf_count)
-   : instruction_scheduler(v, grf_count, SCHEDULE_POST),
+   : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST),
      v(v)
 {
 }
 
 void
-vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
+vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+{
+}
+
+void
+vec4_instruction_scheduler::setup_liveness(cfg_t *cfg)
 {
 }
 
@@ -822,7 +978,7 @@ fs_instruction_scheduler::calculate_deps()
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM &&
                     (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != IMM)) {
+                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
             assert(inst->src[i].file != MRF);
             add_barrier_deps(n);
          }
@@ -927,10 +1083,10 @@ fs_instruction_scheduler::calculate_deps()
          if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
                for (int r = 0; r < inst->regs_read(i); r++)
-                  add_dep(n, last_grf_write[inst->src[i].reg + r]);
+                  add_dep(n, last_grf_write[inst->src[i].reg + r], 0);
             } else {
                for (int r = 0; r < inst->regs_read(i); r++) {
-                  add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r]);
+                  add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], 0);
                }
             }
          } else if (inst->src[i].file == HW_REG &&
@@ -941,17 +1097,17 @@ fs_instruction_scheduler::calculate_deps()
                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
                   size = 1;
                for (int r = 0; r < size; r++)
-                  add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
+                  add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r], 0);
             } else {
-               add_dep(n, last_fixed_grf_write);
+               add_dep(n, last_fixed_grf_write, 0);
             }
          } else if (inst->src[i].is_accumulator()) {
-            add_dep(n, last_accumulator_write);
+            add_dep(n, last_accumulator_write, 0);
          } else if (inst->src[i].file != BAD_FILE &&
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM &&
                     (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != IMM)) {
+                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
             assert(inst->src[i].file != MRF);
             add_barrier_deps(n);
          }
@@ -1080,7 +1236,7 @@ vec4_instruction_scheduler::calculate_deps()
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM &&
                     (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != IMM)) {
+                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
             /* No reads from MRF, and ATTR is already translated away */
             assert(inst->src[i].file != MRF &&
                    inst->src[i].file != ATTR);
@@ -1177,7 +1333,7 @@ vec4_instruction_scheduler::calculate_deps()
                     inst->src[i].file != IMM &&
                     inst->src[i].file != UNIFORM &&
                     (inst->src[i].file != HW_REG ||
-                     inst->src[i].fixed_hw_reg.file != IMM)) {
+                     inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) {
             assert(inst->src[i].file != MRF &&
                    inst->src[i].file != ATTR);
             add_barrier_deps(n);
@@ -1387,6 +1543,9 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
    const struct brw_device_info *devinfo = bs->devinfo;
    backend_instruction *inst = block->end();
    time = 0;
+   if (!post_reg_alloc)
+      reg_pressure = reg_pressure_in[block->num];
+   block_idx = block->num;
 
    /* Remove non-DAG heads from the list. */
    foreach_in_list_safe(schedule_node, n, &instructions) {
@@ -1403,23 +1562,30 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
       chosen->remove();
       inst->insert_before(block, chosen->inst);
       instructions_to_schedule--;
-      update_register_pressure(chosen->inst);
 
-      /* Update the clock for how soon an instruction could start after the
-       * chosen one.
-       */
-      time += issue_time(chosen->inst);
+      if (!post_reg_alloc) {
+         reg_pressure -= get_register_pressure_benefit(chosen->inst);
+         update_register_pressure(chosen->inst);
+      }
 
       /* If we expected a delay for scheduling, then bump the clock to reflect
-       * that as well.  In reality, the hardware will switch to another
-       * hyperthread and may not return to dispatching our thread for a while
-       * even after we're unblocked.
+       * that.  In reality, the hardware will switch to another hyperthread
+       * and may not return to dispatching our thread for a while even after
+       * we're unblocked.  After this, we have the time when the chosen
+       * instruction will start executing.
        */
       time = MAX2(time, chosen->unblocked_time);
 
+      /* Update the clock for how soon an instruction could start after the
+       * chosen one.
+       */
+      time += issue_time(chosen->inst);
+
       if (debug) {
          fprintf(stderr, "clock %4d, scheduled: ", time);
          bs->dump_instruction(chosen->inst);
+         if (!post_reg_alloc)
+            fprintf(stderr, "(register pressure %d)\n", reg_pressure);
       }
 
       /* Now that we've scheduled a new instruction, some of its
@@ -1466,30 +1632,53 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
    if (block->end()->opcode == BRW_OPCODE_NOP)
       block->end()->remove(block);
    assert(instructions_to_schedule == 0);
+
+   block->cycle_count = time;
+}
+
+static unsigned get_cycle_count(cfg_t *cfg)
+{
+   unsigned count = 0, multiplier = 1;
+   foreach_block(block, cfg) {
+      if (block->start()->opcode == BRW_OPCODE_DO)
+         multiplier *= 10; /* assume that loops execute ~10 times */
+
+      count += block->cycle_count * multiplier;
+
+      if (block->end()->opcode == BRW_OPCODE_WHILE)
+         multiplier /= 10;
+   }
+
+   return count;
 }
 
 void
 instruction_scheduler::run(cfg_t *cfg)
 {
-   if (debug) {
+   if (debug && !post_reg_alloc) {
       fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
               post_reg_alloc);
-      bs->dump_instructions();
+         bs->dump_instructions();
    }
 
-   /* Populate the remaining GRF uses array to improve the pre-regalloc
-    * scheduling.
-    */
-   if (remaining_grf_uses) {
-      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
-         count_remaining_grf_uses(inst);
-      }
-   }
+   if (!post_reg_alloc)
+      setup_liveness(cfg);
 
    foreach_block(block, cfg) {
       if (block->end_ip - block->start_ip <= 1)
          continue;
 
+      if (reads_remaining) {
+         memset(reads_remaining, 0,
+                grf_count * sizeof(*reads_remaining));
+         memset(hw_reads_remaining, 0,
+                hw_reg_count * sizeof(*hw_reads_remaining));
+         memset(written, 0, grf_count * sizeof(*written));
+
+         foreach_inst_in_block(fs_inst, inst, block)
+            count_reads_remaining(inst);
+      }
+
       add_insts_from_block(block);
 
       calculate_deps();
@@ -1501,23 +1690,29 @@ instruction_scheduler::run(cfg_t *cfg)
       schedule_instructions(block);
    }
 
-   if (debug) {
+   if (debug && !post_reg_alloc) {
       fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
               post_reg_alloc);
       bs->dump_instructions();
    }
+
+   cfg->cycle_count = get_cycle_count(cfg);
 }
 
 void
 fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
 {
+   if (mode != SCHEDULE_POST)
+      calculate_live_intervals();
+
    int grf_count;
    if (mode == SCHEDULE_POST)
       grf_count = grf_used;
    else
       grf_count = alloc.count;
 
-   fs_instruction_scheduler sched(this, grf_count, mode);
+   fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf,
+                                  cfg->num_blocks, mode);
    sched.run(cfg);
 
    if (unlikely(debug_enabled) && mode == SCHEDULE_POST) {
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index e48f559afa7..063cb84a958 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -298,6 +298,8 @@ brw_instruction_name(enum opcode op)
       return "fb_write";
    case FS_OPCODE_FB_WRITE_LOGICAL:
       return "fb_write_logical";
+   case FS_OPCODE_PACK_STENCIL_REF:
+      return "pack_stencil_ref";
    case FS_OPCODE_BLORP_FB_WRITE:
       return "blorp_fb_write";
    case FS_OPCODE_REP_FB_WRITE:
@@ -988,6 +990,20 @@ backend_instruction::has_side_effects() const
    }
 }
 
+bool
+backend_instruction::is_volatile() const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      return true;
+   default:
+      return false;
+   }
+}
+
 #ifndef NDEBUG
 static bool
 inst_is_in_block(const bblock_t *block, const backend_instruction *inst)
@@ -1178,9 +1194,9 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage,
       stage_prog_data->binding_table.gather_texture_start = 0xd0d0d0d0;
    }
 
-   if (shader_prog && shader_prog->NumAtomicBuffers) {
+   if (shader && shader->NumAtomicBuffers) {
       stage_prog_data->binding_table.abo_start = next_binding_table_offset;
-      next_binding_table_offset += shader_prog->NumAtomicBuffers;
+      next_binding_table_offset += shader->NumAtomicBuffers;
    } else {
       stage_prog_data->binding_table.abo_start = 0xd0d0d0d0;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 8899b30c1ae..f4647cca4f9 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -115,6 +115,12 @@ struct backend_instruction : public exec_node {
     * optimize these out unless you know what you are doing.
     */
    bool has_side_effects() const;
+
+   /**
+    * True if the instruction might be affected by side effects of other
+    * instructions.
+    */
+   bool is_volatile() const;
 #else
 struct backend_instruction {
    struct exec_node link;
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index dc2b9415673..2aa1248fea6 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -49,6 +49,7 @@ extern const struct brw_tracked_state brw_clip_unit;
 extern const struct brw_tracked_state brw_vs_pull_constants;
 extern const struct brw_tracked_state brw_gs_pull_constants;
 extern const struct brw_tracked_state brw_wm_pull_constants;
+extern const struct brw_tracked_state brw_cs_pull_constants;
 extern const struct brw_tracked_state brw_constant_buffer;
 extern const struct brw_tracked_state brw_curbe_offsets;
 extern const struct brw_tracked_state brw_invariant_state;
@@ -220,7 +221,7 @@ bool brw_search_cache(struct brw_cache *cache,
 		      enum brw_cache_id cache_id,
 		      const void *key,
 		      GLuint key_size,
-		      uint32_t *inout_offset, void *out_aux);
+		      uint32_t *inout_offset, void *inout_aux);
 void brw_state_cache_check_size( struct brw_context *brw );
 
 void brw_init_caches( struct brw_context *brw );
@@ -345,7 +346,8 @@ calculate_attr_overrides(const struct brw_context *brw,
                          uint16_t *attr_overrides,
                          uint32_t *point_sprite_enables,
                          uint32_t *flat_enables,
-                         uint32_t *urb_entry_read_length);
+                         uint32_t *urb_entry_read_length,
+                         uint32_t *urb_entry_read_offset);
 
 /* gen6_surface_state.c */
 void gen6_init_vtable_surface_functions(struct brw_context *brw);
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 2fbcd146750..f7c0a2037d9 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -137,7 +137,7 @@ bool
 brw_search_cache(struct brw_cache *cache,
                  enum brw_cache_id cache_id,
                  const void *key, GLuint key_size,
-                 uint32_t *inout_offset, void *out_aux)
+                 uint32_t *inout_offset, void *inout_aux)
 {
    struct brw_context *brw = cache->brw;
    struct brw_cache_item *item;
@@ -155,11 +155,12 @@ brw_search_cache(struct brw_cache *cache,
    if (item == NULL)
       return false;
 
-   *(void **)out_aux = ((char *)item->key + item->key_size);
+   void *aux = ((char *) item->key) + item->key_size;
 
-   if (item->offset != *inout_offset) {
+   if (item->offset != *inout_offset || aux != *((void **) inout_aux)) {
       brw->ctx.NewDriverState |= (1 << cache_id);
       *inout_offset = item->offset;
+      *((void **) inout_aux) = aux;
    }
 
    return true;
@@ -349,11 +350,6 @@ brw_init_caches(struct brw_context *brw)
 				  4096, 64);
    if (brw->has_llc)
       drm_intel_gem_bo_map_unsynchronized(cache->bo);
-
-   cache->aux_free[BRW_CACHE_VS_PROG] = brw_stage_prog_data_free;
-   cache->aux_free[BRW_CACHE_GS_PROG] = brw_stage_prog_data_free;
-   cache->aux_free[BRW_CACHE_FS_PROG] = brw_stage_prog_data_free;
-   cache->aux_free[BRW_CACHE_CS_PROG] = brw_stage_prog_data_free;
 }
 
 static void
@@ -367,9 +363,12 @@ brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
    for (i = 0; i < cache->size; i++) {
       for (c = cache->items[i]; c; c = next) {
 	 next = c->next;
-         if (cache->aux_free[c->cache_id]) {
+         if (c->cache_id == BRW_CACHE_VS_PROG ||
+             c->cache_id == BRW_CACHE_GS_PROG ||
+             c->cache_id == BRW_CACHE_FS_PROG ||
+             c->cache_id == BRW_CACHE_CS_PROG) {
             const void *item_aux = c->key + c->key_size;
-            cache->aux_free[c->cache_id](item_aux);
+            brw_stage_prog_data_free(item_aux);
          }
 	 free((void *)c->key);
 	 free(c);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 79b8301954e..0344b8a7fb0 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -259,6 +259,7 @@ static const struct brw_tracked_state *gen7_compute_atoms[] =
    &brw_state_base_address,
    &brw_cs_image_surfaces,
    &gen7_cs_push_constants,
+   &brw_cs_pull_constants,
    &brw_cs_ubo_surfaces,
    &brw_cs_abo_surfaces,
    &brw_texture_surfaces,
@@ -353,6 +354,7 @@ static const struct brw_tracked_state *gen8_compute_atoms[] =
    &gen8_state_base_address,
    &brw_cs_image_surfaces,
    &gen7_cs_push_constants,
+   &brw_cs_pull_constants,
    &brw_cs_ubo_surfaces,
    &brw_cs_abo_surfaces,
    &brw_texture_surfaces,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 3e7078d0b32..01eb1580953 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -1370,9 +1370,10 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    vec4_instruction *inst = (vec4_instruction *)be_inst;
 
    if (inst->predicate) {
-      fprintf(file, "(%cf0.%d) ",
+      fprintf(file, "(%cf0.%d%s) ",
               inst->predicate_inverse ? '-' : '+',
-              inst->flag_subreg);
+              inst->flag_subreg,
+              pred_ctrl_align16[inst->predicate]);
    }
 
    fprintf(file, "%s", brw_instruction_name(inst->opcode));
@@ -1426,9 +1427,10 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    case BAD_FILE:
       fprintf(file, "(null)");
       break;
-   default:
-      fprintf(file, "???");
-      break;
+   case IMM:
+   case ATTR:
+   case UNIFORM:
+      unreachable("not reached");
    }
    if (inst->dst.writemask != WRITEMASK_XYZW) {
       fprintf(file, ".");
@@ -1520,9 +1522,8 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       case BAD_FILE:
          fprintf(file, "(null)");
          break;
-      default:
-         fprintf(file, "???");
-         break;
+      case MRF:
+         unreachable("not reached");
       }
 
       /* Don't print .0; and only VGRFs have reg_offsets and sizes */
@@ -1787,13 +1788,100 @@ vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
    emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE)));
 
    time.type = BRW_REGISTER_TYPE_UD;
-   emit(MOV(time, src_reg(value)));
+   emit(MOV(time, value));
 
    vec4_instruction *inst =
       emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
    inst->mlen = 2;
 }
 
+void
+vec4_visitor::convert_to_hw_regs()
+{
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         struct src_reg &src = inst->src[i];
+         struct brw_reg reg;
+         switch (src.file) {
+         case GRF:
+            reg = brw_vec8_grf(src.reg + src.reg_offset, 0);
+            reg.type = src.type;
+            reg.dw1.bits.swizzle = src.swizzle;
+            reg.abs = src.abs;
+            reg.negate = src.negate;
+            break;
+
+         case IMM:
+            reg = brw_imm_reg(src.type);
+            reg.dw1.ud = src.fixed_hw_reg.dw1.ud;
+            break;
+
+         case UNIFORM:
+            reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
+                                      (src.reg + src.reg_offset) / 2,
+                                      ((src.reg + src.reg_offset) % 2) * 4),
+                         0, 4, 1);
+            reg.type = src.type;
+            reg.dw1.bits.swizzle = src.swizzle;
+            reg.abs = src.abs;
+            reg.negate = src.negate;
+
+            /* This should have been moved to pull constants. */
+            assert(!src.reladdr);
+            break;
+
+         case HW_REG:
+            assert(src.type == src.fixed_hw_reg.type);
+            continue;
+
+         case BAD_FILE:
+            /* Probably unused. */
+            reg = brw_null_reg();
+            break;
+
+         case MRF:
+         case ATTR:
+            unreachable("not reached");
+         }
+         src.fixed_hw_reg = reg;
+      }
+
+      dst_reg &dst = inst->dst;
+      struct brw_reg reg;
+
+      switch (inst->dst.file) {
+      case GRF:
+         reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
+         reg.type = dst.type;
+         reg.dw1.bits.writemask = dst.writemask;
+         break;
+
+      case MRF:
+         assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen));
+         reg = brw_message_reg(dst.reg + dst.reg_offset);
+         reg.type = dst.type;
+         reg.dw1.bits.writemask = dst.writemask;
+         break;
+
+      case HW_REG:
+         assert(dst.type == dst.fixed_hw_reg.type);
+         reg = dst.fixed_hw_reg;
+         break;
+
+      case BAD_FILE:
+         reg = brw_null_reg();
+         break;
+
+      case IMM:
+      case ATTR:
+      case UNIFORM:
+         unreachable("not reached");
+      }
+
+      dst.fixed_hw_reg = reg;
+   }
+}
+
 bool
 vec4_visitor::run()
 {
@@ -1862,6 +1950,7 @@ vec4_visitor::run()
       OPT(dead_code_eliminate);
       OPT(dead_control_flow_eliminate, this);
       OPT(opt_copy_propagation);
+      OPT(opt_cmod_propagation);
       OPT(opt_cse);
       OPT(opt_algebraic);
       OPT(opt_register_coalesce);
@@ -1914,6 +2003,8 @@ vec4_visitor::run()
 
    opt_set_dependency_control();
 
+   convert_to_hw_regs();
+
    if (last_scratch > 0) {
       prog_data->base.total_scratch =
          brw_get_scratch_size(last_scratch * REG_SIZE);
@@ -2020,9 +2111,9 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
          return NULL;
       }
 
-      vec4_generator g(compiler, log_data, &prog_data->base,
-                       mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
-      assembly = g.generate_assembly(v.cfg, final_assembly_size, shader);
+      assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
+                                            shader, &prog_data->base, v.cfg,
+                                            final_assembly_size);
    }
 
    return assembly;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index d861b2e85df..ec8abf49cd8 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -52,6 +52,15 @@ extern "C" {
 extern "C" {
 #endif
 
+const unsigned *
+brw_vec4_generate_assembly(const struct brw_compiler *compiler,
+                           void *log_data,
+                           void *mem_ctx,
+                           const nir_shader *nir,
+                           struct brw_vue_prog_data *prog_data,
+                           const struct cfg_t *cfg,
+                           unsigned *out_assembly_size);
+
 #ifdef __cplusplus
 } /* extern "C" */
 
@@ -149,6 +158,7 @@ public:
    int var_range_start(unsigned v, unsigned n) const;
    int var_range_end(unsigned v, unsigned n) const;
    bool virtual_grf_interferes(int a, int b);
+   bool opt_cmod_propagation();
    bool opt_copy_propagation(bool do_constant_prop = true);
    bool opt_cse_local(bblock_t *block);
    bool opt_cse();
@@ -158,6 +168,7 @@ public:
    bool is_dep_ctrl_unsafe(const vec4_instruction *inst);
    void opt_set_dependency_control();
    void opt_schedule_instructions();
+   void convert_to_hw_regs();
 
    vec4_instruction *emit(vec4_instruction *inst);
 
@@ -381,117 +392,6 @@ private:
    unsigned last_scratch; /**< measured in 32-byte (register size) units */
 };
 
-
-/**
- * The vertex shader code generator.
- *
- * Translates VS IR to actual i965 assembly code.
- */
-class vec4_generator
-{
-public:
-   vec4_generator(const struct brw_compiler *compiler, void *log_data,
-                  struct brw_vue_prog_data *prog_data,
-                  void *mem_ctx,
-                  bool debug_flag,
-                  const char *stage_name,
-                  const char *stage_abbrev);
-   ~vec4_generator();
-
-   const unsigned *generate_assembly(const cfg_t *cfg, unsigned *asm_size,
-                                     const nir_shader *nir);
-
-private:
-   void generate_code(const cfg_t *cfg, const nir_shader *nir);
-
-   void generate_math1_gen4(vec4_instruction *inst,
-			    struct brw_reg dst,
-			    struct brw_reg src);
-   void generate_math2_gen4(vec4_instruction *inst,
-			    struct brw_reg dst,
-			    struct brw_reg src0,
-			    struct brw_reg src1);
-   void generate_math_gen6(vec4_instruction *inst,
-                           struct brw_reg dst,
-                           struct brw_reg src0,
-                           struct brw_reg src1);
-
-   void generate_tex(vec4_instruction *inst,
-                     struct brw_reg dst,
-                     struct brw_reg src,
-                     struct brw_reg sampler_index);
-
-   void generate_vs_urb_write(vec4_instruction *inst);
-   void generate_gs_urb_write(vec4_instruction *inst);
-   void generate_gs_urb_write_allocate(vec4_instruction *inst);
-   void generate_gs_thread_end(vec4_instruction *inst);
-   void generate_gs_set_write_offset(struct brw_reg dst,
-                                     struct brw_reg src0,
-                                     struct brw_reg src1);
-   void generate_gs_set_vertex_count(struct brw_reg dst,
-                                     struct brw_reg src);
-   void generate_gs_svb_write(vec4_instruction *inst,
-                              struct brw_reg dst,
-                              struct brw_reg src0,
-                              struct brw_reg src1);
-   void generate_gs_svb_set_destination_index(vec4_instruction *inst,
-                                              struct brw_reg dst,
-                                              struct brw_reg src);
-   void generate_gs_set_dword_2(struct brw_reg dst, struct brw_reg src);
-   void generate_gs_prepare_channel_masks(struct brw_reg dst);
-   void generate_gs_set_channel_masks(struct brw_reg dst, struct brw_reg src);
-   void generate_gs_get_instance_id(struct brw_reg dst);
-   void generate_gs_ff_sync_set_primitives(struct brw_reg dst,
-                                           struct brw_reg src0,
-                                           struct brw_reg src1,
-                                           struct brw_reg src2);
-   void generate_gs_ff_sync(vec4_instruction *inst,
-                            struct brw_reg dst,
-                            struct brw_reg src0,
-                            struct brw_reg src1);
-   void generate_gs_set_primitive_id(struct brw_reg dst);
-   void generate_oword_dual_block_offsets(struct brw_reg m1,
-					  struct brw_reg index);
-   void generate_scratch_write(vec4_instruction *inst,
-			       struct brw_reg dst,
-			       struct brw_reg src,
-			       struct brw_reg index);
-   void generate_scratch_read(vec4_instruction *inst,
-			      struct brw_reg dst,
-			      struct brw_reg index);
-   void generate_pull_constant_load(vec4_instruction *inst,
-				    struct brw_reg dst,
-				    struct brw_reg index,
-				    struct brw_reg offset);
-   void generate_pull_constant_load_gen7(vec4_instruction *inst,
-                                         struct brw_reg dst,
-                                         struct brw_reg surf_index,
-                                         struct brw_reg offset);
-   void generate_set_simd4x2_header_gen9(vec4_instruction *inst,
-                                         struct brw_reg dst);
-
-   void generate_get_buffer_size(vec4_instruction *inst,
-                                 struct brw_reg dst,
-                                 struct brw_reg src,
-                                 struct brw_reg index);
-
-   void generate_unpack_flags(struct brw_reg dst);
-
-   const struct brw_compiler *compiler;
-   void *log_data; /* Passed to compiler->*_log functions */
-
-   const struct brw_device_info *devinfo;
-
-   struct brw_codegen *p;
-
-   struct brw_vue_prog_data *prog_data;
-
-   void *mem_ctx;
-   const char *stage_name;
-   const char *stage_abbrev;
-   const bool debug_flag;
-};
-
 } /* namespace brw */
 #endif /* __cplusplus */
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp
new file mode 100644
index 00000000000..329f24269ce
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+/** @file brw_vec4_cmod_propagation.cpp
+ *
+ * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check
+ * brw_fs_cmod_propagation for further details on the rationale behind this
+ * optimization.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+
+namespace brw {
+
+static bool
+opt_cmod_propagation_local(bblock_t *block)
+{
+   bool progress = false;
+   int ip = block->end_ip + 1;
+
+   foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
+      ip--;
+
+      if ((inst->opcode != BRW_OPCODE_AND &&
+           inst->opcode != BRW_OPCODE_CMP &&
+           inst->opcode != BRW_OPCODE_MOV) ||
+          inst->predicate != BRW_PREDICATE_NONE ||
+          !inst->dst.is_null() ||
+          inst->src[0].file != GRF ||
+          inst->src[0].abs)
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_AND &&
+          !(inst->src[1].is_one() &&
+            inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+            !inst->src[0].negate))
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero())
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_MOV &&
+          inst->conditional_mod != BRW_CONDITIONAL_NZ)
+         continue;
+
+      bool read_flag = false;
+      foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
+         if (inst->src[0].in_range(scan_inst->dst,
+                                   scan_inst->regs_written)) {
+            if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
+                scan_inst->dst.reg_offset != inst->src[0].reg_offset ||
+                (scan_inst->dst.writemask != WRITEMASK_X &&
+                 scan_inst->dst.writemask != WRITEMASK_XYZW) ||
+                (scan_inst->dst.writemask == WRITEMASK_XYZW &&
+                 inst->src[0].swizzle != BRW_SWIZZLE_XYZW) ||
+                (inst->dst.writemask & ~scan_inst->dst.writemask) != 0) {
+               break;
+            }
+
+            /* CMP's result is the same regardless of dest type. */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                scan_inst->opcode == BRW_OPCODE_CMP &&
+                (inst->dst.type == BRW_REGISTER_TYPE_D ||
+                 inst->dst.type == BRW_REGISTER_TYPE_UD)) {
+               inst->remove(block);
+               progress = true;
+               break;
+            }
+
+            /* If the AND wasn't handled by the previous case, it isn't safe
+             * to remove it.
+             */
+            if (inst->opcode == BRW_OPCODE_AND)
+               break;
+
+            /* Comparisons operate differently for ints and floats */
+            if (scan_inst->dst.type != inst->dst.type &&
+                (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
+                 inst->dst.type == BRW_REGISTER_TYPE_F))
+               break;
+
+            /* If the instruction generating inst's source also wrote the
+             * flag, and inst is doing a simple .nz comparison, then inst
+             * is redundant - the appropriate value is already in the flag
+             * register.  Delete inst.
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                !inst->src[0].negate &&
+                scan_inst->writes_flag()) {
+               inst->remove(block);
+               progress = true;
+               break;
+            }
+
+            /* Otherwise, try propagating the conditional. */
+            enum brw_conditional_mod cond =
+               inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
+                                   : inst->conditional_mod;
+
+            if (scan_inst->can_do_cmod() &&
+                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+                 scan_inst->conditional_mod == cond)) {
+               scan_inst->conditional_mod = cond;
+               inst->remove(block);
+               progress = true;
+            }
+            break;
+         }
+
+         if (scan_inst->writes_flag())
+            break;
+
+         read_flag = read_flag || scan_inst->reads_flag();
+      }
+   }
+
+   return progress;
+}
+
+bool
+vec4_visitor::opt_cmod_propagation()
+{
+   bool progress = false;
+
+   foreach_block_reverse(block, cfg) {
+      progress = opt_cmod_propagation_local(block) || progress;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+} /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
index 8fc7a365bfc..284e0a8d0a5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp
@@ -78,13 +78,19 @@ vec4_visitor::dead_code_eliminate()
              sizeof(BITSET_WORD));
 
       foreach_inst_in_block_reverse(vec4_instruction, inst, block) {
-         if (inst->dst.file == GRF && !inst->has_side_effects()) {
+         if ((inst->dst.file == GRF && !inst->has_side_effects()) ||
+             (inst->dst.is_null() && inst->writes_flag())){
             bool result_live[4] = { false };
 
-            for (unsigned i = 0; i < inst->regs_written; i++) {
-               for (int c = 0; c < 4; c++)
-                  result_live[c] |= BITSET_TEST(
-                     live, var_from_reg(alloc, offset(inst->dst, i), c));
+            if (inst->dst.file == GRF) {
+               for (unsigned i = 0; i < inst->regs_written; i++) {
+                  for (int c = 0; c < 4; c++)
+                     result_live[c] |= BITSET_TEST(
+                        live, var_from_reg(alloc, offset(inst->dst, i), c));
+               }
+            } else {
+               for (unsigned c = 0; c < 4; c++)
+                  result_live[c] = BITSET_TEST(flag_live, c);
             }
 
             /* If the instruction can't do writemasking, then it's all or
@@ -117,7 +123,11 @@ vec4_visitor::dead_code_eliminate()
          }
 
          if (inst->dst.is_null() && inst->writes_flag()) {
-            if (!BITSET_TEST(flag_live, 0)) {
+            bool combined_live = false;
+            for (unsigned c = 0; c < 4; c++)
+               combined_live |= BITSET_TEST(flag_live, c);
+
+            if (!combined_live) {
                inst->opcode = BRW_OPCODE_NOP;
                progress = true;
                continue;
@@ -136,7 +146,8 @@ vec4_visitor::dead_code_eliminate()
          }
 
          if (inst->writes_flag()) {
-            BITSET_CLEAR(flag_live, 0);
+            for (unsigned c = 0; c < 4; c++)
+               BITSET_CLEAR(flag_live, c);
          }
 
          for (int i = 0; i < 3; i++) {
@@ -150,8 +161,10 @@ vec4_visitor::dead_code_eliminate()
             }
          }
 
-         if (inst->reads_flag()) {
-            BITSET_SET(flag_live, 0);
+         for (unsigned c = 0; c < 4; c++) {
+            if (inst->reads_flag(c)) {
+               BITSET_SET(flag_live, c);
+            }
          }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index a84f6c47471..8bc21df5ffc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -20,146 +20,17 @@
  * IN THE SOFTWARE.
  */
 
-#include <ctype.h>
 #include "glsl/glsl_parser_extras.h"
 #include "brw_vec4.h"
 #include "brw_cfg.h"
 
-extern "C" {
-#include "brw_eu.h"
-#include "main/macros.h"
-#include "program/prog_print.h"
-#include "program/prog_parameter.h"
-};
+using namespace brw;
 
-namespace brw {
-
-struct brw_reg
-vec4_instruction::get_dst(unsigned gen)
-{
-   struct brw_reg brw_reg;
-
-   switch (dst.file) {
-   case GRF:
-      brw_reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
-      brw_reg = retype(brw_reg, dst.type);
-      brw_reg.dw1.bits.writemask = dst.writemask;
-      break;
-
-   case MRF:
-      assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(gen));
-      brw_reg = brw_message_reg(dst.reg + dst.reg_offset);
-      brw_reg = retype(brw_reg, dst.type);
-      brw_reg.dw1.bits.writemask = dst.writemask;
-      break;
-
-   case HW_REG:
-      assert(dst.type == dst.fixed_hw_reg.type);
-      brw_reg = dst.fixed_hw_reg;
-      break;
-
-   case BAD_FILE:
-      brw_reg = brw_null_reg();
-      break;
-
-   default:
-      unreachable("not reached");
-   }
-   return brw_reg;
-}
-
-struct brw_reg
-vec4_instruction::get_src(const struct brw_vue_prog_data *prog_data, int i)
-{
-   struct brw_reg brw_reg;
-
-   switch (src[i].file) {
-   case GRF:
-      brw_reg = brw_vec8_grf(src[i].reg + src[i].reg_offset, 0);
-      brw_reg = retype(brw_reg, src[i].type);
-      brw_reg.dw1.bits.swizzle = src[i].swizzle;
-      if (src[i].abs)
-	 brw_reg = brw_abs(brw_reg);
-      if (src[i].negate)
-	 brw_reg = negate(brw_reg);
-      break;
-
-   case IMM:
-      switch (src[i].type) {
-      case BRW_REGISTER_TYPE_F:
-	 brw_reg = brw_imm_f(src[i].fixed_hw_reg.dw1.f);
-	 break;
-      case BRW_REGISTER_TYPE_D:
-	 brw_reg = brw_imm_d(src[i].fixed_hw_reg.dw1.d);
-	 break;
-      case BRW_REGISTER_TYPE_UD:
-	 brw_reg = brw_imm_ud(src[i].fixed_hw_reg.dw1.ud);
-	 break;
-      case BRW_REGISTER_TYPE_VF:
-         brw_reg = brw_imm_vf(src[i].fixed_hw_reg.dw1.ud);
-         break;
-      default:
-	 unreachable("not reached");
-      }
-      break;
-
-   case UNIFORM:
-      brw_reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg +
-                                    (src[i].reg + src[i].reg_offset) / 2,
-				    ((src[i].reg + src[i].reg_offset) % 2) * 4),
-		       0, 4, 1);
-      brw_reg = retype(brw_reg, src[i].type);
-      brw_reg.dw1.bits.swizzle = src[i].swizzle;
-      if (src[i].abs)
-	 brw_reg = brw_abs(brw_reg);
-      if (src[i].negate)
-	 brw_reg = negate(brw_reg);
-
-      /* This should have been moved to pull constants. */
-      assert(!src[i].reladdr);
-      break;
-
-   case HW_REG:
-      assert(src[i].type == src[i].fixed_hw_reg.type);
-      brw_reg = src[i].fixed_hw_reg;
-      break;
-
-   case BAD_FILE:
-      /* Probably unused. */
-      brw_reg = brw_null_reg();
-      break;
-   case ATTR:
-   default:
-      unreachable("not reached");
-   }
-
-   return brw_reg;
-}
-
-vec4_generator::vec4_generator(const struct brw_compiler *compiler,
-                               void *log_data,
-                               struct brw_vue_prog_data *prog_data,
-                               void *mem_ctx,
-                               bool debug_flag,
-                               const char *stage_name,
-                               const char *stage_abbrev)
-   : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo),
-     prog_data(prog_data),
-     mem_ctx(mem_ctx), stage_name(stage_name), stage_abbrev(stage_abbrev),
-     debug_flag(debug_flag)
-{
-   p = rzalloc(mem_ctx, struct brw_codegen);
-   brw_init_codegen(devinfo, p, mem_ctx);
-}
-
-vec4_generator::~vec4_generator()
-{
-}
-
-void
-vec4_generator::generate_math1_gen4(vec4_instruction *inst,
-                                    struct brw_reg dst,
-                                    struct brw_reg src)
+static void
+generate_math1_gen4(struct brw_codegen *p,
+                    vec4_instruction *inst,
+                    struct brw_reg dst,
+                    struct brw_reg src)
 {
    gen4_math(p,
 	     dst,
@@ -178,11 +49,12 @@ check_gen6_math_src_arg(struct brw_reg src)
    assert(src.dw1.bits.swizzle == BRW_SWIZZLE_XYZW);
 }
 
-void
-vec4_generator::generate_math_gen6(vec4_instruction *inst,
-                                   struct brw_reg dst,
-                                   struct brw_reg src0,
-                                   struct brw_reg src1)
+static void
+generate_math_gen6(struct brw_codegen *p,
+                   vec4_instruction *inst,
+                   struct brw_reg dst,
+                   struct brw_reg src0,
+                   struct brw_reg src1)
 {
    /* Can't do writemask because math can't be align16. */
    assert(dst.dw1.bits.writemask == WRITEMASK_XYZW);
@@ -196,11 +68,12 @@ vec4_generator::generate_math_gen6(vec4_instruction *inst,
    brw_set_default_access_mode(p, BRW_ALIGN_16);
 }
 
-void
-vec4_generator::generate_math2_gen4(vec4_instruction *inst,
-                                    struct brw_reg dst,
-                                    struct brw_reg src0,
-                                    struct brw_reg src1)
+static void
+generate_math2_gen4(struct brw_codegen *p,
+                    vec4_instruction *inst,
+                    struct brw_reg dst,
+                    struct brw_reg src0,
+                    struct brw_reg src1)
 {
    /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
     * "Message Payload":
@@ -229,12 +102,15 @@ vec4_generator::generate_math2_gen4(vec4_instruction *inst,
 	     BRW_MATH_PRECISION_FULL);
 }
 
-void
-vec4_generator::generate_tex(vec4_instruction *inst,
-                             struct brw_reg dst,
-                             struct brw_reg src,
-                             struct brw_reg sampler_index)
+static void
+generate_tex(struct brw_codegen *p,
+             struct brw_vue_prog_data *prog_data,
+             vec4_instruction *inst,
+             struct brw_reg dst,
+             struct brw_reg src,
+             struct brw_reg sampler_index)
 {
+   const struct brw_device_info *devinfo = p->devinfo;
    int msg_type = -1;
 
    if (devinfo->gen >= 5) {
@@ -440,8 +316,8 @@ vec4_generator::generate_tex(vec4_instruction *inst,
    }
 }
 
-void
-vec4_generator::generate_vs_urb_write(vec4_instruction *inst)
+static void
+generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
 {
    brw_urb_WRITE(p,
 		 brw_null_reg(), /* dest */
@@ -454,8 +330,8 @@ vec4_generator::generate_vs_urb_write(vec4_instruction *inst)
 		 BRW_URB_SWIZZLE_INTERLEAVE);
 }
 
-void
-vec4_generator::generate_gs_urb_write(vec4_instruction *inst)
+static void
+generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
 {
    struct brw_reg src = brw_message_reg(inst->base_mrf);
    brw_urb_WRITE(p,
@@ -469,14 +345,14 @@ vec4_generator::generate_gs_urb_write(vec4_instruction *inst)
                  BRW_URB_SWIZZLE_INTERLEAVE);
 }
 
-void
-vec4_generator::generate_gs_urb_write_allocate(vec4_instruction *inst)
+static void
+generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
 {
    struct brw_reg src = brw_message_reg(inst->base_mrf);
 
    /* We pass the temporary passed in src0 as the writeback register */
    brw_urb_WRITE(p,
-                 inst->get_src(this->prog_data, 0), /* dest */
+                 inst->src[0].fixed_hw_reg, /* dest */
                  inst->base_mrf, /* starting mrf reg nr */
                  src,
                  BRW_URB_WRITE_ALLOCATE_COMPLETE,
@@ -489,14 +365,13 @@ vec4_generator::generate_gs_urb_write_allocate(vec4_instruction *inst)
    brw_push_insn_state(p);
    brw_set_default_access_mode(p, BRW_ALIGN_1);
    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-   brw_MOV(p, get_element_ud(inst->get_dst(devinfo->gen), 0),
-           get_element_ud(inst->get_src(this->prog_data, 0), 0));
-   brw_set_default_access_mode(p, BRW_ALIGN_16);
+   brw_MOV(p, get_element_ud(inst->dst.fixed_hw_reg, 0),
+           get_element_ud(inst->src[0].fixed_hw_reg, 0));
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_thread_end(vec4_instruction *inst)
+static void
+generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
 {
    struct brw_reg src = brw_message_reg(inst->base_mrf);
    brw_urb_WRITE(p,
@@ -510,10 +385,11 @@ vec4_generator::generate_gs_thread_end(vec4_instruction *inst)
                  BRW_URB_SWIZZLE_INTERLEAVE);
 }
 
-void
-vec4_generator::generate_gs_set_write_offset(struct brw_reg dst,
-                                             struct brw_reg src0,
-                                             struct brw_reg src1)
+static void
+generate_gs_set_write_offset(struct brw_codegen *p,
+                             struct brw_reg dst,
+                             struct brw_reg src0,
+                             struct brw_reg src1)
 {
    /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
     * Header: M0.3):
@@ -536,29 +412,29 @@ vec4_generator::generate_gs_set_write_offset(struct brw_reg dst,
    brw_push_insn_state(p);
    brw_set_default_access_mode(p, BRW_ALIGN_1);
    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-   assert(devinfo->gen >= 7 &&
+   assert(p->devinfo->gen >= 7 &&
           src1.file == BRW_IMMEDIATE_VALUE &&
           src1.type == BRW_REGISTER_TYPE_UD &&
           src1.dw1.ud <= USHRT_MAX);
-   if (src0.file == IMM) {
+   if (src0.file == BRW_IMMEDIATE_VALUE) {
       brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
               brw_imm_ud(src0.dw1.ud * src1.dw1.ud));
    } else {
       brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
               retype(src1, BRW_REGISTER_TYPE_UW));
    }
-   brw_set_default_access_mode(p, BRW_ALIGN_16);
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst,
-                                             struct brw_reg src)
+static void
+generate_gs_set_vertex_count(struct brw_codegen *p,
+                             struct brw_reg dst,
+                             struct brw_reg src)
 {
    brw_push_insn_state(p);
    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
 
-   if (devinfo->gen >= 8) {
+   if (p->devinfo->gen >= 8) {
       /* Move the vertex count into the second MRF for the EOT write. */
       brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD),
               src);
@@ -580,16 +456,17 @@ vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst,
       brw_MOV(p,
               suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
               stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
-      brw_set_default_access_mode(p, BRW_ALIGN_16);
    }
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_svb_write(vec4_instruction *inst,
-                                      struct brw_reg dst,
-                                      struct brw_reg src0,
-                                      struct brw_reg src1)
+static void
+generate_gs_svb_write(struct brw_codegen *p,
+                      struct brw_vue_prog_data *prog_data,
+                      vec4_instruction *inst,
+                      struct brw_reg dst,
+                      struct brw_reg src0,
+                      struct brw_reg src1)
 {
    int binding = inst->sol_binding;
    bool final_write = inst->sol_final_write;
@@ -623,12 +500,12 @@ vec4_generator::generate_gs_svb_write(vec4_instruction *inst,
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_svb_set_destination_index(vec4_instruction *inst,
-                                                      struct brw_reg dst,
-                                                      struct brw_reg src)
+static void
+generate_gs_svb_set_destination_index(struct brw_codegen *p,
+                                      vec4_instruction *inst,
+                                      struct brw_reg dst,
+                                      struct brw_reg src)
 {
-
    int vertex = inst->sol_vertex;
    brw_push_insn_state(p);
    brw_set_default_access_mode(p, BRW_ALIGN_1);
@@ -637,8 +514,10 @@ vec4_generator::generate_gs_svb_set_destination_index(vec4_instruction *inst,
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_set_dword_2(struct brw_reg dst, struct brw_reg src)
+static void
+generate_gs_set_dword_2(struct brw_codegen *p,
+                        struct brw_reg dst,
+                        struct brw_reg src)
 {
    brw_push_insn_state(p);
    brw_set_default_access_mode(p, BRW_ALIGN_1);
@@ -647,8 +526,9 @@ vec4_generator::generate_gs_set_dword_2(struct brw_reg dst, struct brw_reg src)
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst)
+static void
+generate_gs_prepare_channel_masks(struct brw_codegen *p,
+                                  struct brw_reg dst)
 {
    /* We want to left shift just DWORD 4 (the x component belonging to the
     * second geometry shader invocation) by 4 bits.  So generate the
@@ -664,9 +544,10 @@ vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst)
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst,
-                                              struct brw_reg src)
+static void
+generate_gs_set_channel_masks(struct brw_codegen *p,
+                              struct brw_reg dst,
+                              struct brw_reg src)
 {
    /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
     * Header: M0.5):
@@ -727,8 +608,9 @@ vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst,
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_get_instance_id(struct brw_reg dst)
+static void
+generate_gs_get_instance_id(struct brw_codegen *p,
+                            struct brw_reg dst)
 {
    /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
     * and store into dst.0 & dst.4. So generate the instruction:
@@ -744,11 +626,12 @@ vec4_generator::generate_gs_get_instance_id(struct brw_reg dst)
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_ff_sync_set_primitives(struct brw_reg dst,
-                                                   struct brw_reg src0,
-                                                   struct brw_reg src1,
-                                                   struct brw_reg src2)
+static void
+generate_gs_ff_sync_set_primitives(struct brw_codegen *p,
+                                   struct brw_reg dst,
+                                   struct brw_reg src0,
+                                   struct brw_reg src1,
+                                   struct brw_reg src2)
 {
    brw_push_insn_state(p);
    brw_set_default_access_mode(p, BRW_ALIGN_1);
@@ -765,11 +648,12 @@ vec4_generator::generate_gs_ff_sync_set_primitives(struct brw_reg dst,
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_ff_sync(vec4_instruction *inst,
-                                    struct brw_reg dst,
-                                    struct brw_reg src0,
-                                    struct brw_reg src1)
+static void
+generate_gs_ff_sync(struct brw_codegen *p,
+                    vec4_instruction *inst,
+                    struct brw_reg dst,
+                    struct brw_reg src0,
+                    struct brw_reg src1)
 {
    /* This opcode uses an implied MRF register for:
     *  - the header of the ff_sync message. And as such it is expected to be
@@ -811,8 +695,8 @@ vec4_generator::generate_gs_ff_sync(vec4_instruction *inst,
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_gs_set_primitive_id(struct brw_reg dst)
+static void
+generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
 {
    /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
    struct brw_reg src = brw_vec8_grf(0, 0);
@@ -823,13 +707,14 @@ vec4_generator::generate_gs_set_primitive_id(struct brw_reg dst)
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
-                                                  struct brw_reg index)
+static void
+generate_oword_dual_block_offsets(struct brw_codegen *p,
+                                  struct brw_reg m1,
+                                  struct brw_reg index)
 {
    int second_vertex_offset;
 
-   if (devinfo->gen >= 6)
+   if (p->devinfo->gen >= 6)
       second_vertex_offset = 1;
    else
       second_vertex_offset = 16;
@@ -860,8 +745,9 @@ vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_unpack_flags(struct brw_reg dst)
+static void
+generate_unpack_flags(struct brw_codegen *p,
+                      struct brw_reg dst)
 {
    brw_push_insn_state(p);
    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
@@ -878,16 +764,18 @@ vec4_generator::generate_unpack_flags(struct brw_reg dst)
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_scratch_read(vec4_instruction *inst,
-                                      struct brw_reg dst,
-                                      struct brw_reg index)
+static void
+generate_scratch_read(struct brw_codegen *p,
+                      vec4_instruction *inst,
+                      struct brw_reg dst,
+                      struct brw_reg index)
 {
+   const struct brw_device_info *devinfo = p->devinfo;
    struct brw_reg header = brw_vec8_grf(0, 0);
 
    gen6_resolve_implied_move(p, &header, inst->base_mrf);
 
-   generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
+   generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
 				     index);
 
    uint32_t msg_type;
@@ -906,7 +794,7 @@ vec4_generator::generate_scratch_read(vec4_instruction *inst,
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, header);
    if (devinfo->gen < 6)
-      brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
+      brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
    brw_set_dp_read_message(p, send,
 			   255, /* binding table index: stateless access */
 			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
@@ -917,12 +805,14 @@ vec4_generator::generate_scratch_read(vec4_instruction *inst,
 			   1 /* rlen */);
 }
 
-void
-vec4_generator::generate_scratch_write(vec4_instruction *inst,
-                                       struct brw_reg dst,
-                                       struct brw_reg src,
-                                       struct brw_reg index)
+static void
+generate_scratch_write(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg dst,
+                       struct brw_reg src,
+                       struct brw_reg index)
 {
+   const struct brw_device_info *devinfo = p->devinfo;
    struct brw_reg header = brw_vec8_grf(0, 0);
    bool write_commit;
 
@@ -933,7 +823,7 @@ vec4_generator::generate_scratch_write(vec4_instruction *inst,
 
    gen6_resolve_implied_move(p, &header, inst->base_mrf);
 
-   generate_oword_dual_block_offsets(brw_message_reg(inst->base_mrf + 1),
+   generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
 				     index);
 
    brw_MOV(p,
@@ -990,12 +880,15 @@ vec4_generator::generate_scratch_write(vec4_instruction *inst,
 			    write_commit);
 }
 
-void
-vec4_generator::generate_pull_constant_load(vec4_instruction *inst,
-                                            struct brw_reg dst,
-                                            struct brw_reg index,
-                                            struct brw_reg offset)
+static void
+generate_pull_constant_load(struct brw_codegen *p,
+                            struct brw_vue_prog_data *prog_data,
+                            vec4_instruction *inst,
+                            struct brw_reg dst,
+                            struct brw_reg index,
+                            struct brw_reg offset)
 {
+   const struct brw_device_info *devinfo = p->devinfo;
    assert(index.file == BRW_IMMEDIATE_VALUE &&
 	  index.type == BRW_REGISTER_TYPE_UD);
    uint32_t surf_index = index.dw1.ud;
@@ -1036,13 +929,15 @@ vec4_generator::generate_pull_constant_load(vec4_instruction *inst,
    brw_mark_surface_used(&prog_data->base, surf_index);
 }
 
-void
-vec4_generator::generate_get_buffer_size(vec4_instruction *inst,
-                                         struct brw_reg dst,
-                                         struct brw_reg src,
-                                         struct brw_reg surf_index)
+static void
+generate_get_buffer_size(struct brw_codegen *p,
+                         struct brw_vue_prog_data *prog_data,
+                         vec4_instruction *inst,
+                         struct brw_reg dst,
+                         struct brw_reg src,
+                         struct brw_reg surf_index)
 {
-   assert(devinfo->gen >= 7);
+   assert(p->devinfo->gen >= 7);
    assert(surf_index.type == BRW_REGISTER_TYPE_UD &&
           surf_index.file == BRW_IMMEDIATE_VALUE);
 
@@ -1062,11 +957,13 @@ vec4_generator::generate_get_buffer_size(vec4_instruction *inst,
    brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
 }
 
-void
-vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
-                                                 struct brw_reg dst,
-                                                 struct brw_reg surf_index,
-                                                 struct brw_reg offset)
+static void
+generate_pull_constant_load_gen7(struct brw_codegen *p,
+                                 struct brw_vue_prog_data *prog_data,
+                                 vec4_instruction *inst,
+                                 struct brw_reg dst,
+                                 struct brw_reg surf_index,
+                                 struct brw_reg offset)
 {
    assert(surf_index.type == BRW_REGISTER_TYPE_UD);
 
@@ -1123,9 +1020,10 @@ vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst,
    }
 }
 
-void
-vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst,
-                                                 struct brw_reg dst)
+static void
+generate_set_simd4x2_header_gen9(struct brw_codegen *p,
+                                 vec4_instruction *inst,
+                                 struct brw_reg dst)
 {
    brw_push_insn_state(p);
    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
@@ -1140,9 +1038,18 @@ vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst,
    brw_pop_insn_state(p);
 }
 
-void
-vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir)
+static void
+generate_code(struct brw_codegen *p,
+              const struct brw_compiler *compiler,
+              void *log_data,
+              const nir_shader *nir,
+              struct brw_vue_prog_data *prog_data,
+              const struct cfg_t *cfg)
 {
+   const struct brw_device_info *devinfo = p->devinfo;
+   const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage);
+   bool debug_flag = INTEL_DEBUG &
+      intel_debug_flag_for_shader_stage(nir->stage);
    struct annotation_info annotation;
    memset(&annotation, 0, sizeof(annotation));
    int loop_count = 0;
@@ -1154,9 +1061,9 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir)
          annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
 
       for (unsigned int i = 0; i < 3; i++) {
-	 src[i] = inst->get_src(this->prog_data, i);
+	 src[i] = inst->src[i].fixed_hw_reg;
       }
-      dst = inst->get_dst(devinfo->gen);
+      dst = inst->dst.fixed_hw_reg;
 
       brw_set_default_predicate_control(p, inst->predicate);
       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
@@ -1383,9 +1290,9 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir)
             gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
                       brw_null_reg());
          } else if (devinfo->gen == 6) {
-            generate_math_gen6(inst, dst, src[0], brw_null_reg());
+            generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
          } else {
-            generate_math1_gen4(inst, dst, src[0]);
+            generate_math1_gen4(p, inst, dst, src[0]);
          }
          break;
 
@@ -1396,9 +1303,9 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir)
          if (devinfo->gen >= 7) {
             gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
          } else if (devinfo->gen == 6) {
-            generate_math_gen6(inst, dst, src[0], src[1]);
+            generate_math_gen6(p, inst, dst, src[0], src[1]);
          } else {
-            generate_math2_gen4(inst, dst, src[0], src[1]);
+            generate_math2_gen4(p, inst, dst, src[0], src[1]);
          }
          break;
 
@@ -1412,92 +1319,92 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir)
       case SHADER_OPCODE_TG4:
       case SHADER_OPCODE_TG4_OFFSET:
       case SHADER_OPCODE_SAMPLEINFO:
-         generate_tex(inst, dst, src[0], src[1]);
+         generate_tex(p, prog_data, inst, dst, src[0], src[1]);
          break;
 
       case VS_OPCODE_URB_WRITE:
-         generate_vs_urb_write(inst);
+         generate_vs_urb_write(p, inst);
          break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
-         generate_scratch_read(inst, dst, src[0]);
+         generate_scratch_read(p, inst, dst, src[0]);
          break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
-         generate_scratch_write(inst, dst, src[0], src[1]);
+         generate_scratch_write(p, inst, dst, src[0], src[1]);
          break;
 
       case VS_OPCODE_PULL_CONSTANT_LOAD:
-         generate_pull_constant_load(inst, dst, src[0], src[1]);
+         generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
          break;
 
       case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
-         generate_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+         generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
          break;
 
       case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
-         generate_set_simd4x2_header_gen9(inst, dst);
+         generate_set_simd4x2_header_gen9(p, inst, dst);
          break;
 
 
       case VS_OPCODE_GET_BUFFER_SIZE:
-         generate_get_buffer_size(inst, dst, src[0], src[1]);
+         generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
          break;
 
       case GS_OPCODE_URB_WRITE:
-         generate_gs_urb_write(inst);
+         generate_gs_urb_write(p, inst);
          break;
 
       case GS_OPCODE_URB_WRITE_ALLOCATE:
-         generate_gs_urb_write_allocate(inst);
+         generate_gs_urb_write_allocate(p, inst);
          break;
 
       case GS_OPCODE_SVB_WRITE:
-         generate_gs_svb_write(inst, dst, src[0], src[1]);
+         generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
          break;
 
       case GS_OPCODE_SVB_SET_DST_INDEX:
-         generate_gs_svb_set_destination_index(inst, dst, src[0]);
+         generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
          break;
 
       case GS_OPCODE_THREAD_END:
-         generate_gs_thread_end(inst);
+         generate_gs_thread_end(p, inst);
          break;
 
       case GS_OPCODE_SET_WRITE_OFFSET:
-         generate_gs_set_write_offset(dst, src[0], src[1]);
+         generate_gs_set_write_offset(p, dst, src[0], src[1]);
          break;
 
       case GS_OPCODE_SET_VERTEX_COUNT:
-         generate_gs_set_vertex_count(dst, src[0]);
+         generate_gs_set_vertex_count(p, dst, src[0]);
          break;
 
       case GS_OPCODE_FF_SYNC:
-         generate_gs_ff_sync(inst, dst, src[0], src[1]);
+         generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
          break;
 
       case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
-         generate_gs_ff_sync_set_primitives(dst, src[0], src[1], src[2]);
+         generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
          break;
 
       case GS_OPCODE_SET_PRIMITIVE_ID:
-         generate_gs_set_primitive_id(dst);
+         generate_gs_set_primitive_id(p, dst);
          break;
 
       case GS_OPCODE_SET_DWORD_2:
-         generate_gs_set_dword_2(dst, src[0]);
+         generate_gs_set_dword_2(p, dst, src[0]);
          break;
 
       case GS_OPCODE_PREPARE_CHANNEL_MASKS:
-         generate_gs_prepare_channel_masks(dst);
+         generate_gs_prepare_channel_masks(p, dst);
          break;
 
       case GS_OPCODE_SET_CHANNEL_MASKS:
-         generate_gs_set_channel_masks(dst, src[0]);
+         generate_gs_set_channel_masks(p, dst, src[0]);
          break;
 
       case GS_OPCODE_GET_INSTANCE_ID:
-         generate_gs_get_instance_id(dst);
+         generate_gs_get_instance_id(p, dst);
          break;
 
       case SHADER_OPCODE_SHADER_TIME_ADD:
@@ -1556,7 +1463,7 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir)
          break;
 
       case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
-         generate_unpack_flags(dst);
+         generate_unpack_flags(p, dst);
          break;
 
       case VEC4_OPCODE_MOV_BYTES: {
@@ -1651,10 +1558,10 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir)
               nir->info.label ? nir->info.label : "unnamed",
               _mesa_shader_stage_to_string(nir->stage), nir->info.name);
 
-      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. Compacted %d to %d"
-                      " bytes (%.0f%%)\n",
+      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles."
+                      "Compacted %d to %d bytes (%.0f%%)\n",
               stage_abbrev,
-              before_size / 16, loop_count, before_size, after_size,
+              before_size / 16, loop_count, cfg->cycle_count, before_size, after_size,
               100.0f * (before_size - after_size) / before_size);
 
       dump_assembly(p->store, annotation.ann_count, annotation.ann,
@@ -1663,21 +1570,27 @@ vec4_generator::generate_code(const cfg_t *cfg, const nir_shader *nir)
    }
 
    compiler->shader_debug_log(log_data,
-                              "%s vec4 shader: %d inst, %d loops, "
+                              "%s vec4 shader: %d inst, %d loops, %u cycles, "
                               "compacted %d to %d bytes.\n",
-                              stage_abbrev, before_size / 16, loop_count,
+                              stage_abbrev, before_size / 16,
+                              loop_count, cfg->cycle_count,
                               before_size, after_size);
 }
 
-const unsigned *
-vec4_generator::generate_assembly(const cfg_t *cfg,
-                                  unsigned *assembly_size,
-                                  const nir_shader *nir)
+extern "C" const unsigned *
+brw_vec4_generate_assembly(const struct brw_compiler *compiler,
+                           void *log_data,
+                           void *mem_ctx,
+                           const nir_shader *nir,
+                           struct brw_vue_prog_data *prog_data,
+                           const struct cfg_t *cfg,
+                           unsigned *out_assembly_size)
 {
+   struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
+   brw_init_codegen(compiler->devinfo, p, mem_ctx);
    brw_set_default_access_mode(p, BRW_ALIGN_16);
-   generate_code(cfg, nir);
 
-   return brw_get_program(p, assembly_size);
-}
+   generate_code(p, compiler, log_data, nir, prog_data, cfg);
 
-} /* namespace brw */
+   return brw_get_program(p, out_assembly_size);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 9402489e628..cfb5cd95cb1 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -768,7 +768,7 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
       output_size_bytes += 32;
 
    assert(output_size_bytes >= 1);
-   int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   unsigned max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (compiler->devinfo->gen == 6)
       max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
    if (output_size_bytes > max_output_size_bytes)
@@ -824,9 +824,9 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
          vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader,
                            mem_ctx, true /* no_spills */, shader_time_index);
          if (v.run()) {
-            vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx,
-                             INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
-            return g.generate_assembly(v.cfg, final_assembly_size, shader);
+            return brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
+                                              shader, &prog_data->base, v.cfg,
+                                              final_assembly_size);
          }
       }
    }
@@ -875,9 +875,9 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
       if (error_str)
          *error_str = ralloc_strdup(mem_ctx, gs->fail_msg);
    } else {
-      vec4_generator g(compiler, log_data, &prog_data->base, mem_ctx,
-                       INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
-      ret = g.generate_assembly(gs->cfg, final_assembly_size, shader);
+      ret = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, shader,
+                                       &prog_data->base, gs->cfg,
+                                       final_assembly_size);
    }
 
    delete gs;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
index 678237901f2..aa9a6572eee 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp
@@ -86,9 +86,10 @@ vec4_live_variables::setup_def_use()
                }
 	    }
 	 }
-         if (inst->reads_flag()) {
-            if (!BITSET_TEST(bd->flag_def, 0)) {
-               BITSET_SET(bd->flag_use, 0);
+         for (unsigned c = 0; c < 4; c++) {
+            if (inst->reads_flag(c) &&
+                !BITSET_TEST(bd->flag_def, c)) {
+               BITSET_SET(bd->flag_use, c);
             }
          }
 
@@ -110,8 +111,11 @@ vec4_live_variables::setup_def_use()
             }
          }
          if (inst->writes_flag()) {
-            if (!BITSET_TEST(bd->flag_use, 0)) {
-               BITSET_SET(bd->flag_def, 0);
+            for (unsigned c = 0; c < 4; c++) {
+               if ((inst->dst.writemask & (1 << c)) &&
+                   !BITSET_TEST(bd->flag_use, c)) {
+                  BITSET_SET(bd->flag_def, c);
+               }
             }
          }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index e79a9f3b5b9..1fb1773f856 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -193,7 +193,9 @@ vec4_visitor::nir_emit_if(nir_if *if_stmt)
    vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 
-   emit(IF(BRW_PREDICATE_NORMAL));
+   /* We can just predicate based on the X channel, as the condition only
+    * goes on its own line */
+   emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X));
 
    nir_emit_cf_list(&if_stmt->then_list);
 
@@ -806,6 +808,16 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       break;
    }
 
+   case nir_intrinsic_shader_clock: {
+      /* We cannot do anything if there is an event, so ignore it for now */
+      const src_reg shader_clock = get_timestamp();
+      const enum brw_reg_type type = brw_type_for_base_type(glsl_type::uvec2_type);
+
+      dest = get_nir_dest(instr->dest, type);
+      emit(MOV(dest, shader_clock));
+      break;
+   }
+
    default:
       unreachable("Unknown intrinsic");
    }
@@ -1144,26 +1156,10 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_ball_iequal3:
    case nir_op_ball_fequal4:
    case nir_op_ball_iequal4: {
-      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
-
-      switch (instr->op) {
-      case nir_op_ball_fequal2:
-      case nir_op_ball_iequal2:
-         tmp.writemask = WRITEMASK_XY;
-         break;
-      case nir_op_ball_fequal3:
-      case nir_op_ball_iequal3:
-         tmp.writemask = WRITEMASK_XYZ;
-         break;
-      case nir_op_ball_fequal4:
-      case nir_op_ball_iequal4:
-         tmp.writemask = WRITEMASK_XYZW;
-         break;
-      default:
-         unreachable("not reached");
-      }
+      unsigned swiz =
+         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
 
-      emit(CMP(tmp, op[0], op[1],
+      emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
                brw_conditional_for_nir_comparison(instr->op)));
       emit(MOV(dst, src_reg(0)));
       inst = emit(MOV(dst, src_reg(~0)));
@@ -1177,26 +1173,10 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_bany_inequal3:
    case nir_op_bany_fnequal4:
    case nir_op_bany_inequal4: {
-      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
+      unsigned swiz =
+         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
 
-      switch (instr->op) {
-      case nir_op_bany_fnequal2:
-      case nir_op_bany_inequal2:
-         tmp.writemask = WRITEMASK_XY;
-         break;
-      case nir_op_bany_fnequal3:
-      case nir_op_bany_inequal3:
-         tmp.writemask = WRITEMASK_XYZ;
-         break;
-      case nir_op_bany_fnequal4:
-      case nir_op_bany_inequal4:
-         tmp.writemask = WRITEMASK_XYZW;
-         break;
-      default:
-         unreachable("not reached");
-      }
-
-      emit(CMP(tmp, op[0], op[1],
+      emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
                brw_conditional_for_nir_comparison(instr->op)));
 
       emit(MOV(dst, src_reg(0)));
@@ -1321,26 +1301,18 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
 
    case nir_op_ufind_msb:
    case nir_op_ifind_msb: {
-      src_reg temp = src_reg(this, glsl_type::uint_type);
-
-      inst = emit(FBH(dst_reg(temp), op[0]));
-      inst->dst.writemask = WRITEMASK_XYZW;
+      emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0]));
 
       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
        * subtract the result from 31 to convert the MSB count into an LSB count.
        */
+      src_reg src(dst);
+      emit(CMP(dst_null_d(), src, src_reg(-1), BRW_CONDITIONAL_NZ));
 
-      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
-      temp.swizzle = BRW_SWIZZLE_NOOP;
-      emit(MOV(dst, temp));
-
-      src_reg src_tmp = src_reg(dst);
-      emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
-
-      src_tmp.negate = true;
-      inst = emit(ADD(dst, src_tmp, src_reg(31)));
+      inst = emit(ADD(dst, src, src_reg(31)));
       inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->src[0].negate = true;
       break;
    }
 
@@ -1461,11 +1433,11 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_bany2:
    case nir_op_bany3:
    case nir_op_bany4: {
-      dst_reg tmp = dst_reg(this, glsl_type::bool_type);
-      tmp.writemask = brw_writemask_for_size(nir_op_infos[instr->op].input_sizes[0]);
-
-      emit(CMP(tmp, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
+      unsigned swiz =
+         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
 
+      emit(CMP(dst_null_d(), swizzle(op[0], swiz), src_reg(0),
+               BRW_CONDITIONAL_NZ));
       emit(MOV(dst, src_reg(0)));
       inst = emit(MOV(dst, src_reg(~0)));
       inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 6d155285820..92b089d7ff6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -883,6 +883,18 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
                            uint32_t sampler,
                            src_reg sampler_reg)
 {
+   /* The sampler can only meaningfully compute LOD for fragment shader
+    * messages. For all other stages, we change the opcode to TXL and hardcode
+    * the LOD to 0.
+    *
+    * textureQueryLevels() is implemented in terms of TXS so we need to pass a
+    * valid LOD argument.
+    */
+   if (op == ir_tex || op == ir_query_levels) {
+      assert(lod.file == BAD_FILE);
+      lod = src_reg(0.0f);
+   }
+
    enum opcode opcode;
    switch (op) {
    case ir_tex: opcode = SHADER_OPCODE_TXL; break;
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 5db4b3a86af..0b805b1c0c4 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -311,7 +311,8 @@ brw_vs_populate_key(struct brw_context *brw,
    key->program_string_id = vp->id;
 
    if (ctx->Transform.ClipPlanesEnabled != 0 &&
-       ctx->API == API_OPENGL_COMPAT &&
+       (ctx->API == API_OPENGL_COMPAT ||
+        ctx->API == API_OPENGLES) &&
        vp->program.Base.ClipDistanceArraySize == 0) {
       key->nr_userclip_plane_consts =
          _mesa_logbase2(ctx->Transform.ClipPlanesEnabled) + 1;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index f65258a52a5..d7473845c72 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -177,8 +177,8 @@ brw_upload_vs_abo_surfaces(struct brw_context *brw)
 
    if (prog) {
       /* BRW_NEW_VS_PROG_DATA */
-      brw_upload_abo_surfaces(brw, prog, &brw->vs.base,
-                              &brw->vs.prog_data->base.base);
+      brw_upload_abo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX],
+                              &brw->vs.base, &brw->vs.prog_data->base.base);
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 6ebe6481c32..f88f8d59196 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1029,7 +1029,7 @@ const struct brw_tracked_state brw_cs_ubo_surfaces = {
 
 void
 brw_upload_abo_surfaces(struct brw_context *brw,
-			struct gl_shader_program *prog,
+                        struct gl_shader *shader,
                         struct brw_stage_state *stage_state,
                         struct brw_stage_prog_data *prog_data)
 {
@@ -1037,21 +1037,22 @@ brw_upload_abo_surfaces(struct brw_context *brw,
    uint32_t *surf_offsets =
       &stage_state->surf_offset[prog_data->binding_table.abo_start];
 
-   for (unsigned i = 0; i < prog->NumAtomicBuffers; i++) {
-      struct gl_atomic_buffer_binding *binding =
-         &ctx->AtomicBufferBindings[prog->AtomicBuffers[i].Binding];
-      struct intel_buffer_object *intel_bo =
-         intel_buffer_object(binding->BufferObject);
-      drm_intel_bo *bo = intel_bufferobj_buffer(
-         brw, intel_bo, binding->Offset, intel_bo->Base.Size - binding->Offset);
-
-      brw->vtbl.emit_buffer_surface_state(brw, &surf_offsets[i], bo,
-                                          binding->Offset, BRW_SURFACEFORMAT_RAW,
-                                          bo->size - binding->Offset, 1, true);
-   }
+   if (shader && shader->NumAtomicBuffers) {
+      for (unsigned i = 0; i < shader->NumAtomicBuffers; i++) {
+         struct gl_atomic_buffer_binding *binding =
+            &ctx->AtomicBufferBindings[shader->AtomicBuffers[i]->Binding];
+         struct intel_buffer_object *intel_bo =
+            intel_buffer_object(binding->BufferObject);
+         drm_intel_bo *bo = intel_bufferobj_buffer(
+            brw, intel_bo, binding->Offset, intel_bo->Base.Size - binding->Offset);
+
+         brw->vtbl.emit_buffer_surface_state(brw, &surf_offsets[i], bo,
+                                             binding->Offset, BRW_SURFACEFORMAT_RAW,
+                                             bo->size - binding->Offset, 1, true);
+      }
 
-   if (prog->NumAtomicBuffers)
       brw->ctx.NewDriverState |= BRW_NEW_SURFACES;
+   }
 }
 
 static void
@@ -1063,8 +1064,8 @@ brw_upload_wm_abo_surfaces(struct brw_context *brw)
 
    if (prog) {
       /* BRW_NEW_FS_PROG_DATA */
-      brw_upload_abo_surfaces(brw, prog, &brw->wm.base,
-                              &brw->wm.prog_data->base);
+      brw_upload_abo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_FRAGMENT],
+                              &brw->wm.base, &brw->wm.prog_data->base);
    }
 }
 
@@ -1088,8 +1089,8 @@ brw_upload_cs_abo_surfaces(struct brw_context *brw)
 
    if (prog) {
       /* BRW_NEW_CS_PROG_DATA */
-      brw_upload_abo_surfaces(brw, prog, &brw->cs.base,
-                              &brw->cs.prog_data->base);
+      brw_upload_abo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_COMPUTE],
+                              &brw->cs.base, &brw->cs.prog_data->base);
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 4068f2844a2..2634e6ba6fd 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -60,6 +60,23 @@ get_attr_override(const struct brw_vue_map *vue_map, int urb_entry_read_offset,
    /* Find the VUE slot for this attribute. */
    int slot = vue_map->varying_to_slot[fs_attr];
 
+   /* Viewport and Layer are stored in the VUE header.  We need to override
+    * them to zero if earlier stages didn't write them, as GL requires that
+    * they read back as zero when not explicitly set.
+    */
+   if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
+      unsigned override =
+         ATTRIBUTE_0_OVERRIDE_X | ATTRIBUTE_0_OVERRIDE_W |
+         ATTRIBUTE_CONST_0000 << ATTRIBUTE_0_CONST_SOURCE_SHIFT;
+
+      if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
+         override |= ATTRIBUTE_0_OVERRIDE_Y;
+      if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
+         override |= ATTRIBUTE_0_OVERRIDE_Z;
+
+      return override;
+   }
+
    /* If there was only a back color written but not front, use back
     * as the color instead of undefined
     */
@@ -159,14 +176,30 @@ calculate_attr_overrides(const struct brw_context *brw,
                          uint16_t *attr_overrides,
                          uint32_t *point_sprite_enables,
                          uint32_t *flat_enables,
-                         uint32_t *urb_entry_read_length)
+                         uint32_t *urb_entry_read_length,
+                         uint32_t *urb_entry_read_offset)
 {
-   const int urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
    uint32_t max_source_attr = 0;
 
    *point_sprite_enables = 0;
    *flat_enables = 0;
 
+   *urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM
+    *
+    * If the fragment shader reads VARYING_SLOT_LAYER, then we need to pass in
+    * the full vertex header.  Otherwise, we can program the SF to start
+    * reading at an offset of 1 (2 varying slots) to skip unnecessary data:
+    * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5
+    * - VARYING_SLOT_{PSIZ,LAYER} and VARYING_SLOT_POS on gen6+
+    */
+
+   bool fs_needs_vue_header = brw->fragment_program->Base.InputsRead &
+      (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+
+   *urb_entry_read_offset = fs_needs_vue_header ? 0 : 1;
+
    /* _NEW_LIGHT */
    bool shade_model_flat = brw->ctx.Light.ShadeModel == GL_FLAT;
 
@@ -228,7 +261,7 @@ calculate_attr_overrides(const struct brw_context *brw,
       /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
       uint16_t attr_override = point_sprite ? 0 :
          get_attr_override(&brw->vue_map_geom_out,
-			   urb_entry_read_offset, attr,
+			   *urb_entry_read_offset, attr,
                            brw->ctx.VertexProgram._TwoSideEnabled,
                            &max_source_attr);
 
@@ -276,7 +309,6 @@ upload_sf_state(struct brw_context *brw)
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
-   const int urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
    float point_size;
    uint16_t attr_overrides[16];
    uint32_t point_sprite_origin;
@@ -411,8 +443,10 @@ upload_sf_state(struct brw_context *brw)
     * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
     */
    uint32_t urb_entry_read_length;
+   uint32_t urb_entry_read_offset;
    calculate_attr_overrides(brw, attr_overrides, &point_sprite_enables,
-                            &flat_enables, &urb_entry_read_length);
+                            &flat_enables, &urb_entry_read_length,
+                            &urb_entry_read_offset);
    dw1 |= (urb_entry_read_length << GEN6_SF_URB_ENTRY_READ_LENGTH_SHIFT |
            urb_entry_read_offset << GEN6_SF_URB_ENTRY_READ_OFFSET_SHIFT);
 
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index 6aeb0cb243f..2d7c04f4ad2 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -285,3 +285,34 @@ const struct brw_tracked_state gen7_cs_push_constants = {
    },
    .emit = gen7_upload_cs_push_constants,
 };
+
+/**
+ * Creates a new CS constant buffer reflecting the current CS program's
+ * constants, if needed by the CS program.
+ */
+static void
+brw_upload_cs_pull_constants(struct brw_context *brw)
+{
+   struct brw_stage_state *stage_state = &brw->cs.base;
+
+   /* BRW_NEW_COMPUTE_PROGRAM */
+   struct brw_compute_program *cp =
+      (struct brw_compute_program *) brw->compute_program;
+
+   /* BRW_NEW_CS_PROG_DATA */
+   const struct brw_stage_prog_data *prog_data = &brw->cs.prog_data->base;
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program.Base,
+                             stage_state, prog_data, true);
+}
+
+const struct brw_tracked_state brw_cs_pull_constants = {
+   .dirty = {
+      .mesa = _NEW_PROGRAM_CONSTANTS,
+      .brw = BRW_NEW_BATCH |
+             BRW_NEW_COMPUTE_PROGRAM |
+             BRW_NEW_CS_PROG_DATA,
+   },
+   .emit = brw_upload_cs_pull_constants,
+};
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 698b3d491bc..b1f13aceba4 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -40,7 +40,6 @@ upload_sbe_state(struct brw_context *brw)
    uint32_t point_sprite_enables;
    uint32_t flat_enables;
    int i;
-   const int urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
    uint16_t attr_overrides[16];
    /* _NEW_BUFFERS */
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
@@ -65,8 +64,10 @@ upload_sbe_state(struct brw_context *brw)
     * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
     */
    uint32_t urb_entry_read_length;
+   uint32_t urb_entry_read_offset;
    calculate_attr_overrides(brw, attr_overrides, &point_sprite_enables,
-                            &flat_enables, &urb_entry_read_length);
+                            &flat_enables, &urb_entry_read_length,
+                            &urb_entry_read_offset);
    dw1 |= urb_entry_read_length << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
           urb_entry_read_offset << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT;
 
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 8f0507413a7..10e433b1d59 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -95,6 +95,11 @@ gen8_upload_ps_extra(struct brw_context *brw,
        !brw_color_buffer_write_enabled(brw))
       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 
+   if (prog_data->computed_stencil) {
+      assert(brw->gen >= 9);
+      dw1 |= GEN9_PSX_SHADER_COMPUTES_STENCIL;
+   }
+
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_PS_EXTRA << 16 | (2 - 2));
    OUT_BATCH(dw1);
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index 6b655ee493e..8b6f31f3be6 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -37,6 +37,7 @@ upload_sbe(struct brw_context *brw)
    uint32_t num_outputs = brw->wm.prog_data->num_varying_inputs;
    uint16_t attr_overrides[VARYING_SLOT_MAX];
    uint32_t urb_entry_read_length;
+   uint32_t urb_entry_read_offset;
    uint32_t point_sprite_enables;
    uint32_t flat_enables;
    int sbe_cmd_length;
@@ -66,7 +67,8 @@ upload_sbe(struct brw_context *brw)
    calculate_attr_overrides(brw, attr_overrides,
                             &point_sprite_enables,
                             &flat_enables,
-                            &urb_entry_read_length);
+                            &urb_entry_read_length,
+                            &urb_entry_read_offset);
 
    /* Typically, the URB entry read length and offset should be programmed in
     * 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active stage
@@ -78,7 +80,7 @@ upload_sbe(struct brw_context *brw)
     */
    dw1 |=
       urb_entry_read_length << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT |
-      BRW_SF_URB_ENTRY_READ_OFFSET << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT |
+      urb_entry_read_offset << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT |
       GEN8_SBE_FORCE_URB_ENTRY_READ_LENGTH |
       GEN8_SBE_FORCE_URB_ENTRY_READ_OFFSET;
 
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index 18b86652fd2..140a6544983 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -183,6 +183,14 @@ gen8_emit_buffer_surface_state(struct brw_context *brw,
 }
 
 static void
+gen8_emit_fast_clear_color(struct brw_context *brw,
+                           struct intel_mipmap_tree *mt,
+                           uint32_t *surf)
+{
+   surf[7] |= mt->fast_clear_color_value;
+}
+
+static void
 gen8_emit_texture_surface_state(struct brw_context *brw,
                                 struct intel_mipmap_tree *mt,
                                 GLenum target,
@@ -284,11 +292,10 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
                 SET_FIELD((aux_mt->pitch / tile_w) - 1,
                           GEN8_SURFACE_AUX_PITCH) |
                 aux_mode;
-   } else {
-      surf[6] = 0;
    }
 
-   surf[7] = mt->fast_clear_color_value |
+   gen8_emit_fast_clear_color(brw, mt, surf);
+   surf[7] |=
       SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 0)), GEN7_SURFACE_SCS_R) |
       SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 1)), GEN7_SURFACE_SCS_G) |
       SET_FIELD(swizzle_to_scs(GET_SWZ(swizzle, 2)), GEN7_SURFACE_SCS_B) |
@@ -302,11 +309,7 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
                               aux_mt->bo, 0,
                               I915_GEM_DOMAIN_SAMPLER,
                               (rw ? I915_GEM_DOMAIN_SAMPLER : 0));
-   } else {
-      surf[10] = 0;
-      surf[11] = 0;
    }
-   surf[12] = 0;
 
    /* Emit relocation to surface contents */
    drm_intel_bo_emit_reloc(brw->batch.bo,
@@ -514,15 +517,13 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
                 SET_FIELD((aux_mt->pitch / tile_w) - 1,
                           GEN8_SURFACE_AUX_PITCH) |
                 aux_mode;
-   } else {
-      surf[6] = 0;
    }
 
-   surf[7] = mt->fast_clear_color_value |
-             SET_FIELD(HSW_SCS_RED,   GEN7_SURFACE_SCS_R) |
-             SET_FIELD(HSW_SCS_GREEN, GEN7_SURFACE_SCS_G) |
-             SET_FIELD(HSW_SCS_BLUE,  GEN7_SURFACE_SCS_B) |
-             SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A);
+   gen8_emit_fast_clear_color(brw, mt, surf);
+   surf[7] |= SET_FIELD(HSW_SCS_RED,   GEN7_SURFACE_SCS_R) |
+              SET_FIELD(HSW_SCS_GREEN, GEN7_SURFACE_SCS_G) |
+              SET_FIELD(HSW_SCS_BLUE,  GEN7_SURFACE_SCS_B) |
+              SET_FIELD(HSW_SCS_ALPHA, GEN7_SURFACE_SCS_A);
 
    assert(mt->offset % mt->cpp == 0);
    *((uint64_t *) &surf[8]) = mt->bo->offset64 + mt->offset; /* reloc */
@@ -533,11 +534,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
                               offset + 10 * 4,
                               aux_mt->bo, 0,
                               I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER);
-   } else {
-      surf[10] = 0;
-      surf[11] = 0;
    }
-   surf[12] = 0;
 
    drm_intel_bo_emit_reloc(brw->batch.bo,
                            offset + 8 * 4,
diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
index f7c02c8a38d..c00d2e786f3 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -73,6 +73,8 @@ static const struct debug_control debug_control[] = {
    { "spill_fs",    DEBUG_SPILL_FS },
    { "spill_vec4",  DEBUG_SPILL_VEC4 },
    { "cs",          DEBUG_CS },
+   { "hex",         DEBUG_HEX },
+   { "nocompact",   DEBUG_NO_COMPACTION },
    { NULL,    0 }
 };
 
diff --git a/src/mesa/drivers/dri/i965/intel_debug.h b/src/mesa/drivers/dri/i965/intel_debug.h
index 0a6e1b90b98..98bd7e93956 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.h
+++ b/src/mesa/drivers/dri/i965/intel_debug.h
@@ -67,6 +67,8 @@ extern uint64_t INTEL_DEBUG;
 #define DEBUG_SPILL_FS            (1ull << 31)
 #define DEBUG_SPILL_VEC4          (1ull << 32)
 #define DEBUG_CS                  (1ull << 33)
+#define DEBUG_HEX                 (1ull << 34)
+#define DEBUG_NO_COMPACTION       (1ull << 35)
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 3f9afd16c71..4643ea3e87b 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -287,6 +287,7 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_conditional_render_inverted = true;
       ctx->Extensions.ARB_draw_buffers_blend = true;
       ctx->Extensions.ARB_ES3_compatibility = true;
+      ctx->Extensions.ARB_fragment_layer_viewport = true;
       ctx->Extensions.ARB_sample_shading = true;
       ctx->Extensions.ARB_shading_language_420pack = true;
       ctx->Extensions.ARB_shading_language_packing = true;
@@ -324,6 +325,7 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_framebuffer_no_attachments = true;
       ctx->Extensions.ARB_gpu_shader5 = true;
       ctx->Extensions.ARB_shader_atomic_counters = true;
+      ctx->Extensions.ARB_shader_clock = true;
       ctx->Extensions.ARB_shader_image_load_store = true;
       ctx->Extensions.ARB_shader_image_size = true;
       ctx->Extensions.ARB_shader_texture_image_samples = true;
@@ -358,6 +360,7 @@ intelInitExtensions(struct gl_context *ctx)
    if (brw->gen >= 9) {
       ctx->Extensions.KHR_texture_compression_astc_ldr = true;
       ctx->Extensions.KHR_texture_compression_astc_hdr = true;
+      ctx->Extensions.ARB_shader_stencil_export = true;
    }
 
    if (ctx->API == API_OPENGL_CORE)
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 5a6b0dd1ec5..3a4a53a07e6 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -343,19 +343,15 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
    if (image->planar_format && image->planar_format->nplanes > 1) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
             "glEGLImageTargetRenderbufferStorage(planar buffers are not "
-               "supported as render targets.");
+               "supported as render targets.)");
       return;
    }
 
    /* __DRIimage is opaque to the core so it has to be checked here */
-   switch (image->format) {
-   case MESA_FORMAT_R8G8B8A8_UNORM:
+   if (!brw->format_supported_as_render_target[image->format]) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-            "glEGLImageTargetRenderbufferStorage(unsupported image format");
+            "glEGLImageTargetRenderbufferStorage(unsupported image format)");
       return;
-      break;
-   default:
-      break;
    }
 
    irb = intel_renderbuffer(rb);
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 590c45d93ea..fb95fb629ad 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1357,7 +1357,16 @@ set_max_gl_versions(struct intel_screen *screen)
    }
 }
 
-static int
+/**
+ * Return the revision (generally the revid field of the PCI header) of the
+ * graphics device.
+ *
+ * XXX: This function is useful to keep around even if it is not currently in
+ * use. It is necessary for new platforms and revision specific workarounds or
+ * features. Please don't remove it so that we know it at least continues to
+ * build.
+ */
+static __attribute__((__unused__)) int
 brw_get_revision(int fd)
 {
    struct drm_i915_getparam gp;
@@ -1416,8 +1425,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
        return false;
 
    intelScreen->deviceID = drm_intel_bufmgr_gem_get_devid(intelScreen->bufmgr);
-   intelScreen->devinfo = brw_get_device_info(intelScreen->deviceID,
-                                              brw_get_revision(psp->fd));
+   intelScreen->devinfo = brw_get_device_info(intelScreen->deviceID);
    if (!intelScreen->devinfo)
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
index 5f80f90a91d..62d39f70ec4 100644
--- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
@@ -84,7 +84,7 @@ instruction(bblock_t *block, int num)
 static bool
 cmod_propagation(fs_visitor *v)
 {
-   const bool print = false;
+   const bool print = getenv("TEST_DEBUG");
 
    if (print) {
       fprintf(stderr, "= Before =\n");
diff --git a/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
new file mode 100644
index 00000000000..9aa2fcc7907
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/test_vec4_cmod_propagation.cpp
@@ -0,0 +1,822 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Based on test_fs_cmod_propagation.cpp
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+#include "brw_vec4_builder.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class cmod_propagation_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_compiler *compiler;
+   struct brw_device_info *devinfo;
+   struct gl_context *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vertex_program *vp;
+   vec4_visitor *v;
+};
+
+class cmod_propagation_vec4_visitor : public vec4_visitor
+{
+public:
+   cmod_propagation_vec4_visitor(struct brw_compiler *compiler,
+                                 nir_shader *shader)
+      : vec4_visitor(compiler, NULL, NULL, NULL, shader, NULL,
+                     false, -1) {}
+
+protected:
+   /* Dummy implementation for pure virtual methods */
+   virtual dst_reg *make_reg_for_system_value(int location,
+                                              const glsl_type *type)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void setup_payload()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_prolog()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_program_code()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_thread_end()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_urb_write_header(int mrf)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete)
+   {
+      unreachable("Not reached");
+   }
+};
+
+
+void cmod_propagation_test::SetUp()
+{
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
+
+   vp = ralloc(NULL, struct brw_vertex_program);
+
+   nir_shader *shader = nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL);
+
+   v = new cmod_propagation_vec4_visitor(compiler, shader);
+
+   _mesa_init_gl_program(&vp->program.Base, GL_VERTEX_SHADER, 0);
+
+   devinfo->gen = 4;
+}
+
+static vec4_instruction *
+instruction(bblock_t *block, int num)
+{
+   vec4_instruction *inst = (vec4_instruction *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (vec4_instruction *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+cmod_propagation(vec4_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->dump_instructions();
+   }
+
+   bool ret = v->opt_cmod_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->dump_instructions();
+   }
+
+   return ret;
+}
+
+TEST_F(cmod_propagation_test, basic)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg zero(0.0f);
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.ADD(dest, src0, src1);
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest.x  src0.xxxx  src1.xxxx
+    * 1: cmp.ge.f0  null.x  dest.xxxx  0.0f
+    *
+    * = After =
+    * 0: add.ge.f0  dest.x  src0.xxxx  src1.xxxx
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, basic_different_dst_writemask)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg zero(0.0f);
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.ADD(dest, src0, src1);
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest.x     src0  src1
+    * 1: cmp.ge.f0  null.xyzw  dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andz_one)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::int_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg zero(0.0f);
+   src_reg one(1);
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_Z,
+               bld.AND(bld.null_reg_d(), src_reg(dest), one));
+
+   /* = Before =
+    * 0: cmp.l.f0     dest:F  src0:F  0F
+    * 1: and.z.f0     null:D  dest:D  1D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, non_cmod_instruction)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::uint_type);
+   src_reg src0 = src_reg(v, glsl_type::uint_type);
+   src_reg zero(0u);
+   bld.FBL(dest, src0);
+   bld.CMP(bld.null_reg_ud(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: fbl        dest  src0
+    * 1: cmp.ge.f0  null  dest  0u
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_write)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg src2 = src_reg(v, glsl_type::float_type);
+   src_reg zero(0.0f);
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest  src0  src1
+    * 1: cmp.ge.f0  null  src2  0.0f
+    * 2: cmp.ge.f0  null  dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest0 = dst_reg(v, glsl_type::float_type);
+   dst_reg dest1 = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg src2 = src_reg(v, glsl_type::float_type);
+   src_reg zero(0.0f);
+   bld.ADD(dest0, src0, src1);
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), src_reg(dest0), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest0 src0  src1
+    * 1: (+f0) sel  dest1 src2  0.0f
+    * 2: cmp.ge.f0  null  dest0 0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_dest_write)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg src2 = src_reg(v, glsl_type::vec2_type);
+   src_reg zero(0.0f);
+   bld.ADD(offset(dest, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dest, src2)
+      ->regs_written = 4;
+   bld.CMP(bld.null_reg_f(), offset(src_reg(dest), 2), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest+2  src0    src1
+    * 1: tex rlen 4 dest+0  src2
+    * 2: cmp.ge.f0  null    dest+2  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest0 = dst_reg(v, glsl_type::float_type);
+   dst_reg dest1 = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg src2 = src_reg(v, glsl_type::float_type);
+   src_reg zero(0.0f);
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(dest_null, src_reg(dest0), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add.ge.f0  dest0   src0  src1
+    * 1: (+f0) sel  dest1   src2  0.0f
+    * 2: cmp.ge.f0  null.x  dest0 0.0f
+    *
+    * = After =
+    * 0: add.ge.f0  dest0 src0  src1
+    * 1: (+f0) sel  dest1 src2  0.0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test, negate)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg zero(0.0f);
+   bld.ADD(dest, src0, src1);
+   src_reg tmp_src = src_reg(dest);
+   tmp_src.negate = true;
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+   bld.CMP(dest_null, tmp_src, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest     src0  src1
+    * 1: cmp.ge.f0  null.x  -dest 0.0f
+    *
+    * = After =
+    * 0: add.le.f0  dest     src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, movnz)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.CMP(dest, src0, src1, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(dest_null, src_reg(dest)));
+
+   /* = Before =
+    *
+    * 0: cmp.l.f0  dest:F  src0:F  src1:F
+    * 1: mov.nz.f0 null.x  dest:F
+    *
+    * = After =
+    * 0: cmp.l.f0  dest  src0:F  src1:F
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::int_type);
+   src_reg src0 = src_reg(v, glsl_type::int_type);
+   src_reg src1 = src_reg(v, glsl_type::int_type);
+   src_reg zero(0.0f);
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), retype(src_reg(dest), BRW_REGISTER_TYPE_F), zero,
+           BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest:D  src0:D  src1:D
+    * 1: cmp.ge.f0  null:F  dest:F  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andnz_non_one)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::int_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg zero(0.0f);
+   src_reg nonone(38);
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), src_reg(dest), nonone));
+
+   /* = Before =
+    * 0: cmp.l.f0     dest:F  src0:F  0F
+    * 1: and.nz.f0    null:D  dest:D  38D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+/* Note that basic is using glsl_type:float types, while this one is using
+ * glsl_type::vec4 */
+TEST_F(cmod_propagation_test, basic_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg zero(0.0f);
+
+   bld.MUL(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: mul         dest.xyzw  src0.xyzw  src1.xyzw
+    * 1: cmp.nz.f0.0 null.xyzw  dest.xyzw  0.0f
+    *
+    * = After =
+    * 0: mul.nz.f0.0 dest.xyzw  src0.xyzw  src1.xyzw
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, basic_vec4_different_dst_writemask)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   dest.writemask = WRITEMASK_X;
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg zero(0.0f);
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.MUL(dest, src0, src1);
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: mul         dest.x  src0  src1
+    * 1: cmp.nz.f0.0 null    dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, mad_one_component_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   dest.writemask = WRITEMASK_X;
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg src2 = src_reg(v, glsl_type::vec4_type);
+   src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
+   src2.negate = true;
+   src_reg zero(0.0f);
+   src_reg tmp(dest);
+   tmp.swizzle = BRW_SWIZZLE_XXXX;
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.MAD(dest, src0, src1, src2);
+   bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    *
+    * 0: mad         dest.x:F  src0.xxxx:F  src10.xxxx:F  -src2.xxxx:F
+    * 1: cmp.l.f0.0  null.x:F  dest.xxxx:F  0.0f
+    *
+    * = After =
+    * 0: mad.l.f0    dest.x:F  src0.xxxx:F  src10.xxxx:F  -src2.xxxx:F
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, mad_more_one_component_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   dest.writemask = WRITEMASK_XW;
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg src2 = src_reg(v, glsl_type::vec4_type);
+   src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
+   src2.negate = true;
+   src_reg zero(0.0f);
+   src_reg tmp(dest);
+   tmp.swizzle = BRW_SWIZZLE_XXXX;
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.MAD(dest, src0, src1, src2);
+   bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    *
+    * 0: mad         dest.xw:F  src0.xxxx:F  src10.xxxx:F  -src2.xxxx:F
+    * 1: cmp.l.f0.0  null:F  dest.xxxx:F  zeroF
+    *
+    * = After =
+    * (No changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_mov_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::ivec4_type);
+   dest.writemask = WRITEMASK_X;
+   src_reg src0 = src_reg(v, glsl_type::ivec4_type);
+   src0.swizzle = BRW_SWIZZLE_XXXX;
+   src0.file = UNIFORM;
+   src_reg nonone = retype(src_reg(16), BRW_REGISTER_TYPE_D);
+   src_reg mov_src = src_reg(dest);
+   mov_src.swizzle = BRW_SWIZZLE_XXXX;
+   dst_reg dest_null = bld.null_reg_d();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.CMP(dest, src0, nonone, BRW_CONDITIONAL_GE);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(dest_null, mov_src));
+
+   /* = Before =
+    *
+    * 0: cmp.ge.f0  dest.x:D  u.xxxx:D  16D
+    * 1: mov.nz.f0  null.x:D  dest.xxxx:D
+    *
+    * = After =
+    * 0: cmp.ge.f0  dest.x:D  u.xxxx:D  16D
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, mul_cmp_different_channels_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg zero(0.0f);
+   src_reg cmp_src = src_reg(dest);
+   cmp_src.swizzle = BRW_SWIZZLE4(0,1,3,2);
+
+   bld.MUL(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), cmp_src, zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: mul         dest  src0       src1
+    * 1: cmp.nz.f0.0 null  dest.xywz  0.0f
+    *
+    * = After =
+    * (No changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_context.c b/src/mesa/drivers/dri/nouveau/nouveau_context.c
index a049d9b8de7..cb854b81933 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_context.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_context.c
@@ -188,7 +188,7 @@ nouveau_context_init(struct gl_context *ctx, gl_api api,
 	ctx->Extensions.EXT_blend_minmax = true;
 	ctx->Extensions.EXT_texture_filter_anisotropic = true;
 	ctx->Extensions.NV_texture_env_combine4 = true;
-	ctx->Const.MaxColorAttachments = 1;
+	ctx->Const.MaxDrawBuffers = ctx->Const.MaxColorAttachments = 1;
 
 	/* This effectively disables 3D textures */
 	ctx->Const.Max3DTextureLevels = 1;
diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c
index a46c1944e96..a49018953ae 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/api_validate.c
@@ -698,16 +698,39 @@ valid_draw_indirect(struct gl_context *ctx,
 {
    const GLsizeiptr end = (GLsizeiptr)indirect + size;
 
+   /* OpenGL ES 3.1 spec. section 10.5:
+    *
+    *      "DrawArraysIndirect requires that all data sourced for the
+    *      command, including the DrawArraysIndirectCommand
+    *      structure,  be in buffer objects,  and may not be called when
+    *      the default vertex array object is bound."
+    */
+   if (ctx->Array.VAO == ctx->Array.DefaultVAO) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "(no VAO bound)");
+      return GL_FALSE;
+   }
+
    if (!_mesa_valid_prim_mode(ctx, mode, name))
       return GL_FALSE;
 
+   /* OpenGL ES 3.1 specification, section 10.5:
+    *
+    *      "An INVALID_OPERATION error is generated if
+    *      transform feedback is active and not paused."
+    */
+   if (_mesa_is_gles31(ctx) && _mesa_is_xfb_active_and_unpaused(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(TransformFeedback is active and not paused)", name);
+   }
 
-   /* From the ARB_draw_indirect specification:
-    * "An INVALID_OPERATION error is generated [...] if <indirect> is no
-    *  word aligned."
+   /* From OpenGL version 4.4. section 10.5
+    * and OpenGL ES 3.1, section 10.6:
+    *
+    *      "An INVALID_VALUE error is generated if indirect is not a
+    *       multiple of the size, in basic machine units, of uint."
     */
    if ((GLsizeiptr)indirect & (sizeof(GLuint) - 1)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
+      _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(indirect is not aligned)", name);
       return GL_FALSE;
    }
@@ -895,7 +918,12 @@ check_valid_to_compute(struct gl_context *ctx, const char *function)
       return false;
    }
 
-   prog = ctx->Shader.CurrentProgram[MESA_SHADER_COMPUTE];
+   /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
+    *
+    * "An INVALID_OPERATION error is generated if there is no active program
+    *  for the compute shader stage."
+    */
+   prog = ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
    if (prog == NULL || prog->_LinkedShaders[MESA_SHADER_COMPUTE] == NULL) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(no active compute shader)",
@@ -917,6 +945,24 @@ _mesa_validate_DispatchCompute(struct gl_context *ctx,
       return GL_FALSE;
 
    for (i = 0; i < 3; i++) {
+      /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
+       *
+       * "An INVALID_VALUE error is generated if any of num_groups_x,
+       *  num_groups_y and num_groups_z are greater than or equal to the
+       *  maximum work group count for the corresponding dimension."
+       *
+       * However, the "or equal to" portions appears to be a specification
+       * bug. In all other areas, the specification appears to indicate that
+       * the number of workgroups can match the MAX_COMPUTE_WORK_GROUP_COUNT
+       * value. For example, under DispatchComputeIndirect:
+       *
+       * "If any of num_groups_x, num_groups_y or num_groups_z is greater than
+       *  the value of MAX_COMPUTE_WORK_GROUP_COUNT for the corresponding
+       *  dimension then the results are undefined."
+       *
+       * Additionally, the OpenGLES 3.1 specification does not contain "or
+       * equal to" as an error condition.
+       */
       if (num_groups[i] > ctx->Const.MaxComputeWorkGroupCount[i]) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "glDispatchCompute(num_groups_%c)", 'x' + i);
@@ -937,24 +983,29 @@ valid_dispatch_indirect(struct gl_context *ctx,
    if (!check_valid_to_compute(ctx, name))
       return GL_FALSE;
 
-   /* From the ARB_compute_shader specification:
+   /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
     *
-    * "An INVALID_OPERATION error is generated [...] if <indirect> is less
-    *  than zero or not a multiple of the size, in basic machine units, of
-    *  uint."
+    * "An INVALID_VALUE error is generated if indirect is negative or is not a
+    *  multiple of four."
     */
    if ((GLintptr)indirect & (sizeof(GLuint) - 1)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
+      _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(indirect is not aligned)", name);
       return GL_FALSE;
    }
 
    if ((GLintptr)indirect < 0) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
+      _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(indirect is less than zero)", name);
       return GL_FALSE;
    }
 
+   /* From the OpenGL 4.3 Core Specification, Chapter 19, Compute Shaders:
+    *
+    * "An INVALID_OPERATION error is generated if no buffer is bound to the
+    *  DRAW_INDIRECT_BUFFER binding, or if the command would source data
+    *  beyond the end of the buffer object."
+    */
    if (!_mesa_is_bufferobj(ctx->DispatchIndirectBuffer)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s: no buffer bound to DISPATCH_INDIRECT_BUFFER", name);
@@ -967,11 +1018,6 @@ valid_dispatch_indirect(struct gl_context *ctx,
       return GL_FALSE;
    }
 
-   /* From the ARB_compute_shader specification:
-    *
-    * "An INVALID_OPERATION error is generated if this command sources data
-    *  beyond the end of the buffer object [...]"
-    */
    if (ctx->DispatchIndirectBuffer->Size < end) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(DISPATCH_INDIRECT_BUFFER too small)", name);
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index b2c88c37366..d964f030ecb 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -152,6 +152,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_separate_shader_objects",             o(dummy_true),                              GL,             2010 },
    { "GL_ARB_shader_atomic_counters",              o(ARB_shader_atomic_counters),              GL,             2011 },
    { "GL_ARB_shader_bit_encoding",                 o(ARB_shader_bit_encoding),                 GL,             2010 },
+   { "GL_ARB_shader_clock",                        o(ARB_shader_clock),                        GL,             2015 },
    { "GL_ARB_shader_image_load_store",             o(ARB_shader_image_load_store),             GL,             2011 },
    { "GL_ARB_shader_image_size",                   o(ARB_shader_image_size),                   GL,             2012 },
    { "GL_ARB_shader_objects",                      o(dummy_true),                              GL,             2002 },
@@ -229,6 +230,7 @@ static const struct extension extension_table[] = {
    { "GL_EXT_depth_bounds_test",                   o(EXT_depth_bounds_test),                   GL,             2002 },
    { "GL_EXT_draw_buffers",                        o(dummy_true),                                         ES2, 2012 },
    { "GL_EXT_draw_buffers2",                       o(EXT_draw_buffers2),                       GL,             2006 },
+   { "GL_EXT_draw_elements_base_vertex",           o(ARB_draw_elements_base_vertex),                      ES2, 2014 },
    { "GL_EXT_draw_instanced",                      o(ARB_draw_instanced),                      GL,             2006 },
    { "GL_EXT_draw_range_elements",                 o(dummy_true),                              GLL,            1997 },
    { "GL_EXT_fog_coord",                           o(dummy_true),                              GLL,            1999 },
@@ -305,6 +307,7 @@ static const struct extension extension_table[] = {
    { "GL_OES_depth32",                             o(dummy_false),                     DISABLE,                2005 },
    { "GL_OES_depth_texture",                       o(ARB_depth_texture),                                  ES2, 2006 },
    { "GL_OES_depth_texture_cube_map",              o(OES_depth_texture_cube_map),                         ES2, 2012 },
+   { "GL_OES_draw_elements_base_vertex",           o(ARB_draw_elements_base_vertex),                      ES2, 2014 },
    { "GL_OES_draw_texture",                        o(OES_draw_texture),                             ES1,       2004 },
    { "GL_OES_EGL_sync",                            o(dummy_true),                                   ES1 | ES2, 2010 },
    /*  FIXME: Mesa expects GL_OES_EGL_image to be available in OpenGL contexts. */
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index c295615b475..fbc7b8f8602 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -460,6 +460,7 @@ descriptor=[
   [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader_es31" ],
   [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader_es31" ],
   [ "DISPATCH_INDIRECT_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_compute_shader_es31" ],
+  [ "MAX_COMBINED_COMPUTE_UNIFORM_COMPONENTS", "CONTEXT_INT(Const.Program[MESA_SHADER_COMPUTE].MaxCombinedUniformComponents), extra_ARB_compute_shader_es31" ],
 
 # GL_ARB_framebuffer_no_attachments / GLES 3.1
   ["MAX_FRAMEBUFFER_WIDTH", "CONTEXT_INT(Const.MaxFramebufferWidth), extra_ARB_framebuffer_no_attachments"],
diff --git a/src/mesa/main/lines.c b/src/mesa/main/lines.c
index c020fb3eb9e..93b80af0dc4 100644
--- a/src/mesa/main/lines.c
+++ b/src/mesa/main/lines.c
@@ -45,6 +45,10 @@ _mesa_LineWidth( GLfloat width )
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glLineWidth %f\n", width);
 
+   /* If width is unchanged, there can't be an error */
+   if (ctx->Line.Width == width)
+      return;
+
    if (width <= 0.0F) {
       _mesa_error( ctx, GL_INVALID_VALUE, "glLineWidth" );
       return;
@@ -68,9 +72,6 @@ _mesa_LineWidth( GLfloat width )
       return;
    }
 
-   if (ctx->Line.Width == width)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_LINE);
    ctx->Line.Width = width;
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index ab4fa083672..02dd257d79d 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2292,6 +2292,7 @@ struct gl_shader
 
    struct exec_list *ir;
    struct exec_list *packed_varyings;
+   struct exec_list *fragdata_arrays;
    struct glsl_symbol_table *symbols;
 
    bool uses_builtin_functions;
@@ -2389,6 +2390,9 @@ struct gl_shader
     */
    GLuint NumImages;
 
+   struct gl_active_atomic_buffer **AtomicBuffers;
+   unsigned NumAtomicBuffers;
+
    /**
     * Whether early fragment tests are enabled as defined by
     * ARB_shader_image_load_store.
@@ -3680,6 +3684,7 @@ struct gl_extensions
    GLboolean ARB_seamless_cube_map;
    GLboolean ARB_shader_atomic_counters;
    GLboolean ARB_shader_bit_encoding;
+   GLboolean ARB_shader_clock;
    GLboolean ARB_shader_image_load_store;
    GLboolean ARB_shader_image_size;
    GLboolean ARB_shader_precision;
@@ -4501,7 +4506,7 @@ static inline bool
 _mesa_active_fragment_shader_has_atomic_ops(const struct gl_context *ctx)
 {
    return ctx->Shader._CurrentFragmentProgram != NULL &&
-      ctx->Shader._CurrentFragmentProgram->NumAtomicBuffers > 0;
+      ctx->Shader._CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT]->NumAtomicBuffers > 0;
 }
 
 #ifdef __cplusplus
diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 51ee10ff858..699a2ae47eb 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -230,6 +230,10 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
    struct gl_shader_program *shProg = NULL;
    GLbitfield any_valid_stages;
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glUseProgramStages(%u, 0x%x, %u)\n",
+                  pipeline, stages, program);
+
    if (!pipe) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glUseProgramStages(pipeline)");
       return;
@@ -251,6 +255,8 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
    if (_mesa_has_tessellation(ctx))
       any_valid_stages |= GL_TESS_CONTROL_SHADER_BIT |
                           GL_TESS_EVALUATION_SHADER_BIT;
+   if (_mesa_has_compute_shaders(ctx))
+      any_valid_stages |= GL_COMPUTE_SHADER_BIT;
 
    if (stages != GL_ALL_SHADER_BITS && (stages & ~any_valid_stages) != 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glUseProgramStages(Stages)");
@@ -332,6 +338,9 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
 
    if ((stages & GL_TESS_EVALUATION_SHADER_BIT) != 0)
       _mesa_use_shader_program(ctx, GL_TESS_EVALUATION_SHADER, shProg, pipe);
+
+   if ((stages & GL_COMPUTE_SHADER_BIT) != 0)
+      _mesa_use_shader_program(ctx, GL_COMPUTE_SHADER, shProg, pipe);
 }
 
 /**
@@ -345,6 +354,9 @@ _mesa_ActiveShaderProgram(GLuint pipeline, GLuint program)
    struct gl_shader_program *shProg = NULL;
    struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glActiveShaderProgram(%u, %u)\n", pipeline, program);
+
    if (program != 0) {
       shProg = _mesa_lookup_shader_program_err(ctx, program,
                                                "glActiveShaderProgram(program)");
@@ -380,6 +392,9 @@ _mesa_BindProgramPipeline(GLuint pipeline)
    GET_CURRENT_CONTEXT(ctx);
    struct gl_pipeline_object *newObj = NULL;
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glBindProgramPipeline(%u)\n", pipeline);
+
    /* Rebinding the same pipeline object: no change.
     */
    if (ctx->_Shader->Name == pipeline)
@@ -467,6 +482,9 @@ _mesa_DeleteProgramPipelines(GLsizei n, const GLuint *pipelines)
    GET_CURRENT_CONTEXT(ctx);
    GLsizei i;
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glDeleteProgramPipelines(%d, %p)\n", n, pipelines);
+
    if (n < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glDeleteProgramPipelines(n<0)");
       return;
@@ -551,6 +569,9 @@ _mesa_GenProgramPipelines(GLsizei n, GLuint *pipelines)
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glGenProgramPipelines(%d, %p)\n", n, pipelines);
+
    create_program_pipelines(ctx, n, pipelines, false);
 }
 
@@ -559,6 +580,9 @@ _mesa_CreateProgramPipelines(GLsizei n, GLuint *pipelines)
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glCreateProgramPipelines(%d, %p)\n", n, pipelines);
+
    create_program_pipelines(ctx, n, pipelines, true);
 }
 
@@ -574,6 +598,9 @@ _mesa_IsProgramPipeline(GLuint pipeline)
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glIsProgramPipeline(%u)\n", pipeline);
+
    struct gl_pipeline_object *obj = _mesa_lookup_pipeline_object(ctx, pipeline);
    if (obj == NULL)
       return GL_FALSE;
@@ -590,6 +617,10 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
    GET_CURRENT_CONTEXT(ctx);
    struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glGetProgramPipelineiv(%u, %d, %p)\n",
+                  pipeline, pname, params);
+
    /* Are geometry shaders available in this context?
     */
    const bool has_gs = _mesa_has_geometry_shaders(ctx);
@@ -643,6 +674,12 @@ _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
       *params = pipe->CurrentProgram[MESA_SHADER_FRAGMENT]
          ? pipe->CurrentProgram[MESA_SHADER_FRAGMENT]->Name : 0;
       return;
+   case GL_COMPUTE_SHADER:
+      if (!_mesa_has_compute_shaders(ctx))
+         break;
+      *params = pipe->CurrentProgram[MESA_SHADER_COMPUTE]
+         ? pipe->CurrentProgram[MESA_SHADER_COMPUTE]->Name : 0;
+      return;
    default:
       break;
    }
@@ -857,6 +894,9 @@ _mesa_ValidateProgramPipeline(GLuint pipeline)
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glValidateProgramPipeline(%u)\n", pipeline);
+
    struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    if (!pipe) {
@@ -875,6 +915,10 @@ _mesa_GetProgramPipelineInfoLog(GLuint pipeline, GLsizei bufSize,
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glGetProgramPipelineInfoLog(%u, %d, %p, %p)\n",
+                  pipeline, bufSize, length, infoLog);
+
    struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    if (!pipe) {
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index eb71fdde703..b7e25fe3840 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -119,7 +119,6 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
    case GL_MAX_NUM_ACTIVE_VARIABLES:
       switch (programInterface) {
       case GL_UNIFORM_BLOCK:
-      case GL_SHADER_STORAGE_BLOCK:
          for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) {
             if (shProg->ProgramResourceList[i].Type == programInterface) {
                struct gl_uniform_block *block =
@@ -129,6 +128,26 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
             }
          }
          break;
+      case GL_SHADER_STORAGE_BLOCK:
+         for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) {
+            if (shProg->ProgramResourceList[i].Type == programInterface) {
+               struct gl_uniform_block *block =
+                  (struct gl_uniform_block *)
+                  shProg->ProgramResourceList[i].Data;
+               GLint block_params = 0;
+               for (unsigned j = 0; j < block->NumUniforms; j++) {
+                  const char *iname = block->Uniforms[j].IndexName;
+                  struct gl_program_resource *uni =
+                     _mesa_program_resource_find_name(shProg, GL_BUFFER_VARIABLE,
+                                                      iname, NULL);
+                  if (!uni)
+                     continue;
+                  block_params++;
+               }
+               *params = MAX2(*params, block_params);
+            }
+         }
+         break;
       case GL_ATOMIC_COUNTER_BUFFER:
          for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) {
             if (shProg->ProgramResourceList[i].Type == programInterface) {
diff --git a/src/mesa/main/rastpos.c b/src/mesa/main/rastpos.c
index 54b2125a80f..b468219e688 100644
--- a/src/mesa/main/rastpos.c
+++ b/src/mesa/main/rastpos.c
@@ -36,6 +36,447 @@
 #include "rastpos.h"
 #include "state.h"
 #include "main/dispatch.h"
+#include "main/viewport.h"
+#include "util/simple_list.h"
+
+
+
+/**
+ * Clip a point against the view volume.
+ *
+ * \param v vertex vector describing the point to clip.
+ *
+ * \return zero if outside view volume, or one if inside.
+ */
+static GLuint
+viewclip_point_xy( const GLfloat v[] )
+{
+   if (   v[0] > v[3] || v[0] < -v[3]
+       || v[1] > v[3] || v[1] < -v[3] ) {
+      return 0;
+   }
+   else {
+      return 1;
+   }
+}
+
+
+/**
+ * Clip a point against the far/near Z clipping planes.
+ *
+ * \param v vertex vector describing the point to clip.
+ *
+ * \return zero if outside view volume, or one if inside.
+ */
+static GLuint
+viewclip_point_z( const GLfloat v[] )
+{
+   if (v[2] > v[3] || v[2] < -v[3] ) {
+      return 0;
+   }
+   else {
+      return 1;
+   }
+}
+
+
+/**
+ * Clip a point against the user clipping planes.
+ *
+ * \param ctx GL context.
+ * \param v vertex vector describing the point to clip.
+ *
+ * \return zero if the point was clipped, or one otherwise.
+ */
+static GLuint
+userclip_point( struct gl_context *ctx, const GLfloat v[] )
+{
+   GLuint p;
+
+   for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
+      if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
+	 GLfloat dot = v[0] * ctx->Transform._ClipUserPlane[p][0]
+		     + v[1] * ctx->Transform._ClipUserPlane[p][1]
+		     + v[2] * ctx->Transform._ClipUserPlane[p][2]
+		     + v[3] * ctx->Transform._ClipUserPlane[p][3];
+         if (dot < 0.0F) {
+            return 0;
+         }
+      }
+   }
+
+   return 1;
+}
+
+
+/**
+ * Compute lighting for the raster position.  RGB modes computed.
+ * \param ctx the context
+ * \param vertex vertex location
+ * \param normal normal vector
+ * \param Rcolor returned color
+ * \param Rspec returned specular color (if separate specular enabled)
+ */
+static void
+shade_rastpos(struct gl_context *ctx,
+              const GLfloat vertex[4],
+              const GLfloat normal[3],
+              GLfloat Rcolor[4],
+              GLfloat Rspec[4])
+{
+   /*const*/ GLfloat (*base)[3] = ctx->Light._BaseColor;
+   const struct gl_light *light;
+   GLfloat diffuseColor[4], specularColor[4];  /* for RGB mode only */
+
+   COPY_3V(diffuseColor, base[0]);
+   diffuseColor[3] = CLAMP(
+      ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_DIFFUSE][3], 0.0F, 1.0F );
+   ASSIGN_4V(specularColor, 0.0, 0.0, 0.0, 1.0);
+
+   foreach (light, &ctx->Light.EnabledList) {
+      GLfloat attenuation = 1.0;
+      GLfloat VP[3]; /* vector from vertex to light pos */
+      GLfloat n_dot_VP;
+      GLfloat diffuseContrib[3], specularContrib[3];
+
+      if (!(light->_Flags & LIGHT_POSITIONAL)) {
+         /* light at infinity */
+	 COPY_3V(VP, light->_VP_inf_norm);
+	 attenuation = light->_VP_inf_spot_attenuation;
+      }
+      else {
+         /* local/positional light */
+	 GLfloat d;
+
+         /* VP = vector from vertex pos to light[i].pos */
+	 SUB_3V(VP, light->_Position, vertex);
+         /* d = length(VP) */
+	 d = (GLfloat) LEN_3FV( VP );
+	 if (d > 1.0e-6F) {
+            /* normalize VP */
+	    GLfloat invd = 1.0F / d;
+	    SELF_SCALE_SCALAR_3V(VP, invd);
+	 }
+
+         /* atti */
+	 attenuation = 1.0F / (light->ConstantAttenuation + d *
+			       (light->LinearAttenuation + d *
+				light->QuadraticAttenuation));
+
+	 if (light->_Flags & LIGHT_SPOT) {
+	    GLfloat PV_dot_dir = - DOT3(VP, light->_NormSpotDirection);
+
+	    if (PV_dot_dir<light->_CosCutoff) {
+	       continue;
+	    }
+	    else {
+               GLfloat spot = powf(PV_dot_dir, light->SpotExponent);
+	       attenuation *= spot;
+	    }
+	 }
+      }
+
+      if (attenuation < 1e-3F)
+	 continue;
+
+      n_dot_VP = DOT3( normal, VP );
+
+      if (n_dot_VP < 0.0F) {
+	 ACC_SCALE_SCALAR_3V(diffuseColor, attenuation, light->_MatAmbient[0]);
+	 continue;
+      }
+
+      /* Ambient + diffuse */
+      COPY_3V(diffuseContrib, light->_MatAmbient[0]);
+      ACC_SCALE_SCALAR_3V(diffuseContrib, n_dot_VP, light->_MatDiffuse[0]);
+
+      /* Specular */
+      {
+         const GLfloat *h;
+         GLfloat n_dot_h;
+
+         ASSIGN_3V(specularContrib, 0.0, 0.0, 0.0);
+
+	 if (ctx->Light.Model.LocalViewer) {
+	    GLfloat v[3];
+	    COPY_3V(v, vertex);
+	    NORMALIZE_3FV(v);
+	    SUB_3V(VP, VP, v);
+            NORMALIZE_3FV(VP);
+	    h = VP;
+	 }
+	 else if (light->_Flags & LIGHT_POSITIONAL) {
+	    ACC_3V(VP, ctx->_EyeZDir);
+            NORMALIZE_3FV(VP);
+	    h = VP;
+	 }
+         else {
+	    h = light->_h_inf_norm;
+	 }
+
+	 n_dot_h = DOT3(normal, h);
+
+	 if (n_dot_h > 0.0F) {
+	    GLfloat shine;
+	    GLfloat spec_coef;
+
+	    shine = ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_SHININESS][0];
+	    spec_coef = powf(n_dot_h, shine);
+
+	    if (spec_coef > 1.0e-10F) {
+               if (ctx->Light.Model.ColorControl==GL_SEPARATE_SPECULAR_COLOR) {
+                  ACC_SCALE_SCALAR_3V( specularContrib, spec_coef,
+                                       light->_MatSpecular[0]);
+               }
+               else {
+                  ACC_SCALE_SCALAR_3V( diffuseContrib, spec_coef,
+                                       light->_MatSpecular[0]);
+               }
+	    }
+	 }
+      }
+
+      ACC_SCALE_SCALAR_3V( diffuseColor, attenuation, diffuseContrib );
+      ACC_SCALE_SCALAR_3V( specularColor, attenuation, specularContrib );
+   }
+
+   Rcolor[0] = CLAMP(diffuseColor[0], 0.0F, 1.0F);
+   Rcolor[1] = CLAMP(diffuseColor[1], 0.0F, 1.0F);
+   Rcolor[2] = CLAMP(diffuseColor[2], 0.0F, 1.0F);
+   Rcolor[3] = CLAMP(diffuseColor[3], 0.0F, 1.0F);
+   Rspec[0] = CLAMP(specularColor[0], 0.0F, 1.0F);
+   Rspec[1] = CLAMP(specularColor[1], 0.0F, 1.0F);
+   Rspec[2] = CLAMP(specularColor[2], 0.0F, 1.0F);
+   Rspec[3] = CLAMP(specularColor[3], 0.0F, 1.0F);
+}
+
+
+/**
+ * Do texgen needed for glRasterPos.
+ * \param ctx  rendering context
+ * \param vObj  object-space vertex coordinate
+ * \param vEye  eye-space vertex coordinate
+ * \param normal  vertex normal
+ * \param unit  texture unit number
+ * \param texcoord  incoming texcoord and resulting texcoord
+ */
+static void
+compute_texgen(struct gl_context *ctx, const GLfloat vObj[4], const GLfloat vEye[4],
+               const GLfloat normal[3], GLuint unit, GLfloat texcoord[4])
+{
+   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+
+   /* always compute sphere map terms, just in case */
+   GLfloat u[3], two_nu, rx, ry, rz, m, mInv;
+   COPY_3V(u, vEye);
+   NORMALIZE_3FV(u);
+   two_nu = 2.0F * DOT3(normal, u);
+   rx = u[0] - normal[0] * two_nu;
+   ry = u[1] - normal[1] * two_nu;
+   rz = u[2] - normal[2] * two_nu;
+   m = rx * rx + ry * ry + (rz + 1.0F) * (rz + 1.0F);
+   if (m > 0.0F)
+      mInv = 0.5F * (1.0f / sqrtf(m));
+   else
+      mInv = 0.0F;
+
+   if (texUnit->TexGenEnabled & S_BIT) {
+      switch (texUnit->GenS.Mode) {
+         case GL_OBJECT_LINEAR:
+            texcoord[0] = DOT4(vObj, texUnit->GenS.ObjectPlane);
+            break;
+         case GL_EYE_LINEAR:
+            texcoord[0] = DOT4(vEye, texUnit->GenS.EyePlane);
+            break;
+         case GL_SPHERE_MAP:
+            texcoord[0] = rx * mInv + 0.5F;
+            break;
+         case GL_REFLECTION_MAP:
+            texcoord[0] = rx;
+            break;
+         case GL_NORMAL_MAP:
+            texcoord[0] = normal[0];
+            break;
+         default:
+            _mesa_problem(ctx, "Bad S texgen in compute_texgen()");
+            return;
+      }
+   }
+
+   if (texUnit->TexGenEnabled & T_BIT) {
+      switch (texUnit->GenT.Mode) {
+         case GL_OBJECT_LINEAR:
+            texcoord[1] = DOT4(vObj, texUnit->GenT.ObjectPlane);
+            break;
+         case GL_EYE_LINEAR:
+            texcoord[1] = DOT4(vEye, texUnit->GenT.EyePlane);
+            break;
+         case GL_SPHERE_MAP:
+            texcoord[1] = ry * mInv + 0.5F;
+            break;
+         case GL_REFLECTION_MAP:
+            texcoord[1] = ry;
+            break;
+         case GL_NORMAL_MAP:
+            texcoord[1] = normal[1];
+            break;
+         default:
+            _mesa_problem(ctx, "Bad T texgen in compute_texgen()");
+            return;
+      }
+   }
+
+   if (texUnit->TexGenEnabled & R_BIT) {
+      switch (texUnit->GenR.Mode) {
+         case GL_OBJECT_LINEAR:
+            texcoord[2] = DOT4(vObj, texUnit->GenR.ObjectPlane);
+            break;
+         case GL_EYE_LINEAR:
+            texcoord[2] = DOT4(vEye, texUnit->GenR.EyePlane);
+            break;
+         case GL_REFLECTION_MAP:
+            texcoord[2] = rz;
+            break;
+         case GL_NORMAL_MAP:
+            texcoord[2] = normal[2];
+            break;
+         default:
+            _mesa_problem(ctx, "Bad R texgen in compute_texgen()");
+            return;
+      }
+   }
+
+   if (texUnit->TexGenEnabled & Q_BIT) {
+      switch (texUnit->GenQ.Mode) {
+         case GL_OBJECT_LINEAR:
+            texcoord[3] = DOT4(vObj, texUnit->GenQ.ObjectPlane);
+            break;
+         case GL_EYE_LINEAR:
+            texcoord[3] = DOT4(vEye, texUnit->GenQ.EyePlane);
+            break;
+         default:
+            _mesa_problem(ctx, "Bad Q texgen in compute_texgen()");
+            return;
+      }
+   }
+}
+
+
+/**
+ * glRasterPos transformation.  Typically called via ctx->Driver.RasterPos().
+ *
+ * \param vObj  vertex position in object space
+ */
+void
+_mesa_RasterPos(struct gl_context *ctx, const GLfloat vObj[4])
+{
+   if (ctx->VertexProgram._Enabled) {
+      /* XXX implement this */
+      _mesa_problem(ctx, "Vertex programs not implemented for glRasterPos");
+      return;
+   }
+   else {
+      GLfloat eye[4], clip[4], ndc[3], d;
+      GLfloat *norm, eyenorm[3];
+      GLfloat *objnorm = ctx->Current.Attrib[VERT_ATTRIB_NORMAL];
+      float scale[3], translate[3];
+
+      /* apply modelview matrix:  eye = MV * obj */
+      TRANSFORM_POINT( eye, ctx->ModelviewMatrixStack.Top->m, vObj );
+      /* apply projection matrix:  clip = Proj * eye */
+      TRANSFORM_POINT( clip, ctx->ProjectionMatrixStack.Top->m, eye );
+
+      /* clip to view volume. */
+      if (!ctx->Transform.DepthClamp) {
+         if (viewclip_point_z(clip) == 0) {
+            ctx->Current.RasterPosValid = GL_FALSE;
+            return;
+         }
+      }
+      if (!ctx->Transform.RasterPositionUnclipped) {
+         if (viewclip_point_xy(clip) == 0) {
+            ctx->Current.RasterPosValid = GL_FALSE;
+            return;
+         }
+      }
+
+      /* clip to user clipping planes */
+      if (ctx->Transform.ClipPlanesEnabled && !userclip_point(ctx, clip)) {
+         ctx->Current.RasterPosValid = GL_FALSE;
+         return;
+      }
+
+      /* ndc = clip / W */
+      d = (clip[3] == 0.0F) ? 1.0F : 1.0F / clip[3];
+      ndc[0] = clip[0] * d;
+      ndc[1] = clip[1] * d;
+      ndc[2] = clip[2] * d;
+      /* wincoord = viewport_mapping(ndc) */
+      _mesa_get_viewport_xform(ctx, 0, scale, translate);
+      ctx->Current.RasterPos[0] = ndc[0] * scale[0] + translate[0];
+      ctx->Current.RasterPos[1] = ndc[1] * scale[1] + translate[1];
+      ctx->Current.RasterPos[2] = ndc[2] * scale[2] + translate[2];
+      ctx->Current.RasterPos[3] = clip[3];
+
+      if (ctx->Transform.DepthClamp) {
+	 ctx->Current.RasterPos[3] = CLAMP(ctx->Current.RasterPos[3],
+					   ctx->ViewportArray[0].Near,
+					   ctx->ViewportArray[0].Far);
+      }
+
+      /* compute raster distance */
+      if (ctx->Fog.FogCoordinateSource == GL_FOG_COORDINATE_EXT)
+         ctx->Current.RasterDistance = ctx->Current.Attrib[VERT_ATTRIB_FOG][0];
+      else
+         ctx->Current.RasterDistance =
+                        sqrtf( eye[0]*eye[0] + eye[1]*eye[1] + eye[2]*eye[2] );
+
+      /* compute transformed normal vector (for lighting or texgen) */
+      if (ctx->_NeedEyeCoords) {
+         const GLfloat *inv = ctx->ModelviewMatrixStack.Top->inv;
+         TRANSFORM_NORMAL( eyenorm, objnorm, inv );
+         norm = eyenorm;
+      }
+      else {
+         norm = objnorm;
+      }
+
+      /* update raster color */
+      if (ctx->Light.Enabled) {
+         /* lighting */
+         shade_rastpos( ctx, vObj, norm,
+                        ctx->Current.RasterColor,
+                        ctx->Current.RasterSecondaryColor );
+      }
+      else {
+         /* use current color */
+	 COPY_4FV(ctx->Current.RasterColor,
+		  ctx->Current.Attrib[VERT_ATTRIB_COLOR0]);
+	 COPY_4FV(ctx->Current.RasterSecondaryColor,
+		  ctx->Current.Attrib[VERT_ATTRIB_COLOR1]);
+      }
+
+      /* texture coords */
+      {
+         GLuint u;
+         for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
+            GLfloat tc[4];
+            COPY_4V(tc, ctx->Current.Attrib[VERT_ATTRIB_TEX0 + u]);
+            if (ctx->Texture.Unit[u].TexGenEnabled) {
+               compute_texgen(ctx, vObj, eye, norm, u, tc);
+            }
+            TRANSFORM_POINT(ctx->Current.RasterTexCoords[u],
+                            ctx->TextureMatrixStack[u].Top->m, tc);
+         }
+      }
+
+      ctx->Current.RasterPosValid = GL_TRUE;
+   }
+
+   if (ctx->RenderMode == GL_SELECT) {
+      _mesa_update_hitflag( ctx, ctx->Current.RasterPos[2] );
+   }
+}
 
 
 /**
diff --git a/src/mesa/main/rastpos.h b/src/mesa/main/rastpos.h
index dc28c68d41b..90b8f957b9f 100644
--- a/src/mesa/main/rastpos.h
+++ b/src/mesa/main/rastpos.h
@@ -41,6 +41,9 @@ struct gl_context;
 extern void 
 _mesa_init_rastpos(struct gl_context *ctx);
 
+void
+_mesa_RasterPos(struct gl_context *ctx, const GLfloat vObj[4]);
+
 void GLAPIENTRY
 _mesa_RasterPos2d(GLdouble x, GLdouble y);
 void GLAPIENTRY
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 8182d3dcc04..dd51bba3386 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -543,13 +543,55 @@ _mesa_program_resource_find_name(struct gl_shader_program *shProg,
       /* Resource basename. */
       const char *rname = _mesa_program_resource_name(res);
       unsigned baselen = strlen(rname);
+      unsigned baselen_without_array_index = baselen;
+      const char *rname_last_square_bracket = strrchr(rname, '[');
+      bool found = false;
+      bool rname_has_array_index_zero = false;
+      /* From ARB_program_interface_query spec:
+       *
+       * "uint GetProgramResourceIndex(uint program, enum programInterface,
+       *                               const char *name);
+       *  [...]
+       *  If <name> exactly matches the name string of one of the active
+       *  resources for <programInterface>, the index of the matched resource is
+       *  returned. Additionally, if <name> would exactly match the name string
+       *  of an active resource if "[0]" were appended to <name>, the index of
+       *  the matched resource is returned. [...]"
+       *
+       * "A string provided to GetProgramResourceLocation or
+       * GetProgramResourceLocationIndex is considered to match an active variable
+       * if:
+       *
+       *  * the string exactly matches the name of the active variable;
+       *
+       *  * if the string identifies the base name of an active array, where the
+       *    string would exactly match the name of the variable if the suffix
+       *    "[0]" were appended to the string; [...]"
+       */
+      /* Remove array's index from interface block name comparison only if
+       * array's index is zero and the resulting string length is the same
+       * than the provided name's length.
+       */
+      if (rname_last_square_bracket) {
+         baselen_without_array_index -= strlen(rname_last_square_bracket);
+         rname_has_array_index_zero =
+            (strncmp(rname_last_square_bracket, "[0]\0", 4) == 0) &&
+            (baselen_without_array_index == strlen(name));
+      }
+
+      if (strncmp(rname, name, baselen) == 0)
+         found = true;
+      else if (rname_has_array_index_zero &&
+               strncmp(rname, name, baselen_without_array_index) == 0)
+         found = true;
 
-      if (strncmp(rname, name, baselen) == 0) {
+      if (found) {
          switch (programInterface) {
          case GL_UNIFORM_BLOCK:
          case GL_SHADER_STORAGE_BLOCK:
             /* Basename match, check if array or struct. */
-            if (name[baselen] == '\0' ||
+            if (rname_has_array_index_zero ||
+                name[baselen] == '\0' ||
                 name[baselen] == '[' ||
                 name[baselen] == '.') {
                return res;
@@ -627,6 +669,20 @@ _mesa_program_resource_index(struct gl_shader_program *shProg,
    }
 }
 
+/**
+ * Find a program resource that points to given data.
+ */
+static struct gl_program_resource*
+program_resource_find_data(struct gl_shader_program *shProg, void *data)
+{
+   struct gl_program_resource *res = shProg->ProgramResourceList;
+   for (unsigned i = 0; i < shProg->NumProgramResourceList; i++, res++) {
+      if (res->Data == data)
+         return res;
+   }
+   return NULL;
+}
+
 /* Find a program resource with specific index in given interface.
  */
 struct gl_program_resource *
@@ -808,6 +864,14 @@ program_resource_location(struct gl_shader_program *shProg,
       if (RESOURCE_UNI(res)->builtin)
          return -1;
 
+     /* From page 79 of the OpenGL 4.2 spec:
+      *
+      *     "A valid name cannot be a structure, an array of structures, or any
+      *     portion of a single vector or a matrix."
+      */
+      if (RESOURCE_UNI(res)->type->without_array()->is_record())
+         return -1;
+
       /* From the GL_ARB_uniform_buffer_object spec:
        *
        *     "The value -1 will be returned if <name> does not correspond to an
@@ -1016,8 +1080,18 @@ get_buffer_property(struct gl_shader_program *shProg,
          *val = RESOURCE_ATC(res)->NumUniforms;
          return 1;
       case GL_ACTIVE_VARIABLES:
-         for (unsigned i = 0; i < RESOURCE_ATC(res)->NumUniforms; i++)
-            *val++ = RESOURCE_ATC(res)->Uniforms[i];
+         for (unsigned i = 0; i < RESOURCE_ATC(res)->NumUniforms; i++) {
+            /* Active atomic buffer contains index to UniformStorage. Find
+             * out gl_program_resource via data pointer and then calculate
+             * index of that uniform.
+             */
+            unsigned idx = RESOURCE_ATC(res)->Uniforms[i];
+            struct gl_program_resource *uni =
+               program_resource_find_data(shProg,
+                                          &shProg->UniformStorage[idx]);
+            assert(uni);
+            *val++ = _mesa_program_resource_index(shProg, uni);
+         }
          return RESOURCE_ATC(res)->NumUniforms;
       }
    }
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 765602e50db..ac40891f435 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -630,9 +630,16 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
    case GL_ACTIVE_ATTRIBUTE_MAX_LENGTH:
       *params = _mesa_longest_attribute_name_length(shProg);
       return;
-   case GL_ACTIVE_UNIFORMS:
-      *params = shProg->NumUniformStorage - shProg->NumHiddenUniforms;
+   case GL_ACTIVE_UNIFORMS: {
+      unsigned i;
+      const unsigned num_uniforms =
+         shProg->NumUniformStorage - shProg->NumHiddenUniforms;
+      for (*params = 0, i = 0; i < num_uniforms; i++) {
+         if (!shProg->UniformStorage[i].is_shader_storage)
+            (*params)++;
+      }
       return;
+   }
    case GL_ACTIVE_UNIFORM_MAX_LENGTH: {
       unsigned i;
       GLint max_len = 0;
@@ -640,6 +647,9 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
          shProg->NumUniformStorage - shProg->NumHiddenUniforms;
 
       for (i = 0; i < num_uniforms; i++) {
+         if (shProg->UniformStorage[i].is_shader_storage)
+            continue;
+
 	 /* Add one for the terminating NUL character for a non-array, and
 	  * 4 for the "[0]" and the NUL for an array.
 	  */
diff --git a/src/mesa/main/texcompress.c b/src/mesa/main/texcompress.c
index 84973d3fe5d..a8ac19e40d7 100644
--- a/src/mesa/main/texcompress.c
+++ b/src/mesa/main/texcompress.c
@@ -243,28 +243,6 @@ _mesa_gl_compressed_format_base_format(GLenum format)
  *        what GL_NUM_COMPRESSED_TEXTURE_FORMATS and
  *        GL_COMPRESSED_TEXTURE_FORMATS return."
  *
- * The KHR_texture_compression_astc_hdr spec says:
- *
- *    "Interactions with OpenGL 4.2
- *
- *        OpenGL 4.2 supports the feature that compressed textures can be
- *        compressed online, by passing the compressed texture format enum as
- *        the internal format when uploading a texture using TexImage1D,
- *        TexImage2D or TexImage3D (see Section 3.9.3, Texture Image
- *        Specification, subsection Encoding of Special Internal Formats).
- *
- *        Due to the complexity of the ASTC compression algorithm, it is not
- *        usually suitable for online use, and therefore ASTC support will be
- *        limited to pre-compressed textures only. Where on-device compression
- *        is required, a domain-specific limited compressor will typically
- *        be used, and this is therefore not suitable for implementation in
- *        the driver.
- *
- *        In particular, the ASTC format specifiers will not be added to
- *        Table 3.14, and thus will not be accepted by the TexImage*D
- *        functions, and will not be returned by the (already deprecated)
- *        COMPRESSED_TEXTURE_FORMATS query."
- *
  * There is no formal spec for GL_ATI_texture_compression_3dc.  Since the
  * formats added by this extension are luminance-alpha formats, it is
  * reasonable to expect them to follow the same rules as
@@ -286,7 +264,8 @@ GLuint
 _mesa_get_compressed_formats(struct gl_context *ctx, GLint *formats)
 {
    GLuint n = 0;
-   if (ctx->Extensions.TDFX_texture_compression_FXT1) {
+   if (_mesa_is_desktop_gl(ctx) &&
+       ctx->Extensions.TDFX_texture_compression_FXT1) {
       if (formats) {
          formats[n++] = GL_COMPRESSED_RGB_FXT1_3DFX;
          formats[n++] = GL_COMPRESSED_RGBA_FXT1_3DFX;
@@ -396,6 +375,69 @@ _mesa_get_compressed_formats(struct gl_context *ctx, GLint *formats)
          n += 10;
       }
    }
+
+   /* The KHR_texture_compression_astc_hdr spec says:
+    *
+    *    "Interactions with OpenGL 4.2
+    *
+    *        OpenGL 4.2 supports the feature that compressed textures can be
+    *        compressed online, by passing the compressed texture format enum as
+    *        the internal format when uploading a texture using TexImage1D,
+    *        TexImage2D or TexImage3D (see Section 3.9.3, Texture Image
+    *        Specification, subsection Encoding of Special Internal Formats).
+    *
+    *        Due to the complexity of the ASTC compression algorithm, it is not
+    *        usually suitable for online use, and therefore ASTC support will be
+    *        limited to pre-compressed textures only. Where on-device compression
+    *        is required, a domain-specific limited compressor will typically
+    *        be used, and this is therefore not suitable for implementation in
+    *        the driver.
+    *
+    *        In particular, the ASTC format specifiers will not be added to
+    *        Table 3.14, and thus will not be accepted by the TexImage*D
+    *        functions, and will not be returned by the (already deprecated)
+    *        COMPRESSED_TEXTURE_FORMATS query."
+    *
+    * The ES and the desktop specs diverge here. In OpenGL ES, the COMPRESSED_TEXTURE_FORMATS
+    * query returns the set of supported specific compressed formats.
+    */
+   if (ctx->API == API_OPENGLES2 &&
+       ctx->Extensions.KHR_texture_compression_astc_ldr) {
+      if (formats) {
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_4x4_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_5x4_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_5x5_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_6x5_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_6x6_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_8x5_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_8x6_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_8x8_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_10x5_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_10x6_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_10x8_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_10x10_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_12x10_KHR;
+         formats[n++] = GL_COMPRESSED_RGBA_ASTC_12x12_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR;
+         formats[n++] = GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR;
+      }
+      else {
+         n += 28;
+      }
+   }
+
    return n;
 }
 
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 682b72755c7..945890aeeb5 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -297,8 +297,7 @@ get_tex_rgba_compressed(struct gl_context *ctx, GLuint dimensions,
    uint8_t rebaseSwizzle[4];
 
    /* Decompress into temp float buffer, then pack into user buffer */
-   tempImage = malloc(width * height * depth
-                                  * 4 * sizeof(GLfloat));
+   tempImage = malloc(width * height * depth * 4 * sizeof(GLfloat));
    if (!tempImage) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage()");
       return;
diff --git a/src/mesa/main/vdpau.c b/src/mesa/main/vdpau.c
index 0efa56e4f41..44be3a37443 100644
--- a/src/mesa/main/vdpau.c
+++ b/src/mesa/main/vdpau.c
@@ -163,9 +163,10 @@ register_surface(struct gl_context *ctx, GLboolean isOutput,
          return (GLintptr)NULL;
       }
 
-      if (tex->Target == 0)
+      if (tex->Target == 0) {
          tex->Target = target;
-      else if (tex->Target != target) {
+         tex->TargetIndex = _mesa_tex_target_to_index(ctx, target);
+      } else if (tex->Target != target) {
          _mesa_unlock_texture(ctx, tex);
          free(surf);
          _mesa_error(ctx, GL_INVALID_OPERATION,
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index acaa85d9356..20f8b3df99d 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -73,7 +73,8 @@ void st_upload_constants( struct st_context *st,
        * the parameters list are explicitly set by the user with glUniform,
        * glProgramParameter(), etc.
        */
-      _mesa_load_state_parameters(st->ctx, params);
+      if (params->StateFlags)
+         _mesa_load_state_parameters(st->ctx, params);
 
       /* We always need to get a new buffer, to keep the drivers simple and
        * avoid gratuitous rendering synchronization.
diff --git a/src/mesa/state_tracker/st_cb_copyimage.c b/src/mesa/state_tracker/st_cb_copyimage.c
new file mode 100644
index 00000000000..75114cdb712
--- /dev/null
+++ b/src/mesa/state_tracker/st_cb_copyimage.c
@@ -0,0 +1,582 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "state_tracker/st_context.h"
+#include "state_tracker/st_cb_copyimage.h"
+#include "state_tracker/st_cb_fbo.h"
+#include "state_tracker/st_texture.h"
+
+#include "util/u_box.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+
+
+/**
+ * Return an equivalent canonical format without "X" channels.
+ *
+ * Copying between incompatible formats is easier when the format is
+ * canonicalized, meaning that it is in a standard form.
+ *
+ * The returned format has the same component sizes and swizzles as
+ * the source format, the type is changed to UINT or UNORM, depending on
+ * which one has the most swizzle combinations in their group.
+ *
+ * If it's not an array format, return a memcpy-equivalent array format.
+ *
+ * The key feature is that swizzled versions of formats of the same
+ * component size always return the same component type.
+ *
+ * X returns A.
+ * Luminance, intensity, alpha, depth, stencil, and 8-bit and 16-bit packed
+ * formats are not supported. (same as ARB_copy_image)
+ */
+static enum pipe_format
+get_canonical_format(enum pipe_format format)
+{
+   const struct util_format_description *desc =
+      util_format_description(format);
+
+   /* Packed formats. Return the equivalent array format. */
+   if (format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format == PIPE_FORMAT_R9G9B9E5_FLOAT)
+      return get_canonical_format(PIPE_FORMAT_R8G8B8A8_UINT);
+
+   if (desc->nr_channels == 4 &&
+       desc->channel[0].size == 10 &&
+       desc->channel[1].size == 10 &&
+       desc->channel[2].size == 10 &&
+       desc->channel[3].size == 2) {
+      if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_X &&
+          desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_Y &&
+          desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_Z)
+         return get_canonical_format(PIPE_FORMAT_R8G8B8A8_UINT);
+
+      return PIPE_FORMAT_NONE;
+   }
+
+#define RETURN_FOR_SWIZZLE1(x, format) \
+   if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_##x) \
+      return format
+
+#define RETURN_FOR_SWIZZLE2(x, y, format) \
+   if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_##x && \
+       desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_##y) \
+      return format
+
+#define RETURN_FOR_SWIZZLE3(x, y, z, format) \
+   if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_##x && \
+       desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_##y && \
+       desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_##z) \
+      return format
+
+#define RETURN_FOR_SWIZZLE4(x, y, z, w, format) \
+   if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_##x && \
+       desc->swizzle[1] == UTIL_FORMAT_SWIZZLE_##y && \
+       desc->swizzle[2] == UTIL_FORMAT_SWIZZLE_##z && \
+       desc->swizzle[3] == UTIL_FORMAT_SWIZZLE_##w) \
+      return format
+
+   /* Array formats. */
+   if (desc->is_array) {
+      switch (desc->nr_channels) {
+      case 1:
+         switch (desc->channel[0].size) {
+         case 8:
+            RETURN_FOR_SWIZZLE1(X, PIPE_FORMAT_R8_UINT);
+            break;
+
+         case 16:
+            RETURN_FOR_SWIZZLE1(X, PIPE_FORMAT_R16_UINT);
+            break;
+
+         case 32:
+            RETURN_FOR_SWIZZLE1(X, PIPE_FORMAT_R32_UINT);
+            break;
+         }
+         break;
+
+      case 2:
+         switch (desc->channel[0].size) {
+         case 8:
+            /* All formats in each group must be of the same type.
+             * We can't use UINT for R8G8 while using UNORM for G8R8.
+             */
+            RETURN_FOR_SWIZZLE2(X, Y, PIPE_FORMAT_R8G8_UNORM);
+            RETURN_FOR_SWIZZLE2(Y, X, PIPE_FORMAT_G8R8_UNORM);
+            break;
+
+         case 16:
+            RETURN_FOR_SWIZZLE2(X, Y, PIPE_FORMAT_R16G16_UNORM);
+            RETURN_FOR_SWIZZLE2(Y, X, PIPE_FORMAT_G16R16_UNORM);
+            break;
+
+         case 32:
+            RETURN_FOR_SWIZZLE2(X, Y, PIPE_FORMAT_R32G32_UINT);
+            break;
+         }
+         break;
+
+      case 3:
+         switch (desc->channel[0].size) {
+         case 8:
+            RETURN_FOR_SWIZZLE3(X, Y, Z, PIPE_FORMAT_R8G8B8_UINT);
+            break;
+
+         case 16:
+            RETURN_FOR_SWIZZLE3(X, Y, Z, PIPE_FORMAT_R16G16B16_UINT);
+            break;
+
+         case 32:
+            RETURN_FOR_SWIZZLE3(X, Y, Z, PIPE_FORMAT_R32G32B32_UINT);
+            break;
+         }
+         break;
+
+      case 4:
+         switch (desc->channel[0].size) {
+         case 8:
+            RETURN_FOR_SWIZZLE4(X, Y, Z, W, PIPE_FORMAT_R8G8B8A8_UNORM);
+            RETURN_FOR_SWIZZLE4(X, Y, Z, 1, PIPE_FORMAT_R8G8B8A8_UNORM);
+            RETURN_FOR_SWIZZLE4(Z, Y, X, W, PIPE_FORMAT_B8G8R8A8_UNORM);
+            RETURN_FOR_SWIZZLE4(Z, Y, X, 1, PIPE_FORMAT_B8G8R8A8_UNORM);
+            RETURN_FOR_SWIZZLE4(W, Z, Y, X, PIPE_FORMAT_A8B8G8R8_UNORM);
+            RETURN_FOR_SWIZZLE4(1, Z, Y, X, PIPE_FORMAT_A8B8G8R8_UNORM);
+            RETURN_FOR_SWIZZLE4(W, X, Y, Z, PIPE_FORMAT_A8R8G8B8_UNORM);
+            RETURN_FOR_SWIZZLE4(1, X, Y, Z, PIPE_FORMAT_A8R8G8B8_UNORM);
+            break;
+
+         case 16:
+            RETURN_FOR_SWIZZLE4(X, Y, Z, W, PIPE_FORMAT_R16G16B16A16_UINT);
+            RETURN_FOR_SWIZZLE4(X, Y, Z, 1, PIPE_FORMAT_R16G16B16A16_UINT);
+            break;
+
+         case 32:
+            RETURN_FOR_SWIZZLE4(X, Y, Z, W, PIPE_FORMAT_R32G32B32A32_UINT);
+            RETURN_FOR_SWIZZLE4(X, Y, Z, 1, PIPE_FORMAT_R32G32B32A32_UINT);
+            break;
+         }
+      }
+
+      assert(!"unknown array format");
+      return PIPE_FORMAT_NONE;
+   }
+
+   assert(!"unknown packed format");
+   return PIPE_FORMAT_NONE;
+}
+
+/**
+ * Return true if the swizzle is XYZW in case of a 4-channel format,
+ * XY in case of a 2-channel format, or X in case of a 1-channel format.
+ */
+static bool
+has_identity_swizzle(const struct util_format_description *desc)
+{
+   int i;
+
+   for (i = 0; i < desc->nr_channels; i++)
+      if (desc->swizzle[i] != UTIL_FORMAT_SWIZZLE_X + i)
+         return false;
+
+   return true;
+}
+
+/**
+ * Return a canonical format for the given bits and channel size.
+ */
+static enum pipe_format
+canonical_format_from_bits(unsigned bits, unsigned channel_size)
+{
+   switch (bits) {
+   case 8:
+      if (channel_size == 8)
+         return get_canonical_format(PIPE_FORMAT_R8_UINT);
+      break;
+
+   case 16:
+      if (channel_size == 8)
+         return get_canonical_format(PIPE_FORMAT_R8G8_UINT);
+      if (channel_size == 16)
+         return get_canonical_format(PIPE_FORMAT_R16_UINT);
+      break;
+
+   case 32:
+      if (channel_size == 8)
+         return get_canonical_format(PIPE_FORMAT_R8G8B8A8_UINT);
+      if (channel_size == 16)
+         return get_canonical_format(PIPE_FORMAT_R16G16_UINT);
+      if (channel_size == 32)
+         return get_canonical_format(PIPE_FORMAT_R32_UINT);
+      break;
+
+   case 64:
+      if (channel_size == 16)
+         return get_canonical_format(PIPE_FORMAT_R16G16B16A16_UINT);
+      if (channel_size == 32)
+         return get_canonical_format(PIPE_FORMAT_R32G32_UINT);
+      break;
+
+   case 128:
+      if (channel_size == 32)
+         return get_canonical_format(PIPE_FORMAT_R32G32B32A32_UINT);
+      break;
+   }
+
+   assert(!"impossible format");
+   return PIPE_FORMAT_NONE;
+}
+
+static void
+blit(struct pipe_context *pipe,
+     struct pipe_resource *dst,
+     enum pipe_format dst_format,
+     unsigned dst_level,
+     unsigned dstx, unsigned dsty, unsigned dstz,
+     struct pipe_resource *src,
+     enum pipe_format src_format,
+     unsigned src_level,
+     const struct pipe_box *src_box)
+{
+   struct pipe_blit_info blit = {{0}};
+
+   blit.src.resource = src;
+   blit.dst.resource = dst;
+   blit.src.format = src_format;
+   blit.dst.format = dst_format;
+   blit.src.level = src_level;
+   blit.dst.level = dst_level;
+   blit.src.box = *src_box;
+   u_box_3d(dstx, dsty, dstz, src_box->width, src_box->height,
+            src_box->depth, &blit.dst.box);
+   blit.mask = PIPE_MASK_RGBA;
+   blit.filter = PIPE_TEX_FILTER_NEAREST;
+
+   pipe->blit(pipe, &blit);
+}
+
+static void
+swizzled_copy(struct pipe_context *pipe,
+              struct pipe_resource *dst,
+              unsigned dst_level,
+              unsigned dstx, unsigned dsty, unsigned dstz,
+              struct pipe_resource *src,
+              unsigned src_level,
+              const struct pipe_box *src_box)
+{
+   const struct util_format_description *src_desc, *dst_desc;
+   unsigned bits;
+   enum pipe_format blit_src_format, blit_dst_format;
+
+   /* Get equivalent canonical formats. Those are always array formats and
+    * copying between compatible canonical formats behaves either like
+    * memcpy or like swizzled memcpy. The idea is that we won't have to care
+    * about the channel type from this point on.
+    * Only the swizzle and channel size.
+    */
+   blit_src_format = get_canonical_format(src->format);
+   blit_dst_format = get_canonical_format(dst->format);
+
+   assert(blit_src_format != PIPE_FORMAT_NONE);
+   assert(blit_dst_format != PIPE_FORMAT_NONE);
+
+   src_desc = util_format_description(blit_src_format);
+   dst_desc = util_format_description(blit_dst_format);
+
+   assert(src_desc->block.bits == dst_desc->block.bits);
+   bits = src_desc->block.bits;
+
+   if (dst_desc->channel[0].size == src_desc->channel[0].size) {
+      /* Only the swizzle is different, which means we can just blit,
+       * e.g. RGBA -> BGRA.
+       */
+   } else if (has_identity_swizzle(src_desc)) {
+      /* Src is unswizzled and dst can be swizzled, so src is typecast
+       * to an equivalent dst-compatible format.
+       * e.g. R32 -> BGRA8 is realized as RGBA8 -> BGRA8
+       */
+      blit_src_format =
+         canonical_format_from_bits(bits, dst_desc->channel[0].size);
+   } else if (has_identity_swizzle(dst_desc)) {
+      /* Dst is unswizzled and src can be swizzled, so dst is typecast
+       * to an equivalent src-compatible format.
+       * e.g. BGRA8 -> R32 is realized as BGRA8 -> RGBA8
+       */
+      blit_dst_format =
+         canonical_format_from_bits(bits, src_desc->channel[0].size);
+   } else {
+      assert(!"This should have been handled by handle_complex_copy.");
+      return;
+   }
+
+   blit(pipe, dst, blit_dst_format, dst_level, dstx, dsty, dstz,
+        src, blit_src_format, src_level, src_box);
+}
+
+static bool
+same_size_and_swizzle(const struct util_format_description *d1,
+                      const struct util_format_description *d2)
+{
+   int i;
+
+   if (d1->layout != d2->layout ||
+       d1->nr_channels != d2->nr_channels ||
+       d1->is_array != d2->is_array)
+      return false;
+
+   for (i = 0; i < d1->nr_channels; i++) {
+      if (d1->channel[i].size != d2->channel[i].size)
+         return false;
+
+      if (d1->swizzle[i] <= UTIL_FORMAT_SWIZZLE_W &&
+          d2->swizzle[i] <= UTIL_FORMAT_SWIZZLE_W &&
+          d1->swizzle[i] != d2->swizzle[i])
+         return false;
+   }
+
+   return true;
+}
+
+static struct pipe_resource *
+create_texture(struct pipe_screen *screen, enum pipe_format format,
+               unsigned nr_samples,
+               unsigned width, unsigned height, unsigned depth)
+{
+   struct pipe_resource templ;
+
+   memset(&templ, 0, sizeof(templ));
+   templ.format = format;
+   templ.width0 = width;
+   templ.height0 = height;
+   templ.depth0 = 1;
+   templ.array_size = depth;
+   templ.nr_samples = nr_samples;
+   templ.usage = PIPE_USAGE_DEFAULT;
+   templ.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
+
+   if (depth > 1)
+      templ.target = PIPE_TEXTURE_2D_ARRAY;
+   else
+      templ.target = PIPE_TEXTURE_2D;
+
+   return screen->resource_create(screen, &templ);
+}
+
+/**
+ * Handle complex format conversions using 2 blits with a temporary texture
+ * in between, e.g. blitting from B10G10R10A2 to G16R16.
+ *
+ * This example is implemented this way:
+ * 1) First, blit from B10G10R10A2 to R10G10B10A2, which is canonical, so it
+ *    can be reinterpreted as a different canonical format of the same bpp,
+ *    such as R16G16. This blit only swaps R and B 10-bit components.
+ * 2) Finally, blit the result, which is R10G10B10A2, as R16G16 to G16R16.
+ *    This blit only swaps R and G 16-bit components.
+ */
+static bool
+handle_complex_copy(struct pipe_context *pipe,
+                    struct pipe_resource *dst,
+                    unsigned dst_level,
+                    unsigned dstx, unsigned dsty, unsigned dstz,
+                    struct pipe_resource *src,
+                    unsigned src_level,
+                    const struct pipe_box *src_box,
+                    enum pipe_format noncanon_format,
+                    enum pipe_format canon_format)
+{
+   struct pipe_box temp_box;
+   struct pipe_resource *temp = NULL;
+   const struct util_format_description *src_desc, *dst_desc;
+   const struct util_format_description *canon_desc, *noncanon_desc;
+   bool src_is_canon;
+   bool src_is_noncanon;
+   bool dst_is_canon;
+   bool dst_is_noncanon;
+
+   src_desc = util_format_description(src->format);
+   dst_desc = util_format_description(dst->format);
+   canon_desc = util_format_description(canon_format);
+   noncanon_desc = util_format_description(noncanon_format);
+
+   src_is_canon = same_size_and_swizzle(src_desc, canon_desc);
+   dst_is_canon = same_size_and_swizzle(dst_desc, canon_desc);
+   src_is_noncanon = same_size_and_swizzle(src_desc, noncanon_desc);
+   dst_is_noncanon = same_size_and_swizzle(dst_desc, noncanon_desc);
+
+   if (src_is_noncanon) {
+      /* Simple case - only types differ (e.g. UNORM and UINT). */
+      if (dst_is_noncanon) {
+         blit(pipe, dst, noncanon_format, dst_level, dstx, dsty, dstz, src,
+              noncanon_format, src_level, src_box);
+         return true;
+      }
+
+      /* Simple case - only types and swizzles differ. */
+      if (dst_is_canon) {
+         blit(pipe, dst, canon_format, dst_level, dstx, dsty, dstz, src,
+              noncanon_format, src_level, src_box);
+         return true;
+      }
+
+      /* Use the temporary texture. Src is converted to a canonical format,
+       * then proceed the generic swizzled_copy.
+       */
+      temp = create_texture(pipe->screen, canon_format, src->nr_samples,
+                            src_box->width,
+                            src_box->height, src_box->depth);
+
+      u_box_3d(0, 0, 0, src_box->width, src_box->height, src_box->depth,
+               &temp_box);
+
+      blit(pipe, temp, canon_format, 0, 0, 0, 0, src, noncanon_format,
+           src_level, src_box);
+      swizzled_copy(pipe, dst, dst_level, dstx, dsty, dstz, temp, 0,
+                    &temp_box);
+      pipe_resource_reference(&temp, NULL);
+      return true;
+   }
+
+   if (dst_is_noncanon) {
+      /* Simple case - only types and swizzles differ. */
+      if (src_is_canon) {
+         blit(pipe, dst, noncanon_format, dst_level, dstx, dsty, dstz, src,
+              canon_format, src_level, src_box);
+         return true;
+      }
+
+      /* Use the temporary texture. First, use the generic copy, but use
+       * a canonical format in the destination. Then convert */
+      temp = create_texture(pipe->screen, canon_format, dst->nr_samples,
+                            src_box->width,
+                            src_box->height, src_box->depth);
+
+      u_box_3d(0, 0, 0, src_box->width, src_box->height, src_box->depth,
+               &temp_box);
+
+      swizzled_copy(pipe, temp, 0, 0, 0, 0, src, src_level, src_box);
+      blit(pipe, dst, noncanon_format, dst_level, dstx, dsty, dstz, temp,
+           canon_format, 0, &temp_box);
+      pipe_resource_reference(&temp, NULL);
+      return true;
+   }
+
+   return false;
+}
+
+static void
+copy_image(struct pipe_context *pipe,
+           struct pipe_resource *dst,
+           unsigned dst_level,
+           unsigned dstx, unsigned dsty, unsigned dstz,
+           struct pipe_resource *src,
+           unsigned src_level,
+           const struct pipe_box *src_box)
+{
+   if (src->format == dst->format ||
+       util_format_is_compressed(src->format) ||
+       util_format_is_compressed(dst->format)) {
+      pipe->resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz,
+                                 src, src_level, src_box);
+      return;
+   }
+
+   /* Copying to/from B10G10R10*2 needs 2 blits with R10G10B10A2
+    * as a temporary texture in between.
+    */
+   if (handle_complex_copy(pipe, dst, dst_level, dstx, dsty, dstz, src,
+                           src_level, src_box, PIPE_FORMAT_B10G10R10A2_UINT,
+                           PIPE_FORMAT_R10G10B10A2_UINT))
+      return;
+
+   /* Copying to/from G8R8 needs 2 blits with R8G8 as a temporary texture
+    * in between.
+    */
+   if (handle_complex_copy(pipe, dst, dst_level, dstx, dsty, dstz, src,
+                           src_level, src_box, PIPE_FORMAT_G8R8_UNORM,
+                           PIPE_FORMAT_R8G8_UNORM))
+      return;
+
+   /* Copying to/from G16R16 needs 2 blits with R16G16 as a temporary texture
+    * in between.
+    */
+   if (handle_complex_copy(pipe, dst, dst_level, dstx, dsty, dstz, src,
+                           src_level, src_box, PIPE_FORMAT_G16R16_UNORM,
+                           PIPE_FORMAT_R16G16_UNORM))
+      return;
+
+   /* Only allow non-identity swizzling on RGBA8 formats. */
+
+   /* Simple copy, memcpy with swizzling, no format conversion. */
+   swizzled_copy(pipe, dst, dst_level, dstx, dsty, dstz, src, src_level,
+                 src_box);
+}
+
+static void
+st_CopyImageSubData(struct gl_context *ctx,
+                    struct gl_texture_image *src_image,
+                    struct gl_renderbuffer *src_renderbuffer,
+                    int src_x, int src_y, int src_z,
+                    struct gl_texture_image *dst_image,
+                    struct gl_renderbuffer *dst_renderbuffer,
+                    int dst_x, int dst_y, int dst_z,
+                    int src_width, int src_height)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_resource *src_res, *dst_res;
+   struct pipe_box box;
+   int src_level, dst_level;
+
+   if (src_image) {
+      struct st_texture_image *src = st_texture_image(src_image);
+      src_res = src->pt;
+      src_level = src_image->Level;
+      src_z += src_image->Face;
+   } else {
+      struct st_renderbuffer *src = st_renderbuffer(src_renderbuffer);
+      src_res = src->texture;
+      src_level = 0;
+   }
+
+   if (dst_image) {
+      struct st_texture_image *dst = st_texture_image(dst_image);
+      dst_res = dst->pt;
+      dst_level = dst_image->Level;
+      dst_z += dst_image->Face;
+   } else {
+      struct st_renderbuffer *dst = st_renderbuffer(dst_renderbuffer);
+      dst_res = dst->texture;
+      dst_level = 0;
+   }
+
+   u_box_2d_zslice(src_x, src_y, src_z, src_width, src_height, &box);
+
+   copy_image(pipe, dst_res, dst_level, dst_x, dst_y, dst_z,
+              src_res, src_level, &box);
+}
+
+void
+st_init_copy_image_functions(struct dd_function_table *functions)
+{
+   functions->CopyImageSubData = st_CopyImageSubData;
+}
diff --git a/src/mesa/state_tracker/st_cb_copyimage.h b/src/mesa/state_tracker/st_cb_copyimage.h
new file mode 100644
index 00000000000..d17f35c0953
--- /dev/null
+++ b/src/mesa/state_tracker/st_cb_copyimage.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef ST_CB_COPY_IMAGE_H
+#define ST_CB_COPY_IMAGE_H
+
+struct dd_function_table;
+
+extern void
+st_init_copy_image_functions(struct dd_function_table *functions);
+
+#endif /* ST_CB_COPY_IMAGE_H */
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index b9997dacfd2..747b41464ae 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -39,6 +39,7 @@
 #include "main/imports.h"
 #include "main/macros.h"
 #include "main/feedback.h"
+#include "main/rastpos.h"
 
 #include "st_context.h"
 #include "st_atom.h"
@@ -224,6 +225,15 @@ st_RasterPos(struct gl_context *ctx, const GLfloat v[4])
    struct rastpos_stage *rs;
    const struct gl_client_array **saved_arrays = ctx->Array._DrawArrays;
 
+   if (ctx->VertexProgram._Current == NULL ||
+       ctx->VertexProgram._Current == ctx->VertexProgram._TnlProgram) {
+      /* No vertex shader/program is enabled, used the simple/fast fixed-
+       * function implementation of RasterPos.
+       */
+      _mesa_RasterPos(ctx, v);
+      return;
+   }
+
    if (st->rastpos_stage) {
       /* get rastpos stage info */
       rs = rastpos_stage(st->rastpos_stage);
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 5d25fed317e..d4c916e8057 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1873,55 +1873,6 @@ st_TextureView(struct gl_context *ctx,
    return GL_TRUE;
 }
 
-/* HACK: this is only enough for the most basic uses of CopyImage. Must fix
- * before actually exposing the extension.
- */
-static void
-st_CopyImageSubData(struct gl_context *ctx,
-                    struct gl_texture_image *src_image,
-                    struct gl_renderbuffer *src_renderbuffer,
-                    int src_x, int src_y, int src_z,
-                    struct gl_texture_image *dst_image,
-                    struct gl_renderbuffer *dst_renderbuffer,
-                    int dst_x, int dst_y, int dst_z,
-                    int src_width, int src_height)
-{
-   struct st_context *st = st_context(ctx);
-   struct pipe_context *pipe = st->pipe;
-   struct pipe_resource *src_res, *dst_res;
-   struct pipe_box box;
-   int src_level, dst_level;
-
-   if (src_image) {
-      struct st_texture_image *src = st_texture_image(src_image);
-      src_res = src->pt;
-      src_level = src_image->Level;
-   }
-   else {
-      struct st_renderbuffer *src = st_renderbuffer(src_renderbuffer);
-      src_res = src->texture;
-      src_level = 0;
-   }
-
-   if (dst_image) {
-      struct st_texture_image *dst = st_texture_image(dst_image);
-      dst_res = dst->pt;
-      dst_level = dst_image->Level;
-   }
-   else {
-      struct st_renderbuffer *dst = st_renderbuffer(dst_renderbuffer);
-      dst_res = dst->texture;
-      dst_level = 0;
-   }
-
-   u_box_2d_zslice(src_x, src_y, src_z, src_width, src_height, &box);
-   pipe->resource_copy_region(pipe, dst_res, dst_level,
-                              dst_x, dst_y, dst_z,
-                              src_res, src_level,
-                              &box);
-}
-
-
 void
 st_init_texture_functions(struct dd_function_table *functions)
 {
@@ -1953,6 +1904,4 @@ st_init_texture_functions(struct dd_function_table *functions)
 
    functions->AllocTextureStorage = st_AllocTextureStorage;
    functions->TextureView = st_TextureView;
-
-   functions->CopyImageSubData = st_CopyImageSubData;
 }
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 5abb17385c2..6e20fd1fda2 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -44,6 +44,7 @@
 #include "st_cb_bufferobjects.h"
 #include "st_cb_clear.h"
 #include "st_cb_condrender.h"
+#include "st_cb_copyimage.h"
 #include "st_cb_drawpixels.h"
 #include "st_cb_rasterpos.h"
 #include "st_cb_drawtex.h"
@@ -430,6 +431,7 @@ void st_init_driver_functions(struct pipe_screen *screen,
    st_init_bufferobject_functions(functions);
    st_init_clear_functions(functions);
    st_init_bitmap_functions(functions);
+   st_init_copy_image_functions(functions);
    st_init_drawpixels_functions(functions);
    st_init_rasterpos_functions(functions);
 
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index d4724b46e0a..bd7cbccc20c 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -439,6 +439,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_base_instance),                PIPE_CAP_START_INSTANCE                   },
       { o(ARB_buffer_storage),               PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT   },
       { o(ARB_color_buffer_float),           PIPE_CAP_VERTEX_COLOR_UNCLAMPED           },
+      { o(ARB_copy_image),                   PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS },
       { o(ARB_depth_clamp),                  PIPE_CAP_DEPTH_CLIP_DISABLE               },
       { o(ARB_depth_texture),                PIPE_CAP_TEXTURE_SHADOW_MAP               },
       { o(ARB_draw_buffers_blend),           PIPE_CAP_INDEP_BLEND_FUNC                 },
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index 26e1c21f6c5..b3700406df0 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -61,6 +61,8 @@ compute_num_levels(struct gl_context *ctx,
 
    numLevels = texObj->BaseLevel + baseImage->MaxNumLevels;
    numLevels = MIN2(numLevels, (GLuint) texObj->MaxLevel + 1);
+   if (texObj->Immutable)
+      numLevels = MIN2(numLevels, texObj->NumLevels);
    assert(numLevels >= 1);
 
    return numLevels;
@@ -99,38 +101,40 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
     */
    stObj->lastLevel = lastLevel;
 
-   if (pt->last_level < lastLevel) {
-      /* The current gallium texture doesn't have space for all the
-       * mipmap levels we need to generate.  So allocate a new texture.
-       */
-      struct pipe_resource *oldTex = stObj->pt;
-
-      /* create new texture with space for more levels */
-      stObj->pt = st_texture_create(st,
-                                    oldTex->target,
-                                    oldTex->format,
-                                    lastLevel,
-                                    oldTex->width0,
-                                    oldTex->height0,
-                                    oldTex->depth0,
-                                    oldTex->array_size,
-                                    0,
-                                    oldTex->bind);
-
-      /* This will copy the old texture's base image into the new texture
-       * which we just allocated.
-       */
-      st_finalize_texture(ctx, st->pipe, texObj);
-
-      /* release the old tex (will likely be freed too) */
-      pipe_resource_reference(&oldTex, NULL);
-      st_texture_release_all_sampler_views(st, stObj);
-   }
-   else {
-      /* Make sure that the base texture image data is present in the
-       * texture buffer.
-       */
-      st_finalize_texture(ctx, st->pipe, texObj);
+   if (!texObj->Immutable) {
+      if (pt->last_level < lastLevel) {
+         /* The current gallium texture doesn't have space for all the
+         * mipmap levels we need to generate.  So allocate a new texture.
+         */
+         struct pipe_resource *oldTex = stObj->pt;
+
+         /* create new texture with space for more levels */
+         stObj->pt = st_texture_create(st,
+                                       oldTex->target,
+                                       oldTex->format,
+                                       lastLevel,
+                                       oldTex->width0,
+                                       oldTex->height0,
+                                       oldTex->depth0,
+                                       oldTex->array_size,
+                                       0,
+                                       oldTex->bind);
+
+         /* This will copy the old texture's base image into the new texture
+         * which we just allocated.
+         */
+         st_finalize_texture(ctx, st->pipe, texObj);
+
+         /* release the old tex (will likely be freed too) */
+         pipe_resource_reference(&oldTex, NULL);
+         st_texture_release_all_sampler_views(st, stObj);
+      }
+      else {
+         /* Make sure that the base texture image data is present in the
+         * texture buffer.
+         */
+         st_finalize_texture(ctx, st->pipe, texObj);
+      }
    }
 
    pt = stObj->pt;
diff --git a/src/mesa/tnl/t_rasterpos.c b/src/mesa/tnl/t_rasterpos.c
deleted file mode 100644
index 4bd9ac8539e..00000000000
--- a/src/mesa/tnl/t_rasterpos.c
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-#include "c99_math.h"
-#include "main/glheader.h"
-#include "main/feedback.h"
-#include "main/light.h"
-#include "main/macros.h"
-#include "util/simple_list.h"
-#include "main/mtypes.h"
-#include "main/viewport.h"
-
-#include "math/m_matrix.h"
-#include "tnl/tnl.h"
-
-
-
-/**
- * Clip a point against the view volume.
- *
- * \param v vertex vector describing the point to clip.
- * 
- * \return zero if outside view volume, or one if inside.
- */
-static GLuint
-viewclip_point_xy( const GLfloat v[] )
-{
-   if (   v[0] > v[3] || v[0] < -v[3]
-       || v[1] > v[3] || v[1] < -v[3] ) {
-      return 0;
-   }
-   else {
-      return 1;
-   }
-}
-
-
-/**
- * Clip a point against the far/near Z clipping planes.
- *
- * \param v vertex vector describing the point to clip.
- * 
- * \return zero if outside view volume, or one if inside.
- */
-static GLuint
-viewclip_point_z( const GLfloat v[] )
-{
-   if (v[2] > v[3] || v[2] < -v[3] ) {
-      return 0;
-   }
-   else {
-      return 1;
-   }
-}
-
-
-/**
- * Clip a point against the user clipping planes.
- * 
- * \param ctx GL context.
- * \param v vertex vector describing the point to clip.
- * 
- * \return zero if the point was clipped, or one otherwise.
- */
-static GLuint
-userclip_point( struct gl_context *ctx, const GLfloat v[] )
-{
-   GLuint p;
-
-   for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
-      if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
-	 GLfloat dot = v[0] * ctx->Transform._ClipUserPlane[p][0]
-		     + v[1] * ctx->Transform._ClipUserPlane[p][1]
-		     + v[2] * ctx->Transform._ClipUserPlane[p][2]
-		     + v[3] * ctx->Transform._ClipUserPlane[p][3];
-         if (dot < 0.0F) {
-            return 0;
-         }
-      }
-   }
-
-   return 1;
-}
-
-
-/**
- * Compute lighting for the raster position.  RGB modes computed.
- * \param ctx the context
- * \param vertex vertex location
- * \param normal normal vector
- * \param Rcolor returned color
- * \param Rspec returned specular color (if separate specular enabled)
- */
-static void
-shade_rastpos(struct gl_context *ctx,
-              const GLfloat vertex[4],
-              const GLfloat normal[3],
-              GLfloat Rcolor[4],
-              GLfloat Rspec[4])
-{
-   /*const*/ GLfloat (*base)[3] = ctx->Light._BaseColor;
-   const struct gl_light *light;
-   GLfloat diffuseColor[4], specularColor[4];  /* for RGB mode only */
-
-   COPY_3V(diffuseColor, base[0]);
-   diffuseColor[3] = CLAMP( 
-      ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_DIFFUSE][3], 0.0F, 1.0F );
-   ASSIGN_4V(specularColor, 0.0, 0.0, 0.0, 1.0);
-
-   foreach (light, &ctx->Light.EnabledList) {
-      GLfloat attenuation = 1.0;
-      GLfloat VP[3]; /* vector from vertex to light pos */
-      GLfloat n_dot_VP;
-      GLfloat diffuseContrib[3], specularContrib[3];
-
-      if (!(light->_Flags & LIGHT_POSITIONAL)) {
-         /* light at infinity */
-	 COPY_3V(VP, light->_VP_inf_norm);
-	 attenuation = light->_VP_inf_spot_attenuation;
-      }
-      else {
-         /* local/positional light */
-	 GLfloat d;
-
-         /* VP = vector from vertex pos to light[i].pos */
-	 SUB_3V(VP, light->_Position, vertex);
-         /* d = length(VP) */
-	 d = (GLfloat) LEN_3FV( VP );
-	 if (d > 1.0e-6F) {
-            /* normalize VP */
-	    GLfloat invd = 1.0F / d;
-	    SELF_SCALE_SCALAR_3V(VP, invd);
-	 }
-
-         /* atti */
-	 attenuation = 1.0F / (light->ConstantAttenuation + d *
-			       (light->LinearAttenuation + d *
-				light->QuadraticAttenuation));
-
-	 if (light->_Flags & LIGHT_SPOT) {
-	    GLfloat PV_dot_dir = - DOT3(VP, light->_NormSpotDirection);
-
-	    if (PV_dot_dir<light->_CosCutoff) {
-	       continue;
-	    }
-	    else {
-               GLfloat spot = powf(PV_dot_dir, light->SpotExponent);
-	       attenuation *= spot;
-	    }
-	 }
-      }
-
-      if (attenuation < 1e-3F)
-	 continue;
-
-      n_dot_VP = DOT3( normal, VP );
-
-      if (n_dot_VP < 0.0F) {
-	 ACC_SCALE_SCALAR_3V(diffuseColor, attenuation, light->_MatAmbient[0]);
-	 continue;
-      }
-
-      /* Ambient + diffuse */
-      COPY_3V(diffuseContrib, light->_MatAmbient[0]);
-      ACC_SCALE_SCALAR_3V(diffuseContrib, n_dot_VP, light->_MatDiffuse[0]);
-
-      /* Specular */
-      {
-         const GLfloat *h;
-         GLfloat n_dot_h;
-
-         ASSIGN_3V(specularContrib, 0.0, 0.0, 0.0);
-
-	 if (ctx->Light.Model.LocalViewer) {
-	    GLfloat v[3];
-	    COPY_3V(v, vertex);
-	    NORMALIZE_3FV(v);
-	    SUB_3V(VP, VP, v);
-            NORMALIZE_3FV(VP);
-	    h = VP;
-	 }
-	 else if (light->_Flags & LIGHT_POSITIONAL) {
-	    ACC_3V(VP, ctx->_EyeZDir);
-            NORMALIZE_3FV(VP);
-	    h = VP;
-	 }
-         else {
-	    h = light->_h_inf_norm;
-	 }
-
-	 n_dot_h = DOT3(normal, h);
-
-	 if (n_dot_h > 0.0F) {
-	    GLfloat shine;
-	    GLfloat spec_coef;
-
-	    shine = ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_SHININESS][0];
-	    spec_coef = powf(n_dot_h, shine);
-
-	    if (spec_coef > 1.0e-10F) {
-               if (ctx->Light.Model.ColorControl==GL_SEPARATE_SPECULAR_COLOR) {
-                  ACC_SCALE_SCALAR_3V( specularContrib, spec_coef,
-                                       light->_MatSpecular[0]);
-               }
-               else {
-                  ACC_SCALE_SCALAR_3V( diffuseContrib, spec_coef,
-                                       light->_MatSpecular[0]);
-               }
-	    }
-	 }
-      }
-
-      ACC_SCALE_SCALAR_3V( diffuseColor, attenuation, diffuseContrib );
-      ACC_SCALE_SCALAR_3V( specularColor, attenuation, specularContrib );
-   }
-
-   Rcolor[0] = CLAMP(diffuseColor[0], 0.0F, 1.0F);
-   Rcolor[1] = CLAMP(diffuseColor[1], 0.0F, 1.0F);
-   Rcolor[2] = CLAMP(diffuseColor[2], 0.0F, 1.0F);
-   Rcolor[3] = CLAMP(diffuseColor[3], 0.0F, 1.0F);
-   Rspec[0] = CLAMP(specularColor[0], 0.0F, 1.0F);
-   Rspec[1] = CLAMP(specularColor[1], 0.0F, 1.0F);
-   Rspec[2] = CLAMP(specularColor[2], 0.0F, 1.0F);
-   Rspec[3] = CLAMP(specularColor[3], 0.0F, 1.0F);
-}
-
-
-/**
- * Do texgen needed for glRasterPos.
- * \param ctx  rendering context
- * \param vObj  object-space vertex coordinate
- * \param vEye  eye-space vertex coordinate
- * \param normal  vertex normal
- * \param unit  texture unit number
- * \param texcoord  incoming texcoord and resulting texcoord
- */
-static void
-compute_texgen(struct gl_context *ctx, const GLfloat vObj[4], const GLfloat vEye[4],
-               const GLfloat normal[3], GLuint unit, GLfloat texcoord[4])
-{
-   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-
-   /* always compute sphere map terms, just in case */
-   GLfloat u[3], two_nu, rx, ry, rz, m, mInv;
-   COPY_3V(u, vEye);
-   NORMALIZE_3FV(u);
-   two_nu = 2.0F * DOT3(normal, u);
-   rx = u[0] - normal[0] * two_nu;
-   ry = u[1] - normal[1] * two_nu;
-   rz = u[2] - normal[2] * two_nu;
-   m = rx * rx + ry * ry + (rz + 1.0F) * (rz + 1.0F);
-   if (m > 0.0F)
-      mInv = 0.5F * (1.0f / sqrtf(m));
-   else
-      mInv = 0.0F;
-
-   if (texUnit->TexGenEnabled & S_BIT) {
-      switch (texUnit->GenS.Mode) {
-         case GL_OBJECT_LINEAR:
-            texcoord[0] = DOT4(vObj, texUnit->GenS.ObjectPlane);
-            break;
-         case GL_EYE_LINEAR:
-            texcoord[0] = DOT4(vEye, texUnit->GenS.EyePlane);
-            break;
-         case GL_SPHERE_MAP:
-            texcoord[0] = rx * mInv + 0.5F;
-            break;
-         case GL_REFLECTION_MAP:
-            texcoord[0] = rx;
-            break;
-         case GL_NORMAL_MAP:
-            texcoord[0] = normal[0];
-            break;
-         default:
-            _mesa_problem(ctx, "Bad S texgen in compute_texgen()");
-            return;
-      }
-   }
-
-   if (texUnit->TexGenEnabled & T_BIT) {
-      switch (texUnit->GenT.Mode) {
-         case GL_OBJECT_LINEAR:
-            texcoord[1] = DOT4(vObj, texUnit->GenT.ObjectPlane);
-            break;
-         case GL_EYE_LINEAR:
-            texcoord[1] = DOT4(vEye, texUnit->GenT.EyePlane);
-            break;
-         case GL_SPHERE_MAP:
-            texcoord[1] = ry * mInv + 0.5F;
-            break;
-         case GL_REFLECTION_MAP:
-            texcoord[1] = ry;
-            break;
-         case GL_NORMAL_MAP:
-            texcoord[1] = normal[1];
-            break;
-         default:
-            _mesa_problem(ctx, "Bad T texgen in compute_texgen()");
-            return;
-      }
-   }
-
-   if (texUnit->TexGenEnabled & R_BIT) {
-      switch (texUnit->GenR.Mode) {
-         case GL_OBJECT_LINEAR:
-            texcoord[2] = DOT4(vObj, texUnit->GenR.ObjectPlane);
-            break;
-         case GL_EYE_LINEAR:
-            texcoord[2] = DOT4(vEye, texUnit->GenR.EyePlane);
-            break;
-         case GL_REFLECTION_MAP:
-            texcoord[2] = rz;
-            break;
-         case GL_NORMAL_MAP:
-            texcoord[2] = normal[2];
-            break;
-         default:
-            _mesa_problem(ctx, "Bad R texgen in compute_texgen()");
-            return;
-      }
-   }
-
-   if (texUnit->TexGenEnabled & Q_BIT) {
-      switch (texUnit->GenQ.Mode) {
-         case GL_OBJECT_LINEAR:
-            texcoord[3] = DOT4(vObj, texUnit->GenQ.ObjectPlane);
-            break;
-         case GL_EYE_LINEAR:
-            texcoord[3] = DOT4(vEye, texUnit->GenQ.EyePlane);
-            break;
-         default:
-            _mesa_problem(ctx, "Bad Q texgen in compute_texgen()");
-            return;
-      }
-   }
-}
-
-
-/**
- * glRasterPos transformation.  Typically called via ctx->Driver.RasterPos().
- * XXX some of this code (such as viewport xform, clip testing and setting
- * of ctx->Current.Raster* fields) could get lifted up into the
- * main/rasterpos.c code.
- *
- * \param vObj  vertex position in object space
- */
-void
-_tnl_RasterPos(struct gl_context *ctx, const GLfloat vObj[4])
-{
-   if (ctx->VertexProgram._Enabled) {
-      /* XXX implement this */
-      _mesa_problem(ctx, "Vertex programs not implemented for glRasterPos");
-      return;
-   }
-   else {
-      GLfloat eye[4], clip[4], ndc[3], d;
-      GLfloat *norm, eyenorm[3];
-      GLfloat *objnorm = ctx->Current.Attrib[VERT_ATTRIB_NORMAL];
-      float scale[3], translate[3];
-
-      /* apply modelview matrix:  eye = MV * obj */
-      TRANSFORM_POINT( eye, ctx->ModelviewMatrixStack.Top->m, vObj );
-      /* apply projection matrix:  clip = Proj * eye */
-      TRANSFORM_POINT( clip, ctx->ProjectionMatrixStack.Top->m, eye );
-
-      /* clip to view volume. */
-      if (!ctx->Transform.DepthClamp) {
-         if (viewclip_point_z(clip) == 0) {
-            ctx->Current.RasterPosValid = GL_FALSE;
-            return;
-         }
-      }
-      if (!ctx->Transform.RasterPositionUnclipped) {
-         if (viewclip_point_xy(clip) == 0) {
-            ctx->Current.RasterPosValid = GL_FALSE;
-            return;
-         }
-      }
-
-      /* clip to user clipping planes */
-      if (ctx->Transform.ClipPlanesEnabled && !userclip_point(ctx, clip)) {
-         ctx->Current.RasterPosValid = GL_FALSE;
-         return;
-      }
-
-      /* ndc = clip / W */
-      d = (clip[3] == 0.0F) ? 1.0F : 1.0F / clip[3];
-      ndc[0] = clip[0] * d;
-      ndc[1] = clip[1] * d;
-      ndc[2] = clip[2] * d;
-      /* wincoord = viewport_mapping(ndc) */
-      _mesa_get_viewport_xform(ctx, 0, scale, translate);
-      ctx->Current.RasterPos[0] = ndc[0] * scale[0] + translate[0];
-      ctx->Current.RasterPos[1] = ndc[1] * scale[1] + translate[1];
-      ctx->Current.RasterPos[2] = ndc[2] * scale[2] + translate[2];
-      ctx->Current.RasterPos[3] = clip[3];
-
-      if (ctx->Transform.DepthClamp) {
-	 ctx->Current.RasterPos[3] = CLAMP(ctx->Current.RasterPos[3],
-					   ctx->ViewportArray[0].Near,
-					   ctx->ViewportArray[0].Far);
-      }
-
-      /* compute raster distance */
-      if (ctx->Fog.FogCoordinateSource == GL_FOG_COORDINATE_EXT)
-         ctx->Current.RasterDistance = ctx->Current.Attrib[VERT_ATTRIB_FOG][0];
-      else
-         ctx->Current.RasterDistance =
-                        sqrtf( eye[0]*eye[0] + eye[1]*eye[1] + eye[2]*eye[2] );
-
-      /* compute transformed normal vector (for lighting or texgen) */
-      if (ctx->_NeedEyeCoords) {
-         const GLfloat *inv = ctx->ModelviewMatrixStack.Top->inv;
-         TRANSFORM_NORMAL( eyenorm, objnorm, inv );
-         norm = eyenorm;
-      }
-      else {
-         norm = objnorm;
-      }
-
-      /* update raster color */
-      if (ctx->Light.Enabled) {
-         /* lighting */
-         shade_rastpos( ctx, vObj, norm,
-                        ctx->Current.RasterColor,
-                        ctx->Current.RasterSecondaryColor );
-      }
-      else {
-         /* use current color */
-	 COPY_4FV(ctx->Current.RasterColor,
-		  ctx->Current.Attrib[VERT_ATTRIB_COLOR0]);
-	 COPY_4FV(ctx->Current.RasterSecondaryColor,
-		  ctx->Current.Attrib[VERT_ATTRIB_COLOR1]);
-      }
-
-      /* texture coords */
-      {
-         GLuint u;
-         for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
-            GLfloat tc[4];
-            COPY_4V(tc, ctx->Current.Attrib[VERT_ATTRIB_TEX0 + u]);
-            if (ctx->Texture.Unit[u].TexGenEnabled) {
-               compute_texgen(ctx, vObj, eye, norm, u, tc);
-            }
-            TRANSFORM_POINT(ctx->Current.RasterTexCoords[u],
-                            ctx->TextureMatrixStack[u].Top->m, tc);
-         }
-      }
-
-      ctx->Current.RasterPosValid = GL_TRUE;
-   }
-
-   if (ctx->RenderMode == GL_SELECT) {
-      _mesa_update_hitflag( ctx, ctx->Current.RasterPos[2] );
-   }
-}
diff --git a/src/mesa/vbo/vbo_context.h b/src/mesa/vbo/vbo_context.h
index e6b9d890d5f..6293a8b9edc 100644
--- a/src/mesa/vbo/vbo_context.h
+++ b/src/mesa/vbo/vbo_context.h
@@ -207,7 +207,8 @@ vbo_compute_max_verts(const struct vbo_exec_context *exec)
 {
    unsigned n = (VBO_VERT_BUFFER_SIZE - exec->vtx.buffer_used) /
       (exec->vtx.vertex_size * sizeof(GLfloat));
-   assert(n > 0);
+   if (n == 0)
+      return 0;
    /* Subtract one so we're always sure to have room for an extra
     * vertex for GL_LINE_LOOP -> GL_LINE_STRIP conversion.
     */
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index a23d5aa08aa..a614b26cae4 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -132,8 +132,7 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec )
 static void
 vbo_exec_vtx_wrap(struct vbo_exec_context *exec)
 {
-   fi_type *data = exec->vtx.copied.buffer;
-   GLuint i;
+   unsigned numComponents;
 
    /* Run pipeline on current vertices, copy wrapped vertices
     * to exec->vtx.copied.
@@ -149,13 +148,12 @@ vbo_exec_vtx_wrap(struct vbo_exec_context *exec)
     */
    assert(exec->vtx.max_vert - exec->vtx.vert_count > exec->vtx.copied.nr);
 
-   for (i = 0 ; i < exec->vtx.copied.nr ; i++) {
-      memcpy( exec->vtx.buffer_ptr, data, 
-	      exec->vtx.vertex_size * sizeof(GLfloat));
-      exec->vtx.buffer_ptr += exec->vtx.vertex_size;
-      data += exec->vtx.vertex_size;
-      exec->vtx.vert_count++;
-   }
+   numComponents = exec->vtx.copied.nr * exec->vtx.vertex_size;
+   memcpy(exec->vtx.buffer_ptr,
+          exec->vtx.copied.buffer,
+          numComponents * sizeof(fi_type));
+   exec->vtx.buffer_ptr += numComponents;
+   exec->vtx.vert_count += exec->vtx.copied.nr;
 
    exec->vtx.copied.nr = 0;
 }
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 34d2c1d3d6b..e27fdd90532 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -1807,13 +1807,20 @@ vbo_initialize_exec_dispatch(const struct gl_context *ctx,
       SET_EvalMesh2(exec, vbo_exec_EvalMesh2);
    }
 
-   if (_mesa_is_desktop_gl(ctx)) {
+   if (ctx->API != API_OPENGLES &&
+       ctx->Extensions.ARB_draw_elements_base_vertex) {
       SET_DrawElementsBaseVertex(exec, vbo_exec_DrawElementsBaseVertex);
-      SET_DrawRangeElementsBaseVertex(exec, vbo_exec_DrawRangeElementsBaseVertex);
       SET_MultiDrawElementsBaseVertex(exec, vbo_exec_MultiDrawElementsBaseVertex);
+
+      if (_mesa_is_desktop_gl(ctx) || _mesa_is_gles3(ctx)) {
+         SET_DrawRangeElementsBaseVertex(exec, vbo_exec_DrawRangeElementsBaseVertex);
+         SET_DrawElementsInstancedBaseVertex(exec, vbo_exec_DrawElementsInstancedBaseVertex);
+      }
+   }
+
+   if (_mesa_is_desktop_gl(ctx)) {
       SET_DrawArraysInstancedBaseInstance(exec, vbo_exec_DrawArraysInstancedBaseInstance);
       SET_DrawElementsInstancedBaseInstance(exec, vbo_exec_DrawElementsInstancedBaseInstance);
-      SET_DrawElementsInstancedBaseVertex(exec, vbo_exec_DrawElementsInstancedBaseVertex);
       SET_DrawElementsInstancedBaseVertexBaseInstance(exec, vbo_exec_DrawElementsInstancedBaseVertexBaseInstance);
    }
 
diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index d49aa15b1b7..97a1dfdeb3f 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -601,8 +601,7 @@ static void
 _save_wrap_filled_vertex(struct gl_context *ctx)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
-   fi_type *data = save->copied.buffer;
-   GLuint i;
+   unsigned numComponents;
 
    /* Emit a glEnd to close off the last vertex list.
     */
@@ -612,12 +611,12 @@ _save_wrap_filled_vertex(struct gl_context *ctx)
     */
    assert(save->max_vert - save->vert_count > save->copied.nr);
 
-   for (i = 0; i < save->copied.nr; i++) {
-      memcpy(save->buffer_ptr, data, save->vertex_size * sizeof(GLfloat));
-      data += save->vertex_size;
-      save->buffer_ptr += save->vertex_size;
-      save->vert_count++;
-   }
+   numComponents = save->copied.nr * save->vertex_size;
+   memcpy(save->buffer_ptr,
+          save->copied.buffer,
+          numComponents * sizeof(fi_type));
+   save->buffer_ptr += numComponents;
+   save->vert_count += save->copied.nr;
 }
 
 
diff --git a/src/vulkan/anv_device.c b/src/vulkan/anv_device.c
index 3ab2a245de4..26d0fe57a42 100644
--- a/src/vulkan/anv_device.c
+++ b/src/vulkan/anv_device.c
@@ -74,7 +74,7 @@ anv_physical_device_init(struct anv_physical_device *device,
    }
 
    device->name = brw_get_device_name(device->chipset_id);
-   device->info = brw_get_device_info(device->chipset_id, -1);
+   device->info = brw_get_device_info(device->chipset_id);
    if (!device->info) {
       result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
                          "failed to get device info");